In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

import nltk
nltk.download('punkt_tab')  # This line is only needed the first time you run the code
nltk.download('stopwords') # This line is only needed the first time you run the code

from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import accuracy_score, classification_report

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/lopezgg/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lopezgg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Utils

In [3]:
arr_stopwords = nltk.corpus.stopwords.words('english')

In [4]:
def tokenizer(text):
    """
    Tokenizes the input text (lowercased) using NLTK's word_tokenize function and removes
    stopwords, punctuation, tokens containing non-alphabetic characters, and tokens with length 1.
    
    Args:
        text (str): The input text to tokenize.
        
    Returns:
        list: A list of tokens.
    """
    arr_tokens = word_tokenize(text.lower())
    # Remove tokens with length 1
    arr_tokens = [token for token in arr_tokens if len(token) > 1]
    # Remove stopwords
    arr_tokens = [token for token in arr_tokens if token not in arr_stopwords]
    # Remove punctuation and tokens containing non-alphabetic characters
    arr_tokens = [token for token in arr_tokens if token.isalpha()]
    return arr_tokens

# Data loading

In [5]:
data_dir = "../../data/cancer_type"

In [6]:
df_cancer_type = pd.read_csv(
    os.path.join(data_dir, "tcga-tumor-types.csv"),
    sep=";",
    header=0
)
dict_cancer_type = dict(zip(df_cancer_type["Study Abbreviation"].values, df_cancer_type["Study Name"].values))

## Training

In [7]:
df_train = pd.read_csv(
    os.path.join(data_dir, "train_tcga_reports_cancer_type.csv"),
    sep=',',
    header=0
)

In [8]:
df_train.shape

(4761, 4)

In [9]:
arr_train_corpus = df_train["text"].values.tolist()
arr_train_labels = df_train["cancer_type"].values.tolist()

In [10]:
len(arr_train_corpus), len(arr_train_labels)

(4761, 4761)

In [11]:
len(set(arr_train_labels))

32

## Validation

In [12]:
df_val = pd.read_csv(
    os.path.join(data_dir, "val_tcga_reports_cancer_type.csv"),
    sep=',',
    header=0
)

In [13]:
df_val.shape

(1905, 4)

In [14]:
arr_val_corpus = df_val["text"].values.tolist()
arr_val_labels = df_val["cancer_type"].values.tolist()

In [15]:
len(arr_val_corpus), len(arr_val_labels)

(1905, 1905)

In [16]:
len(set(arr_val_labels))

32

## Test

In [17]:
df_test = pd.read_csv(
    os.path.join(data_dir, "test_tcga_reports_cancer_type.csv"),
    sep=',',
    header=0
)

In [18]:
df_test.shape

(2857, 4)

In [19]:
arr_test_corpus = df_test["text"].values.tolist()
arr_test_labels = df_test["cancer_type"].values.tolist()

In [20]:
len(arr_test_corpus), len(arr_test_labels)

(2857, 2857)

In [21]:
len(set(arr_test_labels))

32

# NLP pipeline

In [22]:
nlp_pipeline = Pipeline([
    (
        "bow", 
        CountVectorizer(
            tokenizer=tokenizer,
            token_pattern=None,
            lowercase=False,
            stop_words=None
        )
    ),
    (
        "classifier",
        LogisticRegression(
            random_state=0,
            max_iter=500,

        )
    )
])

In [23]:
nlp_pipeline.fit(arr_train_corpus, arr_train_labels)

### Evaluation

In [24]:
arr_val_pred = nlp_pipeline.predict(arr_val_corpus)

In [25]:
accuracy_score(
    y_true=arr_val_labels,
    y_pred=arr_val_pred
)

0.952755905511811

# Hyper-parameter tuning

## Logistic Regression

In [26]:
lr_pipeline = Pipeline([
    (
        "bow", 
        CountVectorizer(
            tokenizer=tokenizer,
            token_pattern=None,
            lowercase=False,
            stop_words=None
        )
    ),
    (
        "classifier",
        LogisticRegression(
            random_state=0
        )
    )
])

In [27]:
param_grid = {
    "classifier__max_iter": [200, 500]
}

In [28]:
# Custom CV split using fixed train/validation split
X_data = arr_train_corpus + arr_val_corpus
y_data = arr_train_labels + arr_val_labels
train_indices = list(range(len(arr_train_corpus)))
val_indices = list(range(len(arr_train_corpus), len(X_data)))

In [29]:
custom_cv = [(train_indices, val_indices)]

### Grid Search

In [30]:
grid_search = GridSearchCV(
    estimator=lr_pipeline,
    param_grid=param_grid,
    scoring="accuracy",
    cv=custom_cv,
    verbose=2,
    refit=True
)

In [31]:
grid_search.fit(X_data, y_data)

Fitting 1 folds for each of 2 candidates, totalling 2 fits
[CV] END ...........................classifier__max_iter=200; total time=  45.5s
[CV] END ...........................classifier__max_iter=500; total time=  44.9s


In [32]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__max_iter,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
0,38.812995,0.0,6.669173,0.0,200,{'classifier__max_iter': 200},0.952756,0.952756,0.0,1
1,38.343075,0.0,6.585908,0.0,500,{'classifier__max_iter': 500},0.952756,0.952756,0.0,1


In [33]:
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'classifier__max_iter': 200}


In [34]:
best_model = grid_search.best_estimator_

In [35]:
arr_test_pred = best_model.predict(arr_test_corpus)

In [36]:
print("Test Accuracy:", accuracy_score(
    y_true=arr_test_labels,
    y_pred=arr_test_pred
))

Test Accuracy: 0.9530976548827441


In [37]:
print("Classification Report:", classification_report(
    y_true=arr_test_labels,
    y_pred=arr_test_pred
))

Classification Report:               precision    recall  f1-score   support

         ACC       0.97      0.94      0.95        32
        BLCA       0.98      0.99      0.99       129
        BRCA       0.99      1.00      0.99       270
        CESC       0.94      0.94      0.94        89
        CHOL       0.85      0.85      0.85        13
        COAD       0.85      0.91      0.88       120
        DLBC       1.00      0.88      0.93        16
        ESCA       0.97      0.92      0.94        36
         GBM       0.99      0.98      0.98       135
        HNSC       0.97      0.99      0.98       148
        KICH       0.89      0.82      0.85        39
        KIRC       0.93      0.96      0.94       159
        KIRP       0.90      0.88      0.89        82
         LGG       0.97      0.99      0.98       136
        LIHC       0.98      0.99      0.99       102
        LUAD       0.92      0.92      0.92       154
        LUSC       0.91      0.93      0.92       139
    

## Random Forest

In [38]:
rf_pipeline = Pipeline([
    (
        "bow", 
        CountVectorizer(
            tokenizer=tokenizer,
            token_pattern=None,
            lowercase=False,
            stop_words=None
        )
    ),
    (
        "classifier",
        RandomForestClassifier(
            random_state=0
        )
    )
])

In [39]:
param_grid = {
    "classifier__n_estimators": [10, 100, 500],
    "classifier__max_features": ["sqrt", "log2"],
    "classifier__min_samples_split": [2, 5, 10]
}

In [40]:
# Custom CV split using fixed train/validation split
X_data = arr_train_corpus + arr_val_corpus
y_data = arr_train_labels + arr_val_labels
train_indices = list(range(len(arr_train_corpus)))
val_indices = list(range(len(arr_train_corpus), len(X_data)))

In [41]:
custom_cv = [(train_indices, val_indices)]

### Randomized Search

In [None]:
rand_search = RandomizedSearchCV(
    estimator=rf_pipeline,
    param_distributions=param_grid,
    scoring="accuracy",
    cv=custom_cv,
    verbose=2,
    refit=True,
    n_iter=2,
    random_state=0
)

In [43]:
rand_search.fit(X_data, y_data)

Fitting 1 folds for each of 2 candidates, totalling 2 fits
[CV] END classifier__max_features=sqrt, classifier__min_samples_split=10, classifier__n_estimators=10; total time=  26.0s
[CV] END classifier__max_features=sqrt, classifier__min_samples_split=2, classifier__n_estimators=500; total time=  42.4s


In [44]:
pd.DataFrame(rand_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_estimators,param_classifier__min_samples_split,param_classifier__max_features,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
0,19.498699,0.0,6.529873,0.0,10,10,sqrt,"{'classifier__n_estimators': 10, 'classifier__...",0.891339,0.891339,0.0,2
1,35.550951,0.0,6.803493,0.0,500,2,sqrt,"{'classifier__n_estimators': 500, 'classifier_...",0.917585,0.917585,0.0,1


In [45]:
print("Best Parameters:", rand_search.best_params_)

Best Parameters: {'classifier__n_estimators': 500, 'classifier__min_samples_split': 2, 'classifier__max_features': 'sqrt'}


In [46]:
best_model = rand_search.best_estimator_

In [47]:
arr_test_pred = best_model.predict(arr_test_corpus)

In [48]:
print("Test Accuracy:", accuracy_score(
    y_true=arr_test_labels,
    y_pred=arr_test_pred
))

Test Accuracy: 0.9264963248162408


In [49]:
print("Classification Report:", classification_report(
    y_true=arr_test_labels,
    y_pred=arr_test_pred
))

Classification Report:               precision    recall  f1-score   support

         ACC       1.00      0.81      0.90        32
        BLCA       0.99      0.99      0.99       129
        BRCA       0.97      1.00      0.99       270
        CESC       0.99      0.84      0.91        89
        CHOL       1.00      0.46      0.63        13
        COAD       0.82      0.98      0.89       120
        DLBC       1.00      0.69      0.81        16
        ESCA       0.89      0.86      0.87        36
         GBM       0.95      0.96      0.95       135
        HNSC       0.98      0.99      0.99       148
        KICH       1.00      0.10      0.19        39
        KIRC       0.75      0.97      0.85       159
        KIRP       0.86      0.74      0.80        82
         LGG       0.94      0.96      0.95       136
        LIHC       0.92      1.00      0.96       102
        LUAD       0.93      0.93      0.93       154
        LUSC       0.91      0.94      0.92       139
    