In [1]:
import pandas as pd
import os
import nibabel as nib
import pydicom as dicom
import h5py
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
from tqdm.notebook import tqdm_notebook
from torch.cuda.amp import GradScaler, autocast
import wandb
from skimage.transform import resize
from sklearn.metrics import roc_auc_score
from loguru import logger
import random
from catboost import CatBoostClassifier
import time
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier

In [2]:
torch.manual_seed(383)
random.seed(383)
np.random.seed(383)

# Read data

In [48]:
df = pd.read_csv("Data/MGMT_TCGA_FEATURES.csv")
# df = pd.read_csv("Data/MGMT_FEATURES_REAL_VALUES_ONLY.csv")
df.head()

Unnamed: 0,ID,Date,VOLUME_ET,VOLUME_NET,VOLUME_ED,VOLUME_TC,VOLUME_WT,VOLUME_BRAIN,VOLUME_ET_OVER_NET,VOLUME_ET_OVER_ED,...,TGM_Cog_X_5,TGM_Cog_Y_5,TGM_Cog_Z_5,TGM_T_5,TGM_Cog_X_6,TGM_Cog_Y_6,TGM_Cog_Z_6,TGM_T_6,files,labels
0,TCGA-02-0006,1996.08.23,1662,384,36268,2046,38314,1469432,4.328125,0.045826,...,,,,,,,,,TCGA-02-0006,Unmethylated
1,TCGA-02-0009,1997.06.14,4362,4349,15723,8711,24434,1295721,1.002989,0.277428,...,,,,,,,,,TCGA-02-0009,Unmethylated
2,TCGA-02-0011,1998.02.01,33404,48612,45798,82016,127814,1425843,0.687155,0.729377,...,,,,,,,,,TCGA-02-0011,Methylated
3,TCGA-02-0027,1999.03.28,12114,7587,34086,19701,53787,1403429,1.596679,0.355395,...,,,,,,,,,TCGA-02-0027,Unmethylated
4,TCGA-02-0033,1997.05.26,34538,7137,65653,41675,107328,1365237,4.839288,0.526069,...,,,,,,,,,TCGA-02-0033,Methylated


In [49]:
cols_to_drop = [
    "Date",
    "files"
]

In [50]:
df = df.drop(columns=cols_to_drop)

In [51]:
train = pd.read_csv("Data/TRAIN_TCGA.csv")
val = pd.read_csv("Data/VAL_TCGA.csv")
test = pd.read_csv("Data/TEST_TCGA.csv")

In [52]:
df_train = df[df["ID"].isin(train["files"].to_list())].sample(frac=1, random_state=0).reset_index(drop=True)
df_val = df[df["ID"].isin(val["files"].to_list())].sample(frac=1, random_state=0).reset_index(drop=True)
df_test = df[df["ID"].isin(test["files"].to_list())].sample(frac=1, random_state=0).reset_index(drop=True)

In [54]:
x_train = df_train.drop(columns=["ID", "labels"])
y_train = df_train["labels"]

x_val = df_val.drop(columns=["ID", "labels"])
y_val = df_val["labels"]

x_test = df_test.drop(columns=["ID", "labels"])
y_test = df_test["labels"]

In [55]:
x = pd.concat([x_train, x_val]).reset_index(drop=True)
y = pd.concat([y_train, y_val]).reset_index(drop=True)

## LogisticRegression

In [61]:
search_space = {
    "logisticregression__C": [0.001, 0.01, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.5, 1, 2, 3, 4, 5, 10, 15, 20, 25],
    "logisticregression__penalty": ["l2", "elasticnet", None]
}

In [62]:
MP = make_pipeline(StandardScaler(), LogisticRegression(max_iter=10000))

In [64]:
# %%capture
# GRD_LR = RandomizedSearchCV(MP, search_space, n_iter=250, cv=5, scoring="roc_auc", verbose=3, return_train_score=True, n_jobs=-1)
# GRD_LR.fit(x, y)

In [19]:
pd.DataFrame(GRD_LR.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]].head()

Unnamed: 0,params,mean_test_score,mean_train_score,rank_test_score
9,"{'logisticregression__penalty': 'l2', 'logisti...",0.60657,0.733239,1
13,"{'logisticregression__penalty': 'l2', 'logisti...",0.606002,0.74132,2
17,"{'logisticregression__penalty': 'l2', 'logisti...",0.603161,0.747141,3
21,"{'logisticregression__penalty': 'l2', 'logisti...",0.602502,0.751441,4
25,"{'logisticregression__penalty': 'l2', 'logisti...",0.602039,0.754616,5


In [20]:
LR = GRD_LR.best_estimator_
LR.fit(x, y)

auc_test = roc_auc_score(y_test, LR.predict_proba(x_test)[:, 1])

print(f"AUC test: {100*auc_test:.2f}%.")

AUC test: 57.38%.


## CatBoost

In [74]:
search_space = {
    "depth": [1, 2, 3, 4, 5, 6],
#     "early_stopping_rounds": [5, 10, 15, 20],
    "n_estimators": [5, 10, 15, 20, 25, 50, 75, 100, 200, 300],
    "l2_leaf_reg": [1, 2, 5, 10, 15, 20, 25, 50, 100, 250],
    "learning_rate": [None, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1],
    "loss_function": ["AUC", "Logloss"],
    "min_data_in_leaf": [5, 10, 15, 20, 25, 30, 35, 40, 45, 100, 150, 200, None]
}

In [75]:
%%capture
GRD_CB = RandomizedSearchCV(CatBoostClassifier(verbose=False), 
                   search_space, 
                   n_iter=10,
                   scoring="roc_auc", 
                   cv=5, 
                   verbose=3, 
                   return_train_score=True, 
                   n_jobs=-1)
GRD_CB.fit(x, y)

In [76]:
pd.DataFrame(GRD_CB.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]].head()

Unnamed: 0,params,mean_test_score,mean_train_score,rank_test_score
9,"{'n_estimators': 15, 'min_data_in_leaf': 5, 'l...",0.496,1.0,1
8,"{'n_estimators': 50, 'min_data_in_leaf': 5, 'l...",0.422667,1.0,2
1,"{'n_estimators': 200, 'min_data_in_leaf': 40, ...",0.339,1.0,3
7,"{'n_estimators': 300, 'min_data_in_leaf': 15, ...",0.312333,1.0,4
0,"{'n_estimators': 100, 'min_data_in_leaf': 10, ...",,,5


In [77]:
CB = GRD_CB.best_estimator_
CB.fit(x, y)

auc_test = roc_auc_score(y_test, CB.predict_proba(x_test)[:, 1])

print(f"AUC test: {100*auc_test:.2f}%.")

AUC test: 64.29%.


# RandomForests

In [25]:
search_space = {
    "max_depth": [1, 2, 3, 4, 5, 6],
    "min_samples_split": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40],
    "min_samples_leaf": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40],    
    "n_estimators": [5, 10, 15, 20, 25, 50, 75, 100, 200, 300],
    "max_features": ["auto", "sqrt", "log2"],
}

In [26]:
%%capture
GRD_RF = RandomizedSearchCV(RandomForestClassifier(), 
                   search_space, 
                   n_iter=1000,
                   scoring="roc_auc", 
                   cv=5, 
                   verbose=3, 
                   return_train_score=True, 
                   n_jobs=-1)
GRD_RF.fit(x, y)

In [27]:
pd.DataFrame(GRD_RF.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]].head()

Unnamed: 0,params,mean_test_score,mean_train_score,rank_test_score
269,"{'n_estimators': 50, 'min_samples_split': 2, '...",0.596096,0.847311,1
42,"{'n_estimators': 20, 'min_samples_split': 8, '...",0.595302,0.733304,2
53,"{'n_estimators': 200, 'min_samples_split': 18,...",0.595267,0.950108,3
482,"{'n_estimators': 15, 'min_samples_split': 14, ...",0.594036,0.915202,4
429,"{'n_estimators': 75, 'min_samples_split': 20, ...",0.593306,0.907622,5


In [28]:
RF = GRD_RF.best_estimator_
RF.fit(x, y)

auc_test = roc_auc_score(y_test, RF.predict_proba(x_test)[:, 1])

print(f"AUC test: {100*auc_test:.2f}%.")

AUC test: 58.78%.


# DecisionTree

In [29]:
search_space = {
    "max_depth": [1, 2, 3, 4, 5, 6],
    "min_samples_split": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40],
    "min_samples_leaf": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40],    
    "max_features": ["auto", "sqrt", "log2"],
    "random_state": [0, 1, 2, 3, 4, 5, 6],
    "criterion": ["gini", "entropy"]
}

In [30]:
%%capture
GRD_DT = RandomizedSearchCV(DecisionTreeClassifier(), 
                   search_space, 
                   n_iter=250,
                   scoring="roc_auc", 
                   cv=5, 
                   verbose=3, 
                   return_train_score=True, 
                   n_jobs=-1)
GRD_DT.fit(x, y)

In [31]:
pd.DataFrame(GRD_DT.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]].head()

Unnamed: 0,params,mean_test_score,mean_train_score,rank_test_score
109,"{'random_state': 1, 'min_samples_split': 4, 'm...",0.584646,0.680943,1
91,"{'random_state': 0, 'min_samples_split': 30, '...",0.581859,0.723021,2
80,"{'random_state': 0, 'min_samples_split': 14, '...",0.581438,0.674811,3
246,"{'random_state': 4, 'min_samples_split': 6, 'm...",0.578262,0.77288,4
108,"{'random_state': 1, 'min_samples_split': 2, 'm...",0.574494,0.718059,5


In [32]:
DT = GRD_DT.best_estimator_
DT.fit(x, y)

auc_test = roc_auc_score(y_test, DT.predict_proba(x_test)[:, 1])

print(f"AUC test: {100*auc_test:.2f}%.")

AUC test: 56.92%.


# XGBoost

In [33]:
search_space = {
    "min_child_weight": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "gamma": [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 6],
    "subsample": [0, 0.2, 0.4, 0.8, 1],
    "colsample_by_tree": [0, 0.2, 0.4, 0.6, 0.8, 1],
    "max_depth": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
    "n_estimators": range(100, 2000, 100),
}

In [None]:
%%capture
GRD_XGB = RandomizedSearchCV(XGBClassifier(), 
                   search_space, 
                   n_iter=100,
                   scoring="roc_auc", 
                   cv=5, 
                   verbose=3, 
                   return_train_score=True, 
                   n_jobs=-1)
GRD_XGB.fit(x, y)

In [None]:
pd.DataFrame(GRD_XGB.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]].head()

In [None]:
XGB = GRD_XGB.best_estimator_
XGB.fit(x, y)

auc_test = roc_auc_score(y_test, XGB.predict_proba(x_test)[:, 1])

print(f"AUC test: {100*auc_test:.2f}%.")