# Classification of pediatric brain tumors

We used a dataset from a study that analyzed frozen or formalin-fixed paraffin-embedded (FFPE) tissue from 571 patient samples,
including 559 pediatric brain tumors and 12 non-tumor brain control samples.

The dataset contains 103 features, gene or protein names, which are represented by positive integers. The dataset is imbalanced. There are only 12 non-tumor samples and 16 Ependymoma samples.

In [1]:
%pip install GEOparse
%pip install xgboost
%pip install lightgbm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Collecting lightgbm
  Using cached lightgbm-4.3.0.tar.gz (1.7 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h[31mERROR: Exception:
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.11/site-packages/pip/_internal/cli/base_command.py", line 160, in exc_logg

In [2]:
%pip install imblearn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
# imports
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import GEOparse
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns


import pickle


random_seed = 13

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Data exploration
We used Geoparse library, which easily loaded the dataset from the cloud and parsed all the values.
We explored the dataset features and label distribution. We found out that there is a strong class imbalance and the features are numerical not categorical.

In [4]:
# loading dataset
GeO_key = 'GSE227756'
gse = GEOparse.get_GEO(geo=GeO_key, destdir="./", silent=True)

In [5]:
df_dataset = gse.pivot_samples('VALUE').T

In [6]:
# Number of features
print(f"Number of features: {df_dataset.shape[1]}")

Number of features: 103


In [7]:
# data sample
sample = random.sample(list(gse.gsms.keys()), 5)
df_dataset.loc[sample]

ID_REF,ADAMDEC1,APOL3,ARID2,B2M,BRD7,CCL4,CCL5,CCR4,CD14,CD163,CD19,CD27,CD274,CD276,CD28,CD38,CD3E,CD4,CD47,CD68,CD7,CD74,CD80,CD84,CD86,CD8A,CDK4,CHAD,CKMT2,CMKLR1,CSF1R,CSF2,CTLA4,CXCL10,CXCL9,CXCR6,DNER,FASLG,FCMR,FGL2,FOXP3,FZR1,GBP1,GPR171,GZMA,GZMB,GZMK,HAVCR2,HHLA2,HLA-DQA1,HLA-DRB1,HLA-E,HPGD,ICOS,ICOSLG,IDO1,IFNG,IGL,IL10,IL12A,IL12B,IL17A,IL2,IL4,ITGAL,ITGAM,ITM2A,JAK1,JAK2,LAG3,LAMP1,LCK,MRC1,MS4A1,NCR3LG1,NFKB1,NKG7,NT5E,OMD,OR7A5,PBRM1,PDCD1,PDCD1LG2,PRKCA,PSMB10,SPOP,STAT1,STAT3,STAT5A,STING1,TAP1,TAP2,TAPBP,TGFB1,TIGIT,TLR8,TNF,TNFRSF9,TNNC2,TNXB,VEGFA,VSIR,VTCN1
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1
GSM7107463,30,743,3206,7557,1139,360,66,38,1946,1365,20,58,81,1542,8,383,51,525,758,1492,38,35135,61,38,119,79,1712,791,104,294,2098,23,58,58,30,23,1999,46,36,464,165,1322,1185,48,61,20,56,424,23,38,4868,56,58,23,249,41,61,941,33,112,36,66,23,23,89,348,505,3024,779,91,274,36,769,18,66,172,15,761,63,74,969,43,71,817,28,1867,2504,11496,692,86,1134,997,2521,6131,10,101,315,84,43,720,3435,358,8
GSM7107387,26,1408,1887,36819,1538,4367,559,38,9294,3744,35,143,109,2829,8,1160,256,2551,1488,3973,62,194857,80,19,433,537,1687,301,365,1240,8007,16,70,70,109,72,1655,72,109,1226,135,1692,1854,91,244,61,184,2110,10,104,50400,61,75,13,759,62,61,39883,64,109,62,69,27,16,439,848,2615,4409,1343,67,516,103,694,21,328,327,88,1740,128,125,1053,43,229,2430,29,1403,4306,12482,910,402,1485,1644,4340,6815,13,195,971,82,58,756,15151,1027,16
GSM7107353,57,1145,2466,45995,1976,9080,334,39,5285,7089,17,118,94,1303,20,789,208,2208,2304,3795,72,167319,61,24,619,304,1961,2833,321,859,7863,20,157,96,70,111,5100,35,90,1856,133,1550,2031,98,170,37,129,1819,24,2853,38507,81,98,24,649,42,94,4693,68,57,42,50,31,15,278,614,3731,6350,1344,70,2313,85,1491,22,280,354,61,2184,326,48,1121,37,136,4483,22,1733,3622,11931,944,313,1541,1406,4330,6835,9,304,1043,77,42,1209,10800,1460,13
GSM7107512,610,120,1639,34633,2211,24,38,22,1170,204,12,29,22,1053,4,218,41,306,1509,680,26,10404,22,10,58,77,4857,130,24,113,186,12,30,56,34,10,1233,9,32,370,92,1332,197,30,30,22,12,215,12,8,2431,15,45,7,68,9,27,849,19,103,12,25,19,8,52,52,1167,2644,1096,120,803,25,62,13,378,232,9,959,21,26,1562,13,28,3229,7,1693,3111,5005,349,48,1017,711,1620,476,3,29,15,14,22,49,29770,108,9
GSM7107494,25,1154,2756,42321,1681,65,225,27,1953,2001,27,95,57,1593,5,830,149,1303,1911,1610,69,102012,51,13,222,170,1377,5603,187,791,4303,18,57,42,109,72,2867,36,58,772,100,1587,1327,64,124,39,57,1001,12,3667,23303,64,56,19,291,37,34,42564,27,84,53,42,18,13,353,270,3439,3753,1268,52,845,44,704,13,281,244,79,2507,67,41,992,19,120,2011,20,1305,8671,11619,582,285,2476,1889,3271,3522,5,113,73,40,37,523,4301,709,12


In [8]:
df_labels = gse.phenotype_data[['source_name_ch1','characteristics_ch1.0.tumor type']].rename(columns={'source_name_ch1':'Tumor/Non-tumor','characteristics_ch1.0.tumor type':'Tumor type'})

In [9]:
# brain type distribution
df_brain = df_labels.groupby(['Tumor/Non-tumor']).size().reset_index(name='count')
df_brain

Unnamed: 0,Tumor/Non-tumor,count
0,non-tumor brain control,12
1,pediatric brain tumor,559


In [10]:
# tumor type distribution
df_tumor = gse.phenotype_data[['source_name_ch1','characteristics_ch1.0.tumor type']].rename(columns={'source_name_ch1':'Tumor/Non-tumor','characteristics_ch1.0.tumor type':'Tumor type'}).groupby(['Tumor type']).size().reset_index(name='count')
df_tumor

Unnamed: 0,Tumor type,count
0,Ependymoma,16
1,HGG,170
2,LGG,224
3,MMRD HGG,83
4,Medulloblastoma,66
5,Normal brain,12


In [11]:
# plot distribution
cat_distribution = df_labels.groupby(['Tumor/Non-tumor','Tumor type']).size().reset_index(name='count')
fig = px.bar(cat_distribution, x="Tumor/Non-tumor", y="count", color="Tumor type", title="Cancer type distribution in dataset")
fig.show()

## Data preprocessing
Observing that the data exhibited varying values, we utilized a standard scaler to normalize the data.

To eliminate redundant data and capture the essential information, we employed Principal Component Analysis (PCA). Additionally, we visualized the data in three dimensions to assess whether the data points were separable.

By using the first 44 components from PCA, we achieved a cummulative explained variance of 95%, leading us to select these components for further analysis.

In order to resolve the data imbalance, we used SMOTE oversampling technique.
We utilized all these steps into a single data preprocessing pipeline.


In [12]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

cat_distribution_tumor=cat_distribution[cat_distribution["Tumor type"] != "Normal brain"]
fig = make_subplots(rows=1, cols=2, subplot_titles=("Original distribution", "SMOTE distribution"))

fig.add_trace(
    go.Bar(x=cat_distribution_tumor["Tumor type"], y=cat_distribution_tumor["count"], name="Tumor type"),
    row=1, col=1
)

fig.add_trace(
    go.Bar(x=cat_distribution_tumor["Tumor type"], y=[224]*5, name="Tumor type"),
    row=1, col=2
)

fig.update_layout(height=600, width=1200, title_text="Category Distribution")
fig.show()

In [13]:
# create numpy dataset
def create_dataset():
    df = gse.pivot_samples('VALUE').T
    df_labels = gse.phenotype_data[['source_name_ch1','characteristics_ch1.0.tumor type']].rename(columns={'source_name_ch1':'brain_type','characteristics_ch1.0.tumor type':'tumor_type'})
    df = df.merge(df_labels, left_index=True, right_index=True)
    # mappings
    brain_mapping = {cat: idx for idx, cat in enumerate(df['brain_type'].unique())}
    tumor_mapping = {cat: idx for idx, cat in enumerate(df['tumor_type'].unique())}

    #reversed mapping
    r_brain_mapping = {idx: cat for cat, idx in brain_mapping.items()}
    r_tumor_mapping = {idx: cat for cat, idx in tumor_mapping.items()}

    # replace
    df['brain_type'] = df['brain_type'].map(brain_mapping)
    df['tumor_type'] = df['tumor_type'].map(tumor_mapping)
    # output
    X = df.drop(['brain_type', 'tumor_type'], axis=1).to_numpy()
    y_brain = df['brain_type'].to_numpy()
    y_tumor = df['tumor_type'].to_numpy()
    return X, y_brain, y_tumor, r_brain_mapping, r_tumor_mapping

X, y_brain, y_tumor, brain_mapping, tumor_mapping = create_dataset()

In [14]:
for col in range(X.shape[1]):
  unique_values = np.unique(X[:, col])
  uv_count = len(unique_values)

  if uv_count < 50:
    print(f'column_idx: {col} unique_count: {uv_count}')

column_idx: 14 unique_count: 48


In [15]:
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y_tumor, test_size=test_size, random_state=random_seed)
y_train_labelled = np.array([tumor_mapping[idx] for idx in y_train])

In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)


In [17]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(random_state=random_seed))
])


In [18]:
X_pca = pipeline.fit_transform(X_train)

In [19]:
def plot_reduced_X(X_pca, y_train_labelled):
  fig = px.scatter_3d(x=X_pca[:,0], y=X_pca[:,1], z=X_pca[:,2], labels={'x': 'PC1', 'y': 'PC2', 'z':'PCA3'},
                  color=y_train_labelled)

  fig.update_layout(legend_title_text='Tumor type')
  fig.update_layout(xaxis_title='PC1')
  fig.update_layout(yaxis_title='PC2')
  fig.update_layout(yaxis_title='PC3')

  fig.update_layout(title='PCA of tumor types')

  fig.show()

In the graph below, we can see that the data is some what separable.

In [20]:
plot_reduced_X(X_pca, y_train_labelled)

In [21]:
import plotly.graph_objects as go
import numpy as np

# Example data
explained_variance = pipeline.named_steps['pca'].explained_variance_ratio_

def plot_variance(explained_variance, num_feature=10):
  explained_variance = explained_variance[:num_feature]
  cumulative_variance = np.cumsum(explained_variance)

  # Create bar plot for individual explained variance
  fig = go.Figure(data=[
      go.Bar(
          name='Individual Explained Variance',
          x=list(range(1, len(explained_variance) + 1)),
          y=explained_variance,
          marker_color='lightsalmon',
          text=explained_variance,
          textposition='auto'
      )
  ])

  # Create step line plot for cumulative explained variance
  fig.add_trace(go.Scatter(
      name='Cumulative Explained Variance',
      x=list(range(1, len(cumulative_variance) + 1)),
      y=cumulative_variance,
      mode='lines+markers',
      line=dict(color='red', width=2, shape='hv')
  ))

  # Update layout
  fig.update_layout(
      title='PCA Individual Explained Variance',
      xaxis_title='Principal Components',
      yaxis_title='Explained Variance Ratio',
      barmode='overlay',
      template='plotly_white'
  )

  fig.show()


In [22]:
plot_variance(explained_variance, 10)

In [23]:
from matplotlib import pyplot
from numpy import where
from collections import Counter

original_class_distribution = Counter(y_train)
print("Original class distribution:", original_class_distribution)

oversample = SMOTE(random_state=random_seed)
X_train_oversampled, y_train_oversampled = oversample.fit_resample(X_train, y_train)

oversampled_class_distribution = Counter(y_train_oversampled)
print("Class distribution after applying SMOTE:", oversampled_class_distribution)


Original class distribution: Counter({1: 192, 0: 128, 2: 67, 4: 47, 3: 14, 5: 8})
Class distribution after applying SMOTE: Counter({1: 192, 0: 192, 4: 192, 2: 192, 5: 192, 3: 192})


In [24]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(random_state=random_seed, n_components=44))
])

In [25]:
X_train_final = pipeline.fit_transform(X_train_oversampled)
X_test_final = pipeline.transform(X_test)
y_train_os_labelled = np.array([tumor_mapping[idx] for idx in y_train_oversampled])

### Graph of using PCA after oversampling

In [26]:
plot_reduced_X(X_train_final, y_train_os_labelled)

## Tumor type classification
As there is a small number of non-tumor samples, we decided to create a classifier that will predict tumor type from sick individuals.

Hence we will build a 5 class classifier.

## Model selection & evaluation

In this part we selected multiple classifiers with different hyperparameters, trained them using the train data and evaluated them using F1 score and 5-fold cross-validation.

Models used:
- Logistic regression
- Perceptron
- Decision tree
- Random forest
- Gradient boosting
- SVM
- K neighbours
- MLP
- XGB
- LGBM


In [29]:
%pip install lightgbm

Collecting lightgbm
  Using cached lightgbm-4.3.0.tar.gz (1.7 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h[31mERROR: Exception:
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.11/site-packages/pip/_internal/cli/base_command.py", line 160, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/opt/homebrew/lib/python3.11/site-packages/pip/_internal/cli/req_command.py", line 247, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/lib/python3.11/site-packages/pip/_internal/commands/install.py", line 400, in run
    requirement_set = resolver.resolve(
                      ^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/lib/python3.11/site-packages/pip/_internal/resolution/resolvelib/resolver.py", line 92, in resolve
    result = self._result = resolver.resolve(
                            ^^^^^^^^^^^^^^^^^
  File "

In [30]:
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

log_reg = LogisticRegression(random_state=random_seed, max_iter=1000)

log_reg_params = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2']
}

dec_tree = DecisionTreeClassifier(random_state=random_seed)

dec_tree_params = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10]
}

rand_forest = RandomForestClassifier(random_state=random_seed)

rand_forest_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10]
}

svm = SVC(random_state=random_seed)

svm_params = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
}

knn = KNeighborsClassifier()

knn_params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance']
}

mlp = MLPClassifier(random_state=random_seed, max_iter=1000)

mlp_params = {
    'hidden_layer_sizes': [(16,), (32,), (64,)],
    'alpha': [0.0001, 0.001, 0.01]
}

perceptron = Perceptron(random_state=random_seed)

perceptron_params = {
    'alpha': [0.0001, 0.001, 0.01],
    'penalty': ['l1', 'l2']
}

grad_boost = GradientBoostingClassifier(random_state=random_seed, n_estimators=30)

grad_boost_params = {
    'max_depth': [3, 5, 7],
    'n_estimators': [32, 64]
}

xgb = XGBClassifier(random_state=random_seed, use_label_encoder=False)

xgb_params = {
    'max_depth': [3, 5, 7],
    'n_estimators': [32, 64]
}

lgbm = LGBMClassifier(random_state=random_seed)

lgbm_params = {
    'max_depth': [3, 5, 7],
    'n_estimators': [32, 64]
}

models = [
    (log_reg, log_reg_params),
    (dec_tree, dec_tree_params),
    (rand_forest, rand_forest_params),
    (svm, svm_params),
    (knn, knn_params),
    (mlp, mlp_params),
    (perceptron, perceptron_params),
    (grad_boost, grad_boost_params),
    (xgb, xgb_params),
    (lgbm, lgbm_params)
]




ModuleNotFoundError: No module named 'lightgbm'

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

def grid_search(models, X_train, y_train):
    best_models = []
    all_results = pd.DataFrame()
    for model, params in models:
        grid_search = GridSearchCV(model, params, cv=5, n_jobs=-1, verbose=1, scoring='f1_macro')
        grid_search.fit(X_train, y_train)
        best_models.append(grid_search.best_estimator_)
        results = pd.DataFrame(grid_search.cv_results_)
        results['model'] = model.__class__.__name__
        all_results = pd.concat([all_results, results])
        all_results.to_csv('grid_search_results.csv', index=False)

    return best_models, all_results

def eval_best_models(models, X_test, y_test, path='model_eval_results.csv'):
    results = pd.DataFrame()
    for model in models:
        y_pred = model.predict(X_test)
        accuracy = f1_score(y_test, y_pred, average='macro')
        results = pd.concat([results, pd.DataFrame({'model': [model.__class__.__name__], 'accuracy': [accuracy]})])
        results.to_csv(path, index=False)
    return results

def eval_best_model_without_smote(grid_search_results):
  df_best_params = grid_search_results.groupby('model').apply(lambda x: x.sort_values('mean_test_score', ascending=False).head(1))
  df_best_params = df_best_params.sort_values('mean_test_score', ascending=False)[['model', 'params']]

  models_without_smote = []

  for _, x in df_best_params.iterrows():
    model_type = globals()[x.model]
    X_train_pip = pipeline.fit_transform(X_train)
    model = model_type(**x.params).fit(X_train_pip, y_train)
    models_without_smote.append(model)

  X_test_pip = pipeline.transform(X_test)
  eval_best_models(models_without_smote, X_test_pip, y_test, 'model_without_smote_eval_results.csv')


In [34]:
import os

if os.path.exists('model_eval_results.csv'):
    grid_search_results = pd.read_csv('grid_search_results.csv')
    eval_results = pd.read_csv('model_eval_results.csv')
    eval_without_smote_results = pd.read_csv('model_without_smote_eval_results.csv')
else:
    best_models, grid_search_results = grid_search(models, X_train_final, y_train_oversampled)
    eval_results = eval_best_models(best_models, X_test_final, y_test)
    eval_without_smote_results = eval_best_model_without_smote(grid_search_results)

In [35]:
df_smote = eval_results.rename(columns={'accuracy': 'F1 macro'})[['model', 'F1 macro']]
df_smote['type'] = 'SMOTE'

df_imbalanced = eval_without_smote_results.rename(columns={'accuracy': 'F1 macro'})[['model', 'F1 macro']]
df_imbalanced['type'] = 'Imbalanced'

df_both_results = pd.concat([df_smote, df_imbalanced]).sort_values('F1 macro', ascending=False)

In [36]:
import plotly.express as px

wide_df = px.data.medals_wide()

fig = px.bar(df_both_results, x="model", y='F1 macro', color='type', barmode='group', title="F1 scores")
fig.show()

## Best model evaluation
The best performing model was an MLP with 32 neurons in the hidden layer, 0.0001 learning rate, data preprocessed using standard scaler, 44 principal components and SMOTE oversampling

In [37]:
import numpy as np
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(model, X_test, y_test, label_mapping):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    labels = [label_mapping[idx] for idx in model.classes_]
    
    # Create the heatmap
    heatmap = go.Heatmap(
        z=cm,
        x=labels,
        y=labels,
        colorscale='Blues',
        showscale=False,
        text=cm,
        texttemplate="%{text}",
        textfont={"size": 12}
    )

    # Create the layout
    layout = go.Layout(
        title='Confusion Matrix',
        xaxis=dict(
            title='Predicted labels',
            tickangle=-45  # Rotate the x-axis labels by -45 degrees
        ),
        yaxis=dict(title='True labels'),
        width=800,
        height=600,
    )

    # Create the figure
    fig = go.Figure(data=[heatmap], layout=layout)

    # Show the plot
    fig.show()

# Example usage (assuming you have a trained model, X_test, and y_test)
# plot_confusion_matrix(trained_model, X_test, y_test)



In [38]:
best_model = MLPClassifier(random_state=random_seed, max_iter=1000, alpha=0.0001, hidden_layer_sizes=32)
best_model.fit(X_train_final, y_train_oversampled)

In [39]:
X_test_pip = pipeline.transform(X_test)

In [40]:
y_test_pred = best_model.predict(X_test_pip)
f1 = f1_score(y_test, y_test_pred, average='macro')

f1

0.763247452136341

In [41]:
plot_confusion_matrix(best_model, X_test_pip, y_test, tumor_mapping)

## Binary classification
We tried to interpret the binary classificator for tumor and non-tumor samples in order to identify important features for each class.

In [42]:

X_train_brain, X_test_brain, y_train_brain, y_test_brain = train_test_split(X, y_brain, test_size=test_size, random_state=random_seed)
y_train_brain_labelled = np.array([brain_mapping[idx] for idx in y_train_brain])

logreg = LogisticRegression(random_state=random_seed, max_iter=1000)
scaler = StandardScaler()
X_train_brain_scaled = scaler.fit_transform(X_train_brain)
X_test_brain_scaled = scaler.transform(X_test_brain)
logreg.fit(X_train_brain_scaled, y_train_brain)

df_interpret = pd.DataFrame({'feature': df_dataset.columns, 'weight': logreg.coef_.squeeze()})
df_interpret.head()

y_pred_brain = logreg.predict(X_test_brain_scaled)
f1 = f1_score(y_test_brain, y_pred_brain, average='macro')
print(f'F1 macro: {f1}')

fig = px.bar(df_interpret, x='feature', y='weight')
fig.show()

F1 macro: 0.7789878283151825
