# Decision Tree Classifier

In [41]:
import numpy as np
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
# adding the project root inside the python path
import sys
import os

sys.path.insert(0, os.path.abspath('..'))

In [43]:
# The path where the dataset are stored
DATASET_PATH: str = "../../dataset/first_disease_sel/"
DISEASE_COLNAME: str = 'DISEASE'

## Starting the pipeline

In [44]:
from analysis.preprocess import PreprocessPipeline

pipeline = PreprocessPipeline(datasets_path=DATASET_PATH, disease_col_name=DISEASE_COLNAME)
pipeline.execute_pipeline(force_recompute=True)

INFO:root:Starting pipeline
INFO:root:Loading datasets
INFO:root:Inspecting directory ../../dataset/first_disease_sel/GS
INFO:root:Setting disease as GS
INFO:root:Loading file ../../dataset/first_disease_sel/GS/eGSE117146m.csv
INFO:root:Inspecting directory ../../dataset/first_disease_sel/MCM
INFO:root:Setting disease as MCM
INFO:root:Loading file ../../dataset/first_disease_sel/MCM/MCM_GSE149607.csv
INFO:root:Inspecting directory ../../dataset/first_disease_sel/NALD
INFO:root:Setting disease as NALD
INFO:root:Loading file ../../dataset/first_disease_sel/NALD/eGSE85804m.csv
INFO:root:Loading file ../../dataset/first_disease_sel/NALD/eGSE34308m.csv
INFO:root:Loading file ../../dataset/first_disease_sel/NALD/eGSE117647m.csv
INFO:root:Inspecting directory ../../dataset/first_disease_sel/A1A
INFO:root:Setting disease as A1A
INFO:root:Loading file ../../dataset/first_disease_sel/A1A/A1A_deficiency_GSE109516_pbmc.csv
INFO:root:Inspecting directory ../../dataset/first_disease_sel/DIABETE
INFO

## Inspecting the dataset we have built

In [5]:
print(f"Train DS shape {pipeline.train_set.shape}")
print(f"Test DS shape {pipeline.test_set.shape}")
pipeline.train_set

Train DS shape (299, 4602)
Test DS shape (100, 4602)


Unnamed: 0,THEM6,LRCH3,EMP3,BACE1,MECOM,HEY1,CTDNEP1,KIF1B,GALNT11,ZNF75D,...,CPOX,FST,DENND1A,PDLIM7,HEXA,RHOB,RNF6,CTBP1,ZNF682,HDAC5
DIABETE_80,0.620290,0.059441,0.521277,0.651639,0.487069,0.291667,0.367601,0.254658,0.485356,0.718830,...,0.150472,0.000000,0.504098,0.297297,0.733152,0.369865,0.380090,0.402464,0.052270,0.136833
MCM_24,0.578108,0.934587,0.124425,0.724288,0.785330,0.612562,0.735189,0.695082,0.615750,0.290716,...,0.285329,0.369730,0.829310,0.201783,0.668518,0.167614,0.283751,0.368374,0.673551,0.518351
DIABETE_117,0.325345,0.359597,0.410920,0.386558,0.822021,0.431340,0.432444,0.661756,0.578084,0.364746,...,0.277087,0.630112,0.501568,0.514458,1.000000,0.436902,0.507504,0.617611,0.390654,1.000000
A1A_39,0.115685,0.623519,0.108385,0.476984,0.177652,0.334364,0.092554,0.461684,0.454631,0.550013,...,0.397106,0.047939,0.878667,0.195274,0.180170,0.156603,0.787521,0.098047,0.101160,0.231414
GS_13,0.527060,0.547211,0.049552,0.987860,0.335767,0.951435,0.644609,0.697937,0.238370,0.542026,...,0.516340,0.183675,0.149159,0.721127,0.617795,0.751385,0.455786,0.060050,0.675331,0.886771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DIABETE_30,0.542751,0.416667,0.725768,0.679012,0.496689,0.615108,0.439560,0.529412,0.692308,0.300971,...,0.232044,0.481848,0.490741,0.629108,0.300000,0.743590,0.635514,0.464286,0.394872,0.506024
MCM_20,0.434244,0.734991,0.812388,0.472825,0.365695,0.473242,0.189573,0.550290,0.784009,0.278576,...,0.481948,0.440288,0.512212,0.748510,1.000000,0.373996,0.372025,0.104247,0.666289,0.881041
A1A_84,0.195141,0.745189,0.209474,0.323790,0.073676,0.231938,0.266692,0.458932,0.609652,0.328510,...,0.267758,0.034018,0.823018,0.353656,0.287921,0.179384,0.566963,0.206276,0.382937,0.139778
DIABETE_37,0.724907,0.216667,0.846336,0.753086,0.685430,0.543165,0.769231,0.305882,0.326923,0.339806,...,0.364641,0.072607,0.814815,0.910798,0.318750,0.637821,0.271028,0.794643,0.651282,0.692771


## Building the DT classifier


In [45]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

In [46]:
dataset = pipeline.dataset
disease_colname = "DISEASE"
x_cols = [col_name for col_name in dataset.columns if col_name != disease_colname]
# splitting dataset into train and validation
train_cv, x_test, target_cv, y_test = train_test_split(dataset[x_cols], dataset[disease_colname], test_size=.25,
                                                       stratify=dataset[disease_colname])

In [48]:
scoring = ['accuracy']
params = [
    {
        "criterion": ["gini", "entropy", "log_loss"],
        "splitter": ["best", "random"],
        "max_depth": list(np.arange(1, 200, 1)) + [None],
        "min_samples_split": list(np.arange(0, 5, 1)),
        "min_samples_leaf": list(np.arange(0, 5, 1)),
        "min_weight_fraction_leaf": list(np.arange(0.1, 1, .01)),
        "max_features": ["auto", "sqrt", "log2"],
        "ccp_alpha": list(np.arange(0.1, 1, .01))
        # add another...
    }
]

In [None]:
from sklearn.tree import DecisionTreeClassifier

cv = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=params,
    cv=cv,
    scoring=scoring,
    refit=False,
    n_jobs=-1,
    return_train_score=True
)
grid_search.fit(train_cv, target_cv)

In [29]:
# Return compact form for GridSearchCV results
# It should receive clf.cv_results_ after GridSearchCV
import pandas as pd
def grid_search_report(results):
    results_df = pd.DataFrame(results)
    rank = ['rank_test_accuracy']
    columns = ['rank_test_accuracy',
               'mean_test_accuracy', 'mean_train_accuracy', 'std_test_accuracy', 'std_train_accuracy',
               'mean_fit_time', 'params']
    results_df = results_df.sort_values(by=rank)
    results_df = results_df[columns]
    return results_df

grid_search_report(grid_search.cv_results_)

Unnamed: 0,rank_test_accuracy,mean_test_accuracy,mean_train_accuracy,std_test_accuracy,std_train_accuracy,mean_fit_time,params
1,1,0.709153,1.0,0.051657,0.0,0.113445,"{'criterion': 'gini', 'splitter': 'random'}"
4,2,0.708927,1.0,0.023937,0.0,0.535414,"{'criterion': 'log_loss', 'splitter': 'best'}"
5,3,0.702429,1.0,0.020927,0.0,0.127193,"{'criterion': 'log_loss', 'splitter': 'random'}"
3,4,0.702203,1.0,0.050763,0.0,0.114782,"{'criterion': 'entropy', 'splitter': 'random'}"
2,5,0.68904,1.0,0.046155,0.0,0.600106,"{'criterion': 'entropy', 'splitter': 'best'}"
0,6,0.668814,1.0,0.027644,0.0,0.389216,"{'criterion': 'gini', 'splitter': 'best'}"


## Best parameters

Printing the best parameters found for Decision Tree model.

In [31]:
best_params = grid_search_report(grid_search.cv_results_).iloc[0]['params']
best_params

{'criterion': 'gini', 'splitter': 'random'}

In [33]:
# building the final model with the best parameters
decision_tree = DecisionTreeClassifier(**best_params)
# fitting
decision_tree.fit(train_cv, target_cv)

# Meters

In [34]:
# plotting the tree
from sklearn import tree

In [35]:
# Exporting with graphviz
import graphviz
dot_data = tree.export_graphviz(decision_tree, out_file=None)
graph = graphviz.Source(dot_data)
graph.render("DecisionTree")

'DecisionTree.pdf'

## Evaluating the model

Start by fitting the test data.

In [38]:
y_pred = decision_tree.predict(x_test)

In [39]:
import sklearn.metrics as metrics
from sklearn.metrics import classification_report

In [40]:
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

         A1A       0.97      0.94      0.95        32
     DIABETE       0.71      0.55      0.62        44
          GS       0.00      0.00      0.00         7
         MCM       0.27      0.57      0.36         7
        NALD       0.25      0.30      0.27        10

    accuracy                           0.61       100
   macro avg       0.44      0.47      0.44       100
weighted avg       0.66      0.61      0.63       100

