# Classification model using Decision Tree

### 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

### 2. Upload Clinic data

In [2]:
path ="../../../../Data_preprocessing/RNA_post_autoencoder/encoded_data_binary_200_review.csv"
data = pd.read_csv(path)
data.reset_index
data.round(4)
data=data.iloc[:,1:202 ] 
data.head()

Unnamed: 0,Target,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,1,0.442789,0.226327,-3.395011,0.761912,-1.181912,-2.057796,1.90103,1.522585,1.42352,...,-0.868301,-0.086499,-1.845631,0.055275,-0.643941,1.154629,0.672262,-1.960425,0.978955,1.969951
1,1,-0.028265,1.410163,-2.657692,0.825858,0.435628,-0.972779,1.769565,1.036833,1.384876,...,-1.698915,-1.105859,-0.437742,0.587427,0.04823,-0.216112,1.084762,-2.144046,1.397851,0.873189
2,1,0.269204,0.884609,-1.665391,1.609233,-0.564921,0.837579,1.007009,-0.8865,0.638139,...,-0.221767,-2.244491,-1.593955,-0.136215,-0.294638,-0.84912,-0.267188,-1.294744,0.276453,-0.562008
3,0,0.239312,0.799451,-2.839295,0.810844,0.533907,-0.948322,1.99509,0.853652,1.67033,...,-1.495095,-1.396998,-0.734805,0.479203,-0.163358,-0.100398,0.64768,-2.374888,1.707838,0.507836
4,1,-0.351865,-1.245096,-0.744767,-1.102401,0.138185,1.691464,0.830067,0.261012,0.59356,...,1.819882,-0.315093,-2.720363,-1.11918,0.080892,-1.040733,0.136569,-1.674414,0.958555,-1.271945


In [3]:
Y = data.Target # Target column

X = data.iloc[:,1:202] # I selected all the columns by removing the Unnamed column (row id) and the Target column.

print(X)
print('Numero de pacientes: ',len(Y))

            0         1         2         3         4         5         6  \
0    0.442789  0.226327 -3.395011  0.761912 -1.181912 -2.057796  1.901030   
1   -0.028265  1.410163 -2.657692  0.825858  0.435628 -0.972779  1.769565   
2    0.269204  0.884609 -1.665391  1.609233 -0.564921  0.837579  1.007009   
3    0.239312  0.799451 -2.839295  0.810844  0.533907 -0.948322  1.995090   
4   -0.351865 -1.245096 -0.744767 -1.102401  0.138185  1.691464  0.830067   
..        ...       ...       ...       ...       ...       ...       ...   
176  0.121525  0.789393 -1.679044 -0.338311 -0.033060  0.143954  0.686666   
177 -0.217280  0.614284 -3.202434  1.415873 -0.034867 -1.669631  1.756469   
178 -0.015025  0.468370 -1.550519 -0.117665 -0.567333 -0.207721  0.605682   
179 -0.244144  1.576056 -1.483863  0.089200 -0.107631 -0.831589  0.260183   
180 -0.161056  0.517091 -1.964447  0.106236 -0.606570 -0.446199  0.921929   

            7         8         9  ...       190       191       192  \
0  

### 3. Train-Test dataset split

In [4]:
XTrain, XTest, yTrain, yTest = train_test_split(X, Y, test_size=0.20, random_state=125, stratify=Y)

# Convert sets to arrays
XTrain = XTrain.values
XTest = XTest.values

print('Training set size:', len(XTrain))
print('Target column size of the training set:', len(yTrain))
print('Test set size:', len(XTest))
print('Target column size of the test set:', len(yTest))

Training set size: 144
Target column size of the training set: 144
Test set size: 37
Target column size of the test set: 37


### 4. Select the parameters of the model and fit it

In [5]:
param_grid = {'min_samples_leaf': [2,5,7,10],
              'min_samples_split': [2, 5, 7, 10],
              'max_depth':[2,5,10,12,15,20,100],
              'criterion':['entropy','gini'],
              'splitter': ['best', 'random'],
              'random_state':[125]}

# I created a GridSearchCV which allows us to systematically evaluate and select the parameters of our model.
# By indicating a model and the parameters to test, you can evaluate the performance of the first one based on the
# seconds through cross validation.
clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5)

clf.fit(XTrain , yTrain)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [2, 5, 10, 12, 15, 20, 100],
                         'min_samples_leaf': [2, 5, 7, 10],
                         'min_samples_split': [2, 5, 7, 10],
                         'random_state': [125],
                         'splitter': ['best', 'random']})

In [6]:
print("Best estimate of parameters according to GridSearchCV:")
model = clf.best_estimator_
# Fit the model with the best parameters
model.fit(XTrain , yTrain)

Best estimate of parameters according to GridSearchCV:


DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=10,
                       random_state=125)

In [7]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.5623152709359605


### 5. Prediction

In [8]:
# Making predictions with the optimal model on the training dataset
yTrain=yTrain.to_numpy()
yhatTrain = model.predict(XTrain)
contTrain = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTrain),1) :
    if (yhatTrain[i] == yTrain[i]):
        contTrain = contTrain + 1

In [9]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(XTest)
contTest = 0
yTest=yTest.to_numpy()

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTest),1) :
    if (yhatTest[i] == yTest[i]):
        contTest = contTest + 1


### 6. Results

In [10]:
print('Final accuracy on the training dataset:' + str(contTrain/len(yTrain)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(yTest)))

Final accuracy on the training dataset:0.8333333333333334
Final accuracy on the testing dataset: 0.5405405405405406


In [11]:
from sklearn.metrics import classification_report,confusion_matrix

print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(yTrain,yhatTrain))
print('Input data:  ' + str(np.array(yTrain)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[55 13]
 [11 65]]
Input data:  [1 0 1 0 1 1 0 0 1 0 1 0 1 0 1 1 0 1 1 1 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 1 1
 0 0 1 1 1 1 1 0 0 1 1 1 0 0 0 1 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 1 0 1 0 0
 0 1 0 1 0 0 0 1 1 0 0 1 1 1 1 1 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 1 1 1 0 1 1
 1 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 0 0 0 1 0 1 0 0 0 1 1 1 1 1 0 1]
Prediction:        [1 0 1 0 1 1 0 1 1 0 1 0 1 0 1 0 0 1 1 1 0 0 0 1 1 0 1 0 0 1 1 0 0 1 0 1 1
 0 0 1 1 1 1 1 0 0 1 1 1 0 0 1 1 0 0 1 0 0 1 1 0 1 1 0 0 0 0 1 1 1 1 1 0 0
 0 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 1 0 1 0 0 1 0 0 1 0 1 1 0 1 1 1 1 0 1 1
 1 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 1 0 0 1 0 0 1 0 0 0 1 1 1 1 1 0 1]


In [12]:
print(classification_report(yTrain,yhatTrain))

              precision    recall  f1-score   support

           0       0.83      0.81      0.82        68
           1       0.83      0.86      0.84        76

    accuracy                           0.83       144
   macro avg       0.83      0.83      0.83       144
weighted avg       0.83      0.83      0.83       144



In [13]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(yTest,yhatTest))
print('Input data:  ' + str(np.array(yTest)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[ 6 11]
 [ 6 14]]
Input data:  [0 1 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 1 0 1 0]
Prediction:        [1 0 0 1 1 1 1 1 1 1 0 0 1 0 1 0 1 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 1 1 1]


In [14]:
print(classification_report(yTest,yhatTest))

              precision    recall  f1-score   support

           0       0.50      0.35      0.41        17
           1       0.56      0.70      0.62        20

    accuracy                           0.54        37
   macro avg       0.53      0.53      0.52        37
weighted avg       0.53      0.54      0.53        37

