# Classification model using Decision Tree

### 1. Import libraries

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV


### 2. Upload Clinic data

In [48]:
path ="../../../../Data_preprocessing/RNA_post_autoencoder/encoded_data_multiclass_43.csv"
data = pd.read_csv(path)
data.reset_index
data.round(4)
data=data.iloc[:,1:440  ] 
data.head()

Unnamed: 0,Target,0,1,2,3,4,5,6,7,8,...,33,34,35,36,37,38,39,40,41,42
0,2,-0.366403,-0.529891,1.224275,-0.170353,2.797608,-2.96038,0.503309,2.175944,1.066994,...,2.206299,-0.502463,-2.004801,-0.438421,0.785737,-1.988872,-0.89365,1.52871,3.073825,1.899409
1,1,-0.022685,-0.080576,1.521277,0.333096,2.073513,-2.504466,0.212547,1.684705,0.283884,...,1.657404,-0.800799,-1.17654,-0.398753,0.854077,-1.42104,-0.487865,1.479854,2.290627,1.117899
2,2,-0.052578,-0.362782,1.127761,-0.077959,2.745393,-3.656872,0.171195,2.509113,0.844297,...,2.001973,-0.640431,-2.522731,-0.346004,0.709842,-1.786768,-0.986672,1.240316,3.191606,2.450312
3,0,-0.095125,-0.152655,1.517417,0.131297,2.404263,-2.807642,0.213807,2.105238,0.44596,...,1.930249,-0.709466,-1.687836,-0.319494,0.988847,-1.690042,-0.947813,1.495447,2.861046,1.617567
4,1,0.172483,-0.59903,1.382951,-0.103038,2.338006,-2.98872,0.623872,2.488591,0.634746,...,1.437723,-0.653985,-2.120162,-0.160325,0.505564,-1.804533,-0.608725,1.377656,2.063588,1.264385


In [49]:
Y = data.Target # Target column
X = data.iloc[:,1:439] # I selected all the columns by removing the Unnamed column (row id) and the Target column.

print(X)
print('Numero de pacientes: ',len(Y))

            0         1         2         3         4         5         6  \
0   -0.366403 -0.529891  1.224275 -0.170353  2.797608 -2.960380  0.503309   
1   -0.022685 -0.080576  1.521277  0.333096  2.073513 -2.504466  0.212547   
2   -0.052578 -0.362782  1.127761 -0.077959  2.745393 -3.656872  0.171195   
3   -0.095125 -0.152655  1.517417  0.131297  2.404263 -2.807642  0.213807   
4    0.172483 -0.599030  1.382951 -0.103038  2.338006 -2.988720  0.623872   
..        ...       ...       ...       ...       ...       ...       ...   
176 -0.038708 -0.484153  1.432885  0.085147  2.330397 -2.537074  0.430618   
177 -0.067312 -0.207609  1.119932 -0.150279  2.498550 -3.748665  0.099322   
178  0.037973 -0.539611  1.428888 -0.079274  2.637841 -3.243155  0.248875   
179 -0.066098 -0.426008  1.787303  0.068520  2.061454 -1.809541  0.379197   
180 -0.103007 -0.591417  0.982493 -0.205921  2.387277 -3.304925  0.182727   

            7         8         9  ...        33        34        35  \
0  

In [50]:
XTrain, XTest, yTrain, yTest = train_test_split(X, Y, test_size=0.20, random_state=125,stratify=Y)
# Convert sets to arrays

XTrain = XTrain.values
XTest = XTest.values
yTrain=yTrain.to_numpy()
yTest=yTest.to_numpy()
print('Training set size:', len(XTrain))
print('Target column size of the training set:', len(yTrain))
print('Test set size:', len(XTest))
print('Target column size of the test set:', len(yTest))

Training set size: 144
Target column size of the training set: 144
Test set size: 37
Target column size of the test set: 37


### 4. Select the parameters of the model and fit it

In [51]:
param_grid = {'min_samples_leaf': [2,5,7,10],
              'min_samples_split': [2, 5, 7, 10],
              'max_depth':[2,5,10,12,15,20,100],
              'criterion':['entropy','gini'],
              'splitter': ['best', 'random'],
              'random_state':[125]}

# I created a GridSearchCV which allows us to systematically evaluate and select the parameters of our model.
# By indicating a model and the parameters to test, you can evaluate the performance of the first one based on the
# seconds through cross validation.
clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5)

clf.fit(XTrain , yTrain)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [2, 5, 10, 12, 15, 20, 100],
                         'min_samples_leaf': [2, 5, 7, 10],
                         'min_samples_split': [2, 5, 7, 10],
                         'random_state': [125],
                         'splitter': ['best', 'random']})

In [52]:
print("Best estimate of parameters according to GridSearchCV:")
model = clf.best_estimator_
# Fit the model with the best parameters
model.fit(XTrain , yTrain)

Best estimate of parameters according to GridSearchCV:


DecisionTreeClassifier(max_depth=12, min_samples_leaf=2, min_samples_split=5,
                       random_state=125, splitter='random')

In [53]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.4862068965517241


### 5. Prediction

In [54]:
# Making predictions with the optimal model on the training dataset
yhatTrain = model.predict(XTrain)
contTrain = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTrain),1) :
    if (yhatTrain[i] == yTrain[i]):
        contTrain = contTrain + 1

In [55]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(XTest)
contTest = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTest),1) :
    if (yhatTest[i] == yTest[i]):
        contTest = contTest + 1


### 6. Results

In [56]:
print('Final accuracy on the training dataset:' + str(contTrain/len(yTrain)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(yTest)))

Final accuracy on the training dataset:0.8333333333333334
Final accuracy on the testing dataset: 0.40540540540540543


In [57]:
from sklearn.metrics import classification_report,confusion_matrix

print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(yTrain,yhatTrain))
print('Input data:  ' + str(np.array(yTrain)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[65  1  1]
 [10 15  0]
 [11  1 40]]
Input data:  [1 0 2 0 2 2 0 0 2 0 2 0 1 0 2 2 0 2 2 2 0 0 0 2 0 0 0 0 1 1 2 2 2 2 0 1 2
 0 0 2 1 2 2 1 0 0 2 2 1 0 0 0 2 0 0 2 0 2 1 2 0 2 1 0 0 0 0 2 2 2 0 2 0 0
 0 2 0 2 0 0 0 1 2 0 0 1 2 2 1 1 0 2 0 2 0 0 1 0 1 0 0 2 0 0 2 2 2 1 0 2 2
 1 2 0 1 0 0 0 2 0 1 0 2 0 0 2 1 1 0 0 0 2 0 1 0 0 0 2 2 2 1 2 0 1]
Prediction:        [1 0 2 0 2 2 0 0 0 0 2 0 0 0 0 2 0 2 0 2 0 0 0 2 0 0 0 0 0 0 2 2 2 0 0 1 0
 0 0 0 1 2 2 0 0 0 0 2 0 0 0 0 2 0 0 2 0 1 0 2 0 2 1 0 0 0 0 2 2 2 0 2 0 0
 0 0 0 2 0 0 0 1 2 0 0 0 2 0 1 1 0 2 0 2 0 2 1 0 0 0 0 2 0 0 2 2 2 1 1 2 0
 0 0 0 1 0 0 0 2 0 1 0 2 0 0 2 1 1 0 0 0 2 0 1 0 0 0 2 2 2 1 2 0 0]


In [58]:
print(classification_report(yTrain,yhatTrain))

              precision    recall  f1-score   support

           0       0.76      0.97      0.85        67
           1       0.88      0.60      0.71        25
           2       0.98      0.77      0.86        52

    accuracy                           0.83       144
   macro avg       0.87      0.78      0.81       144
weighted avg       0.86      0.83      0.83       144



In [59]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(yTest,yhatTest))
print('Input data:  ' + str(np.array(yTest)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[11  0  7]
 [ 3  1  2]
 [10  0  3]]
Input data:  [0 1 0 2 0 2 1 0 1 0 2 2 1 2 2 0 1 2 0 0 0 2 1 0 0 2 0 2 2 0 0 0 2 2 0 0 0]
Prediction:        [2 0 0 0 0 0 0 2 0 0 2 2 1 2 0 0 2 0 0 2 2 0 2 0 0 0 0 0 0 2 0 2 0 0 2 0 0]


In [60]:
print(classification_report(yTest,yhatTest))

              precision    recall  f1-score   support

           0       0.46      0.61      0.52        18
           1       1.00      0.17      0.29         6
           2       0.25      0.23      0.24        13

    accuracy                           0.41        37
   macro avg       0.57      0.34      0.35        37
weighted avg       0.47      0.41      0.39        37

