# Classification model using Decision Tree

### 1. Import libraries

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

### 2. Upload Clinic data

In [10]:
path ="../../../../Data_preprocessing/RNA_post_autoencoder/encoded_data_binary_200.csv"
data = pd.read_csv(path)
data.reset_index
data.round(4)
data=data.iloc[:,1:202 ] 
data.head()

Unnamed: 0,Target,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,1,0.832239,1.233726,1.140988,2.16568,-1.597017,-0.743907,-0.071826,0.116781,-0.635038,...,0.077455,0.066486,2.200836,2.124988,-0.88225,0.367745,1.330217,0.660291,-0.899856,1.536773
1,1,-0.137469,-0.010969,0.843637,0.96603,0.409513,-0.847683,-1.006673,0.749325,-1.147702,...,-0.583738,-1.836914,1.90809,1.263731,-1.759173,-0.579072,1.043836,0.452179,0.771279,1.454416
2,1,-0.405876,0.109203,1.359791,1.22549,-0.138266,-1.679678,0.182724,-0.188638,-1.128553,...,0.342459,-0.801281,2.540216,0.93361,-1.590028,0.056035,1.889005,1.706726,0.912262,1.206671
3,0,-0.37618,0.196114,0.779141,1.123773,0.374653,-0.947631,-0.892223,0.621932,-0.999321,...,-0.272015,-2.344969,2.089854,1.595235,-1.722826,-0.317329,1.074202,0.198977,0.91791,1.435312
4,1,-1.964517,1.128601,-0.194799,-1.433777,-0.143446,-1.161713,-2.380596,0.026122,-1.105232,...,0.160001,-1.498763,2.403551,1.303866,-1.757287,0.949241,0.272723,2.77102,1.826682,1.85936


In [11]:
Y = data.Target # Target column

X = data.iloc[:,1:202] # I selected all the columns by removing the Unnamed column (row id) and the Target column.

print(X)
print('Numero de pacientes: ',len(Y))

            0         1         2         3         4         5         6  \
0    0.832239  1.233726  1.140988  2.165680 -1.597017 -0.743907 -0.071826   
1   -0.137469 -0.010969  0.843637  0.966030  0.409513 -0.847683 -1.006673   
2   -0.405876  0.109203  1.359791  1.225490 -0.138266 -1.679678  0.182724   
3   -0.376180  0.196114  0.779141  1.123773  0.374653 -0.947631 -0.892223   
4   -1.964517  1.128601 -0.194799 -1.433777 -0.143446 -1.161713 -2.380596   
..        ...       ...       ...       ...       ...       ...       ...   
176 -0.442013  0.276400  0.702629  0.145038  0.253912 -1.027387 -0.954011   
177  0.122423  0.435671  1.195094  1.099650 -0.651924 -1.125421 -0.957930   
178 -0.338142  0.528099  0.594873  0.742365 -0.105854 -1.069006 -0.974031   
179  0.151963  0.431527  0.093600  0.226152 -0.943175 -1.319361 -1.243683   
180 -0.074435  0.889761  1.095850  0.759461 -0.669753 -0.916490 -0.742355   

            7         8         9  ...       190       191       192  \
0  

### 3. Train-Test dataset split

In [12]:
XTrain, XTest, yTrain, yTest = train_test_split(X, Y, test_size=0.20, random_state=125, stratify=Y)

# Convert sets to arrays
XTrain = XTrain.values
XTest = XTest.values

print('Training set size:', len(XTrain))
print('Target column size of the training set:', len(yTrain))
print('Test set size:', len(XTest))
print('Target column size of the test set:', len(yTest))

Training set size: 144
Target column size of the training set: 144
Test set size: 37
Target column size of the test set: 37


### 4. Select the parameters of the model and fit it

In [13]:
param_grid = {'min_samples_leaf': [2,5,7,10],
              'min_samples_split': [2, 5, 7, 10],
              'max_depth':[2,5,10,12,15,20,100],
              'criterion':['entropy','gini'],
              'splitter': ['best', 'random'],
              'random_state':[125]}

# I created a GridSearchCV which allows us to systematically evaluate and select the parameters of our model.
# By indicating a model and the parameters to test, you can evaluate the performance of the first one based on the
# seconds through cross validation.
clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5)

clf.fit(XTrain , yTrain)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [2, 5, 10, 12, 15, 20, 100],
                         'min_samples_leaf': [2, 5, 7, 10],
                         'min_samples_split': [2, 5, 7, 10],
                         'random_state': [125],
                         'splitter': ['best', 'random']})

In [14]:
print("Best estimate of parameters according to GridSearchCV:")
model = clf.best_estimator_
# Fit the model with the best parameters
model.fit(XTrain , yTrain)

Best estimate of parameters according to GridSearchCV:


DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=5,
                       random_state=125, splitter='random')

In [15]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.5832512315270936


### 5. Prediction

In [16]:
# Making predictions with the optimal model on the training dataset
yTrain=yTrain.to_numpy()
yhatTrain = model.predict(XTrain)
contTrain = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTrain),1) :
    if (yhatTrain[i] == yTrain[i]):
        contTrain = contTrain + 1

In [17]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(XTest)
contTest = 0
yTest=yTest.to_numpy()

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTest),1) :
    if (yhatTest[i] == yTest[i]):
        contTest = contTest + 1


### 6. Results

In [18]:
print('Final accuracy on the training dataset:' + str(contTrain/len(yTrain)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(yTest)))

Final accuracy on the training dataset:0.8958333333333334
Final accuracy on the testing dataset: 0.5945945945945946


In [19]:
from sklearn.metrics import classification_report,confusion_matrix

print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(yTrain,yhatTrain))
print('Input data:  ' + str(np.array(yTrain)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[58 10]
 [ 5 71]]
Input data:  [1 0 1 0 1 1 0 0 1 0 1 0 1 0 1 1 0 1 1 1 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 1 1
 0 0 1 1 1 1 1 0 0 1 1 1 0 0 0 1 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 1 0 1 0 0
 0 1 0 1 0 0 0 1 1 0 0 1 1 1 1 1 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 1 1 1 0 1 1
 1 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 0 0 0 1 0 1 0 0 0 1 1 1 1 1 0 1]
Prediction:        [0 0 1 1 1 1 0 0 1 0 1 0 1 0 0 1 0 1 1 1 0 0 1 1 0 0 1 0 0 1 1 1 1 1 0 1 1
 0 0 1 1 1 1 1 0 0 1 1 1 0 0 0 1 0 0 1 0 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 0
 0 1 0 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 0 0 1 0 0 1 0 1 1 1 1 1
 1 1 0 1 0 0 0 1 0 1 0 1 0 1 1 1 1 0 0 0 1 0 1 0 0 0 1 1 1 1 0 0 1]


In [20]:
print(classification_report(yTrain,yhatTrain))

              precision    recall  f1-score   support

           0       0.92      0.85      0.89        68
           1       0.88      0.93      0.90        76

    accuracy                           0.90       144
   macro avg       0.90      0.89      0.89       144
weighted avg       0.90      0.90      0.90       144



In [21]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(yTest,yhatTest))
print('Input data:  ' + str(np.array(yTest)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[ 7 10]
 [ 5 15]]
Input data:  [0 1 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 1 0 1 0]
Prediction:        [0 1 0 1 1 0 1 0 1 1 1 0 1 0 1 0 1 0 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1]


In [22]:
print(classification_report(yTest,yhatTest))

              precision    recall  f1-score   support

           0       0.58      0.41      0.48        17
           1       0.60      0.75      0.67        20

    accuracy                           0.59        37
   macro avg       0.59      0.58      0.57        37
weighted avg       0.59      0.59      0.58        37

