# Classification model using Decision Tree

### 1. Import libraries

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV


### 2. Upload RNA data

In [24]:
path ='C:/Users/sandr/Documents/ART_project/GNN model/Data/PPT-Ohmnet/mRCC_big_pool/Second big pool/mrcc_protein_matrix_84_genes_32_nodes.csv'
data = pd.read_csv(path)
data.head(5)

Unnamed: 0.1,Unnamed: 0,APAF1,ARID1A,ATM,BAP1,EPAS1,ERBB2,FLT1,FLT4,GSTP1,...,SETD2,SLC2A1,SOD2,TGM2,TP53,TSC1,TSC2,VEGFA,VHL,Y
0,0,32.668769,33.848026,35.942429,33.677294,37.95811,35.32243,33.69326,30.79376,36.48088,...,32.58565,33.38586,38.67433,38.50142,33.83518,32.93402,34.9352,37.79678,32.30615,1
1,1,32.337493,33.843513,35.988225,32.643149,38.83281,33.71706,35.56873,33.38444,36.21403,...,33.19915,33.69538,38.64559,34.33752,34.4481,33.1663,35.08304,40.09193,32.19988,1
2,2,31.818198,33.516005,36.193587,32.368866,37.19345,33.38917,34.21918,33.3467,35.34069,...,32.84628,36.23588,40.50559,35.50178,35.4198,33.63282,34.79244,38.22308,31.49147,1
3,3,32.601293,34.197698,36.578348,31.8954,39.46713,33.2234,36.25593,34.21029,35.36208,...,33.68316,34.41938,38.99231,35.77236,34.18862,32.8825,35.02014,39.94908,32.11538,0
4,4,33.593121,33.35146,36.807497,33.968348,38.49884,33.40876,35.39769,34.92401,34.26885,...,34.63036,34.59911,38.41437,33.47112,34.91241,33.44515,35.0131,39.31564,33.33646,1


In [25]:
X = data.iloc[:,1:33  ] 
Y = []
for i in range (len(data)):
    if data.Y[i]==0: # If PFS is lower than 3 months, I will consider it as NonResponder (NR)
        Y.append(0)
    else:
        Y.append(1)# If PFS is over 3 months, I will consider it as Responder (R)
print(X)
print('Numero de pacientes: ',len(Y))

         APAF1     ARID1A        ATM       BAP1     EPAS1     ERBB2      FLT1  \
0    32.668769  33.848026  35.942429  33.677294  37.95811  35.32243  33.69326   
1    32.337493  33.843513  35.988225  32.643149  38.83281  33.71706  35.56873   
2    31.818198  33.516005  36.193587  32.368866  37.19345  33.38917  34.21918   
3    32.601293  34.197698  36.578348  31.895400  39.46713  33.22340  36.25593   
4    33.593121  33.351460  36.807497  33.968348  38.49884  33.40876  35.39769   
..         ...        ...        ...        ...       ...       ...       ...   
176  32.603769  34.133940  35.318612  33.843872  39.13826  33.62978  35.17642   
177  33.619701  32.373330  35.771711  32.519967  35.86338  31.25871  32.24347   
178  33.316811  34.118843  36.008091  33.115209  37.91340  32.66502  35.55199   
179  33.046782  33.833796  37.008936  32.895151  37.96870  33.57688  35.18870   
180  32.170042  33.739764  35.937812  33.404526  38.75226  32.10887  32.99715   

         FLT4     GSTP1    

### 3. Train-Test dataset split

In [26]:
XTrain, XTest, yTrain, yTest = train_test_split(X, Y, test_size=0.20, random_state=125, stratify=Y)

# Convert sets to arrays
XTrain = XTrain.values
XTest = XTest.values

print('Training set size:', len(XTrain))
print('Target column size of the training set:', len(yTrain))
print('Test set size:', len(XTest))
print('Target column size of the test set:', len(yTest))

Training set size: 144
Target column size of the training set: 144
Test set size: 37
Target column size of the test set: 37


### 4. Select the parameters of the model and fit it

In [27]:
param_grid = {'min_samples_leaf': [2,5,7],
              'min_samples_split': [2, 5],
              'max_depth':[2,5,10,12],
              'criterion':['entropy','gini'],
              'splitter': ['best', 'random'],
              'random_state':[125]}


# I created a GridSearchCV which allows us to systematically evaluate and select the parameters of our model.
# By indicating a model and the parameters to test, you can evaluate the performance of the first one based on the
# seconds through cross validation.
clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5)

clf.fit(XTrain , yTrain)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [2, 5, 10, 12],
                         'min_samples_leaf': [2, 5, 7],
                         'min_samples_split': [2, 5], 'random_state': [125],
                         'splitter': ['best', 'random']})

In [28]:
print("Best estimate of parameters according to GridSearchCV:")
model = clf.best_estimator_
# Fit the model with the best parameters
model.fit(XTrain , yTrain)

Best estimate of parameters according to GridSearchCV:


DecisionTreeClassifier(max_depth=5, min_samples_leaf=5, random_state=125,
                       splitter='random')

In [29]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.5834975369458127


### 5. Prediction

In [30]:
# Making predictions with the optimal model on the training dataset
yhatTrain = model.predict(XTrain)
contTrain = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTrain),1) :
    if (yhatTrain[i] == yTrain[i]):
        contTrain = contTrain + 1

In [31]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(XTest)
contTest = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTest),1) :
    if (yhatTest[i] == yTest[i]):
        contTest = contTest + 1


### 6. Results

In [32]:
print('Final accuracy on the training dataset:' + str(contTrain/len(yTrain)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(yTest)))

Final accuracy on the training dataset:0.7916666666666666
Final accuracy on the testing dataset: 0.6216216216216216


In [33]:
from sklearn.metrics import classification_report,confusion_matrix

print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(yTrain,yhatTrain))
print('Input data:  ' + str(np.array(yTrain)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[56 12]
 [18 58]]
Input data:  [1 0 1 0 1 1 0 0 1 0 1 0 1 0 1 1 0 1 1 1 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 1 1
 0 0 1 1 1 1 1 0 0 1 1 1 0 0 0 1 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 1 0 1 0 0
 0 1 0 1 0 0 0 1 1 0 0 1 1 1 1 1 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 1 1 1 0 1 1
 1 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 0 0 0 1 0 1 0 0 0 1 1 1 1 1 0 1]
Prediction:        [1 0 1 0 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 0 1 1 1 1 0 1 1 0 1 0
 0 0 1 0 1 1 1 0 1 1 1 0 0 0 0 1 1 0 1 1 1 1 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0
 0 1 0 1 0 0 0 1 1 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 1 0 1 1
 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1 0 1 1 0 0 1 1 1 0 1 1 1]


In [34]:
print(classification_report(yTrain,yhatTrain))

              precision    recall  f1-score   support

           0       0.76      0.82      0.79        68
           1       0.83      0.76      0.79        76

    accuracy                           0.79       144
   macro avg       0.79      0.79      0.79       144
weighted avg       0.79      0.79      0.79       144



In [35]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(yTest,yhatTest))
print('Input data:  ' + str(np.array(yTest)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[12  5]
 [ 9 11]]
Input data:  [0 1 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 1 0 1 0]
Prediction:        [1 0 0 0 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 1 0]


In [36]:
print(classification_report(yTest,yhatTest))

              precision    recall  f1-score   support

           0       0.57      0.71      0.63        17
           1       0.69      0.55      0.61        20

    accuracy                           0.62        37
   macro avg       0.63      0.63      0.62        37
weighted avg       0.63      0.62      0.62        37

