# Classification model using Decision Tree

### 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV


### 2. Upload RNA data

In [2]:
path ='C:/Users/sandr/Documents/ART_project/GNN model/Data/PPT-Ohmnet/mRCC_big_pool/Second big pool/mrcc_protein_matrix_24_genes_12_nodes.csv'
data = pd.read_csv(path)
data.head(5)

Unnamed: 0.1,Unnamed: 0,BAP1,EPAS1,MTOR,NF2,PIK3CA,PTEN,PTGS2,RNF139,SETD2,TP53,TSC1,VHL,Y
0,0,33.677294,37.95811,33.01718,33.9608,32.42544,36.73944,31.08504,32.46554,32.58565,33.83518,32.93402,32.30615,1
1,1,32.643149,38.83281,33.17883,33.69899,33.21465,37.13114,30.16993,32.2719,33.19915,34.4481,33.1663,32.19988,1
2,2,32.368866,37.19345,34.06093,34.07472,32.46705,37.91878,30.76766,32.55514,32.84628,35.4198,33.63282,31.49147,1
3,3,31.8954,39.46713,33.50445,33.14612,33.87549,37.77827,30.54053,33.19823,33.68316,34.18862,32.8825,32.11538,0
4,4,33.968348,38.49884,34.22502,32.58079,34.24976,37.99008,30.95478,30.89813,34.63036,34.91241,33.44515,33.33646,1


In [3]:
X = data.iloc[:,1:13  ] 
Y = []
for i in range (len(data)):
    if data.Y[i]==0: # If PFS is lower than 3 months, I will consider it as NonResponder (NR)
        Y.append(0)
    else:
        Y.append(1)# If PFS is over 3 months, I will consider it as Responder (R)
print(X)
print('Numero de pacientes: ',len(Y))

          BAP1     EPAS1      MTOR       NF2    PIK3CA      PTEN     PTGS2  \
0    33.677294  37.95811  33.01718  33.96080  32.42544  36.73944  31.08504   
1    32.643149  38.83281  33.17883  33.69899  33.21465  37.13114  30.16993   
2    32.368866  37.19345  34.06093  34.07472  32.46705  37.91878  30.76766   
3    31.895400  39.46713  33.50445  33.14612  33.87549  37.77827  30.54053   
4    33.968348  38.49884  34.22502  32.58079  34.24976  37.99008  30.95478   
..         ...       ...       ...       ...       ...       ...       ...   
176  33.843872  39.13826  33.84510  33.58214  32.89218  37.99666  30.11854   
177  32.519967  35.86338  32.98942  33.10420  33.35177  34.65038  31.17902   
178  33.115209  37.91340  34.30048  33.80118  32.93922  36.77314  33.02287   
179  32.895151  37.96870  32.81197  33.51366  32.59420  36.08937  31.14709   
180  33.404526  38.75226  33.43997  33.67890  32.98728  36.87734  34.94678   

       RNF139     SETD2      TP53      TSC1       VHL  
0    32

### 3. Train-Test dataset split

In [4]:
XTrain, XTest, yTrain, yTest = train_test_split(X, Y, test_size=0.20, random_state=125, stratify=Y)

# Convert sets to arrays
XTrain = XTrain.values
XTest = XTest.values

print('Training set size:', len(XTrain))
print('Target column size of the training set:', len(yTrain))
print('Test set size:', len(XTest))
print('Target column size of the test set:', len(yTest))

Training set size: 144
Target column size of the training set: 144
Test set size: 37
Target column size of the test set: 37


### 4. Select the parameters of the model and fit it

In [5]:
param_grid = {'min_samples_leaf': [2,5,7],
              'min_samples_split': [2, 5],
              'max_depth':[2,5,10,12],
              'criterion':['entropy','gini'],
              'splitter': ['best', 'random'],
              'random_state':[125]}


# I created a GridSearchCV which allows us to systematically evaluate and select the parameters of our model.
# By indicating a model and the parameters to test, you can evaluate the performance of the first one based on the
# seconds through cross validation.
clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5)

clf.fit(XTrain , yTrain)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [2, 5, 10, 12],
                         'min_samples_leaf': [2, 5, 7],
                         'min_samples_split': [2, 5], 'random_state': [125],
                         'splitter': ['best', 'random']})

In [6]:
print("Best estimate of parameters according to GridSearchCV:")
model = clf.best_estimator_
# Fit the model with the best parameters
model.fit(XTrain , yTrain)

Best estimate of parameters according to GridSearchCV:


DecisionTreeClassifier(max_depth=12, min_samples_leaf=2, min_samples_split=5,
                       random_state=125, splitter='random')

In [7]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.5556650246305418


### 5. Prediction

In [8]:
# Making predictions with the optimal model on the training dataset
yhatTrain = model.predict(XTrain)
contTrain = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTrain),1) :
    if (yhatTrain[i] == yTrain[i]):
        contTrain = contTrain + 1

In [9]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(XTest)
contTest = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTest),1) :
    if (yhatTest[i] == yTest[i]):
        contTest = contTest + 1


### 6. Results

In [10]:
print('Final accuracy on the training dataset:' + str(contTrain/len(yTrain)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(yTest)))

Final accuracy on the training dataset:0.8680555555555556
Final accuracy on the testing dataset: 0.4864864864864865


In [11]:
from sklearn.metrics import classification_report,confusion_matrix

print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(yTrain,yhatTrain))
print('Input data:  ' + str(np.array(yTrain)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[66  2]
 [17 59]]
Input data:  [1 0 1 0 1 1 0 0 1 0 1 0 1 0 1 1 0 1 1 1 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 1 1
 0 0 1 1 1 1 1 0 0 1 1 1 0 0 0 1 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 1 0 1 0 0
 0 1 0 1 0 0 0 1 1 0 0 1 1 1 1 1 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 1 1 1 0 1 1
 1 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 0 0 0 1 0 1 0 0 0 1 1 1 1 1 0 1]
Prediction:        [0 0 1 0 1 0 0 0 1 0 1 0 1 0 1 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 1 1
 0 0 1 1 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 1 1 0 1 0 1
 0 1 0 1 0 0 0 1 0 0 0 1 1 1 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 1 1 0 0 1 1
 1 1 0 1 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 1 1 1 0 0 0 1 1 1 1 0 0 0]


In [12]:
print(classification_report(yTrain,yhatTrain))

              precision    recall  f1-score   support

           0       0.80      0.97      0.87        68
           1       0.97      0.78      0.86        76

    accuracy                           0.87       144
   macro avg       0.88      0.87      0.87       144
weighted avg       0.89      0.87      0.87       144



In [13]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(yTest,yhatTest))
print('Input data:  ' + str(np.array(yTest)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[12  5]
 [14  6]]
Input data:  [0 1 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 1 0 1 0]
Prediction:        [1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 1 0]


In [14]:
print(classification_report(yTest,yhatTest))

              precision    recall  f1-score   support

           0       0.46      0.71      0.56        17
           1       0.55      0.30      0.39        20

    accuracy                           0.49        37
   macro avg       0.50      0.50      0.47        37
weighted avg       0.51      0.49      0.47        37

