# Classification model using Decision Tree

### 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

### 2. Upload Clinic data

In [2]:
path ='../../../../Data_preprocessing\Prediction PFS\RNA+Clinic joined\Clinical_data_and_RNA_30_Features_PFS.csv'
data = pd.read_csv(path)
data.head()

Unnamed: 0.1,Unnamed: 0,RNA_ID,Age,Sarc,Rhab,Number_of_Prior_Therapies,Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy,Tumor_Shrinkage,PFS,TM_TC_Ratio,...,RP11-96O20.4,RP5-827C21.1,RPS12P26,SDR42E1,SNORD113-5,TARBP2,TRAV41,TRIM43B,VN2R17P,XPC
0,0,G138701_RCCBMS-00020-T_v1_RNA_OnPrem,62.0,0.0,0.0,2.0,707.0,0.0,12.164384,1.374775,...,21.18753,27.40814,31.71139,30.26124,21.32743,29.60417,22.22154,21.9744,21.12857,32.24472
1,1,G138701_RCCBMS-00097-T_v1_RNA_OnPrem,62.0,0.0,0.0,2.0,707.0,0.0,5.490411,1.374775,...,21.18753,29.25015,29.15373,29.1273,21.32743,29.61188,22.22154,21.9744,21.12857,32.15619
2,2,G138701_RCCBMS-00141-T_v1_RNA_OnPrem,62.0,0.0,0.0,1.0,707.0,0.0,61.906849,1.374775,...,21.18753,23.03429,29.37858,27.22907,21.32743,31.90343,22.22154,21.9744,21.12857,31.86536
3,3,G138701_RCCBMS-00099-T_v1_RNA_OnPrem,62.0,0.0,0.0,3.0,707.0,0.0,1.249315,1.374775,...,21.18753,23.03429,30.19136,22.83731,21.32743,30.35424,22.22154,21.9744,21.12857,31.63561
4,4,G138701_RCCBMS-00163-T_v1_RNA_OnPrem,62.0,0.0,0.0,2.0,707.0,0.0,5.654795,1.374775,...,21.18753,23.03429,31.82172,22.83731,21.32743,31.60946,22.22154,21.9744,21.12857,33.12717


In [3]:
Y = [] # Target column
# For each entry I classified it by its PFS value.
for i in range (len(data)):
    if data.PFS[i]<3: # If PFS is lower than 3 months, I will consider it as NonResponder (NR)
        Y.append(0)
    elif data.PFS[i]<6: # If PFS is over 6 months, I will consider it as Responder (R)
        Y.append(1)
    else:
        Y.append(2) # If PFS is between 3 and 6 months, I will consider it as SemiResponder (SR)
        

data = data.drop('PFS', axis=1)
data = data.drop('RNA_ID', axis=1)

print(data.head())
X = data.iloc[:,1:57] 
# I selected all the columns by removing the Unnamed column (row id) and the Target column.
print(X)
print('Numero de pacientes: ',len(Y))

   Unnamed: 0   Age  Sarc  Rhab  Number_of_Prior_Therapies  \
0           0  62.0   0.0   0.0                        2.0   
1           1  62.0   0.0   0.0                        2.0   
2           2  62.0   0.0   0.0                        1.0   
3           3  62.0   0.0   0.0                        3.0   
4           4  62.0   0.0   0.0                        2.0   

   Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy  \
0                                              707.0             
1                                              707.0             
2                                              707.0             
3                                              707.0             
4                                              707.0             

   Tumor_Shrinkage  TM_TC_Ratio  Cohort_CM-009  Cohort_CM-010  ...  \
0              0.0     1.374775              0              1  ...   
1              0.0     1.374775              0              1  ...   
2              0.0  

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=125)


print('Training set size:', len(X_train))
print('Target column size of the training set:', len(y_train))
print('Test set size:', len(X_test))
print('Target column size of the test set:', len(y_test))

Training set size: 144
Target column size of the training set: 144
Test set size: 37
Target column size of the test set: 37


### 4. Select the parameters of the model and fit it

In [5]:
param_grid = {'C': [1,2,5,7,10],
            'solver': ['newton-cg','liblinear'],  
            'max_iter':[20000,50000,100000],
            'random_state':[125]}

# Creamos un GridSearchCV que permite evaluar y seleccionar de forma sistemática los parámetros de nuestro modelo. 
# Indicándole un modelo y los parámetros a probar, puede evaluar el rendimiento del primero en función de los 
# segundos mediante validación cruzada.  
clf = GridSearchCV(LogisticRegression(), param_grid, cv = 5)
clf.fit(X_train , y_train)



GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1, 2, 5, 7, 10],
                         'max_iter': [20000, 50000, 100000],
                         'random_state': [125],
                         'solver': ['newton-cg', 'liblinear']})

In [6]:
print("Best estimate of parameters according to GridSearchCV:")
model = clf.best_estimator_
# Fit the model with the best parameters
model.fit(X_train , y_train)

Best estimate of parameters according to GridSearchCV:


LogisticRegression(C=1, max_iter=20000, random_state=125, solver='liblinear')

In [7]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.6672413793103449


In [8]:
# Making predictions with the optimal model on the training dataset
yhatTrain = model.predict(X_train)
contTrain = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(y_train),1) :
    if (yhatTrain[i] == y_train[i]):
        contTrain = contTrain + 1

In [9]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(X_test)
contTest = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(y_test),1) :
    if (yhatTest[i] == y_test[i]):
        contTest = contTest + 1

In [10]:
print('Final accuracy on the training dataset:' + str(contTrain/len(y_train)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(y_test)))

Final accuracy on the training dataset:0.875
Final accuracy on the testing dataset: 0.7027027027027027


In [11]:
from sklearn.metrics import classification_report,confusion_matrix

print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(y_train,yhatTrain))
print('Input data:  ' + str(np.array(y_train)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[65  0  0]
 [ 1 14 13]
 [ 1  3 47]]
Input data:  [0 2 1 1 1 1 0 2 0 1 2 2 1 0 2 0 2 0 0 0 2 1 0 0 2 2 0 1 0 0 1 1 0 2 1 0 1
 2 0 2 2 0 0 1 0 1 1 0 1 0 0 1 0 0 0 2 0 2 0 2 1 0 2 2 0 2 2 2 0 2 0 2 0 2
 2 0 0 2 0 0 1 2 1 0 0 0 0 2 1 2 2 0 0 2 2 2 2 0 0 0 2 0 2 0 0 0 0 1 0 2 0
 2 0 2 1 0 0 0 0 1 2 2 0 2 0 2 2 0 1 2 0 1 2 0 0 2 2 1 2 1 2 2 0 0]
Prediction:        [0 2 0 1 1 2 0 2 0 2 2 2 1 0 2 0 2 0 0 0 2 2 0 0 1 2 0 1 0 0 1 2 0 2 2 0 1
 2 0 2 2 0 0 1 0 2 1 0 2 0 0 2 0 0 0 2 0 2 0 2 2 0 2 2 0 2 2 1 0 2 0 0 0 2
 2 0 0 2 0 0 2 2 2 0 0 0 0 2 1 2 2 0 0 2 2 1 2 0 0 0 2 0 2 0 0 0 0 1 0 2 0
 2 0 2 1 0 0 0 0 2 2 2 0 2 0 2 2 0 1 2 0 2 2 0 0 2 2 1 2 1 2 2 0 0]


In [12]:
print(classification_report(y_train,yhatTrain))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98        65
           1       0.82      0.50      0.62        28
           2       0.78      0.92      0.85        51

    accuracy                           0.88       144
   macro avg       0.86      0.81      0.82       144
weighted avg       0.88      0.88      0.87       144



In [13]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(y_test,yhatTest))
print('Input data:  ' + str(np.array(y_test)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[17  3  0]
 [ 0  2  1]
 [ 2  5  7]]
Input data:  [1 0 0 2 2 1 0 2 2 2 0 0 0 0 0 0 2 0 1 0 0 0 0 2 0 0 2 2 2 0 0 2 0 0 2 2 2]
Prediction:        [1 0 0 0 2 2 0 2 2 1 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 2 1 2 0 0 2 1 0 2 1 1]


In [14]:
print(classification_report(y_test, yhatTest))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87        20
           1       0.20      0.67      0.31         3
           2       0.88      0.50      0.64        14

    accuracy                           0.70        37
   macro avg       0.66      0.67      0.61        37
weighted avg       0.83      0.70      0.74        37



*Everolimus test*

In [15]:
path ='../../../../Data_preprocessing/Testing with everolimus/RNA+Clinic joined/Clinical_data_and_RNA_30_Features_everolimus.csv'
data = pd.read_csv(path)

Y = [] # Target column
# For each entry I classified it by its PFS value.
for i in range (len(data)):
    if data.PFS[i]<3: # If PFS is lower than 3 months, I will consider it as NonResponder (NR)
        Y.append(0)
    elif data.PFS[i]<6:
        Y.append(1)
    else:
        Y.append(2)# If PFS is over 3 months, I will consider it as Responder (R)
data=data.drop([ 'PFS', 'RNA_ID', 'Unnamed: 0'], axis=1)

In [16]:
data.head(5)

Unnamed: 0,Age,Sarc,Rhab,Number_of_Prior_Therapies,Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy,Tumor_Shrinkage,TM_TC_Ratio,Cohort_CM-009,Cohort_CM-010,Cohort_CM-025,...,RP11-96O20.4,RP5-827C21.1,RPS12P26,SDR42E1,SNORD113-5,TARBP2,TRAV41,TRIM43B,VN2R17P,XPC
0,63.0,0.0,0.0,1.0,727.5,-1.204819,1.410021,0.0,0.0,1,...,20.77248,27.09131,31.45042,28.23012,21.27932,31.08288,25.86902,25.5149,20.93829,31.96084
1,63.0,0.0,0.0,1.0,724.75,-1.204819,1.410021,0.0,0.0,1,...,21.03524,26.79393,28.60666,27.55631,20.88605,30.93771,29.24738,23.29578,20.99449,32.62522
2,63.0,0.0,0.0,1.0,723.375,-1.204819,1.410021,0.0,0.0,1,...,21.03524,25.96827,29.55728,28.44482,20.88605,32.2035,30.68048,20.68433,20.99449,32.66731
3,63.0,0.0,0.0,1.0,722.6875,-1.204819,1.410021,0.0,0.0,1,...,21.03524,26.26191,28.32744,30.9253,20.88605,30.41685,29.69705,23.83079,20.99449,32.59007
4,63.0,0.0,0.0,1.0,723.03125,-1.204819,1.410021,0.0,0.0,1,...,21.03524,25.45149,29.25251,29.51644,20.88605,31.69236,21.4775,20.68433,20.99449,32.31981


In [17]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(data)
contTest = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(Y),1) :
    if (yhatTest[i] == Y[i]):
        contTest = contTest + 1

print('Final accuracy on the testing dataset: ' + str(contTest/len(Y)))

Final accuracy on the testing dataset: 0.35384615384615387


In [18]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(Y,yhatTest))
print('Input data:  ' + str(np.array(Y)))
print('Prediction:        ' +str(yhatTest))
print(classification_report(Y,yhatTest))

----------------Confusion Matrix (Test)------------------
[[12 25 23]
 [ 0  7 20]
 [ 0 16 27]]
Input data:  [2 2 0 0 0 1 2 1 0 0 1 2 2 1 1 1 2 1 0 0 2 0 0 1 2 2 0 1 2 0 0 0 0 2 2 0 0
 2 0 2 0 2 2 0 0 0 1 0 1 0 1 1 0 2 0 0 0 0 2 0 0 0 0 1 1 2 1 1 0 0 2 2 1 0
 0 2 0 0 0 2 2 0 0 2 1 2 2 0 0 2 0 2 1 0 2 1 2 1 2 0 0 1 2 2 2 2 1 0 0 0 2
 0 1 0 2 0 0 2 2 1 1 2 0 2 0 2 0 0 0 0]
Prediction:        [1 2 2 1 1 2 2 2 1 2 1 2 2 2 2 2 1 2 1 1 2 2 2 1 2 1 1 2 2 2 0 1 0 1 1 1 0
 1 2 2 2 2 1 2 1 2 1 1 2 0 2 2 0 1 0 1 1 1 1 1 1 1 0 2 2 2 2 2 1 1 2 2 2 1
 2 2 2 2 1 2 2 2 2 1 2 2 2 0 1 2 1 1 2 2 2 1 2 1 1 2 0 2 2 2 2 2 2 1 2 0 1
 2 1 2 1 2 1 1 2 1 2 2 2 1 0 2 2 0 1 2]
              precision    recall  f1-score   support

           0       1.00      0.20      0.33        60
           1       0.15      0.26      0.19        27
           2       0.39      0.63      0.48        43

    accuracy                           0.35       130
   macro avg       0.51      0.36      0.33       130
weighted avg     