# Classification model using Decision Tree

### 1. Import libraries

In [169]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

### 2. Upload Clinic data

In [170]:
path ='../../../Data_preprocessing/Prediction PFS/RNA+Clinic joined/Clinical_data_categorized_PFS.csv' 
data = pd.read_csv(path)
data.head()

Unnamed: 0.1,Unnamed: 0,RNA_ID,Age,Sarc,Rhab,Number_of_Prior_Therapies,Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy,Tumor_Shrinkage,PFS,TM_TC_Ratio,...,MSKCC_NA,MSKCC_POOR,IMDC_FAVORABLE,IMDC_INTERMEDIATE,IMDC_NOT_REPORTED,IMDC_POOR,ImmunoPhenotype_Desert,ImmunoPhenotype_Excluded,ImmunoPhenotype_Infiltrated,ImmunoPhenotype_NA
0,0,G138701_RCCBMS-00020-T_v1_RNA_OnPrem,62.0,0.0,0.0,2.0,962.0,-16.40625,12.164384,0.766909,...,0,0,0,0,1,0,0,0,1,0
1,1,G138701_RCCBMS-00097-T_v1_RNA_OnPrem,67.0,0.0,0.0,2.0,254.0,-40.0,5.490411,0.773269,...,0,0,0,0,1,0,0,0,1,0
2,2,G138701_RCCBMS-00141-T_v1_RNA_OnPrem,62.0,0.0,0.0,1.0,800.0,-89.43662,61.906849,1.782859,...,0,1,0,0,1,0,0,0,1,0
3,3,G138701_RCCBMS-00099-T_v1_RNA_OnPrem,60.0,0.0,0.0,3.0,790.0,11.787072,1.249315,5.890573,...,0,0,0,0,1,0,0,1,0,0
4,4,G138701_RCCBMS-00163-T_v1_RNA_OnPrem,76.0,0.0,0.0,2.0,1241.0,0.0,5.654795,1.374775,...,0,0,0,0,1,0,0,0,0,1


In [171]:
Y = [] # Target column
# For each entry I classified it by its PFS value.
for i in range (len(data)):
    if data.PFS[i]<3: # If PFS is lower than 3 months, I will consider it as NonResponder (NR)
        Y.append(0)
    elif data.PFS[i]<6: # If PFS is over 6 months, I will consider it as Responder (R)
        Y.append(1)
    else:
        Y.append(2) # If PFS is between 3 and 6 months, I will consider it as SemiResponder (SR)
        
data = data.drop('PFS', axis=1) # As we won't need this column any more, I deleted it.

X = data.iloc[:,2:26] 
# I selected all the columns by removing the Unnamed column (row id) and the Target column.
print(X)
print('Numero de pacientes: ',len(Y))

      Age  Sarc  Rhab  Number_of_Prior_Therapies  \
0    62.0   0.0   0.0                        2.0   
1    67.0   0.0   0.0                        2.0   
2    62.0   0.0   0.0                        1.0   
3    60.0   0.0   0.0                        3.0   
4    76.0   0.0   0.0                        2.0   
..    ...   ...   ...                        ...   
176  77.0   0.0   0.0                        1.0   
177  54.0   0.0   0.0                        1.0   
178  64.0   0.0   0.0                        1.0   
179  75.0   0.0   0.0                        1.0   
180  50.0   0.0   0.0                        1.0   

     Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy  \
0                                                962.0             
1                                                254.0             
2                                                800.0             
3                                                790.0             
4                                  

### 3. Train-Test dataset split

In [172]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=126, stratify=Y)


print('Training set size:', len(X_train))
print('Target column size of the training set:', len(y_train))
print('Test set size:', len(X_test))
print('Target column size of the test set:', len(y_test))

Training set size: 144
Target column size of the training set: 144
Test set size: 37
Target column size of the test set: 37


### 4. Select the parameters of the model and fit it

In [173]:
param_grid = {'C': [1,2,5,7,10],
            'solver': ['liblinear'], # No escogemos ‘lbfgs’, ‘sag’ ni ‘saga’ porque no termina de 
                                                 # ejecutarse debido a: "TOTAL NO. of ITERATIONS REACHED LIMIT"
            'max_iter':[25,50,100,200,500,1000],
            'random_state':[1225]}

# I created a GridSearchCV which allows us to systematically evaluate and select the parameters of our model.
# By indicating a model and the parameters to test, you can evaluate the performance of the first one based on the
# seconds through cross validation.
clf = GridSearchCV(LogisticRegression(), param_grid, cv = 5)
clf.fit(X_train , y_train)



GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1, 2, 5, 7, 10],
                         'max_iter': [25, 50, 100, 200, 500, 1000],
                         'random_state': [1225], 'solver': ['liblinear']})

In [174]:
print("Best estimate of parameters according to GridSearchCV:")
model = clf.best_estimator_
# Fit the model with the best parameters
model.fit(X_train , y_train)

Best estimate of parameters according to GridSearchCV:


LogisticRegression(C=1, max_iter=25, random_state=1225, solver='liblinear')

In [175]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.7152709359605911


### 5. Prediction

In [176]:
# Making predictions with the optimal model on the training dataset
yhatTrain = model.predict(X_train)
contTrain = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(y_train),1) :
    if (yhatTrain[i] == y_train[i]):
        contTrain = contTrain + 1

In [177]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(X_test)
contTest = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(y_test),1) :
    if (yhatTest[i] == y_test[i]):
        contTest = contTest + 1

In [178]:
print('Final accuracy on the training dataset:' + str(contTrain/len(y_train)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(y_test)))

Final accuracy on the training dataset:0.7708333333333334
Final accuracy on the testing dataset: 0.6216216216216216


In [179]:
from sklearn.metrics import classification_report,confusion_matrix

print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(y_train,yhatTrain))
print('Input data:  ' + str(np.array(y_train)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[63  1  3]
 [12  2 11]
 [ 6  0 46]]
Input data:  [2 2 1 0 2 2 0 0 2 2 1 2 2 0 2 1 1 2 1 0 2 0 1 0 0 0 0 2 0 0 0 0 0 0 0 0 2
 1 0 2 0 2 0 1 0 0 2 2 0 0 0 0 0 2 1 2 1 0 1 0 1 2 1 0 0 0 2 2 1 0 2 2 1 1
 2 0 0 0 0 0 2 2 2 2 2 0 1 0 2 2 1 2 2 2 0 1 1 2 1 0 2 0 1 1 0 0 0 2 2 0 2
 2 0 0 0 2 2 2 0 0 0 0 0 0 2 0 2 0 1 2 0 0 2 2 2 2 0 0 2 0 0 0 0 1]
Prediction:        [2 2 2 0 2 0 0 0 0 2 0 2 2 0 0 2 1 2 2 0 2 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 2
 2 0 2 0 2 0 0 0 0 2 2 0 0 0 0 0 0 0 2 2 0 1 0 2 2 2 0 0 0 2 2 2 0 2 2 0 0
 2 0 0 0 0 0 2 2 2 2 2 2 0 0 2 2 0 2 2 2 2 0 0 2 2 0 0 0 0 0 0 0 0 2 2 0 2
 2 0 0 0 2 2 2 0 0 0 0 0 0 2 0 2 0 2 2 0 2 2 0 2 2 1 0 2 0 0 0 0 0]


In [180]:
print(classification_report(y_train,yhatTrain))

              precision    recall  f1-score   support

           0       0.78      0.94      0.85        67
           1       0.67      0.08      0.14        25
           2       0.77      0.88      0.82        52

    accuracy                           0.77       144
   macro avg       0.74      0.63      0.61       144
weighted avg       0.75      0.77      0.72       144



In [181]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(y_test,yhatTest))
print('Input data:  ' + str(np.array(y_test)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[15  1  2]
 [ 0  1  5]
 [ 4  2  7]]
Input data:  [0 2 0 1 0 2 1 0 0 2 2 0 0 0 0 0 0 0 2 0 2 2 2 2 1 1 2 2 0 0 0 0 1 2 2 1 0]
Prediction:        [0 1 0 2 0 0 2 0 0 2 0 0 2 0 0 0 0 0 2 2 1 2 0 2 2 1 2 2 0 0 0 1 2 2 0 2 0]


In [182]:
print(classification_report(y_test, yhatTest))

              precision    recall  f1-score   support

           0       0.79      0.83      0.81        18
           1       0.25      0.17      0.20         6
           2       0.50      0.54      0.52        13

    accuracy                           0.62        37
   macro avg       0.51      0.51      0.51        37
weighted avg       0.60      0.62      0.61        37

