# Classification model using Decision Tree

### 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

### 2. Upload Clinic data

In [2]:
path ="../../../../Data_preprocessing/RNA_post_autoencoder/encoded_data_multiclass_43_review.csv"
data = pd.read_csv(path)
data.reset_index
data.round(4)
data=data.iloc[:,1:45] 
data.head()

Unnamed: 0,Target,0,1,2,3,4,5,6,7,8,...,33,34,35,36,37,38,39,40,41,42
0,2,1.303475,0.320223,0.69162,-0.71522,0.269225,-0.053473,-0.916276,0.361488,1.341855,...,0.093131,0.798992,-0.733875,0.038262,0.433046,-0.558861,-0.318921,0.082476,0.361348,-1.581269
1,1,1.040682,0.433916,-0.177336,-0.800547,1.629002,-0.725473,-0.681308,0.043363,0.905286,...,-0.873726,0.304849,0.064908,-0.519554,-0.863431,0.0661,-0.439551,0.713993,-0.246427,-0.186777
2,2,-0.5592,0.636671,-0.067989,-1.036499,-0.306244,0.535286,0.646122,-0.509084,1.033889,...,-0.161364,0.682148,-0.9732,-0.753369,-0.353522,0.262463,-0.381236,-0.342806,0.985693,-0.042482
3,0,1.391767,0.104729,-0.06312,-1.088009,0.555148,-0.69876,-0.12583,-0.17368,0.955284,...,-0.880376,0.879043,0.222619,-0.724652,-0.580264,0.19751,0.039846,1.042282,-0.152176,-0.321515
4,1,1.6213,0.988073,-0.638373,-1.677556,0.2884,0.13792,-0.20117,0.558159,0.453544,...,-0.7508,-0.738778,1.238205,1.384416,-0.896759,-1.147495,0.979291,1.397885,1.220256,-0.773419


In [3]:
Y = data.Target # Target column

X = data.iloc[:,1:439] # I selected all the columns by removing the Unnamed column (row id) and the Target column.


print(X)
print('Numero de pacientes: ',len(Y))

            0         1         2         3         4         5         6  \
0    1.303475  0.320223  0.691620 -0.715220  0.269225 -0.053473 -0.916276   
1    1.040682  0.433916 -0.177336 -0.800547  1.629002 -0.725473 -0.681308   
2   -0.559200  0.636671 -0.067989 -1.036499 -0.306244  0.535286  0.646122   
3    1.391767  0.104729 -0.063120 -1.088009  0.555148 -0.698760 -0.125830   
4    1.621300  0.988073 -0.638373 -1.677556  0.288400  0.137920 -0.201170   
..        ...       ...       ...       ...       ...       ...       ...   
176  0.312103  0.656807 -0.522374 -0.846966  0.896216 -0.312821 -0.150635   
177  0.234944 -0.094377  0.260074 -0.793957  0.243106 -0.228022  0.059169   
178  0.420867  0.195804 -0.247219 -0.115416  0.472313 -0.002184  0.153051   
179  0.508123  0.810161 -0.569704 -0.666671  1.194661 -0.734187  0.211813   
180 -0.029691 -0.012050  0.024318 -0.584142  0.235847 -0.015007  0.447715   

            7         8         9  ...        33        34        35  \
0  

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=125, stratify=Y)
yTrain=y_train.to_numpy()
yTest=y_test.to_numpy()

print('Training set size:', len(X_train))
print('Target column size of the training set:', len(y_train))
print('Test set size:', len(X_test))
print('Target column size of the test set:', len(y_test))

Training set size: 144
Target column size of the training set: 144
Test set size: 37
Target column size of the test set: 37


### 4. Select the parameters of the model and fit it

In [5]:
param_grid = {'C': [1,2,5,7,10],
            'solver': ['newton-cg','liblinear'],  
            'max_iter':[100,200,300,400,500,600,700,800,900,1000,2000,3000, 4000,5000,20000],
            'random_state':[125]}

# Creamos un GridSearchCV que permite evaluar y seleccionar de forma sistemática los parámetros de nuestro modelo. 
# Indicándole un modelo y los parámetros a probar, puede evaluar el rendimiento del primero en función de los 
# segundos mediante validación cruzada.  
clf = GridSearchCV(LogisticRegression(), param_grid, cv = 5)
clf.fit(X_train , y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1, 2, 5, 7, 10],
                         'max_iter': [100, 200, 300, 400, 500, 600, 700, 800,
                                      900, 1000, 2000, 3000, 4000, 5000,
                                      20000],
                         'random_state': [125],
                         'solver': ['newton-cg', 'liblinear']})

In [6]:
print("Best estimate of parameters according to GridSearchCV:")
model = clf.best_estimator_
# Fit the model with the best parameters
model.fit(X_train , y_train)

Best estimate of parameters according to GridSearchCV:


LogisticRegression(C=1, random_state=125, solver='liblinear')

In [7]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.43103448275862066


In [9]:
# Making predictions with the optimal model on the training dataset
yhatTrain = model.predict(X_train)
contTrain = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTrain),1) :
    if (yhatTrain[i] == yTrain[i]):
        contTrain = contTrain + 1

In [11]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(X_test)
contTest = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTest),1) :
    if (yhatTest[i] == yTest[i]):
        contTest = contTest + 1

In [12]:
print('Final accuracy on the training dataset:' + str(contTrain/len(y_train)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(y_test)))

Final accuracy on the training dataset:0.6597222222222222
Final accuracy on the testing dataset: 0.32432432432432434


In [None]:
from sklearn.metrics import classification_report,confusion_matrix

print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(y_train,yhatTrain))
print('Input data:  ' + str(np.array(y_train)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[58  0  9]
 [ 8  8  9]
 [16  3 33]]
Input data:  [1 0 2 0 2 2 0 0 2 0 2 0 1 0 2 2 0 2 2 2 0 0 0 2 0 0 0 0 1 1 2 2 2 2 0 1 2
 0 0 2 1 2 2 1 0 0 2 2 1 0 0 0 2 0 0 2 0 2 1 2 0 2 1 0 0 0 0 2 2 2 0 2 0 0
 0 2 0 2 0 0 0 1 2 0 0 1 2 2 1 1 0 2 0 2 0 0 1 0 1 0 0 2 0 0 2 2 2 1 0 2 2
 1 2 0 1 0 0 0 2 0 1 0 2 0 0 2 1 1 0 0 0 2 0 1 0 0 0 2 2 2 1 2 0 1]
Prediction:        [0 0 2 2 0 0 0 0 2 0 2 2 0 0 0 2 0 0 0 2 0 0 0 2 0 0 0 0 2 0 2 2 0 0 0 0 2
 0 0 1 0 2 2 2 2 0 0 2 2 0 0 0 0 0 0 2 0 2 2 2 0 0 2 0 0 0 0 2 2 2 2 1 0 0
 0 2 0 0 0 0 0 1 0 0 0 1 2 0 1 1 0 2 0 2 0 2 1 0 1 2 2 2 0 0 2 1 0 2 0 2 2
 2 2 0 0 0 0 2 0 0 1 0 2 0 0 2 1 2 0 2 0 2 0 0 0 0 0 2 0 2 0 2 0 2]


In [None]:
print(classification_report(y_train,yhatTrain))

              precision    recall  f1-score   support

           0       0.71      0.87      0.78        67
           1       0.73      0.32      0.44        25
           2       0.65      0.63      0.64        52

    accuracy                           0.69       144
   macro avg       0.69      0.61      0.62       144
weighted avg       0.69      0.69      0.67       144



In [None]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(y_test,yhatTest))
print('Input data:  ' + str(np.array(y_test)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[ 6  2 10]
 [ 4  0  2]
 [ 8  0  5]]
Input data:  [0 1 0 2 0 2 1 0 1 0 2 2 1 2 2 0 1 2 0 0 0 2 1 0 0 2 0 2 2 0 0 0 2 2 0 0 0]
Prediction:        [2 0 1 0 2 0 0 0 0 2 0 0 0 2 2 2 2 0 1 2 0 0 2 2 0 2 2 2 0 0 0 2 2 0 2 2 0]


In [None]:
print(classification_report(y_test, yhatTest))

              precision    recall  f1-score   support

           0       0.33      0.33      0.33        18
           1       0.00      0.00      0.00         6
           2       0.29      0.38      0.33        13

    accuracy                           0.30        37
   macro avg       0.21      0.24      0.22        37
weighted avg       0.27      0.30      0.28        37

