# Classification model using Decision Tree

### 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

### 2. Upload Clinic data

In [2]:
path ="../../../../Data_preprocessing/RNA_post_autoencoder/encoded_data_multiclass_200_review.csv"
data = pd.read_csv(path)
data.reset_index
data.round(4)
data=data.iloc[:,1:202  ] 
data.head()

Unnamed: 0,Target,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,2,0.090383,-0.649882,-0.316971,0.177905,-0.953876,0.335641,0.547794,-0.553937,0.110278,...,2.09292,-0.902635,-1.113107,-0.413884,-0.589024,1.206955,-0.300583,0.270012,-0.130936,0.180399
1,1,0.477048,-2.02071,1.091671,1.287272,-0.363761,0.754296,0.507876,0.777283,-1.12003,...,0.697679,0.224512,-1.192957,0.06101,0.960343,0.241303,0.719193,0.983675,-1.161567,-0.246087
2,2,0.635105,-1.455384,1.860401,0.632697,-0.930232,0.295499,0.756204,0.523225,-2.107392,...,0.579307,-1.044159,-1.480045,-0.010595,0.363854,0.695291,-0.760165,0.963433,-1.901634,-0.315921
3,0,0.961507,-2.004696,0.853696,1.317209,-0.5838,1.132528,0.550134,0.217618,-1.55247,...,1.360498,-0.087889,-1.876129,0.023887,0.442273,1.212686,-0.353484,0.908717,-1.918175,-0.237121
4,1,-0.525586,-1.646895,0.137172,0.377502,-0.305213,0.422324,0.28272,-0.810817,-2.39663,...,1.815167,-2.681982,-0.615784,-0.857696,-2.06251,0.060919,-0.82684,1.52223,-1.23677,0.269057


In [3]:
Y = data.Target # Target column

X = data.iloc[:,1:202] # I selected all the columns by removing the Unnamed column (row id) and the Target column.


print(X)
print('Numero de pacientes: ',len(Y))

            0         1         2         3         4         5         6  \
0    0.090383 -0.649882 -0.316971  0.177905 -0.953876  0.335641  0.547794   
1    0.477048 -2.020710  1.091671  1.287272 -0.363761  0.754296  0.507876   
2    0.635105 -1.455384  1.860401  0.632697 -0.930232  0.295499  0.756204   
3    0.961507 -2.004696  0.853696  1.317209 -0.583800  1.132528  0.550134   
4   -0.525586 -1.646895  0.137172  0.377502 -0.305213  0.422324  0.282720   
..        ...       ...       ...       ...       ...       ...       ...   
176  0.236947 -1.546189  0.735707  0.337749 -0.865903  0.177455  0.514749   
177  0.524165 -2.116409 -0.105561  0.699807 -0.680601  0.442997  0.912442   
178  0.392108 -1.538342  1.038728  0.582032 -0.505431 -0.056508  0.191502   
179 -0.151722 -1.736164  0.687119  0.422451 -0.804183 -0.165308  0.940421   
180  0.356281 -1.524765  0.030108 -0.003902 -0.441868 -0.106647  0.691161   

            7         8         9  ...       190       191       192  \
0  

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=125, stratify=Y)
yTrain=y_train.to_numpy()
yTest=y_test.to_numpy()

print('Training set size:', len(X_train))
print('Target column size of the training set:', len(y_train))
print('Test set size:', len(X_test))
print('Target column size of the test set:', len(y_test))

Training set size: 144
Target column size of the training set: 144
Test set size: 37
Target column size of the test set: 37


### 4. Select the parameters of the model and fit it

In [5]:
param_grid = {'C': [1,2,5,7,10],
            'solver': ['newton-cg','liblinear'],  
            'max_iter':[100,200,300,400,500,600,700,800,900,1000,2000,3000, 4000,5000,20000],
            'random_state':[125]}

# Creamos un GridSearchCV que permite evaluar y seleccionar de forma sistemática los parámetros de nuestro modelo. 
# Indicándole un modelo y los parámetros a probar, puede evaluar el rendimiento del primero en función de los 
# segundos mediante validación cruzada.  
clf = GridSearchCV(LogisticRegression(), param_grid, cv = 5)
clf.fit(X_train , y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1, 2, 5, 7, 10],
                         'max_iter': [100, 200, 300, 400, 500, 600, 700, 800,
                                      900, 1000, 2000, 3000, 4000, 5000,
                                      20000],
                         'random_state': [125],
                         'solver': ['newton-cg', 'liblinear']})

In [6]:
print("Best estimate of parameters according to GridSearchCV:")
model = clf.best_estimator_
# Fit the model with the best parameters
model.fit(X_train , y_train)

Best estimate of parameters according to GridSearchCV:


LogisticRegression(C=5, random_state=125, solver='liblinear')

In [7]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.4798029556650246


In [8]:
# Making predictions with the optimal model on the training dataset
yhatTrain = model.predict(X_train)
contTrain = 0
y_train=y_train.to_numpy()
# Comparing with the Target column and check how many hits there have been
for i in range(0,len(y_train),1) :
    if (yhatTrain[i] == y_train[i]):
        contTrain = contTrain + 1

In [9]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(X_test)
contTest = 0
y_test=y_test.to_numpy()
# Comparing with the Target column and check how many hits there have been
for i in range(0,len(y_test),1) :
    if (yhatTest[i] == y_test[i]):
        contTest = contTest + 1

In [10]:
print('Final accuracy on the training dataset:' + str(contTrain/len(y_train)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(y_test)))

Final accuracy on the training dataset:0.9722222222222222
Final accuracy on the testing dataset: 0.35135135135135137


In [11]:
from sklearn.metrics import classification_report,confusion_matrix

print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(y_train,yhatTrain))
print('Input data:  ' + str(np.array(y_train)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[66  0  1]
 [ 2 23  0]
 [ 1  0 51]]
Input data:  [1 0 2 0 2 2 0 0 2 0 2 0 1 0 2 2 0 2 2 2 0 0 0 2 0 0 0 0 1 1 2 2 2 2 0 1 2
 0 0 2 1 2 2 1 0 0 2 2 1 0 0 0 2 0 0 2 0 2 1 2 0 2 1 0 0 0 0 2 2 2 0 2 0 0
 0 2 0 2 0 0 0 1 2 0 0 1 2 2 1 1 0 2 0 2 0 0 1 0 1 0 0 2 0 0 2 2 2 1 0 2 2
 1 2 0 1 0 0 0 2 0 1 0 2 0 0 2 1 1 0 0 0 2 0 1 0 0 0 2 2 2 1 2 0 1]
Prediction:        [1 0 2 0 2 2 0 0 2 0 2 0 1 0 2 2 0 2 0 2 0 0 0 2 0 0 0 0 1 1 2 2 2 2 0 1 2
 0 0 2 0 2 2 1 0 0 2 2 1 0 0 0 2 0 0 2 0 2 1 2 0 2 1 0 0 0 0 2 2 2 0 2 0 0
 0 2 0 2 0 0 0 1 2 0 0 1 2 2 1 0 0 2 0 2 0 0 1 0 1 0 0 2 0 0 2 2 2 1 0 2 2
 1 2 0 1 0 0 0 2 0 1 0 2 0 0 2 1 1 0 0 2 2 0 1 0 0 0 2 2 2 1 2 0 1]


In [12]:
print(classification_report(y_train,yhatTrain))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97        67
           1       1.00      0.92      0.96        25
           2       0.98      0.98      0.98        52

    accuracy                           0.97       144
   macro avg       0.98      0.96      0.97       144
weighted avg       0.97      0.97      0.97       144



In [13]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(y_test,yhatTest))
print('Input data:  ' + str(np.array(y_test)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[9 4 5]
 [2 1 3]
 [7 3 3]]
Input data:  [0 1 0 2 0 2 1 0 1 0 2 2 1 2 2 0 1 2 0 0 0 2 1 0 0 2 0 2 2 0 0 0 2 2 0 0 0]
Prediction:        [0 1 0 0 2 2 2 1 0 0 1 0 0 2 0 1 2 2 0 2 1 1 2 1 0 0 0 0 1 2 0 2 0 0 0 2 0]


In [14]:
print(classification_report(y_test, yhatTest))

              precision    recall  f1-score   support

           0       0.50      0.50      0.50        18
           1       0.12      0.17      0.14         6
           2       0.27      0.23      0.25        13

    accuracy                           0.35        37
   macro avg       0.30      0.30      0.30        37
weighted avg       0.36      0.35      0.35        37

