# Classification model using Decision Tree

### 1. Import libraries

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

### 2. Upload Clinic data

In [24]:
path ="../../../../Data_preprocessing/RNA_post_autoencoder/encoded_data_multiclass_200.csv"
data = pd.read_csv(path)
data.reset_index
data.round(4)
data=data.iloc[:,1:202  ] 
data.head()

Unnamed: 0,Target,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,2,0.436414,-1.576482,2.176638,1.04665,-0.967788,-0.33276,-0.586029,1.43057,-0.895583,...,-0.92876,0.075799,-0.263159,-0.65543,1.285033,1.158913,-0.501004,-1.361494,0.612884,0.746754
1,1,-0.032149,-0.995621,1.88806,1.284731,0.07914,0.064063,-0.199813,1.521006,0.564534,...,0.483468,-0.893442,0.575146,-0.778792,0.32417,0.177823,-1.233956,-1.483099,-0.341608,-0.996777
2,2,0.476933,-2.010775,1.879018,0.38309,0.207985,-0.992632,0.020097,0.387833,-1.122965,...,0.017044,0.602836,-1.916266,1.144584,1.06957,0.53593,-0.897938,-1.371405,-0.231251,-0.27436
3,0,0.547395,-0.711453,1.527116,1.052659,0.077064,0.006284,-0.112697,1.186751,0.211799,...,0.633414,-0.52401,0.202187,-0.838269,-0.029785,0.492356,-1.486459,-1.970962,0.530672,-0.985897
4,1,-0.624718,-2.406139,2.824231,-0.012635,-1.314704,-0.748785,0.687588,2.262579,-0.263327,...,0.45467,-0.789785,-0.452545,1.738423,0.543774,0.753638,-1.839001,-0.347618,0.589409,-1.081583


In [25]:
Y = data.Target # Target column

X = data.iloc[:,1:202] # I selected all the columns by removing the Unnamed column (row id) and the Target column.


print(X)
print('Numero de pacientes: ',len(Y))

            0         1         2         3         4         5         6  \
0    0.436414 -1.576482  2.176638  1.046650 -0.967788 -0.332760 -0.586029   
1   -0.032149 -0.995621  1.888060  1.284731  0.079140  0.064063 -0.199813   
2    0.476933 -2.010775  1.879018  0.383090  0.207985 -0.992632  0.020097   
3    0.547395 -0.711453  1.527116  1.052659  0.077064  0.006284 -0.112697   
4   -0.624718 -2.406139  2.824231 -0.012635 -1.314704 -0.748785  0.687588   
..        ...       ...       ...       ...       ...       ...       ...   
176  0.152039 -1.448956  1.631890  0.438465 -0.797802 -0.450635 -0.031156   
177  1.139950 -1.364696  0.767903  0.954542  0.358079 -0.439770  0.195097   
178  0.291298 -2.023891  1.954019  1.072944 -0.490579 -0.466329  0.094765   
179  0.102469 -0.451807  1.057516  0.105644 -0.375693 -0.814802  0.114747   
180  0.794935 -1.928761  0.771840  0.929302 -0.237190 -0.890115  0.127341   

            7         8         9  ...       190       191       192  \
0  

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=125, stratify=Y)
yTrain=y_train.to_numpy()
yTest=y_test.to_numpy()

print('Training set size:', len(X_train))
print('Target column size of the training set:', len(y_train))
print('Test set size:', len(X_test))
print('Target column size of the test set:', len(y_test))

Training set size: 144
Target column size of the training set: 144
Test set size: 37
Target column size of the test set: 37


### 4. Select the parameters of the model and fit it

In [27]:
param_grid = {'C': [1,2,5,7,10],
            'solver': ['newton-cg','liblinear'],  
            'max_iter':[100,200,300,400,500,600,700,800,900,1000,2000,3000, 4000,5000,20000],
            'random_state':[125]}

# Creamos un GridSearchCV que permite evaluar y seleccionar de forma sistemática los parámetros de nuestro modelo. 
# Indicándole un modelo y los parámetros a probar, puede evaluar el rendimiento del primero en función de los 
# segundos mediante validación cruzada.  
clf = GridSearchCV(LogisticRegression(), param_grid, cv = 5)
clf.fit(X_train , y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1, 2, 5, 7, 10],
                         'max_iter': [100, 200, 300, 400, 500, 600, 700, 800,
                                      900, 1000, 2000, 3000, 4000, 5000,
                                      20000],
                         'random_state': [125],
                         'solver': ['newton-cg', 'liblinear']})

In [28]:
print("Best estimate of parameters according to GridSearchCV:")
model = clf.best_estimator_
# Fit the model with the best parameters
model.fit(X_train , y_train)

Best estimate of parameters according to GridSearchCV:


LogisticRegression(C=1, random_state=125, solver='newton-cg')

In [29]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.45197044334975367


In [30]:
# Making predictions with the optimal model on the training dataset
yhatTrain = model.predict(X_train)
contTrain = 0
y_train=y_train.to_numpy()
# Comparing with the Target column and check how many hits there have been
for i in range(0,len(y_train),1) :
    if (yhatTrain[i] == y_train[i]):
        contTrain = contTrain + 1

In [31]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(X_test)
contTest = 0
y_test=y_test.to_numpy()
# Comparing with the Target column and check how many hits there have been
for i in range(0,len(y_test),1) :
    if (yhatTest[i] == y_test[i]):
        contTest = contTest + 1

In [32]:
print('Final accuracy on the training dataset:' + str(contTrain/len(y_train)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(y_test)))

Final accuracy on the training dataset:0.9791666666666666
Final accuracy on the testing dataset: 0.43243243243243246


In [33]:
from sklearn.metrics import classification_report,confusion_matrix

print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(y_train,yhatTrain))
print('Input data:  ' + str(np.array(y_train)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[67  0  0]
 [ 1 24  0]
 [ 2  0 50]]
Input data:  [1 0 2 0 2 2 0 0 2 0 2 0 1 0 2 2 0 2 2 2 0 0 0 2 0 0 0 0 1 1 2 2 2 2 0 1 2
 0 0 2 1 2 2 1 0 0 2 2 1 0 0 0 2 0 0 2 0 2 1 2 0 2 1 0 0 0 0 2 2 2 0 2 0 0
 0 2 0 2 0 0 0 1 2 0 0 1 2 2 1 1 0 2 0 2 0 0 1 0 1 0 0 2 0 0 2 2 2 1 0 2 2
 1 2 0 1 0 0 0 2 0 1 0 2 0 0 2 1 1 0 0 0 2 0 1 0 0 0 2 2 2 1 2 0 1]
Prediction:        [1 0 2 0 0 2 0 0 2 0 2 0 1 0 2 2 0 2 0 2 0 0 0 2 0 0 0 0 1 1 2 2 2 2 0 1 2
 0 0 2 0 2 2 1 0 0 2 2 1 0 0 0 2 0 0 2 0 2 1 2 0 2 1 0 0 0 0 2 2 2 0 2 0 0
 0 2 0 2 0 0 0 1 2 0 0 1 2 2 1 1 0 2 0 2 0 0 1 0 1 0 0 2 0 0 2 2 2 1 0 2 2
 1 2 0 1 0 0 0 2 0 1 0 2 0 0 2 1 1 0 0 0 2 0 1 0 0 0 2 2 2 1 2 0 1]


In [34]:
print(classification_report(y_train,yhatTrain))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        67
           1       1.00      0.96      0.98        25
           2       1.00      0.96      0.98        52

    accuracy                           0.98       144
   macro avg       0.99      0.97      0.98       144
weighted avg       0.98      0.98      0.98       144



In [35]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(y_test,yhatTest))
print('Input data:  ' + str(np.array(y_test)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[11  1  6]
 [ 3  1  2]
 [ 6  3  4]]
Input data:  [0 1 0 2 0 2 1 0 1 0 2 2 1 2 2 0 1 2 0 0 0 2 1 0 0 2 0 2 2 0 0 0 2 2 0 0 0]
Prediction:        [0 0 0 0 2 2 2 0 1 0 2 2 0 2 0 0 0 0 0 2 1 0 2 2 0 1 0 1 1 2 0 0 0 0 2 2 0]


In [36]:
print(classification_report(y_test, yhatTest))

              precision    recall  f1-score   support

           0       0.55      0.61      0.58        18
           1       0.20      0.17      0.18         6
           2       0.33      0.31      0.32        13

    accuracy                           0.43        37
   macro avg       0.36      0.36      0.36        37
weighted avg       0.42      0.43      0.42        37

