# Classification model using Decision Tree

### 1. Import libraries

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

### 2. Upload Clinic data

In [16]:
path ="../../../../Data_preprocessing/RNA_post_autoencoder/encoded_data_multiclass_43.csv"
data = pd.read_csv(path)
data.reset_index
data.round(4)
data=data.iloc[:,1:440  ] 
data.head()

Unnamed: 0,Target,0,1,2,3,4,5,6,7,8,...,33,34,35,36,37,38,39,40,41,42
0,2,0.022976,-0.96531,-1.23871,-0.9806,-1.223086,-0.737987,-0.887323,-0.675689,-0.028122,...,0.269411,-0.928447,-0.028176,0.111155,-0.279341,0.982918,-0.837571,-0.242425,-0.231228,0.046297
1,1,-0.151407,-0.861814,-0.970073,-0.646467,0.316065,0.509519,-0.708765,-0.422852,-0.436029,...,-0.513224,0.174376,0.292357,1.021724,1.833745,1.062946,0.196452,0.012724,-0.118445,0.739934
2,2,-0.06821,-1.288656,-1.196822,-0.350951,-0.518477,0.425649,-0.675744,0.307187,-0.279795,...,-0.291583,-0.543025,0.626299,0.95717,0.78782,0.170295,0.837617,-0.622304,-0.296431,-0.324272
3,0,-0.392235,-0.980445,-0.998436,-0.137651,0.285102,0.807072,-0.229599,0.20383,-0.296264,...,-0.53684,0.085051,0.21271,0.40995,1.947804,1.063359,-0.320035,-0.099703,-0.466355,0.805688
4,1,0.263224,0.673038,-1.974711,1.167531,-1.717946,1.500515,-1.023599,-0.077397,-0.726597,...,-0.237098,-0.97087,-1.468881,0.098396,1.068838,-0.387483,-0.999648,0.46018,-1.353426,-0.283846


In [17]:
Y = data.Target # Target column

X = data.iloc[:,1:439] # I selected all the columns by removing the Unnamed column (row id) and the Target column.


print(X)
print('Numero de pacientes: ',len(Y))

            0         1         2         3         4         5         6  \
0    0.022976 -0.965310 -1.238710 -0.980600 -1.223086 -0.737987 -0.887323   
1   -0.151407 -0.861814 -0.970073 -0.646467  0.316065  0.509519 -0.708765   
2   -0.068210 -1.288656 -1.196822 -0.350951 -0.518477  0.425649 -0.675744   
3   -0.392235 -0.980445 -0.998436 -0.137651  0.285102  0.807072 -0.229599   
4    0.263224  0.673038 -1.974711  1.167531 -1.717946  1.500515 -1.023599   
..        ...       ...       ...       ...       ...       ...       ...   
176 -0.152809 -0.701253 -1.527834  0.090276 -0.717189  0.777892 -0.739791   
177 -0.569668 -0.485702 -1.285647 -0.943146 -0.530570 -0.204337 -0.205112   
178 -0.196661 -0.540016 -1.396883  0.021088 -1.211431  0.308100 -0.929356   
179 -0.007758 -0.201605 -1.513765 -0.789999 -0.198751  0.963480 -0.752608   
180  0.007880 -1.269928 -1.413523 -1.124486 -0.896541 -0.050075 -0.325665   

            7         8         9  ...        33        34        35  \
0  

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=125, stratify=Y)
yTrain=y_train.to_numpy()
yTest=y_test.to_numpy()

print('Training set size:', len(X_train))
print('Target column size of the training set:', len(y_train))
print('Test set size:', len(X_test))
print('Target column size of the test set:', len(y_test))

Training set size: 144
Target column size of the training set: 144
Test set size: 37
Target column size of the test set: 37


### 4. Select the parameters of the model and fit it

In [29]:
param_grid = {'C': [1,2,5,7,10],
            'solver': ['newton-cg','liblinear'],  
            'max_iter':[100,200,300,400,500,600,700,800,900,1000,2000,3000, 4000,5000,20000],
            'random_state':[125]}

# Creamos un GridSearchCV que permite evaluar y seleccionar de forma sistemática los parámetros de nuestro modelo. 
# Indicándole un modelo y los parámetros a probar, puede evaluar el rendimiento del primero en función de los 
# segundos mediante validación cruzada.  
clf = GridSearchCV(LogisticRegression(), param_grid, cv = 5)
clf.fit(X_train , y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1, 2, 5, 7, 10],
                         'max_iter': [100, 200, 300, 400, 500, 600, 700, 800,
                                      900, 1000, 2000, 3000, 4000, 5000,
                                      20000],
                         'random_state': [125],
                         'solver': ['newton-cg', 'liblinear']})

In [30]:
print("Best estimate of parameters according to GridSearchCV:")
model = clf.best_estimator_
# Fit the model with the best parameters
model.fit(X_train , y_train)

Best estimate of parameters according to GridSearchCV:


LogisticRegression(C=1, random_state=125, solver='liblinear')

In [31]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.520935960591133


In [33]:
# Making predictions with the optimal model on the training dataset
yhatTrain = model.predict(X_train)
contTrain = 0
# y_train=y_train.to_numpy()
# Comparing with the Target column and check how many hits there have been
for i in range(0,len(y_train),1) :
    if (yhatTrain[i] == y_train[i]):
        contTrain = contTrain + 1

In [35]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(X_test)
contTest = 0
# y_test=y_test.to_numpy()
# Comparing with the Target column and check how many hits there have been
for i in range(0,len(y_test),1) :
    if (yhatTest[i] == y_test[i]):
        contTest = contTest + 1

In [36]:
print('Final accuracy on the training dataset:' + str(contTrain/len(y_train)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(y_test)))

Final accuracy on the training dataset:0.6875
Final accuracy on the testing dataset: 0.2972972972972973


In [25]:
from sklearn.metrics import classification_report,confusion_matrix

print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(y_train,yhatTrain))
print('Input data:  ' + str(np.array(y_train)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[58  0  9]
 [ 8  8  9]
 [16  3 33]]
Input data:  [1 0 2 0 2 2 0 0 2 0 2 0 1 0 2 2 0 2 2 2 0 0 0 2 0 0 0 0 1 1 2 2 2 2 0 1 2
 0 0 2 1 2 2 1 0 0 2 2 1 0 0 0 2 0 0 2 0 2 1 2 0 2 1 0 0 0 0 2 2 2 0 2 0 0
 0 2 0 2 0 0 0 1 2 0 0 1 2 2 1 1 0 2 0 2 0 0 1 0 1 0 0 2 0 0 2 2 2 1 0 2 2
 1 2 0 1 0 0 0 2 0 1 0 2 0 0 2 1 1 0 0 0 2 0 1 0 0 0 2 2 2 1 2 0 1]
Prediction:        [0 0 2 2 0 0 0 0 2 0 2 2 0 0 0 2 0 0 0 2 0 0 0 2 0 0 0 0 2 0 2 2 0 0 0 0 2
 0 0 1 0 2 2 2 2 0 0 2 2 0 0 0 0 0 0 2 0 2 2 2 0 0 2 0 0 0 0 2 2 2 2 1 0 0
 0 2 0 0 0 0 0 1 0 0 0 1 2 0 1 1 0 2 0 2 0 2 1 0 1 2 2 2 0 0 2 1 0 2 0 2 2
 2 2 0 0 0 0 2 0 0 1 0 2 0 0 2 1 2 0 2 0 2 0 0 0 0 0 2 0 2 0 2 0 2]


In [26]:
print(classification_report(y_train,yhatTrain))

              precision    recall  f1-score   support

           0       0.71      0.87      0.78        67
           1       0.73      0.32      0.44        25
           2       0.65      0.63      0.64        52

    accuracy                           0.69       144
   macro avg       0.69      0.61      0.62       144
weighted avg       0.69      0.69      0.67       144



In [27]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(y_test,yhatTest))
print('Input data:  ' + str(np.array(y_test)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[ 6  2 10]
 [ 4  0  2]
 [ 8  0  5]]
Input data:  [0 1 0 2 0 2 1 0 1 0 2 2 1 2 2 0 1 2 0 0 0 2 1 0 0 2 0 2 2 0 0 0 2 2 0 0 0]
Prediction:        [2 0 1 0 2 0 0 0 0 2 0 0 0 2 2 2 2 0 1 2 0 0 2 2 0 2 2 2 0 0 0 2 2 0 2 2 0]


In [28]:
print(classification_report(y_test, yhatTest))

              precision    recall  f1-score   support

           0       0.33      0.33      0.33        18
           1       0.00      0.00      0.00         6
           2       0.29      0.38      0.33        13

    accuracy                           0.30        37
   macro avg       0.21      0.24      0.22        37
weighted avg       0.27      0.30      0.28        37

