# Classification model using Decision Tree

### 1. Import libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

### 2. Upload Clinic data

In [6]:
path ="../../../Data_preprocessing/RNA_post_autoencoder/encoded_data_multiclass.csv"
data = pd.read_csv(path)
data.reset_index
data.round(4)
data=data.iloc[:,1:440  ] 
data.head()

Unnamed: 0,Target,0,1,2,3,4,5,6,7,8,...,428,429,430,431,432,433,434,435,436,437
0,1,-0.315205,-0.407536,0.531673,0.808408,-0.810542,0.193779,0.034024,-0.164311,0.014579,...,0.646639,-0.484226,0.024023,0.428195,-0.997664,-0.198929,0.558472,-0.71109,-0.625051,1.313173
1,0,-0.399706,-0.359715,0.589118,0.689009,-0.934274,0.590328,0.00047,-0.57949,-0.01404,...,0.367153,-0.127431,0.45308,0.222101,0.063686,-0.350376,0.585483,-0.723964,-0.614908,1.04182
2,2,-0.257777,0.017325,-0.369965,0.256681,-0.647283,-0.009628,0.178241,0.039518,-0.395371,...,-0.180099,0.149861,0.336687,0.759315,-0.011072,0.19597,0.454717,0.462148,-0.54828,0.754466
3,0,0.193269,-0.121839,-0.275106,0.06398,-0.25909,-0.19594,0.075242,0.029794,0.122252,...,0.021265,0.002938,0.010013,0.112763,0.223452,-0.481063,0.16617,0.020349,-0.113602,-0.069602
4,2,-0.054664,-0.326593,-0.267536,0.490474,-0.725889,-0.157597,0.261997,0.182627,-0.072347,...,0.146497,-0.214596,0.316881,0.412167,0.322099,-0.438647,0.382818,-0.143397,-0.46916,0.3822


In [7]:
Y = data.Target # Target column

X = data.iloc[:,1:439] # I selected all the columns by removing the Unnamed column (row id) and the Target column.


print(X)
print('Numero de pacientes: ',len(Y))

            0         1         2         3         4         5         6  \
0   -0.315205 -0.407536  0.531673  0.808408 -0.810542  0.193779  0.034024   
1   -0.399706 -0.359715  0.589118  0.689009 -0.934274  0.590328  0.000470   
2   -0.257777  0.017325 -0.369965  0.256681 -0.647283 -0.009628  0.178241   
3    0.193269 -0.121839 -0.275106  0.063980 -0.259090 -0.195940  0.075242   
4   -0.054664 -0.326593 -0.267536  0.490474 -0.725889 -0.157597  0.261997   
..        ...       ...       ...       ...       ...       ...       ...   
176  0.015191 -0.250067 -0.247737  0.428315 -0.673884 -0.122022  0.161610   
177  0.025750 -0.248474 -0.427581  0.390085 -0.543150 -0.102631  0.184679   
178 -0.380961 -0.304403  0.498039  0.549402  0.040175  0.282321 -0.007317   
179 -0.394782 -0.050161  0.336075  0.524383 -0.065044  0.317751 -0.014208   
180 -0.000122 -0.274288 -0.315658  0.305076 -0.167984 -0.064459  0.178613   

            7         8         9  ...       428       429       430  \
0  

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=125, stratify=Y)
yTrain=y_train.to_numpy()
yTest=y_test.to_numpy()

print('Training set size:', len(X_train))
print('Target column size of the training set:', len(y_train))
print('Test set size:', len(X_test))
print('Target column size of the test set:', len(y_test))

Training set size: 144
Target column size of the training set: 144
Test set size: 37
Target column size of the test set: 37


### 4. Select the parameters of the model and fit it

In [9]:
param_grid = {'C': [1,2,5,7,10],
            'solver': ['newton-cg','liblinear'],  
            'max_iter':[20000,50000,100000],
            'random_state':[125]}

# Creamos un GridSearchCV que permite evaluar y seleccionar de forma sistemática los parámetros de nuestro modelo. 
# Indicándole un modelo y los parámetros a probar, puede evaluar el rendimiento del primero en función de los 
# segundos mediante validación cruzada.  
clf = GridSearchCV(LogisticRegression(), param_grid, cv = 5)
clf.fit(X_train , y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1, 2, 5, 7, 10],
                         'max_iter': [20000, 50000, 100000],
                         'random_state': [125],
                         'solver': ['newton-cg', 'liblinear']})

In [10]:
print("Best estimate of parameters according to GridSearchCV:")
model = clf.best_estimator_
# Fit the model with the best parameters
model.fit(X_train , y_train)

Best estimate of parameters according to GridSearchCV:


LogisticRegression(C=1, max_iter=20000, random_state=125, solver='liblinear')

In [11]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.354679802955665


In [12]:
# Making predictions with the optimal model on the training dataset
yhatTrain = model.predict(X_train)
contTrain = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(y_train),1) :
    if (yhatTrain[i] == y_train[i]):
        contTrain = contTrain + 1

KeyError: 1

In [None]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(X_test)
contTest = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(y_test),1) :
    if (yhatTest[i] == y_test[i]):
        contTest = contTest + 1

In [None]:
print('Final accuracy on the training dataset:' + str(contTrain/len(y_train)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(y_test)))

Final accuracy on the training dataset:0.6260869565217392
Final accuracy on the testing dataset: 0.6551724137931034


In [None]:
from sklearn.metrics import classification_report,confusion_matrix

print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(y_train,yhatTrain))
print('Input data:  ' + str(np.array(y_train)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[66  0  0]
 [22  5  0]
 [18  3  1]]
Input data:  [0 0 2 2 0 1 0 1 1 1 0 1 0 0 0 0 2 0 0 1 0 0 0 0 0 2 2 2 0 0 0 2 1 0 0 0 2
 1 0 0 0 0 2 1 1 0 1 0 1 0 0 2 1 2 0 0 0 0 1 0 2 2 1 0 0 0 0 2 0 2 2 0 0 0
 0 2 1 0 0 1 1 0 2 0 0 0 0 2 1 1 0 0 2 1 0 2 0 1 1 0 2 0 0 0 1 0 1 0 0 0 1
 0 1 0 0]
Prediction:        [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0]


In [None]:
print(classification_report(y_train,yhatTrain))

              precision    recall  f1-score   support

           0       0.62      1.00      0.77        66
           1       0.62      0.19      0.29        27
           2       1.00      0.05      0.09        22

    accuracy                           0.63       115
   macro avg       0.75      0.41      0.38       115
weighted avg       0.70      0.63      0.52       115



In [None]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(y_test,yhatTest))
print('Input data:  ' + str(np.array(y_test)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[19  0  0]
 [ 4  0  0]
 [ 4  2  0]]
Input data:  [2 0 2 0 0 2 2 0 2 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 2 0 0 1 0]
Prediction:        [0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [None]:
print(classification_report(y_test, yhatTest))

              precision    recall  f1-score   support

           0       0.70      1.00      0.83        19
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         6

    accuracy                           0.66        29
   macro avg       0.23      0.33      0.28        29
weighted avg       0.46      0.66      0.54        29



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
