# Classification model using Multilayer Perceptron

### 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

### 2. Upload Clinic data

In [2]:
path ="../../../../Data_preprocessing/RNA_post_autoencoder/encoded_data_binary_43.csv"
data = pd.read_csv(path)
data.reset_index
data.round(4)
data=data.iloc[:,1:440  ] 
data.head()

Unnamed: 0,Target,0,1,2,3,4,5,6,7,8,...,33,34,35,36,37,38,39,40,41,42
0,1,0.153994,-0.164904,-0.452284,0.206749,-0.857082,-0.603124,-0.473103,-0.075513,0.05475,...,-0.0376,-0.128832,-0.385064,0.116692,1.490244,1.05952,-1.272441,0.11096,-0.101009,0.161909
1,1,-1.300332,-0.080441,-0.885911,-0.168431,-0.529857,0.184833,-0.464027,-0.361125,0.125864,...,-0.44348,-0.222192,-0.44074,-0.388554,0.997358,0.816687,-1.113903,0.362072,-0.397896,1.188868
2,1,-0.876637,0.587383,-0.925414,0.488304,-0.181991,0.184868,-0.708685,1.060296,0.210958,...,0.440405,0.441728,0.479298,-0.410674,-0.082885,1.063791,-0.764444,0.862694,0.316916,1.723327
3,0,-1.598357,0.092854,-0.4734,-0.009476,-1.107011,0.699234,-1.599456,-0.22061,0.247359,...,-0.878015,-0.766315,-0.725781,-0.124278,0.621617,1.027331,-1.433755,0.882461,0.018591,1.787526
4,1,-1.735836,0.207939,-0.211755,-0.703383,-0.802869,-0.273099,-1.267936,-0.063031,-0.612482,...,-0.671282,1.156629,1.303665,0.926794,0.740702,-0.010802,0.001432,1.500137,0.100971,0.812671


In [3]:
Y = data.Target # Target column

X = data.iloc[:,1:439] # I selected all the columns by removing the Unnamed column (row id) and the Target column.

print(X)
print('Numero de pacientes: ',len(Y))

            0         1         2         3         4         5         6  \
0    0.153994 -0.164904 -0.452284  0.206749 -0.857082 -0.603124 -0.473103   
1   -1.300332 -0.080441 -0.885911 -0.168431 -0.529857  0.184833 -0.464027   
2   -0.876637  0.587383 -0.925414  0.488304 -0.181991  0.184868 -0.708685   
3   -1.598357  0.092854 -0.473400 -0.009476 -1.107011  0.699234 -1.599456   
4   -1.735836  0.207939 -0.211755 -0.703383 -0.802869 -0.273099 -1.267936   
..        ...       ...       ...       ...       ...       ...       ...   
176 -1.222742  0.033713 -0.588393 -0.369282 -0.914620  0.613994 -1.088500   
177 -0.916295  0.632799  0.010232  1.049555 -0.321448 -0.182999 -1.356915   
178 -0.499931  0.487778 -0.365115 -0.074394 -0.231141  0.391114 -0.622813   
179 -0.790637  0.102222 -1.135601 -0.301596 -0.794871  0.188501 -0.760856   
180 -0.297094  0.420879 -0.209357  0.319454 -0.118073 -0.411617 -1.117591   

            7         8         9  ...        33        34        35  \
0  

### 3. Train-Test dataset split

In [4]:
XTrain, XTest, yTrain, yTest = train_test_split(X, Y, test_size=0.20, random_state=125, stratify=Y)

print('Training set size:', len(XTrain))
print('Target column size of the training set:', len(yTrain))
print('Test set size:', len(XTest))
print('Target column size of the test set:', len(yTest))

Training set size: 144
Target column size of the training set: 144
Test set size: 37
Target column size of the test set: 37


### 4. Select the parameters of the model and fit it

In [5]:
param_grid = {'max_iter': [30000, 50000,10000000],
            'activation': ['identity', 'logistic', 'tanh', 'relu'],
            'random_state': [125],
            'max_fun': [ 500,1000, 5000, 10000, 15000, 20000],
            'hidden_layer_sizes': [1,2,3]}

# I created a GridSearchCV which allows us to systematically evaluate and select the parameters of our model.
# By indicating a model and the parameters to test, you can evaluate the performance of the first one based on the
# seconds through cross validation.
clf = GridSearchCV(MLPClassifier(), param_grid, cv =5)

clf.fit(XTrain , yTrain)



GridSearchCV(cv=5, estimator=MLPClassifier(),
             param_grid={'activation': ['identity', 'logistic', 'tanh', 'relu'],
                         'hidden_layer_sizes': [1, 2, 3],
                         'max_fun': [500, 1000, 5000, 10000, 15000, 20000],
                         'max_iter': [30000, 50000, 10000000],
                         'random_state': [125]})

In [6]:
print("Best estimate of parameters according to GridSearchCV:")
model = clf.best_estimator_
# Fit the model with the best parameters
model.fit(XTrain , yTrain)

Best estimate of parameters according to GridSearchCV:


MLPClassifier(activation='tanh', hidden_layer_sizes=2, max_fun=500,
              max_iter=30000, random_state=125)

In [7]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.6327586206896552


In [8]:
# Making predictions with the optimal model on the training dataset
yhatTrain = model.predict(XTrain)
contTrain = 0
yTrain=yTrain.to_numpy()
# yTrain=yTrain.to_numpy()
# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTrain),1) :
    if (yhatTrain[i] == yTrain[i]):
        contTrain = contTrain + 1

In [9]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(XTest)
contTest = 0
yTest=yTest.to_numpy()

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTest),1) :
    if (yhatTest[i] == yTest[i]):
        contTest = contTest + 1


In [10]:
print('Final accuracy on the training dataset:' + str(contTrain/len(yTrain)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(yTest)))

Final accuracy on the training dataset:0.8263888888888888
Final accuracy on the testing dataset: 0.6486486486486487


In [11]:
from sklearn.metrics import classification_report,confusion_matrix

print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(yTrain,yhatTrain))
print('Input data:  ' + str(np.array(yTrain)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[64  4]
 [21 55]]
Input data:  [1 0 1 0 1 1 0 0 1 0 1 0 1 0 1 1 0 1 1 1 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 1 1
 0 0 1 1 1 1 1 0 0 1 1 1 0 0 0 1 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 1 0 1 0 0
 0 1 0 1 0 0 0 1 1 0 0 1 1 1 1 1 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 1 1 1 0 1 1
 1 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 0 0 0 1 0 1 0 0 0 1 1 1 1 1 0 1]
Prediction:        [1 0 1 0 1 1 0 0 1 0 1 0 1 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1 0
 0 0 1 1 1 1 1 0 1 1 1 1 0 0 0 1 0 0 1 0 1 1 1 0 0 1 0 0 0 0 1 1 1 0 1 0 0
 0 0 0 1 0 0 0 1 1 0 0 1 0 1 1 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1
 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1]


In [12]:
print(classification_report(yTrain,yhatTrain))

              precision    recall  f1-score   support

           0       0.75      0.94      0.84        68
           1       0.93      0.72      0.81        76

    accuracy                           0.83       144
   macro avg       0.84      0.83      0.83       144
weighted avg       0.85      0.83      0.83       144



In [13]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(yTest,yhatTest))
print('Input data:  ' + str(np.array(yTest)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[ 8  9]
 [ 4 16]]
Input data:  [0 1 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 1 0 1 0]
Prediction:        [1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 0 1 0 1 0 0 1 1 1 0 1 1 0]


In [14]:
print(classification_report(yTest,yhatTest))

              precision    recall  f1-score   support

           0       0.67      0.47      0.55        17
           1       0.64      0.80      0.71        20

    accuracy                           0.65        37
   macro avg       0.65      0.64      0.63        37
weighted avg       0.65      0.65      0.64        37

