# Classification model using Multilayer Perceptron

### 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

### 2. Upload Clinic data

In [2]:
path ="../../../../Data_preprocessing/RNA_post_autoencoder/encoded_data_multiclass_43_review.csv"
data = pd.read_csv(path)
data.reset_index
data.round(4)
data=data.iloc[:,1:45]

In [3]:
Y = data.Target # Target column

X = data.iloc[:,1:439] # I selected all the columns by removing the Unnamed column (row id) and the Target column.


print(X)
print('Numero de pacientes: ',len(Y))

            0         1         2         3         4         5         6  \
0    1.303475  0.320223  0.691620 -0.715220  0.269225 -0.053473 -0.916276   
1    1.040682  0.433916 -0.177336 -0.800547  1.629002 -0.725473 -0.681308   
2   -0.559200  0.636671 -0.067989 -1.036499 -0.306244  0.535286  0.646122   
3    1.391767  0.104729 -0.063120 -1.088009  0.555148 -0.698760 -0.125830   
4    1.621300  0.988073 -0.638373 -1.677556  0.288400  0.137920 -0.201170   
..        ...       ...       ...       ...       ...       ...       ...   
176  0.312103  0.656807 -0.522374 -0.846966  0.896216 -0.312821 -0.150635   
177  0.234944 -0.094377  0.260074 -0.793957  0.243106 -0.228022  0.059169   
178  0.420867  0.195804 -0.247219 -0.115416  0.472313 -0.002184  0.153051   
179  0.508123  0.810161 -0.569704 -0.666671  1.194661 -0.734187  0.211813   
180 -0.029691 -0.012050  0.024318 -0.584142  0.235847 -0.015007  0.447715   

            7         8         9  ...        33        34        35  \
0  

### 3. Train-Test dataset split

In [4]:
XTrain, XTest, yTrain, yTest = train_test_split(X, Y, test_size=0.20, random_state=125, stratify=Y)
yTrain=yTrain.to_numpy()
yTest=yTest.to_numpy()
print('Training set size:', len(XTrain))
print('Target column size of the training set:', len(yTrain))
print('Test set size:', len(XTest))
print('Target column size of the test set:', len(yTest))

Training set size: 144
Target column size of the training set: 144
Test set size: 37
Target column size of the test set: 37


In [5]:
param_grid = {'max_iter': [200000, 500000, 10000000],
            'activation': ['identity', 'logistic', 'tanh', 'relu'],
            'random_state': [125],
            'max_fun': [300, 500,1000, 5000, 10000, 15000, 20000],
            'hidden_layer_sizes': [3,5],
            'solver': ['lbfgs']}

# I created a GridSearchCV which allows us to systematically evaluate and select the parameters of our model.
# By indicating a model and the parameters to test, you can evaluate the performance of the first one based on the
# seconds through cross validation.
clf = GridSearchCV(MLPClassifier(), param_grid, cv =5)

clf.fit(XTrain , yTrain)


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  sel

GridSearchCV(cv=5, estimator=MLPClassifier(),
             param_grid={'activation': ['identity', 'logistic', 'tanh', 'relu'],
                         'hidden_layer_sizes': [3, 5],
                         'max_fun': [300, 500, 1000, 5000, 10000, 15000, 20000],
                         'max_iter': [200000, 500000, 10000000],
                         'random_state': [125], 'solver': ['lbfgs']})

In [6]:
print("Best estimate of parameters according to GridSearchCV:")
print(clf.best_estimator_)
# Fit the model with the best parameters
model = clf.best_estimator_

Best estimate of parameters according to GridSearchCV:
MLPClassifier(activation='logistic', hidden_layer_sizes=3, max_fun=300,
              max_iter=200000, random_state=125, solver='lbfgs')


In [7]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.4379310344827586


### 5. Prediction

In [8]:
# Making predictions with the optimal model on the training dataset
yhatTrain = model.predict(XTrain)
contTrain = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTrain),1) :
    if (yhatTrain[i] == yTrain[i]):
        contTrain = contTrain + 1

In [9]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(XTest)
contTest = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTest),1) :
    if (yhatTest[i] == yTest[i]):
        contTest = contTest + 1


### 6. Results

In [10]:
print('Final accuracy on the training dataset:' + str(contTrain/len(yTrain)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(yTest)))

Final accuracy on the training dataset:0.7986111111111112
Final accuracy on the testing dataset: 0.32432432432432434


In [11]:
from sklearn.metrics import classification_report,confusion_matrix

print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(yTrain,yhatTrain))
print('Input data:  ' + str(np.array(yTrain)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[54  7  6]
 [ 2 18  5]
 [ 8  1 43]]
Input data:  [1 0 2 0 2 2 0 0 2 0 2 0 1 0 2 2 0 2 2 2 0 0 0 2 0 0 0 0 1 1 2 2 2 2 0 1 2
 0 0 2 1 2 2 1 0 0 2 2 1 0 0 0 2 0 0 2 0 2 1 2 0 2 1 0 0 0 0 2 2 2 0 2 0 0
 0 2 0 2 0 0 0 1 2 0 0 1 2 2 1 1 0 2 0 2 0 0 1 0 1 0 0 2 0 0 2 2 2 1 0 2 2
 1 2 0 1 0 0 0 2 0 1 0 2 0 0 2 1 1 0 0 0 2 0 1 0 0 0 2 2 2 1 2 0 1]
Prediction:        [1 0 2 0 2 0 0 0 0 0 2 0 2 0 2 2 2 2 2 2 0 0 0 2 0 0 0 0 1 2 2 2 2 2 0 1 2
 0 0 2 0 2 2 2 0 0 0 0 1 0 0 0 0 0 0 0 1 2 1 0 0 2 1 0 0 0 0 0 2 2 0 2 1 2
 2 2 0 2 0 0 2 1 2 0 0 1 2 2 2 0 1 2 0 2 1 2 1 0 1 0 1 2 0 0 2 2 2 1 0 2 2
 1 2 1 1 0 0 0 1 2 1 0 2 0 0 2 1 1 0 0 0 2 0 1 0 0 0 2 2 2 1 2 1 2]


In [12]:
print(classification_report(yTrain,yhatTrain))

              precision    recall  f1-score   support

           0       0.84      0.81      0.82        67
           1       0.69      0.72      0.71        25
           2       0.80      0.83      0.81        52

    accuracy                           0.80       144
   macro avg       0.78      0.78      0.78       144
weighted avg       0.80      0.80      0.80       144



In [13]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(yTest,yhatTest))
print('Input data:  ' + str(np.array(yTest)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[7 8 3]
 [4 0 2]
 [8 0 5]]
Input data:  [0 1 0 2 0 2 1 0 1 0 2 2 1 2 2 0 1 2 0 0 0 2 1 0 0 2 0 2 2 0 0 0 2 2 0 0 0]
Prediction:        [0 0 1 0 2 2 2 0 2 1 0 0 0 0 2 1 0 0 1 1 1 2 0 1 0 2 0 0 2 0 2 0 0 0 1 2 0]


In [14]:
print(classification_report(yTest,yhatTest))

              precision    recall  f1-score   support

           0       0.37      0.39      0.38        18
           1       0.00      0.00      0.00         6
           2       0.50      0.38      0.43        13

    accuracy                           0.32        37
   macro avg       0.29      0.26      0.27        37
weighted avg       0.35      0.32      0.34        37

