# Classification model using Multilayer Perceptron

### 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

### 2. Upload RNA data

In [2]:
path ='../../../../Data_preprocessing/Prediction PFS/RNA+Clinic joined/New/Clinical_data_and_RNA_total_Features_PFS.csv'
data = pd.read_csv(path)
data.head(5)

Unnamed: 0.1,Unnamed: 0,RNA_ID,Age,Sarc,Rhab,Number_of_Prior_Therapies,Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy,Tumor_Shrinkage,PFS,TM_TC_Ratio,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
0,0,G138701_RCCBMS-00020-T_v1_RNA_OnPrem,62.0,0.0,0.0,2.0,962.0,-16.40625,12.164384,0.766909,...,33.26816,30.04056,30.36292,30.98129,32.38442,28.01561,31.24379,33.81903,33.22469,31.23383
1,1,G138701_RCCBMS-00097-T_v1_RNA_OnPrem,67.0,0.0,0.0,2.0,254.0,-40.0,5.490411,0.773269,...,31.53633,29.94062,30.80829,30.74475,32.0947,26.32526,32.70057,33.67262,33.10198,32.44643
2,2,G138701_RCCBMS-00141-T_v1_RNA_OnPrem,62.0,0.0,0.0,1.0,800.0,-89.43662,61.906849,1.782859,...,30.74474,31.95812,30.73898,30.15898,31.69473,26.33098,32.6833,33.96047,32.97135,32.20607
3,3,G138701_RCCBMS-00099-T_v1_RNA_OnPrem,60.0,0.0,0.0,3.0,790.0,11.787072,1.249315,5.890573,...,31.63196,30.71732,31.09654,30.25757,31.90774,27.53819,32.64194,34.54254,33.24885,32.78278
4,4,G138701_RCCBMS-00163-T_v1_RNA_OnPrem,76.0,0.0,0.0,2.0,1241.0,0.0,5.654795,1.374775,...,31.7562,30.75149,30.55896,30.17101,31.01545,21.45312,32.8662,34.24375,34.8672,32.73361


In [3]:
X = data[['AC005307.1', 'ATP5G2P1', 'DLGAP4', 'EIF4A1P9', 'FAM172BP', 'FAM224A', 'GJA9', 'GPR155', 'GYPE', 'IL25', 'KLHL5', 'LANCL1-AS1', 'LEMD1', 'PCMT1', 'RP11-120J4.1', 'RP11-20G6.3', 'RP11-349G13.2', 'RP11-374M1.2', 'RP11-536C10.21', 'RP11-95G17.2', 'RP11-96O20.4', 'RP5-827C21.1', 'RPS12P26', 'SDR42E1', 'SNORD113-5', 'TARBP2', 'TRAV41', 'TRIM43B', 'VN2R17P', 'XPC']] #Seleccionamos todas las columnas quitando la columna Unnamed (id de la fila ) y la columna Target.
Y = []
# X = data[['AC002486.3', 'AC003986.5', 'AC004158.3', 'AC004221.2', 'AC005392.13', 'AC005498.4', 'AC006296.2', 'AC006372.6', 'AC011516.2', 'AC012363.13', 'AC017048.2', 'AC018359.1', 'AC091814.2', 'AC092933.4', 'BSN-AS2', 'CDKN2AIPNLP2', 'CTB-49A3.1', 'GLUD1P4', 'GLULP6', 'LRRC34P1', 'MARCKSL1P2', 'OR10B1P', 'OR2B8P', 'OSBPL9P1', 'RNA5SP205', 'RNF212', 'RP11-123J14.1', 'RP3-406P24.4', 'RP4-581O6.1', 'XXyac-YR12DB5.1']]

Y = [] # Target column
# For each entry I classified it by its PFS value.
for i in range (len(data)):
    if data.PFS[i]<3: # If PFS is lower than 3 months, I will consider it as NonResponder (NR)
        Y.append(0)
    else:
        Y.append(1)# If PFS is over 3 months, I will consider it as Responder (R)

print(X)
print('Numero de pacientes: ',len(Y))

     AC005307.1   ATP5G2P1    DLGAP4  EIF4A1P9  FAM172BP   FAM224A      GJA9  \
0     21.055379  21.300337  34.54982  21.37925  21.62929  21.12373  29.68286   
1     21.055379  21.300337  34.06647  21.37925  21.62929  21.12373  26.80539   
2     21.055379  21.300337  34.42561  25.08766  29.29206  21.12373  30.09292   
3     21.055379  21.300337  34.41176  26.76156  21.62929  21.12373  29.29223   
4     21.055379  21.300337  34.29088  21.37925  21.62929  21.12373  32.00382   
..          ...        ...       ...       ...       ...       ...       ...   
176   20.944668  21.624881  34.55658  21.78113  21.49653  20.98542  28.18967   
177   20.944668  21.624881  33.15506  21.78113  21.49653  20.98542  28.18967   
178   20.944668  21.624881  34.68047  21.78113  25.64475  23.49110  28.18967   
179   20.944668  21.624881  35.60014  21.78113  21.49653  20.98542  33.92530   
180   20.944668  21.624881  35.23053  21.78113  21.49653  20.98542  28.18967   

       GPR155      GYPE      IL25  ... 

### 3. Train-Test dataset split

In [4]:
XTrain, XTest, yTrain, yTest = train_test_split(X, Y, test_size=0.15, random_state=125, stratify=Y)

print('Training set size:', len(XTrain))
print('Target column size of the training set:', len(yTrain))
print('Test set size:', len(XTest))
print('Target column size of the test set:', len(yTest))

Training set size: 153
Target column size of the training set: 153
Test set size: 28
Target column size of the test set: 28


In [5]:
param_grid = {'max_iter': [200, 500, 1000],
            'activation': ['identity', 'logistic', 'tanh', 'relu'],
            'random_state': [125],
            'max_fun': [300, 500,1000, 5000, 10000, 15000, 20000],
            'hidden_layer_sizes': [3,5],
            'solver': ['lbfgs']}

# I created a GridSearchCV which allows us to systematically evaluate and select the parameters of our model.
# By indicating a model and the parameters to test, you can evaluate the performance of the first one based on the
# seconds through cross validation.
clf = GridSearchCV(MLPClassifier(), param_grid, cv =5)

clf.fit(XTrain , yTrain)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

GridSearchCV(cv=5, estimator=MLPClassifier(),
             param_grid={'activation': ['identity', 'logistic', 'tanh', 'relu'],
                         'hidden_layer_sizes': [3, 5],
                         'max_fun': [300, 500, 1000, 5000, 10000, 15000, 20000],
                         'max_iter': [200, 500, 1000], 'random_state': [125],
                         'solver': ['lbfgs']})

In [6]:
print("Best estimate of parameters according to GridSearchCV:")
model = clf.best_estimator_
# Fit the model with the best parameters
model.fit(XTrain , yTrain)

Best estimate of parameters according to GridSearchCV:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(activation='identity', hidden_layer_sizes=3, max_fun=1000,
              max_iter=500, random_state=125, solver='lbfgs')

In [7]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.8438709677419356


### 5. Prediction

In [8]:
# Making predictions with the optimal model on the training dataset
yhatTrain = model.predict(XTrain)
contTrain = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTrain),1) :
    if (yhatTrain[i] == yTrain[i]):
        contTrain = contTrain + 1

In [9]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(XTest)
contTest = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(yTest),1) :
    if (yhatTest[i] == yTest[i]):
        contTest = contTest + 1


### 6. Results

In [10]:
print('Final accuracy on the training dataset:' + str(contTrain/len(yTrain)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(yTest)))

Final accuracy on the training dataset:0.954248366013072
Final accuracy on the testing dataset: 0.8928571428571429


In [11]:
from sklearn.metrics import classification_report,confusion_matrix

print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(yTrain,yhatTrain))
print('Input data:  ' + str(np.array(yTrain)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[68  4]
 [ 3 78]]
Input data:  [1 1 1 1 0 1 1 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 1
 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 0 1 0 1 1 1 0 0 1 0 1 1 0 1 1 0 0 0 0 0 0
 1 1 1 0 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 0 0 1 1 1 1 1 1 0 0 1 0 1 0 0 1 0 1
 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 0 0 1 0 0 1 0 1 1 1 0 1 0 0 1 0 1 0 0 0 1 1
 1 1 1 0 1]
Prediction:        [1 1 1 1 0 1 1 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 0 1 1 1 0 0 1 0 0 0 0 0
 1 1 1 1 1 1 0 1 1 0 0 1 1 1 0 1 0 1 0 1 1 1 0 0 1 0 1 1 0 1 1 0 0 0 1 0 0
 1 1 1 0 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 0 0 1 1 1 1 1 1 0 0 1 0 1 0 0 1 0 0
 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0 1 0 0 1 0 1 1 1 0 1 0 0 1 0 1 0 1 0 1 1
 1 1 1 0 1]


In [12]:
print(classification_report(yTrain,yhatTrain))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95        72
           1       0.95      0.96      0.96        81

    accuracy                           0.95       153
   macro avg       0.95      0.95      0.95       153
weighted avg       0.95      0.95      0.95       153



In [13]:
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix(yTest,yhatTest))
print('Input data:  ' + str(np.array(yTest)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[12  1]
 [ 2 13]]
Input data:  [0 1 1 1 1 0 1 0 0 1 0 1 1 1 0 1 0 0 1 1 0 0 0 1 1 1 0 0]
Prediction:        [0 1 1 1 1 1 1 0 0 0 0 1 1 1 0 1 0 0 1 1 0 0 0 1 1 0 0 0]


In [14]:
print(classification_report(yTest,yhatTest))

              precision    recall  f1-score   support

           0       0.86      0.92      0.89        13
           1       0.93      0.87      0.90        15

    accuracy                           0.89        28
   macro avg       0.89      0.89      0.89        28
weighted avg       0.90      0.89      0.89        28

