# Classification model using Logistic Regression

### 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

### 2. Upload RNA data

In [2]:
path ='../../../../Data_preprocessing/Prediction PFS/RNA+Clinic joined/New/Clinical_data_and_RNA_98_Features_PFS.csv'
data = pd.read_csv(path)
data.head(5)

Unnamed: 0.1,Unnamed: 0,RNA_ID,Age,Sarc,Rhab,Number_of_Prior_Therapies,Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy,Tumor_Shrinkage,PFS,TM_TC_Ratio,...,SYT10,TARBP2,TAS1R3,TAT,TRAV41,TRIM43B,VN2R17P,XPC,ZNF608,ZNF746
0,0,G138701_RCCBMS-00020-T_v1_RNA_OnPrem,62.0,0.0,0.0,2.0,962.0,-16.40625,12.164384,0.766909,...,21.82797,29.60417,26.87053,22.27506,22.22154,21.9744,21.12857,32.24472,34.46991,29.11848
1,1,G138701_RCCBMS-00097-T_v1_RNA_OnPrem,67.0,0.0,0.0,2.0,254.0,-40.0,5.490411,0.773269,...,23.53072,29.61188,26.87283,22.27506,22.22154,21.9744,21.12857,32.15619,34.95662,30.78421
2,2,G138701_RCCBMS-00141-T_v1_RNA_OnPrem,62.0,0.0,0.0,1.0,800.0,-89.43662,61.906849,1.782859,...,21.82797,31.90343,23.88665,27.7119,22.22154,21.9744,21.12857,31.86536,32.73029,31.9834
3,3,G138701_RCCBMS-00099-T_v1_RNA_OnPrem,60.0,0.0,0.0,3.0,790.0,11.787072,1.249315,5.890573,...,31.50567,30.35424,25.66465,22.27506,22.22154,21.9744,21.12857,31.63561,33.92535,30.6818
4,4,G138701_RCCBMS-00163-T_v1_RNA_OnPrem,76.0,0.0,0.0,2.0,1241.0,0.0,5.654795,1.374775,...,27.75696,31.60946,27.41015,22.27506,22.22154,21.9744,21.12857,33.12717,33.16473,30.01943


In [3]:
X = data.iloc[:,27:125]
Y=[]
# For each entry I classified it by its PFS value.
for i in range (len(data)):
    if data.PFS[i]<3: # If PFS is lower than 3 months, I will consider it as NonResponder (NR)
        Y.append(0)
    elif data.PFS[i]<6:
        Y.append(1)# If PFS is over 3 months, I will consider it as Responder (R)
    else:
        Y.append(2)
print(X)
print('Numero de pacientes: ',len(Y))

         ABHD5  AC005307.1  AC005550.3  AC090957.2      AGAP3  AP001055.6  \
0    30.497897   21.055379   21.270652   21.030369  32.969805   22.028180   
1    31.269076   21.055379   21.270652   21.030369  32.303016   22.028180   
2    32.103159   21.055379   21.270652   21.030369  31.523937   22.028180   
3    31.860291   21.055379   21.270652   21.030369  32.234884   22.028180   
4    31.130802   21.055379   21.270652   21.030369  33.158813   22.028180   
..         ...         ...         ...         ...        ...         ...   
176  32.097191   20.944668   20.782537   20.962109  33.553554   20.515641   
177  33.549605   20.944668   20.782537   20.962109  32.424865   24.317471   
178  32.023461   20.944668   22.065655   20.962109  33.420634   20.515641   
179  30.836938   20.944668   21.846195   20.962109  33.432114   23.692971   
180  31.554782   20.944668   23.384450   20.962109  33.223246   20.515641   

     AP003025.2     AQP7P4   ATP5G2P1       AVIL  ...     SYT10    TARBP2  

### 3. Train-Test dataset split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=125, stratify=Y)

print('Training set size:', len(X_train))
print('Target column size of the training set:', len(y_train))
print('Test set size:', len(X_test))
print('Target column size of the test set:', len(y_test))

Training set size: 144
Target column size of the training set: 144
Test set size: 37
Target column size of the test set: 37


### 4. Select the parameters of the model and fit it

In [5]:
param_grid = {'C': [16,17,18,19,20],
            'solver': ['newton-cg','liblinear'], 
            'max_iter':[10,11,12,13,18,19,20,25,50,75,100,200],
            'random_state':[125]}

# I created a GridSearchCV which allows us to systematically evaluate and select the parameters of our model.
# By indicating a model and the parameters to test, you can evaluate the performance of the first one based on the
# seconds through cross validation.
clf = GridSearchCV(LogisticRegression(), param_grid, cv = 5)
clf.fit(X_train , y_train)



GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [16, 17, 18, 19, 20],
                         'max_iter': [10, 11, 12, 13, 18, 19, 20, 25, 50, 75,
                                      100, 200],
                         'random_state': [125],
                         'solver': ['newton-cg', 'liblinear']})

In [6]:
print("Best estimate of parameters according to GridSearchCV:")
model = clf.best_estimator_
# Fit the model with the best parameters
model.fit(X_train , y_train)

Best estimate of parameters according to GridSearchCV:




LogisticRegression(C=16, max_iter=10, random_state=125, solver='liblinear')

In [7]:
print("Best result of the cross validation of the model with the best paramters:" +str(clf.best_score_))

Best result of the cross validation of the model with the best paramters:0.6667487684729064


### 5. Prediction

In [8]:
# Making predictions with the optimal model on the training dataset
yhatTrain = model.predict(X_train)
contTrain = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(y_train),1) :
    if (yhatTrain[i] == y_train[i]):
        contTrain = contTrain + 1

In [9]:
# Making predictions with the optimal model on the test dataset
yhatTest = model.predict(X_test)
contTest = 0

# Comparing with the Target column and check how many hits there have been
for i in range(0,len(y_test),1) :
    if (yhatTest[i] == y_test[i]):
        contTest = contTest + 1

### 6. Results

In [10]:
print('Final accuracy on the training dataset: ' + str(contTrain/len(y_train)))
print('Final accuracy on the testing dataset: ' + str(contTest/len(y_test)))

Final accuracy on the training dataset: 0.9652777777777778
Final accuracy on the testing dataset: 0.5945945945945946


In [11]:
from sklearn.metrics import classification_report,confusion_matrix
print('----------------Confusion Matrix (Training)------------------')
print(confusion_matrix(y_train,yhatTrain))
print('Input data:  ' + str(np.array(y_train)))
print('Prediction:        ' +str(yhatTrain))

----------------Confusion Matrix (Training)------------------
[[67  0  0]
 [ 0 22  3]
 [ 0  2 50]]
Input data:  [1 0 2 0 2 2 0 0 2 0 2 0 1 0 2 2 0 2 2 2 0 0 0 2 0 0 0 0 1 1 2 2 2 2 0 1 2
 0 0 2 1 2 2 1 0 0 2 2 1 0 0 0 2 0 0 2 0 2 1 2 0 2 1 0 0 0 0 2 2 2 0 2 0 0
 0 2 0 2 0 0 0 1 2 0 0 1 2 2 1 1 0 2 0 2 0 0 1 0 1 0 0 2 0 0 2 2 2 1 0 2 2
 1 2 0 1 0 0 0 2 0 1 0 2 0 0 2 1 1 0 0 0 2 0 1 0 0 0 2 2 2 1 2 0 1]
Prediction:        [1 0 2 0 1 2 0 0 2 0 2 0 1 0 2 2 0 2 2 2 0 0 0 2 0 0 0 0 1 1 2 2 2 2 0 1 2
 0 0 2 1 2 2 1 0 0 2 2 1 0 0 0 2 0 0 2 0 2 1 2 0 2 1 0 0 0 0 2 2 2 0 2 0 0
 0 2 0 2 0 0 0 1 2 0 0 2 2 2 1 1 0 2 0 2 0 0 1 0 1 0 0 2 0 0 2 2 2 2 0 2 2
 1 2 0 1 0 0 0 2 0 1 0 2 0 0 1 1 1 0 0 0 2 0 1 0 0 0 2 2 2 2 2 0 1]


In [12]:
print(classification_report(y_train,yhatTrain))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        67
           1       0.92      0.88      0.90        25
           2       0.94      0.96      0.95        52

    accuracy                           0.97       144
   macro avg       0.95      0.95      0.95       144
weighted avg       0.97      0.97      0.97       144



In [13]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, yhatTest)
print('----------------Confusion Matrix (Test)------------------')
print(confusion_matrix)
print('Input data:  ' + str(np.array(y_test)))
print('Prediction:        ' +str(yhatTest))

----------------Confusion Matrix (Test)------------------
[[11  4  3]
 [ 2  1  3]
 [ 2  1 10]]
Input data:  [0 1 0 2 0 2 1 0 1 0 2 2 1 2 2 0 1 2 0 0 0 2 1 0 0 2 0 2 2 0 0 0 2 2 0 0 0]
Prediction:        [1 2 2 2 0 2 2 0 0 0 2 2 0 2 2 0 1 2 0 0 0 0 2 0 2 2 1 1 0 1 0 0 2 2 0 2 1]


In [14]:
print(classification_report(y_test, yhatTest))

              precision    recall  f1-score   support

           0       0.73      0.61      0.67        18
           1       0.17      0.17      0.17         6
           2       0.62      0.77      0.69        13

    accuracy                           0.59        37
   macro avg       0.51      0.52      0.51        37
weighted avg       0.60      0.59      0.59        37



In [15]:
# En esta figrura se puede ver la proporción de falsos positivos con verdaderos positivos en el set de prueba
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

ValueError: multi_class must be in ('ovo', 'ovr')