In [82]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report

from sklearn.feature_selection import SelectKBest, chi2

from sklearn.decomposition import PCA

from sklearn.linear_model import Lasso

from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE, RFECV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from pandas.plotting import scatter_matrix 
from sklearn.metrics import confusion_matrix

## load tested data


In [83]:
# Load the data from the .csv file
data = pd.read_csv('cleaned_descriptor_data',index_col=0)
X_fingerprints=pd.read_csv('fingerprint_data',index_col=0)

# Scale the data with a standard scaling
scaling=MinMaxScaler()
scaling.fit(data)
scaled_data=scaling.transform(data)
df_data = pd.DataFrame(scaled_data, columns=data.columns,index=data.index)

# Choose the kinase of which you want to predict the inhibition
kinase_ERK2 = 'ERK2_inhibition'
kinase_PKM2 = 'PKM2_inhibition'    

# Define train and test data
X = df_data.drop(columns=['ERK2_inhibition','PKM2_inhibition']).copy()
y_pkm2 = data[kinase_PKM2].copy()
y_erk2 = data[kinase_ERK2].copy()


## Load untested data

In [84]:
untested_molecules=pd.read_csv('cleaned_descriptor_data_untested',index_col=0)
untested_molecules.head()
X_untested=untested_molecules.drop(columns=['ERK2_inhibition','PKM2_inhibition']).copy()

y_untested_erk2=untested_molecules[kinase_ERK2].copy()
y_untested_pkm2=untested_molecules[kinase_PKM2].copy()

In [85]:
X_untested_fingerprints=pd.read_csv('fingerprint_data_untested',index_col=0)

## Recursive Feature Exclusion

In [90]:
rfe_pmk = RFE(RandomForestClassifier(), n_features_to_select=50)
rfe_pmk.fit(X, y_pkm2)
selected_features_pmk = X.columns[rfe_pmk.support_]
print(selected_features_pmk)

rfe_erk = RFE(RandomForestClassifier(), n_features_to_select=50)
rfe_erk.fit(X, y_erk2)
selected_features_erk = X.columns[rfe_erk.support_]
print(selected_features_erk)

Index(['MaxAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'HeavyAtomMolWt',
       'MinPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1',
       'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW',
       'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_MRHI',
       'BCUT2D_MRLOW', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0n', 'Chi3n',
       'Chi3v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'PEOE_VSA1', 'PEOE_VSA11',
       'PEOE_VSA5', 'PEOE_VSA7', 'PEOE_VSA9', 'SMR_VSA10', 'SMR_VSA3',
       'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA9', 'SlogP_VSA2', 'SlogP_VSA3',
       'EState_VSA2', 'EState_VSA3', 'EState_VSA6', 'VSA_EState1',
       'VSA_EState2', 'VSA_EState3', 'VSA_EState5', 'VSA_EState6',
       'VSA_EState9', 'NumAromaticHeterocycles', 'MolLogP', 'fr_Ar_N'],
      dtype='object')
Index(['MaxAbsEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed',
       'SPS', 'MinPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1',
       'BCUT2D_MWHI', 'BCUT2D_M

In [91]:
X_cleaned_pmk=X[selected_features_pmk]
X_untested_cleaned_pmk=X_untested[selected_features_pmk]

X_cleaned_erk=X[selected_features_erk]
X_untested_cleaned_erk=X_untested[selected_features_erk]

# Random forests

## Random forest with descriptors evaluation

### Find n_estimators

In [121]:
# ERK2
X_train, X_test, y_train, y_test = train_test_split(X_cleaned_erk, y_erk2, test_size=0.2, stratify=y_erk2, random_state=40)

pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=10)],['classifier', RandomForestClassifier()]])

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=40)
    
param_grid = {'classifier__n_estimators':[100, 200, 300, 400, 500, 600]}
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='recall', cv=stratified_kfold, n_jobs=-1)

grid_search.fit(X_train, y_train)
n_estimators_erk = grid_search.best_params_['classifier__n_estimators']
print(n_estimators_erk)

# PMK2
X_train_pmk, X_test_pmk, y_train_pmk, y_test_pmk = train_test_split(X_cleaned_pmk, y_pkm2, test_size=0.2, stratify=y_pkm2, random_state=40)

pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=10)],['classifier', RandomForestClassifier()]])

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=40)
    
param_grid = {'classifier__n_estimators':[100, 200, 300, 400, 500, 600]}
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='recall', cv=stratified_kfold, n_jobs=-1)

grid_search.fit(X_train_pmk, y_train_pmk)
n_estimators_pmk = grid_search.best_params_['classifier__n_estimators']
print(n_estimators_pmk)


400
300


In [126]:
#ERK2:
# Split the data into training and test sets
X_train_erk, X_test_erk, y_train_erk, y_test_erk = train_test_split(X_cleaned_erk, y_erk2, test_size=0.2)

# Apply SMOTE to balance the training set
smote_erk = SMOTE(random_state=40)
X_train_balanced_erk, y_train_balanced_erk = smote_erk.fit_resample(X_train_erk, y_train_erk)

# Calculate the ratio between 0 and 1 in the balanced training set and the test set
ratio_train_balanced_erk = y_train_balanced_erk.value_counts(normalize=True)
ratio_test_erk = y_test_erk.value_counts(normalize=True)

print("Ratio of 0 and 1 inhibitors in the balanced training set:")
print(ratio_train_balanced_erk)
print("\nRatio of 0 and 1 inhibitors in the testing set:")
print(ratio_test_erk)

# Train the Random Forest model with the balanced training set
rf_descriptors = RandomForestClassifier(n_estimators_erk)

# Perform cross-validation
cv = StratifiedKFold(n_splits=2, shuffle=False)
cv_scores = cross_val_score(rf_descriptors, X_train_balanced_erk, y_train_balanced_erk, cv=cv, scoring='recall')

print(f'Cross-Validation Scores: {cv_scores}')

rf_descriptors.fit(X_train_balanced_erk, y_train_balanced_erk)
y_pred_erk = rf_descriptors.predict(X_test_erk)

# Evaluate the model using a confusion matrix
conf_mat_erk = confusion_matrix(y_test_erk, y_pred_erk)
print("Confusion Matrix:")
print(conf_mat_erk)
print(classification_report(y_test_erk, y_pred_erk))


#PMK:
# Split the data into training and test sets
X_train_pmk, X_test_pmk, y_train_pmk, y_test_pmk = train_test_split(X_cleaned_pmk, y_pkm2, test_size=0.2)

# Apply SMOTE to balance the training set
smote_pmk = SMOTE(random_state=40)
X_train_balanced_pmk, y_train_balanced_pmk = smote_pmk.fit_resample(X_train_pmk, y_train_pmk)

# Calculate the ratio between 0 and 1 in the balanced training set and the test set
ratio_train_balanced_pmk = y_train_balanced_pmk.value_counts(normalize=True)
ratio_test_pmk = y_test_pmk.value_counts(normalize=True)

print("Ratio of 0 and 1 inhibitors in the balanced training set:")
print(ratio_train_balanced_pmk)
print("\nRatio of 0 and 1 inhibitors in the testing set:")
print(ratio_test_pmk)

# Train the Random Forest model with the balanced training set
rf_descriptors_pmk = RandomForestClassifier(n_estimators_pmk, random_state=42)

# Perform cross-validation
cv = StratifiedKFold(n_splits=2, shuffle=False)
cv_scores_pmk= cross_val_score(rf_descriptors, X_train_balanced_pmk, y_train_balanced_pmk, cv=cv, scoring='recall')

print(f'Cross-Validation Scores: {cv_scores_pmk}')

rf_descriptors_pmk.fit(X_train_balanced_pmk, y_train_balanced_pmk)
y_pred_pmk = rf_descriptors_pmk.predict(X_test_pmk)

# Evaluate the model using a confusion matrix
conf_mat_pmk = confusion_matrix(y_test_pmk, y_pred_pmk)
print("Confusion Matrix:")
print(conf_mat_pmk)
print(classification_report(y_test_pmk, y_pred_pmk))

Ratio of 0 and 1 inhibitors in the balanced training set:
0    0.5
1    0.5
Name: ERK2_inhibition, dtype: float64

Ratio of 0 and 1 inhibitors in the testing set:
0    0.955357
1    0.044643
Name: ERK2_inhibition, dtype: float64
Cross-Validation Scores: [0.98117647 0.99061033]
Confusion Matrix:
[[212   2]
 [ 10   0]]
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       214
           1       0.00      0.00      0.00        10

    accuracy                           0.95       224
   macro avg       0.48      0.50      0.49       224
weighted avg       0.91      0.95      0.93       224

Ratio of 0 and 1 inhibitors in the balanced training set:
0    0.5
1    0.5
Name: PKM2_inhibition, dtype: float64

Ratio of 0 and 1 inhibitors in the testing set:
0    0.977679
1    0.022321
Name: PKM2_inhibition, dtype: float64
Cross-Validation Scores: [1. 1.]
Confusion Matrix:
[[218   1]
 [  5   0]]
              precision    recall  f1-score   suppo

## Random forest with fingerprints evaluation

In [116]:
#ERK2:
# Split the data into training and test sets
X_train_erk, X_test_erk, y_train_erk, y_test_erk = train_test_split(X_fingerprints, y_erk2, test_size=0.2)

# Apply SMOTE to balance the training set
smote_erk = SMOTE(random_state=40)
X_train_balanced_finger_erk, y_train_balanced_finger_erk = smote_erk.fit_resample(X_train_erk, y_train_erk)

# Calculate the ratio between 0 and 1 in the balanced training set and the test set
ratio_train_balanced_erk = y_train_balanced_erk.value_counts(normalize=True)
ratio_test_erk = y_test_erk.value_counts(normalize=True)

print("Ratio of 0 and 1 inhibitors in the balanced training set:")
print(ratio_train_balanced_erk)
print("\nRatio of 0 and 1 inhibitors in the testing set:")
print(ratio_test_erk)

# Train the Random Forest model with the balanced training set
rf_finger_erk = RandomForestClassifier(n_estimators_erk, random_state=42)

# Perform cross-validation
cv = StratifiedKFold(n_splits=2, shuffle=False)
cv_scores = cross_val_score(rf_descriptors, X_train_balanced_finger_erk, y_train_balanced_finger_erk, cv=cv, scoring='recall')

print(f'Cross-Validation Scores: {cv_scores}')

rf_finger_erk.fit(X_train_balanced_finger_erk, y_train_balanced_finger_erk)
y_pred_erk = rf_finger_erk.predict(X_test_erk)

# Evaluate the model using a confusion matrix
conf_mat_erk = confusion_matrix(y_test_erk, y_pred_erk)
print("Confusion Matrix:")
print(conf_mat_erk)
print(classification_report(y_test_erk, y_pred_erk))


#PMK:
# Split the data into training and test sets
X_train_pmk, X_test_pmk, y_train_pmk, y_test_pmk = train_test_split(X_fingerprints, y_pkm2, test_size=0.2)

# Apply SMOTE to balance the training set
smote_pmk = SMOTE(random_state=40)
X_train_balanced_finger_pmk, y_train_balanced_finger_pmk = smote_pmk.fit_resample(X_train_pmk, y_train_pmk)

# Calculate the ratio between 0 and 1 in the balanced training set and the test set
ratio_train_balanced_pmk = y_train_balanced_pmk.value_counts(normalize=True)
ratio_test_pmk = y_test_pmk.value_counts(normalize=True)

print("Ratio of 0 and 1 inhibitors in the balanced training set:")
print(ratio_train_balanced_pmk)
print("\nRatio of 0 and 1 inhibitors in the testing set:")
print(ratio_test_pmk)

# Train the Random Forest model with the balanced training set
rf_finger_pmk = RandomForestClassifier(n_estimators_pmk, random_state=42)

# Perform cross-validation
cv = StratifiedKFold(n_splits=2, shuffle=False)
cv_scores_pmk= cross_val_score(rf_descriptors, X_train_balanced_finger_pmk, y_train_balanced_finger_pmk, cv=cv, scoring='recall')

print(f'Cross-Validation Scores: {cv_scores_pmk}')

rf_finger_pmk.fit(X_train_balanced_finger_pmk, y_train_balanced_finger_pmk)
y_pred_pmk = rf_finger_pmk.predict(X_test_pmk)

# Evaluate the model using a confusion matrix
conf_mat_pmk = confusion_matrix(y_test_pmk, y_pred_pmk)
print("Confusion Matrix:")
print(conf_mat_pmk)
print(classification_report(y_test_pmk, y_pred_pmk))

Ratio of 0 and 1 inhibitors in the balanced training set:
0    0.5
1    0.5
Name: ERK2_inhibition, dtype: float64

Ratio of 0 and 1 inhibitors in the testing set:
0    0.959821
1    0.040179
Name: ERK2_inhibition, dtype: float64
Cross-Validation Scores: [0.94823529 0.99294118]
Confusion Matrix:
[[212   3]
 [  9   0]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       215
           1       0.00      0.00      0.00         9

    accuracy                           0.95       224
   macro avg       0.48      0.49      0.49       224
weighted avg       0.92      0.95      0.93       224

Ratio of 0 and 1 inhibitors in the balanced training set:
0    0.5
1    0.5
Name: PKM2_inhibition, dtype: float64

Ratio of 0 and 1 inhibitors in the testing set:
0    0.973214
1    0.026786
Name: PKM2_inhibition, dtype: float64
Cross-Validation Scores: [0.97241379 1.        ]
Confusion Matrix:
[[217   1]
 [  6   0]]
              precision    recall  

## Random forest with descriptors on actual data

In [117]:
rf_descriptors_pmk = RandomForestClassifier(n_estimators_pmk)
rf_descriptors_pmk.fit(X_cleaned_pmk, y_pkm2)
y_pred_descriptor_pmk = rf_descriptors_pmk.predict(X_untested_cleaned_pmk)
df_y_pred_descriptor_pmk=pd.DataFrame(y_pred_descriptor_pmk,index=X.index)

rf_descriptors_erk = RandomForestClassifier(n_estimators_erk)
rf_descriptors_erk.fit(X_cleaned_erk, y_erk2)
y_pred_descriptor_erk = rf_descriptors.predict(X_untested_cleaned_erk)
df_y_pred_descriptor_erk=pd.DataFrame(y_pred_descriptor_erk,index=X.index)


## Random forest with fingerprints on actual data

In [118]:
rf_fingerprints_pmk = RandomForestClassifier(n_estimators_pmk)
rf_fingerprints_pmk.fit(X_fingerprints, y_pkm2)
y_pred_fingerprint_pmk = rf_fingerprints_pmk.predict(X_untested_fingerprints)
df_y_pred_fingerprint_pmk=pd.DataFrame(y_pred_fingerprint_pmk,index=X.index)

rf_fingerprints_erk = RandomForestClassifier(n_estimators_erk)
rf_fingerprints_erk.fit(X_fingerprints, y_erk2)
y_pred_fingerprint_erk = rf_fingerprints_erk.predict(X_untested_fingerprints)
df_y_pred_fingerprint_erk=pd.DataFrame(y_pred_fingerprint_erk,index=X.index)

## Intersect

In [119]:
intersect_pmk=X[(df_y_pred_descriptor_pmk[0]==1)&(df_y_pred_fingerprint_pmk[0]==1)].index
intersect_pmk

intersect_erk=X[(df_y_pred_descriptor_erk[0]==1)&(df_y_pred_fingerprint_erk[0]==1)].index
intersect_erk

Index([], dtype='object', name='SMILES')

Writing outcomes to CSV file

In [120]:
# df_untested=pd.read_csv('untested_molecules.csv')
# for i in range(len(df_untested['SMILES'])):
#     molecule=df_untested.loc[i,'SMILES']
#     if molecule in intersect:
#         df_untested.loc[i,selected_kinase]=1
#         print(molecule)
#     else:
#         df_untested.loc[i,selected_kinase]=0

# df_untested.set_index('SMILES', inplace=True)
# df_untested.to_csv('predicted_molecules.csv')




df_untested=pd.read_csv('untested_molecules.csv')
for i in range(len(df_untested['SMILES'])):
    molecule=df_untested.loc[i,'SMILES']
    if molecule in intersect_erk:
        df_untested.loc[i,kinase_ERK2]=1
        print(molecule)
    else:
        df_untested.loc[i,kinase_ERK2]=0

df_untested.set_index('SMILES', inplace=True)
df_untested.to_csv('predicted_molecules.csv')



df_untested=pd.read_csv('untested_molecules.csv')
for i in range(len(df_untested['SMILES'])):
    molecule=df_untested.loc[i,'SMILES']
    if molecule in intersect_pmk:
        df_untested.loc[i,kinase_PKM2]=1
        print(molecule)
    else:
        df_untested.loc[i,kinase_PKM2]=0

df_untested.set_index('SMILES', inplace=True)
df_untested.to_csv('predicted_molecules.csv')
