In [1]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [44]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold


from sklearn.feature_selection import SelectKBest, chi2

from sklearn.decomposition import PCA

from sklearn.linear_model import Lasso

from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE, RFECV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from pandas.plotting import scatter_matrix 
from sklearn.metrics import confusion_matrix

## load tested data


In [5]:
# Load the data from the .csv file
data = pd.read_csv('cleaned_descriptor_data',index_col=0)

# Scale the data with a standard scaling
scaling=MinMaxScaler()
scaling.fit(data)
scaled_data=scaling.transform(data)
df_data = pd.DataFrame(scaled_data, columns=data.columns,index=data.index)

# Choose the kinase of which you want to predict the inhibition
#selected_kinase = 'ERK2_inhibition'
selected_kinase = 'PKM2_inhibition'     # Or 'ERK2_inhibition'

# Define train and test data
X = df_data.drop(columns=['ERK2_inhibition','PKM2_inhibition']).copy()
y = data[selected_kinase].copy()


In [6]:
X_fingerprints=pd.read_csv('fingerprint_data',index_col=0)

## Load untested data

In [7]:
untested_molecules=pd.read_csv('cleaned_descriptor_data_untested',index_col=0)
untested_molecules.head()
X_untested=untested_molecules.drop(columns=['ERK2_inhibition','PKM2_inhibition']).copy()
y_untested=untested_molecules[selected_kinase].copy()


In [8]:
X_untested_fingerprints=pd.read_csv('fingerprint_data_untested',index_col=0)

## Recursive Feature Exclusion

In [13]:
rfe = RFE(RandomForestClassifier(), n_features_to_select=50)
rfe.fit(X, y)
selected_features = X.columns[rfe.support_]
print(selected_features)

Index(['MaxAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'MinPartialCharge',
       'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3',
       'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO',
       'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW',
       'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0n', 'Chi3v', 'HallKierAlpha',
       'Ipc', 'Kappa1', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA5', 'PEOE_VSA7',
       'SMR_VSA10', 'SMR_VSA3', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA9',
       'SlogP_VSA2', 'SlogP_VSA5', 'SlogP_VSA6', 'TPSA', 'EState_VSA3',
       'EState_VSA5', 'EState_VSA6', 'VSA_EState1', 'VSA_EState2',
       'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState9',
       'NumAromaticHeterocycles', 'MolLogP', 'fr_Ar_N', 'fr_sulfonamd'],
      dtype='object')


In [14]:
X_cleaned=X[selected_features]
X_untested_cleaned=X_untested[selected_features]

# Random forests

## Random forest with descriptors evaluation

In [48]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y, test_size=0.2)

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Calculate the ratio between 0 and 1 in the balanced training set and the test set
ratio_train_balanced = y_train_balanced.value_counts(normalize=True)
ratio_test = y_test.value_counts(normalize=True)

print("Ratio of 0 and 1 inhibitors in the balanced training set:")
print(ratio_train_balanced)
print("\nRatio of 0 and 1 inhibitors in the testing set:")
print(ratio_test)

# Train the Random Forest model with the balanced training set
rf_descriptors = RandomForestClassifier(n_estimators=1100)

# Perform cross-validation
num_folds = 2
cv = StratifiedKFold(n_splits=num_folds, shuffle=True)
cv_scores = cross_val_score(rf_descriptors, X_train_balanced, y_train_balanced, cv=cv, scoring='accuracy')

print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean Accuracy: {cv_scores.mean():.2f} +/- {cv_scores.std():.2f}')

rf_descriptors.fit(X_train_balanced, y_train_balanced)
y_pred = rf_descriptors.predict(X_test)

# Evaluate the model using a confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_mat)


Ratio of 0 and 1 inhibitors in the balanced training set:
0    0.5
1    0.5
Name: PKM2_inhibition, dtype: float64

Ratio of 0 and 1 inhibitors in the testing set:
0    0.973214
1    0.026786
Name: PKM2_inhibition, dtype: float64
Cross-Validation Scores: [0.98965517 0.9954023 ]
Mean Accuracy: 0.99 +/- 0.00
Confusion Matrix:
[[216   2]
 [  5   1]]


## Random forest with fingerprints evaluation

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X_fingerprints, y, test_size=0.2)

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_balanced_finger, y_train_balanced_finger = smote.fit_resample(X_train, y_train)

# Calculate the ratio between 0 and 1 in the balanced training set and the test set
ratio_train_balanced_finger = y_train_balanced_finger.value_counts(normalize=True)
ratio_test_finger = y_test.value_counts(normalize=True)

print("Ratio of 0 and 1 inhibitors in the training set:")
print(ratio_train_balanced_finger)
print("\nRatio of 0 and 1 inhibitors in the testing set:")
print(ratio_test_finger)

rf_fingerprints= RandomForestClassifier(n_estimators=1100)

# Perform cross-validation
num_folds = 2
cv = StratifiedKFold(n_splits=num_folds, shuffle=True)
cv_scores = cross_val_score(rf_fingerprints, X_train_balanced_finger, y_train_balanced_finger, cv=cv, scoring='accuracy')

print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean Accuracy: {cv_scores.mean():.2f} +/- {cv_scores.std():.2f}')

rf_fingerprints.fit(X_train, y_train)
y_pred = rf_fingerprints.predict(X_test)

conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)


Ratio of 0 and 1 inhibitors in the training set:
0    0.5
1    0.5
Name: PKM2_inhibition, dtype: float64

Ratio of 0 and 1 inhibitors in the testing set:
0    0.973214
1    0.026786
Name: PKM2_inhibition, dtype: float64
Cross-Validation Scores: [0.98275862 0.9908046 ]
Mean Accuracy: 0.99 +/- 0.00
[[218   0]
 [  6   0]]


## Random forest with descriptors on actual data

In [37]:
rf_descriptors = RandomForestClassifier(n_estimators=1100)
rf_descriptors.fit(X_cleaned, y)
y_pred_descriptor = rf_descriptors.predict(X_untested_cleaned)
df_y_pred_descriptor=pd.DataFrame(y_pred_descriptor,index=X.index)


## Random forest with fingerprints on actual data

In [38]:
rf_fingerprints = RandomForestClassifier(n_estimators=1100)
rf_fingerprints.fit(X_fingerprints, y)
y_pred_fingerprint = rf_descriptors.predict(X_untested_cleaned)
df_y_pred_fingerprint=pd.DataFrame(y_pred_descriptor,index=X.index)

## Intersect

In [40]:
intersect=X[(df_y_pred_descriptor[0]==1)&(df_y_pred_fingerprint[0]==1)].index
intersect

Index(['C=CCOn1c(=O)c(C)[n+]([O-])c2ccccc21', 'C=CCn1cc(Cl)c(=O)n(CC=C)c1=O',
       'CCc1nnc(NCn2nc(C)cc2C)s1', 'COc1cccc(OC[C@H](O)CO)c1',
       'CSc1nnc2c(n1)OC(C)=Nc1ccccc1-2', 'Cc1csc2nc(=O)c(-c3ccccc3)nn12',
       'N#CCSc1nnc(N)s1', 'Nc1nc2sc3c(c2c(=O)s1)CCCC3',
       'O=C([O-])CCSc1nnnn1-c1ccccc1', 'O=c1c(-c2ccccc2)[n+]([O-])c2ccccc2n1O',
       ...
       'COCCSc1ccccc1C(=O)Nc1cc(C)on1', 'C=CCNc1nc(NCc2ccco2)nc(OCC)n1',
       'c1ccc(COc2nsnc2N2CCOCC2)cc1',
       'Cc1cc(C)n2cc(CSc3nnnn3-c3ccc(Cl)cc3)[nH+]c2n1',
       'Cc1cc(NC(=O)CSc2nnc(-c3ccoc3C)o2)no1',
       'c1ccc2c(c1)ncn2Cc1nnc2sc(-c3ccncc3)nn12',
       'O=C(COc1ccc(Cl)cc1)Nc1nc(-c2cccs2)cs1',
       'O=C1OCC2=C1[C@@H](c1ccc3c(c1)OCO3)Sc1ccccc1N2',
       'c1ccc(-c2csc(N3CCN(c4ccccn4)CC3)n2)cc1',
       'c1ccc(C2=Nn3c(nnc3-c3cc(-c4ccccc4)n[nH]3)SC2)cc1'],
      dtype='object', name='SMILES', length=189)