In [1]:
# dit moet je wel weghalen bij inleveren, die staat nu ook in de yml file...
#pip install imblearn

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report

from sklearn.feature_selection import SelectKBest, chi2

from sklearn.decomposition import PCA

from sklearn.linear_model import Lasso

from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE, RFECV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from pandas.plotting import scatter_matrix 
from sklearn.metrics import confusion_matrix

## load tested data


In [3]:
# Load the data from the .csv file
data = pd.read_csv('cleaned_descriptor_data',index_col=0)

# Scale the data with a standard scaling
scaling=MinMaxScaler()
scaling.fit(data)
scaled_data=scaling.transform(data)
df_data = pd.DataFrame(scaled_data, columns=data.columns,index=data.index)

# Choose the kinase of which you want to predict the inhibition
#selected_kinase = 'ERK2_inhibition'
selected_kinase = 'PKM2_inhibition'    

# Define train and test data
X = df_data.drop(columns=['ERK2_inhibition','PKM2_inhibition']).copy()
y = data[selected_kinase].copy()


In [4]:
X_fingerprints=pd.read_csv('fingerprint_data',index_col=0)

## Load untested data

In [5]:
untested_molecules=pd.read_csv('cleaned_descriptor_data_untested',index_col=0)
untested_molecules.head()
X_untested=untested_molecules.drop(columns=['ERK2_inhibition','PKM2_inhibition']).copy()
y_untested=untested_molecules[selected_kinase].copy()


In [6]:
X_untested_fingerprints=pd.read_csv('fingerprint_data_untested',index_col=0)

## Recursive Feature Exclusion

In [7]:
rfe = RFE(RandomForestClassifier(), n_features_to_select=50)
rfe.fit(X, y)
selected_features = X.columns[rfe.support_]
print(selected_features)

Index(['MaxAbsEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed',
       'SPS', 'MinPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2',
       'FpDensityMorgan3', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO',
       'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW',
       'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0n', 'Chi1v', 'Chi3v',
       'HallKierAlpha', 'Kappa1', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11',
       'PEOE_VSA5', 'PEOE_VSA9', 'SMR_VSA10', 'SMR_VSA3', 'SMR_VSA5',
       'SMR_VSA7', 'SlogP_VSA1', 'SlogP_VSA2', 'SlogP_VSA5', 'EState_VSA3',
       'EState_VSA5', 'EState_VSA6', 'VSA_EState1', 'VSA_EState2',
       'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6',
       'VSA_EState9', 'NumAromaticHeterocycles', 'MolLogP', 'fr_Ar_N',
       'fr_sulfonamd'],
      dtype='object')


In [8]:
X_cleaned=X[selected_features]
X_untested_cleaned=X_untested[selected_features]

# Random forests

## Random forest with descriptors evaluation

### Find n_estimators

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y, test_size=0.2, stratify=y,random_state=11)


pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],['classifier', RandomForestClassifier()]])

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=11)
    
param_grid = {'classifier__n_estimators':[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='recall', cv=stratified_kfold, n_jobs=None)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')


# Get the best estimator from GridSearchCV
best_model = grid_search.best_estimator_

# Predict on test set
y_pred = best_model.predict(X_test)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)
print(classification_report(y_pred, y_test))

{'classifier__n_estimators': 100}
Cross-validation score: 0.13999999999999999
Test score: 0.0
Confusion Matrix:
[[216   2]
 [  6   0]]
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       222
           1       0.00      0.00      0.00         2

    accuracy                           0.96       224
   macro avg       0.50      0.49      0.49       224
weighted avg       0.98      0.96      0.97       224



In [10]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y, test_size=0.2)

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=40)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Calculate the ratio between 0 and 1 in the balanced training set and the test set
ratio_train_balanced = y_train_balanced.value_counts(normalize=True)
ratio_test = y_test.value_counts(normalize=True)

print("Ratio of 0 and 1 inhibitors in the balanced training set:")
print(ratio_train_balanced)
print("\nRatio of 0 and 1 inhibitors in the testing set:")
print(ratio_test)

# Train the Random Forest model with the balanced training set
rf_descriptors = RandomForestClassifier(n_estimators=600, random_state=42)

# Perform cross-validation
cv = StratifiedKFold(n_splits=2, shuffle=False)
cv_scores = cross_val_score(rf_descriptors, X_train_balanced, y_train_balanced, cv=cv, scoring='recall')

print(f'Cross-Validation Scores: {cv_scores}')

rf_descriptors.fit(X_train_balanced, y_train_balanced)
y_pred = rf_descriptors.predict(X_test)

# Evaluate the model using a confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_mat)
print(classification_report(y_pred, y_test))


Ratio of 0 and 1 inhibitors in the balanced training set:
0    0.5
1    0.5
Name: PKM2_inhibition, dtype: float64

Ratio of 0 and 1 inhibitors in the testing set:
0    0.982143
1    0.017857
Name: PKM2_inhibition, dtype: float64
Cross-Validation Scores: [0.99078341 1.        ]
Confusion Matrix:
[[217   3]
 [  3   1]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       220
           1       0.25      0.25      0.25         4

    accuracy                           0.97       224
   macro avg       0.62      0.62      0.62       224
weighted avg       0.97      0.97      0.97       224



## Random forest with fingerprints evaluation

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_fingerprints, y, test_size=0.2)

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=40)
X_train_balanced_finger, y_train_balanced_finger = smote.fit_resample(X_train, y_train)

# Calculate the ratio between 0 and 1 in the balanced training set and the test set
ratio_train_balanced_finger = y_train_balanced_finger.value_counts(normalize=True)
ratio_test_finger = y_test.value_counts(normalize=True)

print("Ratio of 0 and 1 inhibitors in the training set:")
print(ratio_train_balanced_finger)
print("\nRatio of 0 and 1 inhibitors in the testing set:")
print(ratio_test_finger)

rf_fingerprints= RandomForestClassifier(n_estimators=600, random_state=40)

# Perform cross-validation
cv = StratifiedKFold(n_splits=2, shuffle=False)
cv_scores = cross_val_score(rf_fingerprints, X_train_balanced_finger, y_train_balanced_finger, cv=cv, scoring='recall')

print(f'Cross-Validation Scores: {cv_scores}')

rf_fingerprints.fit(X_train, y_train)
y_pred = rf_fingerprints.predict(X_test)

conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)
print(classification_report(y_pred, y_test))

Ratio of 0 and 1 inhibitors in the training set:
0    0.5
1    0.5
Name: PKM2_inhibition, dtype: float64

Ratio of 0 and 1 inhibitors in the testing set:
0    0.982143
1    0.017857
Name: PKM2_inhibition, dtype: float64
Cross-Validation Scores: [0.97926267 1.        ]
[[220   0]
 [  4   0]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       224
           1       0.00      0.00      0.00         0

    accuracy                           0.98       224
   macro avg       0.50      0.49      0.50       224
weighted avg       1.00      0.98      0.99       224



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Random forest with descriptors on actual data

In [12]:
rf_descriptors = RandomForestClassifier(n_estimators=600)
rf_descriptors.fit(X_cleaned, y)
y_pred_descriptor = rf_descriptors.predict(X_untested_cleaned)
df_y_pred_descriptor=pd.DataFrame(y_pred_descriptor,index=X.index)


## Random forest with fingerprints on actual data

In [15]:
rf_fingerprints = RandomForestClassifier(n_estimators=600)
rf_fingerprints.fit(X_fingerprints, y)
y_pred_fingerprint = rf_fingerprints.predict(X_untested_fingerprints)
df_y_pred_fingerprint=pd.DataFrame(y_pred_fingerprint,index=X.index)

## Intersect

In [16]:
intersect=X[(df_y_pred_descriptor[0]==1)&(df_y_pred_fingerprint[0]==1)].index
intersect

Index(['COc1ccccc1-n1cnnc1SCC(=O)N1CCN(S(=O)(=O)c2ccccc2)CC1',
       'O=C(CSc1n[nH]c(-c2cccs2)n1)N1CCN(S(=O)(=O)c2ccccc2)CC1',
       'C=CCn1c(SCc2csc(C)n2)nc2scc(-c3ccc(C)o3)c2c1=O',
       'COc1cccc(-c2nn(C)c3sc(C(=O)NCc4ccc(C)o4)cc23)c1',
       'C=CCn1c(SCC(=O)N2CCOCC2)nc2scc(-c3ccco3)c2c1=O',
       'COCCSc1ccccc1C(=O)Nc1cc(C)on1', 'Cc1nc2ccc(S(=O)(=O)Nc3ccccc3C)cc2s1',
       'N#CCc1ccc(NS(=O)(=O)c2cccs2)cc1',
       'Cc1cc(C)n2cc(CSc3nnnn3-c3ccc(Cl)cc3)[nH+]c2n1',
       'Cc1cc(NC(=O)CSc2nnc(-c3ccoc3C)o2)no1',
       'c1ccc2c(c1)ncn2Cc1nnc2sc(-c3ccncc3)nn12'],
      dtype='object', name='SMILES')

Writing outcomes to CSV file

In [18]:
df_untested=pd.read_csv('untested_molecules.csv')
for i in range(len(df_untested['SMILES'])):
    molecule=df_untested.loc[i,'SMILES']
    if molecule in intersect:
        df_untested.loc[i,selected_kinase]=1
        print(molecule)
    else:
        df_untested.loc[i,selected_kinase]=0

df_untested.set_index('SMILES', inplace=True)
df_untested.to_csv('predicted_molecules.csv')