In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_selection import SelectKBest, chi2

from sklearn.decomposition import PCA

from sklearn.linear_model import Lasso

from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE, RFECV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from pandas.plotting import scatter_matrix 
from sklearn.metrics import confusion_matrix




## load tested data


In [20]:
# Load the data from the .csv file
data = pd.read_csv('cleaned_descriptor_data',index_col=0)

# Scale the data with a standard scaling
scaling=MinMaxScaler()
scaling.fit(data)
scaled_data=scaling.transform(data)
df_data = pd.DataFrame(scaled_data, columns=data.columns)

# Choose the kinase of which you want to predict the inhibition
#selected_kinase = 'ERK2_inhibition'
selected_kinase = 'PKM2_inhibition'     # Or 'ERK2_inhibition'

# Define train and test data
X = df_data.drop(columns=['ERK2_inhibition','PKM2_inhibition']).copy()
y = data[selected_kinase].copy()


In [21]:
X_fingerprints=pd.read_csv('fingerprint_data',index_col=0)

## Load untested data

In [22]:
untested_molecules=pd.read_csv('cleaned_descriptor_data_untested',index_col=0)
untested_molecules.head()
X_untested=untested_molecules.drop(columns=['ERK2_inhibition','PKM2_inhibition']).copy()
y_untested=untested_molecules[selected_kinase].copy()


In [23]:
X_untested_fingerprints=pd.read_csv('fingerprint_data_untested',index_col=0)

## Recursive Feature Exclusion

In [24]:
rfe = RFE(RandomForestClassifier(), n_features_to_select=50)
rfe.fit(X, y)
selected_features = X.columns[rfe.support_]
print(selected_features)

Index(['MaxAbsEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed',
       'SPS', 'MinPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2',
       'FpDensityMorgan3', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO',
       'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'AvgIpc', 'BalabanJ',
       'BertzCT', 'Chi1v', 'Chi3n', 'Chi3v', 'Ipc', 'Kappa1', 'PEOE_VSA1',
       'PEOE_VSA10', 'PEOE_VSA5', 'PEOE_VSA7', 'PEOE_VSA9', 'SMR_VSA1',
       'SMR_VSA10', 'SMR_VSA3', 'SMR_VSA7', 'SMR_VSA9', 'SlogP_VSA1',
       'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA5', 'SlogP_VSA6', 'TPSA',
       'EState_VSA3', 'EState_VSA6', 'VSA_EState1', 'VSA_EState2',
       'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState9',
       'NumAromaticHeterocycles', 'MolLogP', 'fr_Ar_N'],
      dtype='object')


In [29]:
X_cleaned=X[selected_features]
X_untested_cleaned=X_untested[selected_features]

# Random forests

## Random forest with descriptors evaluation

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y, test_size=0.2, stratify=y)

# Bereken de ratio tussen 0 and 1 
ratio_train = y_train.value_counts(normalize=True)

# Bereken de ratio tussen 0 and 1
ratio_test = y_test.value_counts(normalize=True)

print("Ratio of 0 and 1 inhibitors in the training set:")
print(ratio_train)
print("\nRatio of 0 and 1 inhibitors in the testing set:")
print(ratio_test)

# Weigth
# class_weights = {0: 1, 1: 1}  # Aanpassen op weigth distribution
rf_descriptors = RandomForestClassifier(n_estimators=1100)#, class_weight=class_weights)
rf_descriptors.fit(X_train, y_train)
y_pred = rf_descriptors.predict(X_test)

conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)




Ratio of 0 and 1 inhibitors in the training set:
0    0.975336
1    0.024664
Name: PKM2_inhibition, dtype: float64

Ratio of 0 and 1 inhibitors in the testing set:
0    0.973214
1    0.026786
Name: PKM2_inhibition, dtype: float64
[[217   1]
 [  6   0]]


## Random forest with fingerprints evaluation

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_fingerprints, y, test_size=0.2, stratify=y)

# Bereken de ratio tussen 0 and 1 
ratio_train = y_train.value_counts(normalize=True)

# Bereken de ratio tussen 0 and 1
ratio_test = y_test.value_counts(normalize=True)

print("Ratio of 0 and 1 inhibitors in the training set:")
print(ratio_train)
print("\nRatio of 0 and 1 inhibitors in the testing set:")
print(ratio_test)

# Weigth
# class_weights = {0: 1, 1: 1}  # Aanpassen op weigth distribution
rf_fingerprints= RandomForestClassifier(n_estimators=1100)#, class_weight=class_weights)
rf_fingerprints.fit(X_train, y_train)
y_pred = rf_fingerprints.predict(X_test)

conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)


Ratio of 0 and 1 inhibitors in the training set:
0    0.975336
1    0.024664
Name: PKM2_inhibition, dtype: float64

Ratio of 0 and 1 inhibitors in the testing set:
0    0.973214
1    0.026786
Name: PKM2_inhibition, dtype: float64
[[218   0]
 [  6   0]]


## Random forest with descriptors on actual data

In [35]:
rf_descriptors = RandomForestClassifier(n_estimators=1100)#, class_weight=class_weights)
rf_descriptors.fit(X_cleaned, y)
y_pred_descriptor = rf_descriptors.predict(X_untested_cleaned)

array([0, 0, 0, ..., 0, 1, 1], dtype=int64)

## Random forest with fingerprints on actual data

In [32]:
rf_fingerprints = RandomForestClassifier(n_estimators=1100)#, class_weight=class_weights)
rf_fingerprints.fit(X_fingerprints, y)
y_pred_fingerprint = rf_descriptors.predict(X_untested_cleaned)

## Intersect