# Dependency installation

In [2]:
!pip install rdkit



In [3]:
import numpy as np
import pandas as pd
from rdkit import Chem

# Data Understanding

In [51]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# load_path = '/content/gdrive/MyDrive/ID2214/training_smiles.csv'

# train_df = pd.read_csv(load_path)

train_df = pd.read_csv("training_smiles.csv")
train_df

Unnamed: 0,INDEX,SMILES,ACTIVE
0,1,CCCc1sc(N)nc1-c1ccc(C)cc1,0.0
1,2,CCCCNC(=O)Cn1cnc2c(cnn2-c2ccc(C)c(C)c2)c1=O,0.0
2,3,O=C(NCCC1=CCCCC1)C1CCN(S(=O)(=O)N2CCOCC2)CC1,0.0
3,4,N#Cc1c(-c2c(Cl)cccc2Cl)noc1/C=C/Nc1ccc(C(=O)O)cc1,0.0
4,5,Cc1cc(C)cc(OCC(=O)Nc2ccc(F)cc2)c1,0.0
...,...,...,...
153225,153226,O=C(COC(=O)Cc1ccsc1)NC1CCCCCC1,0.0
153226,153227,CCCCCCN(C(=O)CCCCCN1C(=O)NC(c2cccc([N+](=O)[O-...,0.0
153227,153228,O=C(Cn1cnc([N+](=O)[O-])c1)NCc1ccc(F)cc1,0.0
153228,153229,CCOC(=O)CS(=O)(=O)CC(=O)Nc1ccc([N+](=O)[O-])cc1OC,0.0


In [52]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153230 entries, 0 to 153229
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   INDEX   153230 non-null  int64  
 1   SMILES  153230 non-null  object 
 2   ACTIVE  153230 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 3.5+ MB


In [53]:
train_df.describe()

Unnamed: 0,INDEX,ACTIVE
count,153230.0,153230.0
mean,76615.5,0.011643
std,44233.835211,0.107271
min,1.0,0.0
25%,38308.25,0.0
50%,76615.5,0.0
75%,114922.75,0.0
max,153230.0,1.0


In [54]:
train_df['ACTIVE'].value_counts()

0.0    151446
1.0      1784
Name: ACTIVE, dtype: int64

# Data Preprocessing and Feature Engineering

In [55]:
# Example: Convert SMILES to a molecule object
train_df['MOLECULE'] = train_df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))



In [56]:
train_df

Unnamed: 0,INDEX,SMILES,ACTIVE,MOLECULE
0,1,CCCc1sc(N)nc1-c1ccc(C)cc1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d76aace0>
1,2,CCCCNC(=O)Cn1cnc2c(cnn2-c2ccc(C)c(C)c2)c1=O,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d76aac00>
2,3,O=C(NCCC1=CCCCC1)C1CCN(S(=O)(=O)N2CCOCC2)CC1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d76aac70>
3,4,N#Cc1c(-c2c(Cl)cccc2Cl)noc1/C=C/Nc1ccc(C(=O)O)cc1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d76aaf10>
4,5,Cc1cc(C)cc(OCC(=O)Nc2ccc(F)cc2)c1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d76ab530>
...,...,...,...,...
153225,153226,O=C(COC(=O)Cc1ccsc1)NC1CCCCCC1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d657de00>
153226,153227,CCCCCCN(C(=O)CCCCCN1C(=O)NC(c2cccc([N+](=O)[O-...,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d657de70>
153227,153228,O=C(Cn1cnc([N+](=O)[O-])c1)NCc1ccc(F)cc1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d657dee0>
153228,153229,CCOC(=O)CS(=O)(=O)CC(=O)Nc1ccc([N+](=O)[O-])cc1OC,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d657df50>


In [57]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153230 entries, 0 to 153229
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   INDEX     153230 non-null  int64  
 1   SMILES    153230 non-null  object 
 2   ACTIVE    153230 non-null  float64
 3   MOLECULE  153230 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 4.7+ MB


In [58]:
# Molecular Descriptors
train_df_md = train_df.copy()
# Fingerprint
train_df_finger = train_df.copy()

## Molecular Descriptors Representation

In [59]:
from rdkit.Chem import rdMolDescriptors, Fragments, Lipinski, AllChem, Descriptors, rdChemReactions

train_df_md['FR_Al_COO'] = train_df_md['MOLECULE'].apply(lambda x: Fragments.fr_Al_COO(x))
train_df_md['FR_Al_OH'] = train_df_md['MOLECULE'].apply(lambda x: Fragments.fr_Al_OH(x))
train_df_md['FR_COO'] = train_df_md['MOLECULE'].apply(lambda x: Fragments.fr_COO(x))

# Example: Calculate molecular weight using rdMolDescriptors
train_df_md['MOL_WEIGHT'] = train_df_md['MOLECULE'].apply(lambda x: rdMolDescriptors.CalcExactMolWt(x))

# Example: Calculate number of heavy atoms using Lipinski
train_df_md['HEAVY_ATOM_COUNT'] = train_df_md['MOLECULE'].apply(lambda x: Lipinski.HeavyAtomCount(x))

# Example: Experiment with additional molecular descriptors
train_df_md['NUM_RINGS'] = train_df_md['MOLECULE'].apply(lambda x: rdMolDescriptors.CalcNumRings(x))

# Example: Number of Rotatable Bonds
train_df_md['NUM_ROTATABLE_BONDS'] = train_df_md['MOLECULE'].apply(lambda x: Descriptors.NumRotatableBonds(x))

# Example: Topological Polar Surface Area (TPSA)
train_df_md['TPSA'] = train_df_md['MOLECULE'].apply(lambda x: Descriptors.TPSA(x))

# Example: Number of Hydrogen Bond Donors and Acceptors
train_df_md['HBOND_DONORS'] = train_df_md['MOLECULE'].apply(lambda x: Lipinski.NumHDonors(x))
train_df_md['HBOND_ACCEPTORS'] = train_df_md['MOLECULE'].apply(lambda x: Lipinski.NumHAcceptors(x))

# Example: LogP (Lipophilicity)
train_df_md['LOGP'] = train_df_md['MOLECULE'].apply(lambda x: Descriptors.MolLogP(x))

# Example: Aromatic Proportion
train_df_md['AROMATIC_PROPORTION'] = train_df_md['MOLECULE'].apply(lambda x: Descriptors.FractionCSP3(x))

# Example: Number of atoms
train_df_md['NUM_ATOMS'] = train_df_md['MOLECULE'].apply(lambda x: x.GetNumAtoms())

# Example: Number of valence electrons
train_df_md['NUM_VALENCE_ELECTRONS'] = train_df_md['MOLECULE'].apply(lambda x: Descriptors.NumValenceElectrons(x))

# Example: Fraction of sp3-hybridized carbon atoms
train_df_md['FRACTION_SP3'] = train_df_md['MOLECULE'].apply(lambda x: Descriptors.FractionCSP3(x))

# Example: Fractional polar surface area
train_df_md['FRACTIONAL_POLAR_SURFACE_AREA'] = train_df_md['MOLECULE'].apply(lambda x: Descriptors.FractionCSP3(x))

# Example: Number of Saturated Rings
train_df_md['NUM_SATURATED_RINGS'] = train_df_md['MOLECULE'].apply(lambda x: rdMolDescriptors.CalcNumSaturatedRings(x))

# Example: Number of Amide Bonds
def count_amide_bonds(mol):
    amide_pattern = '[NX3][CX3](=[OX1])[#6]'
    return len(mol.GetSubstructMatches(Chem.MolFromSmarts(amide_pattern)))

train_df_md['NUM_AMIDE_BONDS'] = train_df_md['MOLECULE'].apply(lambda x: count_amide_bonds(x))

# Example: Number of aromatic rings
train_df_md['NUM_AROMATIC_RINGS'] = train_df_md['MOLECULE'].apply(lambda x: rdMolDescriptors.CalcNumAromaticRings(x))

# Example: Number of aliphatic rings
train_df_md['NUM_ALIPHATIC_RINGS'] = train_df_md['MOLECULE'].apply(lambda x: rdMolDescriptors.CalcNumAliphaticRings(x))

train_df_md.describe()

Unnamed: 0,INDEX,ACTIVE,FR_Al_COO,FR_Al_OH,FR_COO,MOL_WEIGHT,HEAVY_ATOM_COUNT,NUM_RINGS,NUM_ROTATABLE_BONDS,TPSA,...,LOGP,AROMATIC_PROPORTION,NUM_ATOMS,NUM_VALENCE_ELECTRONS,FRACTION_SP3,FRACTIONAL_POLAR_SURFACE_AREA,NUM_SATURATED_RINGS,NUM_AMIDE_BONDS,NUM_AROMATIC_RINGS,NUM_ALIPHATIC_RINGS
count,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,...,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0
mean,76615.5,0.011643,0.032526,0.060497,0.053775,349.270902,24.414083,2.951145,4.79224,72.827919,...,2.901459,0.29657,24.41409,127.618573,0.29657,0.29657,0.461848,0.728708,2.21439,0.736755
std,44233.835211,0.107271,0.187272,0.322565,0.237937,80.700601,5.724021,1.049867,2.34978,28.637325,...,1.278492,0.181896,5.724022,30.080849,0.181896,0.181896,0.737023,0.763031,0.966451,0.873352
min,1.0,0.0,0.0,0.0,0.0,32.026215,2.0,0.0,0.0,0.0,...,-18.5803,0.0,2.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,38308.25,0.0,0.0,0.0,0.0,295.031599,21.0,2.0,3.0,54.02,...,2.1178,0.16,21.0,108.0,0.16,0.16,0.0,0.0,2.0,0.0
50%,76615.5,0.0,0.0,0.0,0.0,345.147727,24.0,3.0,5.0,70.71,...,2.94156,0.274295,24.0,126.0,0.274295,0.274295,0.0,1.0,2.0,1.0
75%,114922.75,0.0,0.0,0.0,0.0,400.140994,28.0,4.0,6.0,89.35,...,3.733335,0.407407,28.0,146.0,0.407407,0.407407,1.0,1.0,3.0,1.0
max,153230.0,1.0,5.0,21.0,5.0,1766.302831,117.0,21.0,47.0,878.39,...,16.4285,1.0,117.0,670.0,1.0,1.0,21.0,14.0,12.0,21.0


In [60]:
train_df_md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153230 entries, 0 to 153229
Data columns (total 24 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   INDEX                          153230 non-null  int64  
 1   SMILES                         153230 non-null  object 
 2   ACTIVE                         153230 non-null  float64
 3   MOLECULE                       153230 non-null  object 
 4   FR_Al_COO                      153230 non-null  int64  
 5   FR_Al_OH                       153230 non-null  int64  
 6   FR_COO                         153230 non-null  int64  
 7   MOL_WEIGHT                     153230 non-null  float64
 8   HEAVY_ATOM_COUNT               153230 non-null  int64  
 9   NUM_RINGS                      153230 non-null  int64  
 10  NUM_ROTATABLE_BONDS            153230 non-null  int64  
 11  TPSA                           153230 non-null  float64
 12  HBOND_DONORS                  

In [62]:
from sklearn.preprocessing import MinMaxScaler

# Example: Normalize a specific feature using Min-Max scaling
scaler = MinMaxScaler()

columns_to_modify = [
    'FR_Al_COO', 'FR_Al_OH', 'FR_COO',
    'MOL_WEIGHT', 'HEAVY_ATOM_COUNT', 'NUM_RINGS',
    'NUM_ROTATABLE_BONDS', 'TPSA', 'HBOND_DONORS',
    'HBOND_ACCEPTORS', 'LOGP', 'AROMATIC_PROPORTION',
    'NUM_ATOMS','NUM_VALENCE_ELECTRONS','FRACTION_SP3',
    'FRACTIONAL_POLAR_SURFACE_AREA','NUM_SATURATED_RINGS',
    'NUM_AMIDE_BONDS','NUM_AROMATIC_RINGS','NUM_ALIPHATIC_RINGS'
]

for column in columns_to_modify:
    normalized_column_name = f'{column}_NORMALIZED'
    train_df_md[normalized_column_name] = scaler.fit_transform(train_df_md[[column]])

train_df_md.drop(columns=columns_to_modify, inplace=True)

train_df_md.describe()

Unnamed: 0,INDEX,ACTIVE,FR_Al_COO_NORMALIZED,FR_Al_OH_NORMALIZED,FR_COO_NORMALIZED,MOL_WEIGHT_NORMALIZED,HEAVY_ATOM_COUNT_NORMALIZED,NUM_RINGS_NORMALIZED,NUM_ROTATABLE_BONDS_NORMALIZED,TPSA_NORMALIZED,...,LOGP_NORMALIZED,AROMATIC_PROPORTION_NORMALIZED,NUM_ATOMS_NORMALIZED,NUM_VALENCE_ELECTRONS_NORMALIZED,FRACTION_SP3_NORMALIZED,FRACTIONAL_POLAR_SURFACE_AREA_NORMALIZED,NUM_SATURATED_RINGS_NORMALIZED,NUM_AMIDE_BONDS_NORMALIZED,NUM_AROMATIC_RINGS_NORMALIZED,NUM_ALIPHATIC_RINGS_NORMALIZED
count,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,...,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0,153230.0
mean,76615.5,0.011643,0.006505,0.002881,0.010755,0.182926,0.194905,0.140531,0.101963,0.082911,...,0.61361,0.29657,0.194905,0.180693,0.29657,0.29657,0.021993,0.052051,0.184533,0.035084
std,44233.835211,0.107271,0.037454,0.01536,0.047587,0.046533,0.049774,0.049994,0.049995,0.032602,...,0.036519,0.181896,0.049774,0.045439,0.181896,0.181896,0.035096,0.054502,0.080538,0.041588
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,38308.25,0.0,0.0,0.0,0.0,0.151651,0.165217,0.095238,0.06383,0.061499,...,0.591226,0.16,0.165217,0.151057,0.16,0.16,0.0,0.0,0.166667,0.0
50%,76615.5,0.0,0.0,0.0,0.0,0.180549,0.191304,0.142857,0.106383,0.0805,...,0.614756,0.274295,0.191304,0.178248,0.274295,0.274295,0.0,0.071429,0.166667,0.047619
75%,114922.75,0.0,0.0,0.0,0.0,0.212258,0.226087,0.190476,0.12766,0.10172,...,0.637372,0.407407,0.226087,0.208459,0.407407,0.407407,0.047619,0.071429,0.25,0.047619
max,153230.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [63]:
target_variable = 'ACTIVE'
correlation_matrix = train_df_md.corr()
# simply use pearson correlation coefficient to measure linear correlation
print(correlation_matrix[target_variable].sort_values(ascending=False))

correlation_upper_threshold_1 = 0.01
correlation_lower_threshold_1 = - 0.01
columns_to_drop_1 = correlation_matrix[(correlation_matrix[target_variable] < correlation_upper_threshold_1) &
                                      (correlation_matrix[target_variable] > correlation_lower_threshold_1)].index
columns_to_drop_1 = [col for col in columns_to_drop_1 if col != 'INDEX']

columns_to_drop_1 = pd.Index(columns_to_drop_1)
print(columns_to_drop_1)

  correlation_matrix = train_df_md.corr()


ACTIVE                                      1.000000
LOGP_NORMALIZED                             0.045920
NUM_AROMATIC_RINGS_NORMALIZED               0.036922
NUM_RINGS_NORMALIZED                        0.032692
MOL_WEIGHT_NORMALIZED                       0.019481
HEAVY_ATOM_COUNT_NORMALIZED                 0.017444
NUM_ATOMS_NORMALIZED                        0.017444
NUM_VALENCE_ELECTRONS_NORMALIZED            0.011932
FR_Al_OH_NORMALIZED                         0.006992
INDEX                                       0.002599
HBOND_ACCEPTORS_NORMALIZED                  0.000511
NUM_ALIPHATIC_RINGS_NORMALIZED             -0.001558
HBOND_DONORS_NORMALIZED                    -0.003886
TPSA_NORMALIZED                            -0.008226
NUM_ROTATABLE_BONDS_NORMALIZED             -0.013421
FR_Al_COO_NORMALIZED                       -0.013653
FR_COO_NORMALIZED                          -0.015581
NUM_SATURATED_RINGS_NORMALIZED             -0.017082
NUM_AMIDE_BONDS_NORMALIZED                 -0.

In [64]:
from scipy.stats import spearmanr

# Specify the features you want to evaluate
normalized_features_to_evaluate = [
    'FR_Al_COO_NORMALIZED', 'FR_Al_OH_NORMALIZED', 'FR_COO_NORMALIZED',
    'MOL_WEIGHT_NORMALIZED', 'HEAVY_ATOM_COUNT_NORMALIZED', 'NUM_RINGS_NORMALIZED',
    'NUM_ROTATABLE_BONDS_NORMALIZED', 'TPSA_NORMALIZED', 'HBOND_DONORS_NORMALIZED',
    'HBOND_ACCEPTORS_NORMALIZED', 'LOGP_NORMALIZED', 'AROMATIC_PROPORTION_NORMALIZED',
    'NUM_ATOMS_NORMALIZED','NUM_VALENCE_ELECTRONS_NORMALIZED','FRACTION_SP3_NORMALIZED',
    'FRACTIONAL_POLAR_SURFACE_AREA_NORMALIZED','NUM_SATURATED_RINGS_NORMALIZED',
    'NUM_AMIDE_BONDS_NORMALIZED'
]

# Convert 'ACTIVE' column to numeric (0 or 1)
train_df_md['ACTIVE_NUMERIC'] = train_df_md['ACTIVE'].astype(int)

# Create a DataFrame with the specified features and the target variable
features_and_target = train_df_md[normalized_features_to_evaluate + ['ACTIVE_NUMERIC']]

# Calculate Spearman's rank correlation for each feature
spearman_corr = features_and_target.corr(method='spearman')['ACTIVE_NUMERIC'].sort_values(ascending=False)

# Display the correlation results
print("Spearman's Rank Correlation for the specified features:")
print(spearman_corr)

correlation_upper_threshold_2 = 0.01
correlation_lower_threshold_2 = - 0.01

columns_to_drop_2 = spearman_corr[
    (spearman_corr < correlation_upper_threshold_2) &
    (spearman_corr > correlation_lower_threshold_2)
].index

print(columns_to_drop_2)

Spearman's Rank Correlation for the specified features:
ACTIVE_NUMERIC                              1.000000
LOGP_NORMALIZED                             0.046828
NUM_RINGS_NORMALIZED                        0.032037
MOL_WEIGHT_NORMALIZED                       0.016465
HEAVY_ATOM_COUNT_NORMALIZED                 0.015329
NUM_ATOMS_NORMALIZED                        0.015328
NUM_VALENCE_ELECTRONS_NORMALIZED            0.009290
HBOND_ACCEPTORS_NORMALIZED                 -0.000455
FR_Al_OH_NORMALIZED                        -0.002118
HBOND_DONORS_NORMALIZED                    -0.009116
TPSA_NORMALIZED                            -0.009631
FR_Al_COO_NORMALIZED                       -0.015191
NUM_ROTATABLE_BONDS_NORMALIZED             -0.016600
FR_COO_NORMALIZED                          -0.017196
NUM_SATURATED_RINGS_NORMALIZED             -0.020774
AROMATIC_PROPORTION_NORMALIZED             -0.032688
FRACTION_SP3_NORMALIZED                    -0.032688
FRACTIONAL_POLAR_SURFACE_AREA_NORMALIZED   

In [65]:
from scipy.stats import kendalltau

# Convert 'ACTIVE' column to numeric (0 or 1)
train_df_md['ACTIVE_NUMERIC'] = train_df_md['ACTIVE'].astype(int)

# Create a DataFrame with the specified normalized features and the target variable
normalized_features_and_target = train_df_md[normalized_features_to_evaluate + ['ACTIVE_NUMERIC']]

# Calculate Kendall's Tau for each normalized feature
kendall_tau_normalized = normalized_features_and_target.apply(lambda x: kendalltau(x, normalized_features_and_target['ACTIVE_NUMERIC']).correlation).sort_values(ascending=False)

# Display the Kendall's Tau results for normalized features
print("Kendall's Tau for the specified normalized features:")
print(kendall_tau_normalized)

correlation_upper_threshold_3 = 0.01
correlation_lower_threshold_3 = - 0.01

columns_to_drop_3 = spearman_corr[
    (spearman_corr < correlation_upper_threshold_3) &
    (spearman_corr > correlation_lower_threshold_3)
].index

print(columns_to_drop_3)

Kendall's Tau for the specified normalized features:
ACTIVE_NUMERIC                              1.000000
LOGP_NORMALIZED                             0.038235
NUM_RINGS_NORMALIZED                        0.029355
MOL_WEIGHT_NORMALIZED                       0.013444
HEAVY_ATOM_COUNT_NORMALIZED                 0.012832
NUM_ATOMS_NORMALIZED                        0.012831
NUM_VALENCE_ELECTRONS_NORMALIZED            0.007660
HBOND_ACCEPTORS_NORMALIZED                 -0.000398
FR_Al_OH_NORMALIZED                        -0.002113
TPSA_NORMALIZED                            -0.007867
HBOND_DONORS_NORMALIZED                    -0.008509
NUM_ROTATABLE_BONDS_NORMALIZED             -0.014382
FR_Al_COO_NORMALIZED                       -0.015181
FR_COO_NORMALIZED                          -0.017178
NUM_SATURATED_RINGS_NORMALIZED             -0.020178
AROMATIC_PROPORTION_NORMALIZED             -0.026861
FRACTION_SP3_NORMALIZED                    -0.026861
FRACTIONAL_POLAR_SURFACE_AREA_NORMALIZED   -0.

In [66]:
from scipy.stats import pointbiserialr

# Convert 'ACTIVE' column to numeric (0 or 1)
train_df_md['ACTIVE_NUMERIC'] = train_df_md['ACTIVE'].astype(int)

# Create a DataFrame with the specified normalized features and the target variable
normalized_features_and_target = train_df_md[normalized_features_to_evaluate + ['ACTIVE_NUMERIC']]

# Calculate Point-Biserial Correlation for each normalized feature
point_biserial_corr = normalized_features_and_target.apply(lambda x: pointbiserialr(x, normalized_features_and_target['ACTIVE_NUMERIC']).correlation).sort_values(ascending=False)

# Display the Point-Biserial Correlation results for normalized features
print("Point-Biserial Correlation for the specified normalized features:")
print(point_biserial_corr)

correlation_upper_threshold_4 = 0.01
correlation_lower_threshold_4 = - 0.01

columns_to_drop_4 = spearman_corr[
    (spearman_corr < correlation_upper_threshold_4) &
    (spearman_corr > correlation_lower_threshold_4)
].index

print(columns_to_drop_4)

Point-Biserial Correlation for the specified normalized features:
ACTIVE_NUMERIC                              1.000000
LOGP_NORMALIZED                             0.045920
NUM_RINGS_NORMALIZED                        0.032692
MOL_WEIGHT_NORMALIZED                       0.019481
HEAVY_ATOM_COUNT_NORMALIZED                 0.017444
NUM_ATOMS_NORMALIZED                        0.017444
NUM_VALENCE_ELECTRONS_NORMALIZED            0.011932
FR_Al_OH_NORMALIZED                         0.006992
HBOND_ACCEPTORS_NORMALIZED                  0.000511
HBOND_DONORS_NORMALIZED                    -0.003886
TPSA_NORMALIZED                            -0.008226
NUM_ROTATABLE_BONDS_NORMALIZED             -0.013421
FR_Al_COO_NORMALIZED                       -0.013653
FR_COO_NORMALIZED                          -0.015581
NUM_SATURATED_RINGS_NORMALIZED             -0.017082
NUM_AMIDE_BONDS_NORMALIZED                 -0.030539
AROMATIC_PROPORTION_NORMALIZED             -0.031083
FRACTION_SP3_NORMALIZED          

In [67]:
columns_to_drop = (
    columns_to_drop_1
    .intersection(columns_to_drop_2)
    .intersection(columns_to_drop_3)
    .intersection(columns_to_drop_4)
)
print(columns_to_drop)

train_df_md = train_df_md.drop(columns=columns_to_drop)
train_df_md.drop(columns='ACTIVE_NUMERIC', inplace=True)

train_df_md.info()

Index(['FR_Al_OH_NORMALIZED', 'TPSA_NORMALIZED', 'HBOND_DONORS_NORMALIZED',
       'HBOND_ACCEPTORS_NORMALIZED'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153230 entries, 0 to 153229
Data columns (total 20 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   INDEX                                     153230 non-null  int64  
 1   SMILES                                    153230 non-null  object 
 2   ACTIVE                                    153230 non-null  float64
 3   MOLECULE                                  153230 non-null  object 
 4   FR_Al_COO_NORMALIZED                      153230 non-null  float64
 5   FR_COO_NORMALIZED                         153230 non-null  float64
 6   MOL_WEIGHT_NORMALIZED                     153230 non-null  float64
 7   HEAVY_ATOM_COUNT_NORMALIZED               153230 non-null  float64
 8   NUM_RINGS_NORMALIZED        

In [68]:
train_df_md

Unnamed: 0,INDEX,SMILES,ACTIVE,MOLECULE,FR_Al_COO_NORMALIZED,FR_COO_NORMALIZED,MOL_WEIGHT_NORMALIZED,HEAVY_ATOM_COUNT_NORMALIZED,NUM_RINGS_NORMALIZED,NUM_ROTATABLE_BONDS_NORMALIZED,LOGP_NORMALIZED,AROMATIC_PROPORTION_NORMALIZED,NUM_ATOMS_NORMALIZED,NUM_VALENCE_ELECTRONS_NORMALIZED,FRACTION_SP3_NORMALIZED,FRACTIONAL_POLAR_SURFACE_AREA_NORMALIZED,NUM_SATURATED_RINGS_NORMALIZED,NUM_AMIDE_BONDS_NORMALIZED,NUM_AROMATIC_RINGS_NORMALIZED,NUM_ALIPHATIC_RINGS_NORMALIZED
0,1,CCCc1sc(N)nc1-c1ccc(C)cc1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d76aace0>,0.0,0.0,0.115366,0.121739,0.095238,0.063830,0.635084,0.307692,0.121739,0.114804,0.307692,0.307692,0.000000,0.000000,0.166667,0.000000
1,2,CCCCNC(=O)Cn1cnc2c(cnn2-c2ccc(C)c(C)c2)c1=O,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d76aac00>,0.0,0.0,0.185183,0.208696,0.142857,0.127660,0.591155,0.368421,0.208696,0.193353,0.368421,0.368421,0.000000,0.071429,0.250000,0.000000
2,3,O=C(NCCC1=CCCCC1)C1CCN(S(=O)(=O)N2CCOCC2)CC1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d76aac70>,0.0,0.0,0.203645,0.208696,0.142857,0.127660,0.567354,0.833333,0.208696,0.211480,0.833333,0.833333,0.095238,0.071429,0.000000,0.142857
3,4,N#Cc1c(-c2c(Cl)cccc2Cl)noc1/C=C/Nc1ccc(C(=O)O)cc1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d76aaf10>,0.0,0.2,0.211611,0.217391,0.142857,0.106383,0.682154,0.000000,0.217391,0.190332,0.000000,0.000000,0.000000,0.000000,0.250000,0.000000
4,5,Cc1cc(C)cc(OCC(=O)Nc2ccc(F)cc2)c1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d76ab530>,0.0,0.0,0.139015,0.156522,0.095238,0.085106,0.629566,0.187500,0.156522,0.145015,0.187500,0.187500,0.000000,0.071429,0.166667,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153225,153226,O=C(COC(=O)Cc1ccsc1)NC1CCCCCC1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d657de00>,0.0,0.0,0.151705,0.156522,0.095238,0.106383,0.607079,0.600000,0.156522,0.154079,0.600000,0.600000,0.047619,0.071429,0.083333,0.047619
153226,153227,CCCCCCN(C(=O)CCCCCN1C(=O)NC(c2cccc([N+](=O)[O-...,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d657de70>,0.0,0.0,0.500154,0.547826,0.190476,0.553191,0.749072,0.458333,0.547826,0.516616,0.458333,0.458333,0.000000,0.142857,0.250000,0.047619
153227,153228,O=C(Cn1cnc([N+](=O)[O-])c1)NCc1ccc(F)cc1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d657dee0>,0.0,0.0,0.141878,0.156522,0.095238,0.106383,0.566346,0.166667,0.156522,0.145015,0.166667,0.166667,0.000000,0.071429,0.166667,0.000000
153228,153229,CCOC(=O)CS(=O)(=O)CC(=O)Nc1ccc([N+](=O)[O-])cc1OC,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d657df50>,0.0,0.0,0.189149,0.191304,0.047619,0.170213,0.545580,0.384615,0.191304,0.187311,0.384615,0.384615,0.000000,0.071429,0.083333,0.000000


In [69]:
train_df_md.drop(columns='MOLECULE', inplace=True)

In [70]:
# from google.colab import drive

# drive.mount('/content/gdrive')

# Specify the path for saving the DataFrame as a CSV file
# save_path = '/content/gdrive/MyDrive/ID2214/training_smiles_molecule.csv'

# Use to_csv method to save the DataFrame to a CSV file
# train_df_md.to_csv(save_path, index=False)

# Print the path where the DataFrame is saved
# print('DataFrame saved to:', save_path)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
DataFrame saved to: /content/gdrive/MyDrive/ID2214/training_smiles_molecule.csv


## Fingerprint Representation

In [71]:
# Example: Generate Morgan fingerprints
train_df_finger['MORGAN_FP'] = train_df_finger['MOLECULE'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=128))

# breakdown fingerprints vector to numberic feature groups
fp_df = pd.DataFrame()
fp_df['INDEX'] = train_df_finger['INDEX']

for i in range(128):
    fp_df[f'MORGAN_FP_{i+1}'] = train_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)

fp_df.head()


  fp_df[f'MORGAN_FP_{i+1}'] = train_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df[f'MORGAN_FP_{i+1}'] = train_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df[f'MORGAN_FP_{i+1}'] = train_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df[f'MORGAN_FP_{i+1}'] = train_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df[f'MORGAN_FP_{i+1}'] = train_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df[f'MORGAN_FP_{i+1}'] = train_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df[f'MORGAN_FP_{i+1}'] = train_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df[f'MORGAN_FP_{i+1}'] = train_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df[f'MORGAN_FP_{i+1}'] = train_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df[f'MORGAN_FP_{i+1}'] = train_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)


Unnamed: 0,INDEX,MORGAN_FP_1,MORGAN_FP_2,MORGAN_FP_3,MORGAN_FP_4,MORGAN_FP_5,MORGAN_FP_6,MORGAN_FP_7,MORGAN_FP_8,MORGAN_FP_9,...,MORGAN_FP_119,MORGAN_FP_120,MORGAN_FP_121,MORGAN_FP_122,MORGAN_FP_123,MORGAN_FP_124,MORGAN_FP_125,MORGAN_FP_126,MORGAN_FP_127,MORGAN_FP_128
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,2,1,0,0,1,0,0,0,0,0,...,0,1,1,1,1,0,0,1,0,1
2,3,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,1,0,1,0,0
3,4,1,0,0,0,0,1,1,0,0,...,1,0,1,1,1,0,0,1,0,0
4,5,1,0,1,0,0,1,1,0,1,...,0,0,1,0,0,0,0,1,0,0


In [72]:
train_df_finger = pd.merge(train_df_finger, fp_df, on='INDEX', how='inner')
train_df_finger

Unnamed: 0,INDEX,SMILES,ACTIVE,MOLECULE,MORGAN_FP,MORGAN_FP_1,MORGAN_FP_2,MORGAN_FP_3,MORGAN_FP_4,MORGAN_FP_5,...,MORGAN_FP_119,MORGAN_FP_120,MORGAN_FP_121,MORGAN_FP_122,MORGAN_FP_123,MORGAN_FP_124,MORGAN_FP_125,MORGAN_FP_126,MORGAN_FP_127,MORGAN_FP_128
0,1,CCCc1sc(N)nc1-c1ccc(C)cc1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d76aace0>,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...",1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,2,CCCCNC(=O)Cn1cnc2c(cnn2-c2ccc(C)c(C)c2)c1=O,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d76aac00>,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",1,0,0,1,0,...,0,1,1,1,1,0,0,1,0,1
2,3,O=C(NCCC1=CCCCC1)C1CCN(S(=O)(=O)N2CCOCC2)CC1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d76aac70>,"[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, ...",1,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0
3,4,N#Cc1c(-c2c(Cl)cccc2Cl)noc1/C=C/Nc1ccc(C(=O)O)cc1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d76aaf10>,"[1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, ...",1,0,0,0,0,...,1,0,1,1,1,0,0,1,0,0
4,5,Cc1cc(C)cc(OCC(=O)Nc2ccc(F)cc2)c1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d76ab530>,"[1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, ...",1,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153225,153226,O=C(COC(=O)Cc1ccsc1)NC1CCCCCC1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d657de00>,"[1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, ...",1,0,1,1,1,...,0,1,0,0,1,1,0,1,0,0
153226,153227,CCCCCCN(C(=O)CCCCCN1C(=O)NC(c2cccc([N+](=O)[O-...,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d657de70>,"[1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, ...",1,1,0,1,0,...,0,1,0,1,1,1,0,1,0,1
153227,153228,O=C(Cn1cnc([N+](=O)[O-])c1)NCc1ccc(F)cc1,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d657dee0>,"[1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, ...",1,0,1,0,0,...,0,0,0,0,1,0,0,1,1,1
153228,153229,CCOC(=O)CS(=O)(=O)CC(=O)Nc1ccc([N+](=O)[O-])cc1OC,0.0,<rdkit.Chem.rdchem.Mol object at 0x7994d657df50>,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...",1,0,0,0,0,...,0,0,0,1,1,1,0,1,0,0


In [73]:
train_df_finger.drop(columns='MOLECULE', inplace=True)

In [79]:
# from google.colab import drive

# drive.mount('/content/gdrive')

# Specify the path for saving the DataFrame as a CSV file
# save_path = '/content/gdrive/MyDrive/ID2214/training_smiles_fingerprint.csv'

# Use to_csv method to save the DataFrame to a CSV file
# train_df_finger.to_csv(save_path, index=False)

# Print the path where the DataFrame is saved
# print('DataFrame saved to:', save_path)

DataFrame saved to: /content/gdrive/MyDrive/ID2214/training_smiles_fingerprint.csv


## Combined Representation

In [75]:
train_df_combined = pd.merge(train_df_md, train_df_finger, on='INDEX', how='inner')

In [45]:
train_df_combined

Unnamed: 0,INDEX,SMILES_x,ACTIVE_x,FR_Al_COO_NORMALIZED,FR_COO_NORMALIZED,MOL_WEIGHT_NORMALIZED,HEAVY_ATOM_COUNT_NORMALIZED,NUM_RINGS_NORMALIZED,NUM_ROTATABLE_BONDS_NORMALIZED,LOGP_NORMALIZED,...,MORGAN_FP_119,MORGAN_FP_120,MORGAN_FP_121,MORGAN_FP_122,MORGAN_FP_123,MORGAN_FP_124,MORGAN_FP_125,MORGAN_FP_126,MORGAN_FP_127,MORGAN_FP_128
0,1,CCCc1sc(N)nc1-c1ccc(C)cc1,0.0,0.0,0.0,0.115366,0.121739,0.095238,0.063830,0.635084,...,0,0,0,0,1,0,0,0,0,0
1,2,CCCCNC(=O)Cn1cnc2c(cnn2-c2ccc(C)c(C)c2)c1=O,0.0,0.0,0.0,0.185183,0.208696,0.142857,0.127660,0.591155,...,0,1,1,1,1,0,0,1,0,1
2,3,O=C(NCCC1=CCCCC1)C1CCN(S(=O)(=O)N2CCOCC2)CC1,0.0,0.0,0.0,0.203645,0.208696,0.142857,0.127660,0.567354,...,0,0,0,0,0,1,0,1,0,0
3,4,N#Cc1c(-c2c(Cl)cccc2Cl)noc1/C=C/Nc1ccc(C(=O)O)cc1,0.0,0.0,0.2,0.211611,0.217391,0.142857,0.106383,0.682154,...,1,0,1,1,1,0,0,1,0,0
4,5,Cc1cc(C)cc(OCC(=O)Nc2ccc(F)cc2)c1,0.0,0.0,0.0,0.139015,0.156522,0.095238,0.085106,0.629566,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153225,153226,O=C(COC(=O)Cc1ccsc1)NC1CCCCCC1,0.0,0.0,0.0,0.151705,0.156522,0.095238,0.106383,0.607079,...,0,1,0,0,1,1,0,1,0,0
153226,153227,CCCCCCN(C(=O)CCCCCN1C(=O)NC(c2cccc([N+](=O)[O-...,0.0,0.0,0.0,0.500154,0.547826,0.190476,0.553191,0.749072,...,0,1,0,1,1,1,0,1,0,1
153227,153228,O=C(Cn1cnc([N+](=O)[O-])c1)NCc1ccc(F)cc1,0.0,0.0,0.0,0.141878,0.156522,0.095238,0.106383,0.566346,...,0,0,0,0,1,0,0,1,1,1
153228,153229,CCOC(=O)CS(=O)(=O)CC(=O)Nc1ccc([N+](=O)[O-])cc1OC,0.0,0.0,0.0,0.189149,0.191304,0.047619,0.170213,0.545580,...,0,0,0,1,1,1,0,1,0,0


In [77]:
train_df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 153230 entries, 0 to 153229
Columns: 150 entries, INDEX to MORGAN_FP_128
dtypes: float64(18), int64(129), object(3)
memory usage: 176.5+ MB


In [80]:
# from google.colab import drive

# drive.mount('/content/gdrive')

# Specify the path for saving the DataFrame as a CSV file
# save_path = '/content/gdrive/MyDrive/ID2214/training_smiles_combined.csv'

# Use to_csv method to save the DataFrame to a CSV file
# train_df_combined.to_csv(save_path, index=False)

# Print the path where the DataFrame is saved
# print('DataFrame saved to:', save_path)

DataFrame saved to: /content/gdrive/MyDrive/ID2214/training_smiles_combined.csv


# Choose Learning Algorithms

In [4]:
from google.colab import drive

drive.mount('/content/gdrive')

load_path_1 = '/content/gdrive/MyDrive/ID2214/training_smiles_molecule.csv'
load_path_2 = '/content/gdrive/MyDrive/ID2214/training_smiles_fingerprint.csv'
load_path_3 = '/content/gdrive/MyDrive/ID2214/training_smiles_combined.csv'
train_df_md = pd.read_csv(load_path_1)
train_df_finger = pd.read_csv(load_path_2)
train_df_combined = pd.read_csv(load_path_3)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [11]:
# TODO: Replace svc with Multinomial Naïve Bayes

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

## Random Forest

### Molecular Descriptors Representation

In [14]:
columns_list = train_df_md.columns.tolist()
columns_list.remove('INDEX')
columns_list.remove('SMILES')
columns_list.remove('ACTIVE')

print(columns_list)

['FR_Al_COO_NORMALIZED', 'FR_COO_NORMALIZED', 'MOL_WEIGHT_NORMALIZED', 'HEAVY_ATOM_COUNT_NORMALIZED', 'NUM_RINGS_NORMALIZED', 'NUM_ROTATABLE_BONDS_NORMALIZED', 'LOGP_NORMALIZED', 'AROMATIC_PROPORTION_NORMALIZED', 'NUM_ATOMS_NORMALIZED', 'NUM_VALENCE_ELECTRONS_NORMALIZED', 'FRACTION_SP3_NORMALIZED', 'FRACTIONAL_POLAR_SURFACE_AREA_NORMALIZED', 'NUM_SATURATED_RINGS_NORMALIZED', 'NUM_AMIDE_BONDS_NORMALIZED', 'NUM_AROMATIC_RINGS_NORMALIZED', 'NUM_ALIPHATIC_RINGS_NORMALIZED']


In [15]:
from sklearn.model_selection import train_test_split

X = train_df_md[columns_list]
y = train_df_md['ACTIVE']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [67]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122584 entries, 143510 to 121958
Data columns (total 16 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   FR_Al_COO_NORMALIZED                      122584 non-null  float64
 1   FR_COO_NORMALIZED                         122584 non-null  float64
 2   MOL_WEIGHT_NORMALIZED                     122584 non-null  float64
 3   HEAVY_ATOM_COUNT_NORMALIZED               122584 non-null  float64
 4   NUM_RINGS_NORMALIZED                      122584 non-null  float64
 5   NUM_ROTATABLE_BONDS_NORMALIZED            122584 non-null  float64
 6   LOGP_NORMALIZED                           122584 non-null  float64
 7   AROMATIC_PROPORTION_NORMALIZED            122584 non-null  float64
 8   NUM_ATOMS_NORMALIZED                      122584 non-null  float64
 9   NUM_VALENCE_ELECTRONS_NORMALIZED          122584 non-null  float64
 10  FRACTION_SP3_NO

In [23]:
from sklearn.model_selection import GridSearchCV

# Example: Tune hyperparameters for Random Forest
# change the params several times to find the best
rf_params = {
    'n_estimators': [200, 400, 500],
    'max_depth': [5, 10, 15],
}

# Perform Grid Search
rf_grid = GridSearchCV(RandomForestClassifier(), param_grid=rf_params, scoring='roc_auc', cv=3, verbose=2)
rf_grid.fit(X_train, y_train)

# Get the best hyperparameters
best_rf_params_md = rf_grid.best_params_

# Get all results
results_df = pd.DataFrame(rf_grid.cv_results_)

# Display the results
print(results_df[['params', 'mean_test_score', 'rank_test_score']])

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] END ......................max_depth=5, n_estimators=200; total time=  16.5s
[CV] END ......................max_depth=5, n_estimators=200; total time=  15.0s
[CV] END ......................max_depth=5, n_estimators=200; total time=  15.8s
[CV] END ......................max_depth=5, n_estimators=400; total time=  28.9s
[CV] END ......................max_depth=5, n_estimators=400; total time=  30.3s
[CV] END ......................max_depth=5, n_estimators=400; total time=  30.4s
[CV] END ......................max_depth=5, n_estimators=500; total time=  43.6s
[CV] END ......................max_depth=5, n_estimators=500; total time=  37.4s
[CV] END ......................max_depth=5, n_estimators=500; total time=  37.3s
[CV] END .....................max_depth=10, n_estimators=200; total time=  24.8s
[CV] END .....................max_depth=10, n_estimators=200; total time=  26.7s
[CV] END .....................max_depth=10, n_est

In [24]:
print(best_rf_params_md)

{'max_depth': 10, 'n_estimators': 500}


In [25]:
rf_model = RandomForestClassifier(n_estimators=500,
 max_depth=best_rf_params_md['max_depth'])
rf_model.fit(X_train, y_train)

In [26]:
from sklearn.metrics import roc_auc_score

# Example: Evaluate Random Forest on the validation set
y_val_pred_rf = rf_model.predict_proba(X_val)[:, 1]
auc_rf = roc_auc_score(y_val, y_val_pred_rf)

print(auc_rf)

0.6970617873100708


The AUC (Area Under the ROC Curve) obtained during the cross-validation performed by GridSearchCV is typically a more reliable estimate of the model's performance compared to the AUC calculated on a single validation set.

### Combined Representation

In [16]:
columns_list = train_df_combined.columns.tolist()
columns_list.remove('INDEX')
columns_list.remove('SMILES_x')
columns_list.remove('ACTIVE_x')
columns_list.remove('SMILES_y')
columns_list.remove('ACTIVE_y')
columns_list.remove('MORGAN_FP')

print(columns_list)

['FR_Al_COO_NORMALIZED', 'FR_COO_NORMALIZED', 'MOL_WEIGHT_NORMALIZED', 'HEAVY_ATOM_COUNT_NORMALIZED', 'NUM_RINGS_NORMALIZED', 'NUM_ROTATABLE_BONDS_NORMALIZED', 'LOGP_NORMALIZED', 'AROMATIC_PROPORTION_NORMALIZED', 'NUM_ATOMS_NORMALIZED', 'NUM_VALENCE_ELECTRONS_NORMALIZED', 'FRACTION_SP3_NORMALIZED', 'FRACTIONAL_POLAR_SURFACE_AREA_NORMALIZED', 'NUM_SATURATED_RINGS_NORMALIZED', 'NUM_AMIDE_BONDS_NORMALIZED', 'NUM_AROMATIC_RINGS_NORMALIZED', 'NUM_ALIPHATIC_RINGS_NORMALIZED', 'MORGAN_FP_1', 'MORGAN_FP_2', 'MORGAN_FP_3', 'MORGAN_FP_4', 'MORGAN_FP_5', 'MORGAN_FP_6', 'MORGAN_FP_7', 'MORGAN_FP_8', 'MORGAN_FP_9', 'MORGAN_FP_10', 'MORGAN_FP_11', 'MORGAN_FP_12', 'MORGAN_FP_13', 'MORGAN_FP_14', 'MORGAN_FP_15', 'MORGAN_FP_16', 'MORGAN_FP_17', 'MORGAN_FP_18', 'MORGAN_FP_19', 'MORGAN_FP_20', 'MORGAN_FP_21', 'MORGAN_FP_22', 'MORGAN_FP_23', 'MORGAN_FP_24', 'MORGAN_FP_25', 'MORGAN_FP_26', 'MORGAN_FP_27', 'MORGAN_FP_28', 'MORGAN_FP_29', 'MORGAN_FP_30', 'MORGAN_FP_31', 'MORGAN_FP_32', 'MORGAN_FP_33', 'MORGA

In [17]:
from sklearn.model_selection import train_test_split

X = train_df_combined[columns_list]
y = train_df_combined['ACTIVE_x']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122584 entries, 143510 to 121958
Columns: 144 entries, FR_Al_COO_NORMALIZED to MORGAN_FP_128
dtypes: float64(16), int64(128)
memory usage: 135.6 MB


In [19]:
from sklearn.model_selection import GridSearchCV

# Example: Tune hyperparameters for Random Forest
# change the params several times to find the best
rf_params = {
    'n_estimators': [400, 500, 600],
    'max_depth': [10, 15, 20],
}

# Perform Grid Search
rf_grid = GridSearchCV(RandomForestClassifier(), param_grid=rf_params, scoring='roc_auc', cv=3, verbose=2)
rf_grid.fit(X_train, y_train)

# Get the best hyperparameters
best_rf_params_c = rf_grid.best_params_

# Get all results
results_df = pd.DataFrame(rf_grid.cv_results_)

# Display the results
print(results_df[['params', 'mean_test_score', 'rank_test_score']])

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] END .....................max_depth=10, n_estimators=400; total time= 1.2min
[CV] END .....................max_depth=10, n_estimators=400; total time= 1.1min
[CV] END .....................max_depth=10, n_estimators=400; total time= 1.1min
[CV] END .....................max_depth=10, n_estimators=500; total time= 1.3min
[CV] END .....................max_depth=10, n_estimators=500; total time= 1.3min
[CV] END .....................max_depth=10, n_estimators=500; total time= 1.4min
[CV] END .....................max_depth=10, n_estimators=600; total time= 1.7min
[CV] END .....................max_depth=10, n_estimators=600; total time= 1.7min
[CV] END .....................max_depth=10, n_estimators=600; total time= 1.7min
[CV] END .....................max_depth=15, n_estimators=400; total time= 1.5min
[CV] END .....................max_depth=15, n_estimators=400; total time= 1.5min
[CV] END .....................max_depth=15, n_est

In [29]:
print(best_rf_params_c)

{'max_depth': 15, 'n_estimators': 600}


In [21]:
rf_model = RandomForestClassifier(n_estimators=500,
 max_depth=best_rf_params_c['max_depth'])
rf_model.fit(X_train, y_train)

In [22]:
from sklearn.metrics import roc_auc_score

# Example: Evaluate Random Forest on the validation set
y_val_pred_rf = rf_model.predict_proba(X_val)[:, 1]
auc_rf = roc_auc_score(y_val, y_val_pred_rf)

print(auc_rf)

0.7618966166942699


The AUC (Area Under the ROC Curve) obtained during the cross-validation performed by GridSearchCV is typically a more reliable estimate of the model's performance compared to the AUC calculated on a single validation set.

## Multinomial Naïve Bayes

### Molecular Descriptors Representation

In [47]:
columns_list = train_df_md.columns.tolist()
columns_list.remove('INDEX')
columns_list.remove('SMILES')
columns_list.remove('ACTIVE')

print(columns_list)

['FR_Al_COO_NORMALIZED', 'FR_COO_NORMALIZED', 'MOL_WEIGHT_NORMALIZED', 'HEAVY_ATOM_COUNT_NORMALIZED', 'NUM_RINGS_NORMALIZED', 'NUM_ROTATABLE_BONDS_NORMALIZED', 'LOGP_NORMALIZED', 'AROMATIC_PROPORTION_NORMALIZED', 'NUM_ATOMS_NORMALIZED', 'NUM_VALENCE_ELECTRONS_NORMALIZED', 'FRACTION_SP3_NORMALIZED', 'FRACTIONAL_POLAR_SURFACE_AREA_NORMALIZED', 'NUM_SATURATED_RINGS_NORMALIZED', 'NUM_AMIDE_BONDS_NORMALIZED', 'NUM_AROMATIC_RINGS_NORMALIZED', 'NUM_ALIPHATIC_RINGS_NORMALIZED']


In [48]:
from sklearn.model_selection import train_test_split

X = train_df_md[columns_list]
y = train_df_md['ACTIVE']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
from sklearn.model_selection import GridSearchCV

nb_params = {
    'alpha': [0.01, 0.1, 0.5, 1.0],
}

# Perform Grid Search for Multinomial Naive Bayes
nb_grid = GridSearchCV(MultinomialNB(), param_grid=nb_params, scoring='roc_auc', cv=3, verbose=2)
nb_grid.fit(X_train, y_train)

# Get the best hyperparameters
best_nb_params_md = nb_grid.best_params_

# Get all results
nb_results_df = pd.DataFrame(nb_grid.cv_results_)

# Display the results
print(nb_results_df[['params', 'mean_test_score', 'rank_test_score']])

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END .........................................alpha=0.01; total time=   0.1s
[CV] END .........................................alpha=0.01; total time=   0.1s
[CV] END .........................................alpha=0.01; total time=   0.1s
[CV] END ..........................................alpha=0.1; total time=   0.1s
[CV] END ..........................................alpha=0.1; total time=   0.1s
[CV] END ..........................................alpha=0.1; total time=   0.1s
[CV] END ..........................................alpha=0.5; total time=   0.1s
[CV] END ..........................................alpha=0.5; total time=   0.1s
[CV] END ..........................................alpha=0.5; total time=   0.1s
[CV] END ..........................................alpha=1.0; total time=   0.1s
[CV] END ..........................................alpha=1.0; total time=   0.1s
[CV] END ........................................

In [50]:
print(best_nb_params_md)

{'alpha': 0.01}


### Combined Representation

In [53]:
columns_list = train_df_combined.columns.tolist()
columns_list.remove('INDEX')
columns_list.remove('SMILES_x')
columns_list.remove('ACTIVE_x')
columns_list.remove('SMILES_y')
columns_list.remove('ACTIVE_y')
columns_list.remove('MORGAN_FP')

print(columns_list)

['FR_Al_COO_NORMALIZED', 'FR_COO_NORMALIZED', 'MOL_WEIGHT_NORMALIZED', 'HEAVY_ATOM_COUNT_NORMALIZED', 'NUM_RINGS_NORMALIZED', 'NUM_ROTATABLE_BONDS_NORMALIZED', 'LOGP_NORMALIZED', 'AROMATIC_PROPORTION_NORMALIZED', 'NUM_ATOMS_NORMALIZED', 'NUM_VALENCE_ELECTRONS_NORMALIZED', 'FRACTION_SP3_NORMALIZED', 'FRACTIONAL_POLAR_SURFACE_AREA_NORMALIZED', 'NUM_SATURATED_RINGS_NORMALIZED', 'NUM_AMIDE_BONDS_NORMALIZED', 'NUM_AROMATIC_RINGS_NORMALIZED', 'NUM_ALIPHATIC_RINGS_NORMALIZED', 'MORGAN_FP_1', 'MORGAN_FP_2', 'MORGAN_FP_3', 'MORGAN_FP_4', 'MORGAN_FP_5', 'MORGAN_FP_6', 'MORGAN_FP_7', 'MORGAN_FP_8', 'MORGAN_FP_9', 'MORGAN_FP_10', 'MORGAN_FP_11', 'MORGAN_FP_12', 'MORGAN_FP_13', 'MORGAN_FP_14', 'MORGAN_FP_15', 'MORGAN_FP_16', 'MORGAN_FP_17', 'MORGAN_FP_18', 'MORGAN_FP_19', 'MORGAN_FP_20', 'MORGAN_FP_21', 'MORGAN_FP_22', 'MORGAN_FP_23', 'MORGAN_FP_24', 'MORGAN_FP_25', 'MORGAN_FP_26', 'MORGAN_FP_27', 'MORGAN_FP_28', 'MORGAN_FP_29', 'MORGAN_FP_30', 'MORGAN_FP_31', 'MORGAN_FP_32', 'MORGAN_FP_33', 'MORGA

In [54]:
from sklearn.model_selection import train_test_split

X = train_df_combined[columns_list]
y = train_df_combined['ACTIVE_x']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
from sklearn.model_selection import GridSearchCV

nb_params = {
    'alpha': [0.01, 0.1, 0.5, 1.0],
}

# Perform Grid Search for Multinomial Naive Bayes
nb_grid = GridSearchCV(MultinomialNB(), param_grid=nb_params, scoring='roc_auc', cv=3, verbose=2)
nb_grid.fit(X_train, y_train)

# Get the best hyperparameters
best_nb_params_c = nb_grid.best_params_

# Get all results
nb_results_df = pd.DataFrame(nb_grid.cv_results_)

# Display the results
print(nb_results_df[['params', 'mean_test_score', 'rank_test_score']])

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END .........................................alpha=0.01; total time=   0.4s
[CV] END .........................................alpha=0.01; total time=   0.4s
[CV] END .........................................alpha=0.01; total time=   1.1s
[CV] END ..........................................alpha=0.1; total time=   0.9s
[CV] END ..........................................alpha=0.1; total time=   0.4s
[CV] END ..........................................alpha=0.1; total time=   0.5s
[CV] END ..........................................alpha=0.5; total time=   0.4s
[CV] END ..........................................alpha=0.5; total time=   0.3s
[CV] END ..........................................alpha=0.5; total time=   0.3s
[CV] END ..........................................alpha=1.0; total time=   0.3s
[CV] END ..........................................alpha=1.0; total time=   0.3s
[CV] END ........................................

In [56]:
print(best_nb_params_c)

{'alpha': 0.01}


## Multilayer Perceptron Classifier

### Molecular Descriptors Representation

In [66]:
columns_list = train_df_md.columns.tolist()
columns_list.remove('INDEX')
columns_list.remove('SMILES')
columns_list.remove('ACTIVE')

print(columns_list)

['FR_Al_COO_NORMALIZED', 'FR_COO_NORMALIZED', 'MOL_WEIGHT_NORMALIZED', 'HEAVY_ATOM_COUNT_NORMALIZED', 'NUM_RINGS_NORMALIZED', 'NUM_ROTATABLE_BONDS_NORMALIZED', 'LOGP_NORMALIZED', 'AROMATIC_PROPORTION_NORMALIZED', 'NUM_ATOMS_NORMALIZED', 'NUM_VALENCE_ELECTRONS_NORMALIZED', 'FRACTION_SP3_NORMALIZED', 'FRACTIONAL_POLAR_SURFACE_AREA_NORMALIZED', 'NUM_SATURATED_RINGS_NORMALIZED', 'NUM_AMIDE_BONDS_NORMALIZED', 'NUM_AROMATIC_RINGS_NORMALIZED', 'NUM_ALIPHATIC_RINGS_NORMALIZED']


In [67]:
from sklearn.model_selection import train_test_split

X = train_df_md[columns_list]
y = train_df_md['ACTIVE']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [68]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122584 entries, 143510 to 121958
Data columns (total 16 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   FR_Al_COO_NORMALIZED                      122584 non-null  float64
 1   FR_COO_NORMALIZED                         122584 non-null  float64
 2   MOL_WEIGHT_NORMALIZED                     122584 non-null  float64
 3   HEAVY_ATOM_COUNT_NORMALIZED               122584 non-null  float64
 4   NUM_RINGS_NORMALIZED                      122584 non-null  float64
 5   NUM_ROTATABLE_BONDS_NORMALIZED            122584 non-null  float64
 6   LOGP_NORMALIZED                           122584 non-null  float64
 7   AROMATIC_PROPORTION_NORMALIZED            122584 non-null  float64
 8   NUM_ATOMS_NORMALIZED                      122584 non-null  float64
 9   NUM_VALENCE_ELECTRONS_NORMALIZED          122584 non-null  float64
 10  FRACTION_SP3_NO

In [78]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters for the MLP Classifier
mlp_params = {
    'hidden_layer_sizes': [(24, ), (30, ), (36, )],
    'activation': ['relu'],
    'alpha': [0.0001, 0.001],
    'max_iter': [30, 50, 100],
}


# Perform Grid Search for the MLP Classifier
mlp_grid = GridSearchCV(MLPClassifier(), param_grid=mlp_params, scoring='roc_auc', cv=3, verbose=2)
mlp_grid.fit(X_train, y_train)

# Get the best hyperparameters
best_mlp_params_md = mlp_grid.best_params_

# Get all results
results_df = pd.DataFrame(mlp_grid.cv_results_)

# Display the results
print(results_df[['param_hidden_layer_sizes', 'param_max_iter', 'param_alpha']])
print(results_df[['mean_test_score', 'rank_test_score']])

sorted_results_df = results_df.sort_values(by='rank_test_score')

print(sorted_results_df[['param_hidden_layer_sizes', 'param_max_iter', 'param_alpha']])

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(16,), max_iter=30; total time=   3.6s
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(16,), max_iter=30; total time=   5.4s
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(16,), max_iter=30; total time=   6.5s
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(16,), max_iter=50; total time=   4.3s
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(16,), max_iter=50; total time=   4.2s
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(16,), max_iter=50; total time=   6.4s
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(16,), max_iter=100; total time=   4.7s
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(16,), max_iter=100; total time=   4.3s
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(16,), max_iter=100; total time=   3.8s
[CV] END activation=relu, alpha=0.0001, hidden_layer_s

In [79]:
print(best_mlp_params_md)

{'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (24,), 'max_iter': 100}


### Combined Representation

In [6]:
columns_list = train_df_combined.columns.tolist()
columns_list.remove('INDEX')
columns_list.remove('SMILES_x')
columns_list.remove('ACTIVE_x')
columns_list.remove('SMILES_y')
columns_list.remove('ACTIVE_y')
columns_list.remove('MORGAN_FP')

print(columns_list)

['FR_Al_COO_NORMALIZED', 'FR_COO_NORMALIZED', 'MOL_WEIGHT_NORMALIZED', 'HEAVY_ATOM_COUNT_NORMALIZED', 'NUM_RINGS_NORMALIZED', 'NUM_ROTATABLE_BONDS_NORMALIZED', 'LOGP_NORMALIZED', 'AROMATIC_PROPORTION_NORMALIZED', 'NUM_ATOMS_NORMALIZED', 'NUM_VALENCE_ELECTRONS_NORMALIZED', 'FRACTION_SP3_NORMALIZED', 'FRACTIONAL_POLAR_SURFACE_AREA_NORMALIZED', 'NUM_SATURATED_RINGS_NORMALIZED', 'NUM_AMIDE_BONDS_NORMALIZED', 'NUM_AROMATIC_RINGS_NORMALIZED', 'NUM_ALIPHATIC_RINGS_NORMALIZED', 'MORGAN_FP_1', 'MORGAN_FP_2', 'MORGAN_FP_3', 'MORGAN_FP_4', 'MORGAN_FP_5', 'MORGAN_FP_6', 'MORGAN_FP_7', 'MORGAN_FP_8', 'MORGAN_FP_9', 'MORGAN_FP_10', 'MORGAN_FP_11', 'MORGAN_FP_12', 'MORGAN_FP_13', 'MORGAN_FP_14', 'MORGAN_FP_15', 'MORGAN_FP_16', 'MORGAN_FP_17', 'MORGAN_FP_18', 'MORGAN_FP_19', 'MORGAN_FP_20', 'MORGAN_FP_21', 'MORGAN_FP_22', 'MORGAN_FP_23', 'MORGAN_FP_24', 'MORGAN_FP_25', 'MORGAN_FP_26', 'MORGAN_FP_27', 'MORGAN_FP_28', 'MORGAN_FP_29', 'MORGAN_FP_30', 'MORGAN_FP_31', 'MORGAN_FP_32', 'MORGAN_FP_33', 'MORGA

In [7]:
from sklearn.model_selection import train_test_split

X = train_df_combined[columns_list]
y = train_df_combined['ACTIVE_x']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122584 entries, 143510 to 121958
Columns: 144 entries, FR_Al_COO_NORMALIZED to MORGAN_FP_128
dtypes: float64(16), int64(128)
memory usage: 135.6 MB


In [12]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters for the MLP Classifier
mlp_params = {
    'hidden_layer_sizes': [(96, ), (144, ), (192, )],
    'activation': ['relu'],
    'alpha': [0.0001, 0.001],
    'max_iter': [50, 100, 200],
}


# Perform Grid Search for the MLP Classifier
mlp_grid = GridSearchCV(MLPClassifier(), param_grid=mlp_params, scoring='roc_auc', cv=3, verbose=2)
mlp_grid.fit(X_train, y_train)

# Get the best hyperparameters
best_mlp_params_c = mlp_grid.best_params_

# Get all results
results_df = pd.DataFrame(mlp_grid.cv_results_)

# Display the results
print(results_df[['params', 'mean_test_score', 'rank_test_score']])

Fitting 3 folds for each of 18 candidates, totalling 54 fits




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(96,), max_iter=50; total time=  51.1s




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(96,), max_iter=50; total time=  52.9s




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(96,), max_iter=50; total time=  52.9s
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(96,), max_iter=100; total time= 1.6min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(96,), max_iter=100; total time= 1.4min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(96,), max_iter=100; total time= 1.5min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(96,), max_iter=200; total time= 1.4min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(96,), max_iter=200; total time= 1.6min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(96,), max_iter=200; total time= 1.4min




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(144,), max_iter=50; total time= 1.1min




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(144,), max_iter=50; total time= 1.1min




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(144,), max_iter=50; total time= 1.3min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(144,), max_iter=100; total time= 1.5min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(144,), max_iter=100; total time= 1.8min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(144,), max_iter=100; total time= 1.4min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(144,), max_iter=200; total time= 1.9min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(144,), max_iter=200; total time= 1.8min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(144,), max_iter=200; total time= 2.2min




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(192,), max_iter=50; total time= 1.6min




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(192,), max_iter=50; total time= 1.6min




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(192,), max_iter=50; total time= 1.7min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(192,), max_iter=100; total time= 2.1min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(192,), max_iter=100; total time= 2.4min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(192,), max_iter=100; total time= 2.0min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(192,), max_iter=200; total time= 1.7min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(192,), max_iter=200; total time= 2.1min
[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(192,), max_iter=200; total time= 1.5min




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(96,), max_iter=50; total time=  59.4s




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(96,), max_iter=50; total time=  55.5s




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(96,), max_iter=50; total time= 1.1min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(96,), max_iter=100; total time= 1.9min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(96,), max_iter=100; total time= 1.7min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(96,), max_iter=100; total time= 1.7min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(96,), max_iter=200; total time= 1.8min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(96,), max_iter=200; total time= 1.9min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(96,), max_iter=200; total time= 1.6min




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(144,), max_iter=50; total time= 1.3min




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(144,), max_iter=50; total time= 1.2min




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(144,), max_iter=50; total time= 1.2min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(144,), max_iter=100; total time= 1.5min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(144,), max_iter=100; total time= 1.5min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(144,), max_iter=100; total time= 1.6min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(144,), max_iter=200; total time= 1.7min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(144,), max_iter=200; total time= 1.6min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(144,), max_iter=200; total time= 1.7min




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(192,), max_iter=50; total time= 1.4min




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(192,), max_iter=50; total time= 1.4min




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(192,), max_iter=50; total time= 1.4min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(192,), max_iter=100; total time= 1.5min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(192,), max_iter=100; total time= 1.9min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(192,), max_iter=100; total time= 1.6min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(192,), max_iter=200; total time= 2.3min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(192,), max_iter=200; total time= 1.6min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(192,), max_iter=200; total time= 2.3min
                                               params  mean_test_score  \
0   {'activation': 'relu', 'alpha': 0.0001, 'hidde...         0.685769   
1   {'activation': 'relu', 'alpha': 0.0001, 'hidde...         0.683917   
2   {'activation': 'relu', 'alpha': 0.0001, 'hidde...         0.683541   
3   {'activa

In [13]:
print(best_mlp_params_c)

{'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (192,), 'max_iter': 200}


# Model Training

In [33]:
columns_list = train_df_combined.columns.tolist()
columns_list.remove('INDEX')
columns_list.remove('SMILES_x')
columns_list.remove('ACTIVE_x')
columns_list.remove('SMILES_y')
columns_list.remove('ACTIVE_y')
columns_list.remove('MORGAN_FP')

print(columns_list)

['FR_Al_COO_NORMALIZED', 'FR_COO_NORMALIZED', 'MOL_WEIGHT_NORMALIZED', 'HEAVY_ATOM_COUNT_NORMALIZED', 'NUM_RINGS_NORMALIZED', 'NUM_ROTATABLE_BONDS_NORMALIZED', 'LOGP_NORMALIZED', 'AROMATIC_PROPORTION_NORMALIZED', 'NUM_ATOMS_NORMALIZED', 'NUM_VALENCE_ELECTRONS_NORMALIZED', 'FRACTION_SP3_NORMALIZED', 'FRACTIONAL_POLAR_SURFACE_AREA_NORMALIZED', 'NUM_SATURATED_RINGS_NORMALIZED', 'NUM_AMIDE_BONDS_NORMALIZED', 'NUM_AROMATIC_RINGS_NORMALIZED', 'NUM_ALIPHATIC_RINGS_NORMALIZED', 'MORGAN_FP_1', 'MORGAN_FP_2', 'MORGAN_FP_3', 'MORGAN_FP_4', 'MORGAN_FP_5', 'MORGAN_FP_6', 'MORGAN_FP_7', 'MORGAN_FP_8', 'MORGAN_FP_9', 'MORGAN_FP_10', 'MORGAN_FP_11', 'MORGAN_FP_12', 'MORGAN_FP_13', 'MORGAN_FP_14', 'MORGAN_FP_15', 'MORGAN_FP_16', 'MORGAN_FP_17', 'MORGAN_FP_18', 'MORGAN_FP_19', 'MORGAN_FP_20', 'MORGAN_FP_21', 'MORGAN_FP_22', 'MORGAN_FP_23', 'MORGAN_FP_24', 'MORGAN_FP_25', 'MORGAN_FP_26', 'MORGAN_FP_27', 'MORGAN_FP_28', 'MORGAN_FP_29', 'MORGAN_FP_30', 'MORGAN_FP_31', 'MORGAN_FP_32', 'MORGAN_FP_33', 'MORGA

In [36]:
# Use the full training set for model training
X_train_full = train_df_combined[columns_list]
y_train_full = train_df_combined['ACTIVE_x']

In [37]:
# Example: Train Random Forest with the best-tuned hyperparameters using the full training set
rf_model_full = RandomForestClassifier(n_estimators=best_rf_params_c['n_estimators'], max_depth=best_rf_params_c['max_depth'])
rf_model_full.fit(X_train_full, y_train_full)

In [39]:
import joblib

# Save the model to Google Drive
model_path = '/content/gdrive/MyDrive/ID2214/{}'.format('random_forest_model_c.pkl')
joblib.dump(rf_model_full, model_path)

# Print the path where the model is saved
print('Model saved to:', model_path)

Model saved to: /content/gdrive/MyDrive/ID2214/random_forest_model_c.pkl


# Test Data Preprocessing and Feature Engineering

In [41]:
# Load the test set into a DataFrame
load_path_test = '/content/gdrive/MyDrive/ID2214/test_smiles.csv'

test_df = pd.read_csv(load_path_test)

In [42]:
# Example: Convert SMILES to a molecule object
test_df['MOLECULE'] = test_df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))



In [78]:
# Molecular Descriptors
test_df_md = test_df.copy()
# Fingerprint
test_df_finger = test_df.copy()

In [79]:
from rdkit.Chem import rdMolDescriptors, Fragments, Lipinski, AllChem, Descriptors, rdChemReactions

test_df_md['FR_Al_COO'] = test_df_md['MOLECULE'].apply(lambda x: Fragments.fr_Al_COO(x))
test_df_md['FR_Al_OH'] = test_df_md['MOLECULE'].apply(lambda x: Fragments.fr_Al_OH(x))
test_df_md['FR_COO'] = test_df_md['MOLECULE'].apply(lambda x: Fragments.fr_COO(x))

# Example: Calculate molecular weight using rdMolDescriptors
test_df_md['MOL_WEIGHT'] = test_df_md['MOLECULE'].apply(lambda x: rdMolDescriptors.CalcExactMolWt(x))

# Example: Calculate number of heavy atoms using Lipinski
test_df_md['HEAVY_ATOM_COUNT'] = test_df_md['MOLECULE'].apply(lambda x: Lipinski.HeavyAtomCount(x))

# Example: Experiment with additional molecular descriptors
test_df_md['NUM_RINGS'] = test_df_md['MOLECULE'].apply(lambda x: rdMolDescriptors.CalcNumRings(x))

# Example: Number of Rotatable Bonds
test_df_md['NUM_ROTATABLE_BONDS'] = test_df_md['MOLECULE'].apply(lambda x: Descriptors.NumRotatableBonds(x))

# Example: Topological Polar Surface Area (TPSA)
test_df_md['TPSA'] = test_df_md['MOLECULE'].apply(lambda x: Descriptors.TPSA(x))

# Example: Number of Hydrogen Bond Donors and Acceptors
test_df_md['HBOND_DONORS'] = test_df_md['MOLECULE'].apply(lambda x: Lipinski.NumHDonors(x))
test_df_md['HBOND_ACCEPTORS'] = test_df_md['MOLECULE'].apply(lambda x: Lipinski.NumHAcceptors(x))

# Example: LogP (Lipophilicity)
test_df_md['LOGP'] = test_df_md['MOLECULE'].apply(lambda x: Descriptors.MolLogP(x))

# Example: Aromatic Proportion
test_df_md['AROMATIC_PROPORTION'] = test_df_md['MOLECULE'].apply(lambda x: Descriptors.FractionCSP3(x))

# Example: Number of atoms
test_df_md['NUM_ATOMS'] = test_df_md['MOLECULE'].apply(lambda x: x.GetNumAtoms())

# Example: Number of valence electrons
test_df_md['NUM_VALENCE_ELECTRONS'] = test_df_md['MOLECULE'].apply(lambda x: Descriptors.NumValenceElectrons(x))

# Example: Fraction of sp3-hybridized carbon atoms
test_df_md['FRACTION_SP3'] = test_df_md['MOLECULE'].apply(lambda x: Descriptors.FractionCSP3(x))

# Example: Fractional polar surface area
test_df_md['FRACTIONAL_POLAR_SURFACE_AREA'] = test_df_md['MOLECULE'].apply(lambda x: Descriptors.FractionCSP3(x))

# Example: Number of Saturated Rings
test_df_md['NUM_SATURATED_RINGS'] = test_df_md['MOLECULE'].apply(lambda x: rdMolDescriptors.CalcNumSaturatedRings(x))

# Example: Number of Amide Bonds
def count_amide_bonds(mol):
    amide_pattern = '[NX3][CX3](=[OX1])[#6]'
    return len(mol.GetSubstructMatches(Chem.MolFromSmarts(amide_pattern)))

test_df_md['NUM_AMIDE_BONDS'] = test_df_md['MOLECULE'].apply(lambda x: count_amide_bonds(x))

# Example: Number of aromatic rings
test_df_md['NUM_AROMATIC_RINGS'] = test_df_md['MOLECULE'].apply(lambda x: rdMolDescriptors.CalcNumAromaticRings(x))

# Example: Number of aliphatic rings
test_df_md['NUM_ALIPHATIC_RINGS'] = test_df_md['MOLECULE'].apply(lambda x: rdMolDescriptors.CalcNumAliphaticRings(x))

test_df_md.describe()

Unnamed: 0,INDEX,FR_Al_COO,FR_Al_OH,FR_COO,MOL_WEIGHT,HEAVY_ATOM_COUNT,NUM_RINGS,NUM_ROTATABLE_BONDS,TPSA,HBOND_DONORS,...,LOGP,AROMATIC_PROPORTION,NUM_ATOMS,NUM_VALENCE_ELECTRONS,FRACTION_SP3,FRACTIONAL_POLAR_SURFACE_AREA,NUM_SATURATED_RINGS,NUM_AMIDE_BONDS,NUM_AROMATIC_RINGS,NUM_ALIPHATIC_RINGS
count,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,...,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0
mean,178768.5,0.031502,0.062123,0.052784,349.177953,24.409586,2.951758,4.781972,72.641826,1.13985,...,2.914651,0.294979,24.409703,127.57481,0.294979,0.294979,0.462566,0.72118,2.215757,0.736001
std,14744.515511,0.182671,0.345926,0.234043,80.317159,5.696362,1.051481,2.324923,28.716027,0.929592,...,1.282937,0.181047,5.696437,29.963805,0.181047,0.181047,0.745388,0.761845,0.963762,0.88273
min,153231.0,0.0,0.0,0.0,58.005479,1.0,0.0,0.0,0.0,0.0,...,-13.0548,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,165999.75,0.0,0.0,0.0,295.044919,21.0,2.0,3.0,53.93,1.0,...,2.137015,0.157895,21.0,108.0,0.157895,0.157895,0.0,0.0,2.0,0.0
50%,178768.5,0.0,0.0,0.0,345.114712,24.0,3.0,5.0,70.57,1.0,...,2.95566,0.272727,24.0,126.0,0.272727,0.272727,0.0,1.0,2.0,1.0
75%,191537.25,0.0,0.0,0.0,400.066382,28.0,4.0,6.0,89.19,2.0,...,3.747725,0.4,28.0,146.0,0.4,0.4,1.0,1.0,3.0,1.0
max,204306.0,3.0,18.0,4.0,1722.425568,122.0,22.0,43.0,777.98,25.0,...,17.8539,1.0,122.0,642.0,1.0,1.0,22.0,11.0,10.0,22.0


In [80]:
from sklearn.preprocessing import MinMaxScaler

# Example: Normalize a specific feature using Min-Max scaling
scaler = MinMaxScaler()

columns_to_modify = [
    'FR_Al_COO', 'FR_Al_OH', 'FR_COO',
    'MOL_WEIGHT', 'HEAVY_ATOM_COUNT', 'NUM_RINGS',
    'NUM_ROTATABLE_BONDS', 'TPSA', 'HBOND_DONORS',
    'HBOND_ACCEPTORS', 'LOGP', 'AROMATIC_PROPORTION',
    'NUM_ATOMS','NUM_VALENCE_ELECTRONS','FRACTION_SP3',
    'FRACTIONAL_POLAR_SURFACE_AREA','NUM_SATURATED_RINGS',
    'NUM_AMIDE_BONDS','NUM_AROMATIC_RINGS','NUM_ALIPHATIC_RINGS'
]

for column in columns_to_modify:
    normalized_column_name = f'{column}_NORMALIZED'
    test_df_md[normalized_column_name] = scaler.fit_transform(test_df_md[[column]])

test_df_md.drop(columns=columns_to_modify, inplace=True)

test_df_md.drop(columns=['FR_Al_OH_NORMALIZED', 'TPSA_NORMALIZED', 'HBOND_DONORS_NORMALIZED',
       'HBOND_ACCEPTORS_NORMALIZED'], inplace=True)

test_df_md.describe()

Unnamed: 0,INDEX,FR_Al_COO_NORMALIZED,FR_COO_NORMALIZED,MOL_WEIGHT_NORMALIZED,HEAVY_ATOM_COUNT_NORMALIZED,NUM_RINGS_NORMALIZED,NUM_ROTATABLE_BONDS_NORMALIZED,LOGP_NORMALIZED,AROMATIC_PROPORTION_NORMALIZED,NUM_ATOMS_NORMALIZED,NUM_VALENCE_ELECTRONS_NORMALIZED,FRACTION_SP3_NORMALIZED,FRACTIONAL_POLAR_SURFACE_AREA_NORMALIZED,NUM_SATURATED_RINGS_NORMALIZED,NUM_AMIDE_BONDS_NORMALIZED,NUM_AROMATIC_RINGS_NORMALIZED,NUM_ALIPHATIC_RINGS_NORMALIZED
count,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0,51076.0
mean,178768.5,0.010501,0.013196,0.174939,0.193468,0.134171,0.111209,0.516665,0.294979,0.193469,0.198715,0.294979,0.294979,0.021026,0.065562,0.221576,0.033455
std,14744.515511,0.06089,0.058511,0.048255,0.047077,0.047795,0.054068,0.041507,0.181047,0.047078,0.046673,0.181047,0.181047,0.033881,0.069259,0.096376,0.040124
min,153231.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,165999.75,0.0,0.0,0.142416,0.165289,0.090909,0.069767,0.491506,0.157895,0.165289,0.168224,0.157895,0.157895,0.0,0.0,0.2,0.0
50%,178768.5,0.0,0.0,0.172498,0.190083,0.136364,0.116279,0.517992,0.272727,0.190083,0.196262,0.272727,0.272727,0.0,0.090909,0.2,0.045455
75%,191537.25,0.0,0.0,0.205514,0.22314,0.181818,0.139535,0.543618,0.4,0.22314,0.227414,0.4,0.4,0.045455,0.090909,0.3,0.045455
max,204306.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [81]:
test_df_md.drop(columns='MOLECULE', inplace=True)

In [82]:
# Example: Generate Morgan fingerprints
test_df_finger['MORGAN_FP'] = test_df_finger['MOLECULE'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=128))

# breakdown fingerprints vector to numberic feature groups
fp_df_c = pd.DataFrame()
fp_df_c['INDEX'] = test_df_finger['INDEX']

for i in range(128):
    fp_df_c[f'MORGAN_FP_{i+1}'] = test_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)

  fp_df_c[f'MORGAN_FP_{i+1}'] = test_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df_c[f'MORGAN_FP_{i+1}'] = test_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df_c[f'MORGAN_FP_{i+1}'] = test_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df_c[f'MORGAN_FP_{i+1}'] = test_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df_c[f'MORGAN_FP_{i+1}'] = test_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df_c[f'MORGAN_FP_{i+1}'] = test_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df_c[f'MORGAN_FP_{i+1}'] = test_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df_c[f'MORGAN_FP_{i+1}'] = test_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df_c[f'MORGAN_FP_{i+1}'] = test_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).astype(int)
  fp_df_c[f'MORGAN_FP_{i+1}'] = test_df_finger['MORGAN_FP'].apply(lambda x: x.GetBit(i)).as

In [83]:
test_df_finger = pd.merge(test_df_finger, fp_df_c, on='INDEX', how='inner')
test_df_finger

Unnamed: 0,INDEX,SMILES,MOLECULE,MORGAN_FP,MORGAN_FP_1,MORGAN_FP_2,MORGAN_FP_3,MORGAN_FP_4,MORGAN_FP_5,MORGAN_FP_6,...,MORGAN_FP_119,MORGAN_FP_120,MORGAN_FP_121,MORGAN_FP_122,MORGAN_FP_123,MORGAN_FP_124,MORGAN_FP_125,MORGAN_FP_126,MORGAN_FP_127,MORGAN_FP_128
0,153231,O=C(N/N=C\c1cccc(Br)c1)c1ccccc1-n1cccc1,<rdkit.Chem.rdchem.Mol object at 0x7bd0a3711620>,"[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",1,0,0,1,0,1,...,0,0,0,0,0,0,0,1,1,0
1,153232,CCOc1ccc(/C=N/NC(=O)c2nnn(-c3nonc3N)c2COc2ccc(...,<rdkit.Chem.rdchem.Mol object at 0x7bd0a37114d0>,"[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, ...",1,0,1,0,1,0,...,0,1,0,0,1,0,0,1,0,0
2,153233,Cc1cc2nc(Cl)c(Cl)nc2cc1C,<rdkit.Chem.rdchem.Mol object at 0x7bd0a3711460>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
3,153234,O=C1NC(=S)N/C1=C/c1cc([N+](=O)[O-])ccc1N1CCOCC1,<rdkit.Chem.rdchem.Mol object at 0x7bd0a37113f0>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...",0,1,0,0,0,0,...,1,0,0,1,1,1,0,1,0,0
4,153235,Cc1c(C(=O)OCC(=O)C(C#N)=C2Nc3ccccc3N2)oc2ccccc12,<rdkit.Chem.rdchem.Mol object at 0x7bd0a3711540>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51071,204302,CCOc1ccc(-c2c[n+](=O)c3c(n2[O-])CCCC3)c(OCC)c1,<rdkit.Chem.rdchem.Mol object at 0x7bd0a3c6edc0>,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, ...",0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
51072,204303,COc1nc(C)nc2c1ccc1ccccc12,<rdkit.Chem.rdchem.Mol object at 0x7bd0a3c6ee30>,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
51073,204304,COc1cccc(-c2ccc(SCC(=O)NCCc3ccccc3)nn2)c1,<rdkit.Chem.rdchem.Mol object at 0x7bd0a3c6eea0>,"[1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, ...",1,1,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
51074,204305,CSc1nc2ccccn2c(=O)c1C#N,<rdkit.Chem.rdchem.Mol object at 0x7bd0a3c6ef10>,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0


In [84]:
test_df_finger.drop(columns='MOLECULE', inplace=True)
# TODO: DROP 'MORGAN_FP'

In [85]:
test_df_combined = pd.merge(test_df_md, test_df_finger, on='INDEX', how='inner')

In [86]:
test_df_combined

Unnamed: 0,INDEX,SMILES_x,FR_Al_COO_NORMALIZED,FR_COO_NORMALIZED,MOL_WEIGHT_NORMALIZED,HEAVY_ATOM_COUNT_NORMALIZED,NUM_RINGS_NORMALIZED,NUM_ROTATABLE_BONDS_NORMALIZED,LOGP_NORMALIZED,AROMATIC_PROPORTION_NORMALIZED,...,MORGAN_FP_119,MORGAN_FP_120,MORGAN_FP_121,MORGAN_FP_122,MORGAN_FP_123,MORGAN_FP_124,MORGAN_FP_125,MORGAN_FP_126,MORGAN_FP_127,MORGAN_FP_128
0,153231,O=C(N/N=C\c1cccc(Br)c1)c1ccccc1-n1cccc1,0.0,0.0,0.185666,0.181818,0.136364,0.093023,0.551900,0.000000,...,0,0,0,0,0,0,0,1,1,0
1,153232,CCOc1ccc(/C=N/NC(=O)c2nnn(-c3nonc3N)c2COc2ccc(...,0.0,0.0,0.245218,0.272727,0.181818,0.209302,0.490736,0.142857,...,0,1,0,0,1,0,0,1,0,0
2,153233,Cc1cc2nc(Cl)c(Cl)nc2cc1C,0.0,0.0,0.100937,0.107438,0.090909,0.000000,0.537332,0.200000,...,0,0,0,1,1,0,0,0,0,0
3,153234,O=C1NC(=S)N/C1=C/c1cc([N+](=O)[O-])ccc1N1CCOCC1,0.0,0.0,0.165864,0.181818,0.136364,0.069767,0.447492,0.285714,...,1,0,0,1,1,1,0,1,0,0
4,153235,Cc1c(C(=O)OCC(=O)C(C#N)=C2Nc3ccccc3N2)oc2ccccc12,0.0,0.0,0.189316,0.223140,0.181818,0.093023,0.543365,0.095238,...,0,0,0,0,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51071,204302,CCOc1ccc(-c2c[n+](=O)c3c(n2[O-])CCCC3)c(OCC)c1,0.0,0.0,0.163512,0.190083,0.136364,0.116279,0.522393,0.444444,...,0,0,0,0,0,1,0,0,0,0
51072,204303,COc1nc(C)nc2c1ccc1ccccc12,0.0,0.0,0.099788,0.132231,0.136364,0.023256,0.522663,0.142857,...,0,0,0,0,1,0,0,0,0,0
51073,204304,COc1cccc(-c2ccc(SCC(=O)NCCc3ccccc3)nn2)c1,0.0,0.0,0.192938,0.214876,0.136364,0.186047,0.538942,0.190476,...,0,0,0,0,1,0,0,1,0,0
51074,204305,CSc1nc2ccccn2c(=O)c1C#N,0.0,0.0,0.095544,0.115702,0.090909,0.023256,0.464040,0.100000,...,0,0,1,0,1,0,0,0,0,0


In [87]:
test_df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51076 entries, 0 to 51075
Columns: 148 entries, INDEX to MORGAN_FP_128
dtypes: float64(16), int64(129), object(3)
memory usage: 58.1+ MB


In [88]:
from google.colab import drive

drive.mount('/content/gdrive')

# Specify the path for saving the DataFrame as a CSV file
save_path = '/content/gdrive/MyDrive/ID2214/test_smiles_combined.csv'

# Use to_csv method to save the DataFrame to a CSV file
test_df_combined.to_csv(save_path, index=False)

# Print the path where the DataFrame is saved
print('DataFrame saved to:', save_path)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
DataFrame saved to: /content/gdrive/MyDrive/ID2214/test_smiles_combined.csv


# Generate results

In [89]:
import joblib
from sklearn.metrics import roc_auc_score

load_path_model = '/content/gdrive/MyDrive/ID2214/random_forest_model_c.pkl'
rf_model_full = joblib.load(load_path_model)

In [90]:
columns_list = test_df_combined.columns.tolist()
columns_list.remove('INDEX')
columns_list.remove('SMILES_x')
columns_list.remove('SMILES_y')
columns_list.remove('MORGAN_FP')

print(columns_list)

['FR_Al_COO_NORMALIZED', 'FR_COO_NORMALIZED', 'MOL_WEIGHT_NORMALIZED', 'HEAVY_ATOM_COUNT_NORMALIZED', 'NUM_RINGS_NORMALIZED', 'NUM_ROTATABLE_BONDS_NORMALIZED', 'LOGP_NORMALIZED', 'AROMATIC_PROPORTION_NORMALIZED', 'NUM_ATOMS_NORMALIZED', 'NUM_VALENCE_ELECTRONS_NORMALIZED', 'FRACTION_SP3_NORMALIZED', 'FRACTIONAL_POLAR_SURFACE_AREA_NORMALIZED', 'NUM_SATURATED_RINGS_NORMALIZED', 'NUM_AMIDE_BONDS_NORMALIZED', 'NUM_AROMATIC_RINGS_NORMALIZED', 'NUM_ALIPHATIC_RINGS_NORMALIZED', 'MORGAN_FP_1', 'MORGAN_FP_2', 'MORGAN_FP_3', 'MORGAN_FP_4', 'MORGAN_FP_5', 'MORGAN_FP_6', 'MORGAN_FP_7', 'MORGAN_FP_8', 'MORGAN_FP_9', 'MORGAN_FP_10', 'MORGAN_FP_11', 'MORGAN_FP_12', 'MORGAN_FP_13', 'MORGAN_FP_14', 'MORGAN_FP_15', 'MORGAN_FP_16', 'MORGAN_FP_17', 'MORGAN_FP_18', 'MORGAN_FP_19', 'MORGAN_FP_20', 'MORGAN_FP_21', 'MORGAN_FP_22', 'MORGAN_FP_23', 'MORGAN_FP_24', 'MORGAN_FP_25', 'MORGAN_FP_26', 'MORGAN_FP_27', 'MORGAN_FP_28', 'MORGAN_FP_29', 'MORGAN_FP_30', 'MORGAN_FP_31', 'MORGAN_FP_32', 'MORGAN_FP_33', 'MORGA

In [92]:
X_test = test_df_combined[columns_list]
test_df_combined['PREDICTED_ACTIVE'] = rf_model_full.predict_proba(X_test)[:, 1]

In [93]:
selected_columns = ['INDEX', 'PREDICTED_ACTIVE']
result_df = test_df_combined[selected_columns]

In [94]:
result_df

Unnamed: 0,INDEX,PREDICTED_ACTIVE
0,153231,0.021621
1,153232,0.034813
2,153233,0.010435
3,153234,0.029575
4,153235,0.024920
...,...,...
51071,204302,0.010257
51072,204303,0.048696
51073,204304,0.009050
51074,204305,0.022062


In [95]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51076 entries, 0 to 51075
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   INDEX             51076 non-null  int64  
 1   PREDICTED_ACTIVE  51076 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.2 MB


In [96]:
result_df.describe()

Unnamed: 0,INDEX,PREDICTED_ACTIVE
count,51076.0,51076.0
mean,178768.5,0.018626
std,14744.515511,0.016279
min,153231.0,0.000518
25%,165999.75,0.009268
50%,178768.5,0.01489
75%,191537.25,0.023397
max,204306.0,0.538466


In [97]:
from google.colab import drive

drive.mount('/content/gdrive')

# Specify the path for saving the DataFrame as a CSV file
save_path = '/content/gdrive/MyDrive/ID2214/result.csv'

# Use to_csv method to save the DataFrame to a CSV file
result_df.to_csv(save_path, index=False)

# Print the path where the DataFrame is saved
print('DataFrame saved to:', save_path)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
DataFrame saved to: /content/gdrive/MyDrive/ID2214/result.csv


In [98]:
# Specify the file path where you want to save the text file
file_path = '/content/gdrive/MyDrive/ID2214/7.txt'

# Extract the 'PREDICTED_ACTIVE' column and write it to the text file
result_df['PREDICTED_ACTIVE'].to_csv(file_path, index=False, header=None, sep='\t')

In [None]:
# TODO: Paste the predicted AUC

In [100]:
predictions_df = pd.read_csv('/content/gdrive/MyDrive/ID2214/7.txt', header=None)
try:
    assert predictions_df.shape == (51077, 1)
    assert np.all((predictions_df.values >= 0) & (predictions_df.values <= 1))
    print("All assertions passed!")
except AssertionError as e:
    print(f"Assertion error: {e}")

All assertions passed!
