In [1]:
# Imports
import pickle
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import defaults

In [2]:
# Load the dataset
df = pd.read_csv('./datasets/untested_molecules.csv')

In [3]:
df['Descriptors'] = df['SMILES'].apply(defaults.compute_descriptors)
descriptors_df = pd.json_normalize(df['Descriptors'])
df = pd.concat([df, descriptors_df], axis=1).drop(columns=['Descriptors'])

In [29]:
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
        return list(fp)
    else:
        return [0] * 1024

# Compute fingerprints and expand into separate columns
df['Fingerprint'] = df['SMILES'].apply(smiles_to_fingerprint)
fingerprints_df = pd.DataFrame(df['Fingerprint'].tolist(), columns=[f'Fingerprint_{i}' for i in range(1024)])

# Concatenate the original DataFrame with the new fingerprints DataFrame
df = pd.concat([df, fingerprints_df], axis=1).drop(columns=['Fingerprint'])

df.head()

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition,FractionCSP3,NumAromaticRings,fr_C_O,BertzCT,fr_amide,MolLogP,fr_Ar_NH,...,Fingerprint_1014,Fingerprint_1015,Fingerprint_1016,Fingerprint_1017,Fingerprint_1018,Fingerprint_1019,Fingerprint_1020,Fingerprint_1021,Fingerprint_1022,Fingerprint_1023
0,C[C@@H](Sc1nc(=O)cc(N)[nH]1)C(=O)NC1CCCCC1,0,0,0.615385,1,1,525.347595,1,1.2816,1,...,0,0,0,0,0,1,0,0,0,0
1,O=C(CCN1C(=O)COc2ccccc21)NCc1cccs1,0,0,0.25,2,2,669.790259,2,2.18,0,...,0,0,0,0,0,0,0,0,0,0
2,Cn1nnnc1SCC(=O)N1CC[NH+](Cc2ccccc2)CC1,0,0,0.466667,2,1,638.426476,1,-0.7705,0,...,0,0,0,0,0,0,0,0,0,0
3,CCOC(=O)CCP(=O)([O-])[C@@H](O)c1ccc(OC)cc1,0,0,0.461538,1,1,484.139678,0,1.2777,0,...,0,0,0,0,0,0,0,0,0,0
4,C=CCNC(=O)c1cc(-c2ccccc2O)on1,0,0,0.076923,2,1,575.120263,1,1.963,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
def apply_model(inhibitor):
    with open(f"./models/model_{inhibitor}.pkl", "rb") as f:
        model = pickle.load(f)

    x_data = df[defaults.get_descriptors(inhibitor)]
    df[inhibitor] = (model.predict(x_data) >= 0.5).astype(int)

In [6]:
for inhibitor in defaults.y_data:
    apply_model(inhibitor)

In [50]:
result_df = df.iloc[:, :3]
result_df.iloc[:, 0] = '"' + result_df.iloc[:, 0].astype(str) + '"'
result_df.to_csv("./datasets/applied_molecules.csv", index=False, quoting=3)  # quoting=3 (csv.QUOTE_NONE) to avoid quoting other columns