In [7]:
%%time
# importing dependencies
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, rdMolDescriptors, MACCSkeys, Descriptors, Draw

# Defining the SMILES strings for the cucumin varients
curcumin_variants = (
    "COc3cc(OC)c(C/C=C/C2=CC(/C=C/Cc1c(OC)cc(OC)cc1OC)=[O+][B-](F)(F)O2)c(OC)c3",
    "COc3ccc(C/C=C/C2=[O+][B-](F)(F)OC(/C=C/Cc1c(OC)cc(OC)cc1OC)=C2)cc3",
    "COc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(OC)cc1OC)=[O+][B-](F)(F)O2)c(OC)c3",
    "COc5ccc(C/C=C/C4=[O+][B-](F)(F)OC(/C=C/Cc2c1ccccc1cc3ccccc23)=C4)cc5",
    "COc3ccc(OC)c(C/C=C/C2=CC(/C=C/Cc1cc(OC)ccc1OC)=[O+][B-](F)(F)O2)c3",
    "COc3cc(C/C=C/C2=CC(/C=C/Cc1ccc(O)c(OC)c1)=[O+][B-](F)(F)O2)ccc3O",
    "F[B-]3(F)OC(/C=C/Cc1ccc(Br)cc1)=CC(/C=C/Cc2ccc(Br)cc2)=[O+]3",
    "C=C(OC)c3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(C(=C)OC)cc1)=[O+][B-](F)(F)O2)cc3",
    "C=C(OC)c3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(C(=C)OC)cc1)=[O+][B-](F)(F)O2)cc3",
    "CN(C)c3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(N(C)C)cc1)=[O+][B-](F)(F)O2)cc3",
    "CCCOc1cc(N(CC)CC)ccc1C/C=C/C3=CC(/C=C/Cc2ccc(N(CC)CC)cc2OCCC)=[O+][B-](F)(F)O3",
    "CCCOc1cc(N(CC)CC)ccc1C/C=C/C3=CC(/C=C/Cc2ccc(N(CC)CC)cc2OCCC)=[O+][B-](F)(F)O3",
    "N#Cc3ccc(C/C=C/C2=[O+][B-](F)(F)OC(/C=C/Cc1ccccc1)=C2)cc3",
    "COc6ccc(C/C=C/C5=CC(/C=C/Cc1cc2ccc3cccc4ccc(c1)c2c34)=[O+][B-](F)(F)O5)cc6",
    "COc4ccc(C/C=C/C3=CC(/C=C/Cc1ccc(OC)c2ccccc12)=[O+][B-](F)(F)O3)c5ccccc45",
    "CN(C)c4ccc(C/C=C/C3=CC(/C=C/Cc1ccc(N(C)C)c2ccccc12)=[O+][B-](F)(F)O3)c5ccccc45",
    "N#Cc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(C#N)cc1)=[O+][B-](F)(F)O2)cc3",
    "CCCCN(CCCC)c3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(N(CCCC)CCCC)cc1)=[O+][B-](F)(F)O2)cc3",
    "COc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(C#N)cc1)=[O+][B-](F)(F)O2)cc3",
    "CN5/C(=C\C\C=C\C3=CC(/C=C/C/C=C/2N(C)c1ccccc1C2(C)C)=[O+][B-](F)(F)O3)C(C)(C)c4ccccc45",
    "COc3ccc(C/C=C/C2=[O+][B-](F)(F)OC(/C=C/Cc1ccc(SC)cc1)=C2)cc3",
    "CSc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(SC)cc1)=[O+][B-](F)(F)O2)cc3",
    "COc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(N(C)C)cc1)=[O+][B-](F)(F)O2)cc3",
    "COc5ccc(C/C=C/C4=[O+][B-](F)(F)OC(/C=C/Cc3ccc2c1ccccc1n(C)c2c3)=C4)cc5",
    "COc1ccccc1C/C=C/C3=CC(/C=C/Cc2ccccc2OC)=[O+][B-](F)(F)O3",
    "CCCCCC(CC)c5ccc(c4ccc(C/C=C/C3=CC(/C=C/Cc2ccc(c1ccc(C(CC)CCCCC)s1)s2)=[O+][B-](F)(F)O3)s4)s5"
)

# Defining the molecule names and HOMO-LUMO Gap
molecule_names = ['2-ADMeO3', '3-MR83a', 'AD-10', 'AD-1013', 'AD-1022', 'AD-11', 'AD-12', 'AD-13', 'AD-13-DMF', 'AD-14-Moore', 'AD-15', 'AD-15-DMF', 'AD-16-DMF', 'AD-18', 'AD-24', 'AD-25', 'AD-3', 'AD-35', 'AD-4', 'AD-48', 'AD-5', 'AD-6', 'AD-7', 'AD-8', 'AD-9', 'YD-30']
homo_lumo_gap = [3.077, 3.072, 3.259, 2.625, 2.938, 2.946, 3.191, 3.226, 3.228, 2.811, 2.971, 2.813, 3.231, 2.735, 2.878, 2.686, 3.215, 2.77, 3.001, 2.702, 2.97, 2.89, 2.859, 2.97, 3.137, 2.525]

# Molecules generation and descriptor calculations
molecules = [Chem.MolFromSmiles(smiles) for smiles in curcumin_variants]
mws = [round(Descriptors.MolWt(mol),3) for mol in molecules]
logp = [Descriptors.MolLogP(mol) for mol in molecules]
# legends = [f"{name} : {gap} eV" for name, gap in zip(molecule_names, homo_lumo_gap)]
# Draw.MolsToGridImage(molecules, molsPerRow=4, subImgSize=(300,250), legends=legends)

# Function to generate Morgan fingerprints
def generate_morgan_fingerprint(smiles, radius=4, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits, useChirality = True)
        return list(fingerprint)
    else:
        return None

# Creating the initial dataframe
data = {
    'Molecule': molecule_names,
    'Molecular Weight': mws,
    'LogP': logp,
    'Homo-Lumo Gap (eV)': homo_lumo_gap,
    'Smiles': curcumin_variants
}

curcumin_df = pd.DataFrame(data)

# Generate Morgan fingerprints and create the dataframe
curcumin_df['Morgan_Fingerprint'] = curcumin_df['Smiles'].apply(generate_morgan_fingerprint)
morgan_df = pd.DataFrame(curcumin_df['Morgan_Fingerprint'].tolist(), columns=[f'morgan_{i}' for i in range(2048)])
curcumin_df_mf = pd.concat([curcumin_df.drop(columns=['Morgan_Fingerprint']), morgan_df], axis=1)

# Harvard OPV dataset import
data = pd.read_csv('https://raw.githubusercontent.com/AjStephan/havard-smile-opv/main/Non-fullerene%20small-molecules%20acceptors.csv')
newdata = data.drop(columns=['index', 'inchikey',  'HOMO_calc', 'LUMO_calc', 'LUMO_calib',
       'LUMO_calib_stds', 'HOMO_calib', 'HOMO_calib_stds', 'molW',
       'PCE_calc', 'Voc_calc', 'Jsc_calc', 'FF_calc', 'EQE_calc',
       'PCE_calib', 'Voc_calib', 'Jsc_calib', 'FF_calib', 'EQE_calib',
       'PCE_cdiff', 'PCE_calib_plus'], axis=1)

# Generate New dataframe of morgan fingerprint for trainning
newdata["mol"] = newdata["smiles"].apply(Chem.MolFromSmiles)
newdata["Morgan_Fingerprints"] = newdata['smiles'].apply(generate_morgan_fingerprint)
morgan_df_opv = pd.DataFrame(newdata['Morgan_Fingerprints'].tolist(), columns=[f'morgan_{i}' for i in range(2048)])
opv_df_mf = pd.concat([newdata.drop(columns = ['Morgan_Fingerprints']), morgan_df_opv],axis = 1)

CPU times: total: 1min 7s
Wall time: 1min 32s


In [8]:
curcumin_sim = curcumin_df_mf.iloc[:,:5]
curcumin_sim["mol"] = curcumin_sim["Smiles"].apply(Chem.MolFromSmiles)

In [None]:
# Similarity loop
opv_df_sim = opv_df_mf.iloc[:,:3]
opv_df_sim["mol"] = opv_df_sim["smiles"].apply(Chem.MolFromSmiles)

# Define similarity algo
def Tanimoto_similarity(mol_fp, specific_fp):
    return DataStructs.TanimotoSimilarity(mol_fp, specific_fp)

def calculate_similarity(mol_fp, specific_fp):
    return DataStructs.DiceSimilarity(mol_fp, specific_fp)

def calculate_cosine(mol_fp, specific_fp):
    return DataStructs.CosineSimilarity(mol_fp, specific_fp)

def calculate_kulczynski(mol_fp, specific_fp):
    return DataStructs.KulczynskiSimilarity(mol_fp, specific_fp)

# Define similarity searching molecule
specific_smiles = 'Cc3ccc(/C=C/C2=CC(/C=C/c1ccc(C)cc1)=[O+][B-](F)(F)O2)cc3'
specific_mol = Chem.MolFromSmiles(specific_smiles)

# Define fingerprint for the specific molecule
specific_fp_morgan = generate_morgan_fingerprint(specific_smiles)
specific_fp_atompair = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(specific_mol)
specific_fp_maccs = MACCSkeys.GenMACCSKeys(specific_mol)

In [11]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys, rdMolDescriptors, DataStructs

# Function to generate Morgan fingerprint
def generate_morgan_fingerprint(mol, radius=2):
    return rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=2048)

# Function to generate Atom Pair fingerprint
def generate_atom_pair_fingerprint(mol):
    return rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=2048)

# Define similarity functions
def Tanimoto_similarity(mol_fp, specific_fp):
    return DataStructs.TanimotoSimilarity(mol_fp, specific_fp)

def calculate_similarity(mol_fp, specific_fp):
    return DataStructs.DiceSimilarity(mol_fp, specific_fp)

def calculate_cosine(mol_fp, specific_fp):
    return DataStructs.CosineSimilarity(mol_fp, specific_fp)

def calculate_kulczynski(mol_fp, specific_fp):
    return DataStructs.KulczynskiSimilarity(mol_fp, specific_fp)

# Define similarity searching molecule
specific_smiles = 'Cc3ccc(/C=C/C2=CC(/C=C/c1ccc(C)cc1)=[O+][B-](F)(F)O2)cc3'
specific_mol = Chem.MolFromSmiles(specific_smiles)

# Define fingerprints for the specific molecule
specific_fp_morgan = generate_morgan_fingerprint(specific_mol)
specific_fp_atompair = generate_atom_pair_fingerprint(specific_mol)
specific_fp_maccs = MACCSkeys.GenMACCSKeys(specific_mol)

# Initialize the resulting dictionary
resulting_dfs = {}

# Define a dictionary for fingerprint functions and their names
fingerprint_functions = {
    'morgan': generate_morgan_fingerprint,
    'atom_pair': generate_atom_pair_fingerprint,
    'maccs': MACCSkeys.GenMACCSKeys
}

# Define a dictionary for similarity functions and their names
similarity_functions = {
    'Tanimoto': Tanimoto_similarity,
    'Dice': calculate_similarity,
    'Cosine': calculate_cosine,
    'Kulczynski': calculate_kulczynski
}

# Process each fingerprint and similarity metric
for fp_name, fp_function in fingerprint_functions.items():
    for sim_name, sim_function in similarity_functions.items():
        # Create the fingerprint dataframe name
        fp_df_name = f'opv_df_sim_{fp_name}_{sim_name}'

        # Generate fingerprints for the dataframe
        opv_df_sim = opv_df_mf.iloc[:, :3].copy()
        opv_df_sim["mol"] = opv_df_sim["smiles"].apply(Chem.MolFromSmiles)
        
        # Generate the fingerprints
        if fp_name == 'morgan':
            opv_df_sim["fp"] = opv_df_sim["mol"].apply(lambda mol: generate_morgan_fingerprint(mol))
        elif fp_name == 'atom_pair':
            opv_df_sim["fp"] = opv_df_sim["mol"].apply(lambda mol: generate_atom_pair_fingerprint(mol))
        elif fp_name == 'maccs':
            opv_df_sim["fp"] = opv_df_sim["mol"].apply(lambda mol: MACCSkeys.GenMACCSKeys(mol))
        
        # Calculate similarity
        if fp_name == 'morgan':
            specific_fp = specific_fp_morgan
        elif fp_name == 'atom_pair':
            specific_fp = specific_fp_atompair
        elif fp_name == 'maccs':
            specific_fp = specific_fp_maccs

        opv_df_sim['similarity'] = opv_df_sim['fp'].apply(lambda x: sim_function(x, specific_fp))

        # Filter the top 10% similar molecules
        top10_df = opv_df_sim.sort_values(by='similarity', ascending=False).reset_index(drop=True)
        top10_df = top10_df[top10_df['similarity'] >= top10_df['similarity'].quantile(0.90)].copy()

        # Store in resulting_dfs
        resulting_dfs[f'{fp_name}_{sim_name}_top10'] = top10_df
        print(f'{fp_name}_{sim_name}_top10: {top10_df.shape}')

morgan_Tanimoto_top10: (5196, 6)
morgan_Dice_top10: (5196, 6)
morgan_Cosine_top10: (5160, 6)
morgan_Kulczynski_top10: (5139, 6)
atom_pair_Tanimoto_top10: (5129, 6)
atom_pair_Dice_top10: (5129, 6)
atom_pair_Cosine_top10: (5130, 6)
atom_pair_Kulczynski_top10: (5126, 6)
maccs_Tanimoto_top10: (5187, 6)
maccs_Dice_top10: (5187, 6)
maccs_Cosine_top10: (5188, 6)
maccs_Kulczynski_top10: (5168, 6)


In [14]:
for name, df in resulting_dfs.items():
    globals()[name] = df

In [21]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from pycaret.regression import *

# Function to generate Morgan fingerprints
def generate_morgan_fingerprint(smiles, radius=4, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits, useChirality=True)
        return list(fingerprint)
    else:
        return [0]*n_bits  # Return a list of zeros if the molecule is None

# Add Morgan fingerprints to each dataframe in resulting_dfs
fingerprint_dfs = {}
for name, df in resulting_dfs.items():
    df['Morgan_Fingerprint'] = df['smiles'].apply(generate_morgan_fingerprint)
    morgan_df = pd.DataFrame(df['Morgan_Fingerprint'].tolist(), columns=[f'morgan_{i}' for i in range(2048)])
    fingerprint_dfs[name] = pd.concat([df.drop(columns=['Morgan_Fingerprint']), morgan_df], axis=1)

# PyCaret setup and model comparison
results = []

for name, df in fingerprint_dfs.items():
    print(f"Processing {name}...")
    X = df.iloc[:, 6:]  # Adjust column index as needed
    y = df["GAP_calib"]
    
    setup_data = pd.concat([X, y], axis=1)
    regression_setup = setup(setup_data, target='GAP_calib', verbose=False, session_id=123, train_size=0.75)
    models_to_include = ['xgboost', 'lightgbm', 'knn', 'br', 'huber']
    best_model = compare_models(include=models_to_include)
    
    results.append({
        'dataset': name,
        'best_model': best_model
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df.head()

Processing morgan_Tanimoto_top10...


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.1495,0.0441,0.2086,0.8731,0.0553,0.0566,0.828
xgboost,Extreme Gradient Boosting,0.1532,0.0472,0.2158,0.864,0.0582,0.058,0.935
br,Bayesian Ridge,0.1732,0.0559,0.2352,0.8388,0.0635,0.0664,3.39
huber,Huber Regressor,0.1933,0.0706,0.2649,0.7958,0.0725,0.074,1.536
knn,K Neighbors Regressor,0.2283,0.1085,0.3283,0.6866,0.0905,0.0925,0.092


Processing morgan_Dice_top10...


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.1495,0.0441,0.2086,0.8731,0.0553,0.0566,0.26
xgboost,Extreme Gradient Boosting,0.1532,0.0472,0.2158,0.864,0.0582,0.058,0.287
br,Bayesian Ridge,0.1732,0.0559,0.2352,0.8388,0.0635,0.0664,3.474
huber,Huber Regressor,0.1933,0.0706,0.2649,0.7958,0.0725,0.074,1.494
knn,K Neighbors Regressor,0.2283,0.1085,0.3283,0.6866,0.0905,0.0925,0.078


Processing morgan_Cosine_top10...


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.1467,0.0425,0.205,0.8758,0.054,0.056,0.232
xgboost,Extreme Gradient Boosting,0.1523,0.0457,0.2128,0.8665,0.0567,0.0578,0.289
br,Bayesian Ridge,0.172,0.0545,0.2325,0.8409,0.0629,0.0664,3.397
huber,Huber Regressor,0.1898,0.0662,0.2566,0.8059,0.0699,0.073,1.513
knn,K Neighbors Regressor,0.2196,0.1045,0.3222,0.6942,0.0883,0.0894,0.077


Processing morgan_Kulczynski_top10...


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.1443,0.0398,0.1991,0.8786,0.0549,0.0557,0.25
xgboost,Extreme Gradient Boosting,0.1488,0.0413,0.2028,0.8743,0.0555,0.0569,0.296
br,Bayesian Ridge,0.1666,0.049,0.2209,0.8507,0.0623,0.065,3.447
huber,Huber Regressor,0.1826,0.0596,0.2438,0.8181,0.0683,0.0707,1.501
knn,K Neighbors Regressor,0.2164,0.0964,0.3101,0.7064,0.0874,0.089,0.077


Processing atom_pair_Tanimoto_top10...


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.1385,0.035,0.1871,0.8626,0.0493,0.0495,0.248
xgboost,Extreme Gradient Boosting,0.1454,0.0387,0.1967,0.8479,0.0517,0.0518,0.284
br,Bayesian Ridge,0.154,0.0427,0.2063,0.8328,0.0546,0.0551,3.382
huber,Huber Regressor,0.1668,0.0513,0.2263,0.7991,0.0601,0.0595,1.471
knn,K Neighbors Regressor,0.2124,0.0912,0.3013,0.6413,0.0805,0.0797,0.078


Processing atom_pair_Dice_top10...


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.1385,0.035,0.1871,0.8626,0.0493,0.0495,0.256
xgboost,Extreme Gradient Boosting,0.1454,0.0387,0.1967,0.8479,0.0517,0.0518,0.306
br,Bayesian Ridge,0.154,0.0427,0.2063,0.8328,0.0546,0.0551,3.38
huber,Huber Regressor,0.1668,0.0513,0.2263,0.7991,0.0601,0.0595,1.476
knn,K Neighbors Regressor,0.2124,0.0912,0.3013,0.6413,0.0805,0.0797,0.081


Processing atom_pair_Cosine_top10...


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.1447,0.041,0.202,0.8497,0.0567,0.0558,0.273
xgboost,Extreme Gradient Boosting,0.1478,0.0425,0.2054,0.845,0.0569,0.0563,0.31
br,Bayesian Ridge,0.161,0.048,0.2187,0.8243,0.0613,0.0618,3.412
huber,Huber Regressor,0.1761,0.0585,0.2414,0.7862,0.0675,0.067,1.461
knn,K Neighbors Regressor,0.2129,0.0912,0.3014,0.6655,0.0839,0.0848,0.075


Processing atom_pair_Kulczynski_top10...


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.1517,0.0463,0.2135,0.8249,0.0597,0.0607,0.275
xgboost,Extreme Gradient Boosting,0.1558,0.0485,0.2187,0.8164,0.0615,0.0623,0.304
br,Bayesian Ridge,0.1707,0.0555,0.2331,0.791,0.0644,0.0681,3.413
huber,Huber Regressor,0.1866,0.0682,0.259,0.7416,0.0719,0.0739,1.496
knn,K Neighbors Regressor,0.2048,0.086,0.2926,0.6708,0.0828,0.0846,0.074


Processing maccs_Tanimoto_top10...


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.1529,0.0429,0.2071,0.8611,0.0536,0.0538,0.256
xgboost,Extreme Gradient Boosting,0.1541,0.045,0.212,0.8547,0.0547,0.054,0.306
br,Bayesian Ridge,0.1798,0.0565,0.2376,0.8173,0.0618,0.0634,3.48
huber,Huber Regressor,0.1988,0.0731,0.2702,0.7637,0.07,0.0697,1.51
knn,K Neighbors Regressor,0.2137,0.0931,0.3046,0.6993,0.0797,0.079,0.078


Processing maccs_Dice_top10...


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.1529,0.0429,0.2071,0.8611,0.0536,0.0538,0.256
xgboost,Extreme Gradient Boosting,0.1541,0.045,0.212,0.8547,0.0547,0.054,0.309
br,Bayesian Ridge,0.1798,0.0565,0.2376,0.8173,0.0618,0.0634,3.447
huber,Huber Regressor,0.1988,0.0731,0.2702,0.7637,0.07,0.0697,1.497
knn,K Neighbors Regressor,0.2137,0.0931,0.3046,0.6993,0.0797,0.079,0.081


Processing maccs_Cosine_top10...


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.1484,0.041,0.2022,0.8683,0.0532,0.0534,0.267
xgboost,Extreme Gradient Boosting,0.1526,0.0439,0.209,0.8595,0.0542,0.0543,0.304
br,Bayesian Ridge,0.1746,0.0538,0.2318,0.8272,0.0608,0.0628,3.445
huber,Huber Regressor,0.195,0.0693,0.263,0.777,0.069,0.0698,1.496
knn,K Neighbors Regressor,0.2148,0.0944,0.3066,0.6975,0.082,0.0819,0.075


Processing maccs_Kulczynski_top10...


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.1432,0.0406,0.2007,0.8716,0.053,0.0524,0.251
xgboost,Extreme Gradient Boosting,0.1492,0.0437,0.2081,0.862,0.0547,0.0544,0.306
br,Bayesian Ridge,0.1667,0.0514,0.2262,0.837,0.0599,0.061,3.395
huber,Huber Regressor,0.1878,0.0669,0.2581,0.7874,0.068,0.0682,1.517
knn,K Neighbors Regressor,0.2066,0.0898,0.2986,0.7148,0.0808,0.0805,0.081


Unnamed: 0,dataset,best_model
0,morgan_Tanimoto_top10,"LGBMRegressor(n_jobs=-1, random_state=123)"
1,morgan_Dice_top10,"LGBMRegressor(n_jobs=-1, random_state=123)"
2,morgan_Cosine_top10,"LGBMRegressor(n_jobs=-1, random_state=123)"
3,morgan_Kulczynski_top10,"LGBMRegressor(n_jobs=-1, random_state=123)"
4,atom_pair_Tanimoto_top10,"LGBMRegressor(n_jobs=-1, random_state=123)"


In [24]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from pycaret.regression import *

# Step 1: Generate Morgan fingerprints for each dataframe in resulting_dfs
def generate_morgan_fingerprint(smiles, radius=4, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits, useChirality=True)
        return list(fingerprint)
    else:
        return [0]*n_bits  # Return a list of zeros if the molecule is None

fingerprint_dfs = {}
for name, df in resulting_dfs.items():
    df['Morgan_Fingerprint'] = df['smiles'].apply(generate_morgan_fingerprint)
    morgan_df = pd.DataFrame(df['Morgan_Fingerprint'].tolist(), columns=[f'morgan_{i}' for i in range(2048)])
    fingerprint_dfs[name] = pd.concat([df.drop(columns=['Morgan_Fingerprint']), morgan_df], axis=1)

# Step 2: Define a function to train models and get results
def train_models_and_get_results(df, models_to_include=['xgboost', 'lightgbm', 'knn', 'br', 'huber']):
    X = df.iloc[:, 6:] 
    y = df["GAP_calib"]
    
    setup_data = pd.concat([X, y], axis=1)
    regression_setup = setup(setup_data, target='GAP_calib', verbose=False, session_id=123, train_size=0.75)
    best_model = compare_models(include=models_to_include, verbose=False)

    model_results = pull()
    model_results['dataset'] = df['dataset_name'].iloc[0] 
    
    return model_results

# Step 3: Run the loop to apply the function to each dataframe in fingerprint_dfs
all_results = []
for name, df in fingerprint_dfs.items():
    print(f"Processing {name}...")
    df['dataset_name'] = name 
    model_results = train_models_and_get_results(df)
    all_results.append(model_results)
    print(f"Completed {name}!")

# Step 4: Collect final results and save
final_results = pd.concat(all_results, ignore_index=True)
final_results.to_csv(path_or_buf=r"C:\Users\Chamod Peiris\Documents\Curcumin_Scripts\Similarity_model_comparison_results.csv", index=False)
final_results.head()

Processing morgan_Tanimoto_top10...
Completed morgan_Tanimoto_top10!
Processing morgan_Dice_top10...
Completed morgan_Dice_top10!
Processing morgan_Cosine_top10...
Completed morgan_Cosine_top10!
Processing morgan_Kulczynski_top10...
Completed morgan_Kulczynski_top10!
Processing atom_pair_Tanimoto_top10...
Completed atom_pair_Tanimoto_top10!
Processing atom_pair_Dice_top10...
Completed atom_pair_Dice_top10!
Processing atom_pair_Cosine_top10...
Completed atom_pair_Cosine_top10!
Processing atom_pair_Kulczynski_top10...
Completed atom_pair_Kulczynski_top10!
Processing maccs_Tanimoto_top10...
Completed maccs_Tanimoto_top10!
Processing maccs_Dice_top10...
Completed maccs_Dice_top10!
Processing maccs_Cosine_top10...
Completed maccs_Cosine_top10!
Processing maccs_Kulczynski_top10...
Completed maccs_Kulczynski_top10!


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec),dataset
0,Light Gradient Boosting Machine,0.1495,0.0441,0.2086,0.8731,0.0553,0.0566,1.214,morgan_Tanimoto_top10
1,Extreme Gradient Boosting,0.1532,0.0472,0.2158,0.864,0.0582,0.058,1.035,morgan_Tanimoto_top10
2,Bayesian Ridge,0.1732,0.0559,0.2352,0.8388,0.0635,0.0664,3.593,morgan_Tanimoto_top10
3,Huber Regressor,0.1918,0.069,0.2619,0.8003,0.0715,0.0734,1.774,morgan_Tanimoto_top10
4,K Neighbors Regressor,0.2283,0.1085,0.3283,0.6866,0.0905,0.0925,0.325,morgan_Tanimoto_top10


In [48]:
import plotly.graph_objects as go
import pandas as pd

# Extract fingerprint and similarity from 'dataset' column
final_results['Fingerprint'] = final_results['dataset'].apply(lambda x: x.split('_')[0])
final_results['Similarity'] = final_results['dataset'].apply(lambda x: x.split('_')[1])

# Group by Fingerprint
fingerprint_group = final_results.groupby('Fingerprint').agg({'R2': 'mean', 'RMSE': 'mean'}).reset_index()

# Create the figure for Fingerprint-wise averages
fig_fingerprint = go.Figure()

# R2 Score
fig_fingerprint.add_trace(go.Bar(
    x=fingerprint_group['Fingerprint'],
    y=fingerprint_group['R2'],
    name='R2 Score (Fingerprint)',
    text=fingerprint_group['R2'],
    textposition='auto'
))

# RMSE Score
fig_fingerprint.add_trace(go.Scatter(
    x=fingerprint_group['Fingerprint'],
    y=fingerprint_group['RMSE'],
    name='RMSE Score (Fingerprint)',
    mode='lines+markers',
    yaxis='y2'
))

# Update layout for Fingerprint plot
fig_fingerprint.update_layout(
    title='Average R2 and RMSE Scores by Fingerprint',
    xaxis_title='Fingerprint',
    yaxis_title='R2 Score',
    yaxis=dict(range=[0.6, 0.85]),
    yaxis2=dict(
        title='RMSE Score',
        overlaying='y',
        side='right'
    ),
    barmode='group',
    width=900,height=600,
        legend=dict(
        orientation='h',
        x=1,
        xanchor='center',
        y=1.15
    )
)

fig_fingerprint.show()

In [50]:
# Group by Similarity
similarity_group = final_results.groupby('Similarity').agg({'R2': 'mean', 'RMSE': 'mean'}).reset_index()

# Create the figure for Similarity-wise averages
fig_similarity = go.Figure()

# R2 Score
fig_similarity.add_trace(go.Bar(
    x=similarity_group['Similarity'],
    y=similarity_group['R2'],
    name='R2 Score (Similarity)',
    text=similarity_group['R2'].map(lambda x: f"{x:.4f}"),
    textposition='auto'
))

# RMSE Score
fig_similarity.add_trace(go.Scatter(
    x=similarity_group['Similarity'],
    y=similarity_group['RMSE'],
    name='RMSE Score (Similarity)',
    mode='lines+markers',
    yaxis='y2'
))

# Update layout for Similarity plot
fig_similarity.update_layout(
    title='Average R2 and RMSE Scores by Similarity',
    xaxis_title='Similarity',
    yaxis_title='R2 Score',
    yaxis = dict(range=[0.6, 0.85]),
    yaxis2=dict(
        title='RMSE Score',
        overlaying='y',
        side='right'
    ),
    barmode='group',
    width=900,height=600,
    legend=dict(
        orientation='h',
        x=1,
        xanchor='center',
        y=1.15)
)

fig_similarity.show()

In [68]:
# Find the best model for each dataset
best_models = final_results.loc[final_results.groupby('dataset')['R2'].idxmax()]

fig = px.bar(best_models, x='dataset', y='R2', color='Model',
             text='R2', title='Best Model for Each Dataset with R2 Label',
             labels={'R2': 'R2 Score', 'dataset': 'Dataset'})
fig.update_traces(texttemplate='%{text:.4f}', textposition='auto')
fig.update_layout(width=1000,height=600,yaxis=dict(range=[0.6, 0.9]),
                   legend=dict(orientation='h', x=1, xanchor='center', y=1.15)
                   )      
fig.show()

In [63]:
overall_best = final_results.loc[final_results['R2'].idxmax()]

# Create the vertical bar plot
fig = px.bar(final_results, x='dataset', y='R2', color='Model',
             title='Best Model Across All Dataset Types',
             labels={'R2': 'R2 Score', 'dataset': 'Dataset'},
             text='R2')

# Highlight the best model
fig.add_trace(go.Scatter(x=[overall_best['dataset']], y=[overall_best['R2']],
                         mode='markers+text', text=[f'Best: {overall_best["Model"]} ({overall_best["R2"]:.2f})'],
                         marker=dict(color='red', size=5),
                         textposition='top center'))

# Update layout to show R2 scores on bars
fig.update_layout(barmode='group', yaxis=dict(range=[0.6, 0.9]),
                  width=1200, height=600,
                    legend=dict(orientation='h', x=1, xanchor='center', y=1.15)
                  )

# Show the figure
fig.show()