In [14]:
%reset -f
import rdkit
import pycaret
import joblib

print('rdkit: %s' % rdkit.__version__)
print('pycaret: %s' % pycaret.__version__)
print('joblib: %s' % joblib.__version__)

rdkit: 2023.09.5
pycaret: 3.2.0
joblib: 1.3.2


In [15]:
%%time
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys, Descriptors

# Defining the SMILES strings for the cucumin varients
curcumin_variants = (
    "COc3cc(OC)c(C/C=C/C2=CC(/C=C/Cc1c(OC)cc(OC)cc1OC)=[O+][B-](F)(F)O2)c(OC)c3",
    "COc3ccc(C/C=C/C2=[O+][B-](F)(F)OC(/C=C/Cc1c(OC)cc(OC)cc1OC)=C2)cc3",
    "COc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(OC)cc1OC)=[O+][B-](F)(F)O2)c(OC)c3",
    "COc5ccc(C/C=C/C4=[O+][B-](F)(F)OC(/C=C/Cc2c1ccccc1cc3ccccc23)=C4)cc5",
    "COc3ccc(OC)c(C/C=C/C2=CC(/C=C/Cc1cc(OC)ccc1OC)=[O+][B-](F)(F)O2)c3",
    "COc3cc(C/C=C/C2=CC(/C=C/Cc1ccc(O)c(OC)c1)=[O+][B-](F)(F)O2)ccc3O",
    "F[B-]3(F)OC(/C=C/Cc1ccc(Br)cc1)=CC(/C=C/Cc2ccc(Br)cc2)=[O+]3",
    "C=C(OC)c3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(C(=C)OC)cc1)=[O+][B-](F)(F)O2)cc3",
    "C=C(OC)c3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(C(=C)OC)cc1)=[O+][B-](F)(F)O2)cc3",
    "CN(C)c3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(N(C)C)cc1)=[O+][B-](F)(F)O2)cc3",
    "CCCOc1cc(N(CC)CC)ccc1C/C=C/C3=CC(/C=C/Cc2ccc(N(CC)CC)cc2OCCC)=[O+][B-](F)(F)O3",
    "CCCOc1cc(N(CC)CC)ccc1C/C=C/C3=CC(/C=C/Cc2ccc(N(CC)CC)cc2OCCC)=[O+][B-](F)(F)O3",
    "N#Cc3ccc(C/C=C/C2=[O+][B-](F)(F)OC(/C=C/Cc1ccccc1)=C2)cc3",
    "COc6ccc(C/C=C/C5=CC(/C=C/Cc1cc2ccc3cccc4ccc(c1)c2c34)=[O+][B-](F)(F)O5)cc6",
    "COc4ccc(C/C=C/C3=CC(/C=C/Cc1ccc(OC)c2ccccc12)=[O+][B-](F)(F)O3)c5ccccc45",
    "CN(C)c4ccc(C/C=C/C3=CC(/C=C/Cc1ccc(N(C)C)c2ccccc12)=[O+][B-](F)(F)O3)c5ccccc45",
    "N#Cc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(C#N)cc1)=[O+][B-](F)(F)O2)cc3",
    "CCCCN(CCCC)c3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(N(CCCC)CCCC)cc1)=[O+][B-](F)(F)O2)cc3",
    "COc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(C#N)cc1)=[O+][B-](F)(F)O2)cc3",
    "CN5/C(=C\C\C=C\C3=CC(/C=C/C/C=C/2N(C)c1ccccc1C2(C)C)=[O+][B-](F)(F)O3)C(C)(C)c4ccccc45",
    "COc3ccc(C/C=C/C2=[O+][B-](F)(F)OC(/C=C/Cc1ccc(SC)cc1)=C2)cc3",
    "CSc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(SC)cc1)=[O+][B-](F)(F)O2)cc3",
    "COc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(N(C)C)cc1)=[O+][B-](F)(F)O2)cc3",
    "COc5ccc(C/C=C/C4=[O+][B-](F)(F)OC(/C=C/Cc3ccc2c1ccccc1n(C)c2c3)=C4)cc5",
    "COc1ccccc1C/C=C/C3=CC(/C=C/Cc2ccccc2OC)=[O+][B-](F)(F)O3",
    "CCCCCC(CC)c5ccc(c4ccc(C/C=C/C3=CC(/C=C/Cc2ccc(c1ccc(C(CC)CCCCC)s1)s2)=[O+][B-](F)(F)O3)s4)s5"
)

# Defining the molecule names and HOMO-LUMO Gap
molecule_names = ['2-ADMeO3', '3-MR83a', 'AD-10', 'AD-1013', 'AD-1022', 'AD-11', 'AD-12', 'AD-13', 'AD-13-DMF', 'AD-14-Moore', 'AD-15', 'AD-15-DMF', 'AD-16-DMF', 'AD-18', 'AD-24', 'AD-25', 'AD-3', 'AD-35', 'AD-4', 'AD-48', 'AD-5', 'AD-6', 'AD-7', 'AD-8', 'AD-9', 'YD-30']
homo_lumo_gap = [3.077, 3.072, 3.259, 2.625, 2.938, 2.946, 3.191, 3.226, 3.228, 2.811, 2.971, 2.813, 3.231, 2.735, 2.878, 2.686, 3.215, 2.77, 3.001, 2.702, 2.97, 2.89, 2.859, 2.97, 3.137, 2.525]

molecules = [Chem.MolFromSmiles(smiles) for smiles in curcumin_variants]
mws = [round(Descriptors.MolWt(mol),3) for mol in molecules]
logp = [Descriptors.MolLogP(mol) for mol in molecules]

# Create the initial DataFrame
data = {
    'Molecule': molecule_names,
    'Molecular Weight': mws,
    'LogP': logp,
    'Homo-Lumo Gap (eV)': homo_lumo_gap,
    'Smiles': curcumin_variants
}
curcumin_df = pd.DataFrame(data)
curcumin_df['mol'] = curcumin_df['Smiles'].apply(Chem.MolFromSmiles)

# Harvard OPV dataset import
data = pd.read_csv('https://raw.githubusercontent.com/AjStephan/havard-smile-opv/main/Non-fullerene%20small-molecules%20acceptors.csv')
opv_df = data.drop(columns=[
    'index', 'inchikey', 'HOMO_calc', 'LUMO_calc', 'LUMO_calib', 'LUMO_calib_stds',
    'HOMO_calib', 'HOMO_calib_stds','GAP_calc', 'molW', 'PCE_calc', 'Voc_calc', 'Jsc_calc',
    'FF_calc', 'EQE_calc', 'PCE_calib', 'Voc_calib', 'Jsc_calib', 'FF_calib',
    'EQE_calib', 'PCE_cdiff', 'PCE_calib_plus'], axis=1)

opv_df['mol'] = opv_df['smiles'].apply(Chem.MolFromSmiles)

CPU times: total: 11.8 s
Wall time: 15.1 s


In [16]:
%%time
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys, rdMolDescriptors
from rdkit.DataStructs import ExplicitBitVect
import numpy as np

# Functions to generate fingerprints
def generate_morgan_fingerprint(mol, radius=2, nBits=2048):
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)

def generate_maccs166_fingerprint(mol):
    return MACCSkeys.GenMACCSKeys(mol)

def generate_atom_pair_fingerprint(mol, nBits=2048):
    fp = rdMolDescriptors.GetAtomPairFingerprint(mol)
    return convert_to_bit_vector(fp, nBits)

def generate_fcfp_fingerprint(mol, radius=2, nBits=2048):
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits, useFeatures=True)

# Convert any fingerprint to a bit vector
def convert_to_bit_vector(fp, nBits=2048):
    bit_vector = ExplicitBitVect(nBits)
    for bit in fp.GetNonzeroElements().keys():
        bit_vector.SetBit(bit % nBits)
    return bit_vector

# Function to add fingerprint to dataframe
def add_fingerprint_to_df(df, fingerprint_func, fp_name):
    df[fp_name] = df['mol'].apply(fingerprint_func)
    return df

# Function to split fingerprints into separate bit columns
def split_fingerprint_bits(df, fp_column, prefix):
    bit_array = np.array([list(fp) for fp in df[fp_column].values])
    bit_columns = pd.DataFrame(bit_array, columns=[f'{prefix}_{i}' for i in range(bit_array.shape[1])])
    df = pd.concat([df.drop(columns=[fp_column]), bit_columns], axis=1)
    return df

# Assuming opv_df is already defined and 'mol' column has been created
# List of fingerprint generation functions and names
fingerprint_functions = [
    (generate_morgan_fingerprint, 'morgan_fp', 'morgan'),
    (generate_maccs166_fingerprint, 'maccs_fp', 'maccs'),
    (generate_atom_pair_fingerprint, 'atom_pair_fp', 'atom_pair'),
    (generate_fcfp_fingerprint, 'fcfp_fp', 'fcfp')
]

# List to store resulting dataframes
resulting_dfs = []

# Generate and store dataframes with fingerprints
for fp_func, fp_name, prefix in fingerprint_functions:
    df_copy = opv_df.copy()
    df_copy = add_fingerprint_to_df(df_copy, fp_func, fp_name)
    df_copy = split_fingerprint_bits(df_copy, fp_name, prefix)
    resulting_dfs.append(df_copy)
    print(f'Finished processing {prefix} fingerprints')

# The resulting_dfs list now contains your four dataframes with split fingerprint columns
opv_df_morgan = resulting_dfs[0]
opv_df_maccs = resulting_dfs[1]
opv_df_atom_pair = resulting_dfs[2]
opv_df_fcfp = resulting_dfs[3]

Finished processing morgan fingerprints
Finished processing maccs fingerprints
Finished processing atom_pair fingerprints
Finished processing fcfp fingerprints
CPU times: total: 4min 8s
Wall time: 4min 15s


In [6]:
%%time
import pandas as pd
from sklearn.model_selection import train_test_split
from pycaret.regression import *
import time

# Define the fingerprint datasets
fingerprint_datasets = [
    (opv_df_morgan, 'Morgan'),
    (opv_df_maccs, 'MACCS'),
    (opv_df_atom_pair, 'AtomPair'),
    (opv_df_fcfp, 'FCFP')
]

def train_models_and_get_results(fingerprint_data, fingerprint_name):
    print(f"Processing the training of {fingerprint_name}...")
    start_time = time.time()
    
    # Splitting the data
    X = fingerprint_data.iloc[:, 3:]
    y = fingerprint_data["GAP_calib"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    # Preparing the data for PyCaret
    setup_data = pd.concat([X_train, y_train], axis=1)

    # Setting up PyCaret regression
    regression_setup = setup(setup_data, target='GAP_calib', verbose=False, session_id=123)

    # Selecting and comparing models
    models_to_include = ['lightgbm', 'br', 'knn', 'par']
    best_model = compare_models(include=models_to_include, sort='r2', verbose=False)
    
    # Pulling the results
    results = pull()

    # Adding the fingerprint type and time taken to the results
    results['Fingerprint Type'] = fingerprint_name
    results['Time Taken (s)'] = time.time() - start_time

    print(f"Completed pulling results of {fingerprint_name}")

    return results, best_model

all_results = []
best_models = {}

for fp_data, fp_name in fingerprint_datasets:
    fp_results, best_model = train_models_and_get_results(fp_data, fp_name)
    all_results.append(fp_results)
    best_models[fp_name] = best_model

final_results = pd.concat(all_results, ignore_index=True)
final_results.to_csv(path_or_buf=r"C:\Users\Chamod Peiris\Documents\Curcumin_Scripts\Phase01.csv")
final_results.head()

Processing the training of Morgan...
Completed pulling results of Morgan
Processing the training of MACCS...
Completed pulling results of MACCS
Processing the training of AtomPair...
Completed pulling results of AtomPair
Processing the training of FCFP...
Completed pulling results of FCFP
CPU times: total: 54.5 s
Wall time: 8min


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec),Fingerprint Type,Time Taken (s)
0,Light Gradient Boosting Machine,0.1602,0.0508,0.2234,0.8673,0.0582,0.061,2.109,Morgan,161.588898
1,Bayesian Ridge,0.1793,0.0623,0.2482,0.8368,0.0675,0.0696,9.329,Morgan,161.588898
2,K Neighbors Regressor,0.1954,0.0787,0.2792,0.7938,0.076,0.0778,2.494,Morgan,161.588898
3,Passive Aggressive Regressor,0.2332,0.0959,0.3087,0.748,0.0841,0.0901,0.922,Morgan,161.588898
4,Light Gradient Boosting Machine,0.2315,0.0964,0.3095,0.7471,0.0827,0.0894,0.342,MACCS,11.351094


In [3]:
import pandas as pd
path = (r"C:\Users\Chamod Peiris\Documents\Curcumin_Scripts\Phase01.csv")
final_results = pd.read_csv(path)
final_results

Unnamed: 0.1,Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec),Fingerprint Type,Time Taken (s)
0,0,Light Gradient Boosting Machine,0.1602,0.0508,0.2234,0.8673,0.0582,0.061,2.109,Morgan,161.588898
1,1,Bayesian Ridge,0.1793,0.0623,0.2482,0.8368,0.0675,0.0696,9.329,Morgan,161.588898
2,2,K Neighbors Regressor,0.1954,0.0787,0.2792,0.7938,0.076,0.0778,2.494,Morgan,161.588898
3,3,Passive Aggressive Regressor,0.2332,0.0959,0.3087,0.748,0.0841,0.0901,0.922,Morgan,161.588898
4,4,Light Gradient Boosting Machine,0.2315,0.0964,0.3095,0.7471,0.0827,0.0894,0.342,MACCS,11.351094
5,5,K Neighbors Regressor,0.2415,0.1102,0.3311,0.7105,0.0899,0.0943,0.229,MACCS,11.351094
6,6,Bayesian Ridge,0.2992,0.1534,0.3912,0.5966,0.1065,0.1176,0.213,MACCS,11.351094
7,7,Passive Aggressive Regressor,0.3556,0.2114,0.4582,0.4441,0.1248,0.1415,0.076,MACCS,11.351094
8,8,Light Gradient Boosting Machine,0.2583,0.1271,0.3556,0.6663,0.0969,0.1026,0.965,AtomPair,144.205276
9,9,K Neighbors Regressor,0.2763,0.1525,0.3898,0.5994,0.1071,0.1103,2.397,AtomPair,144.205276


In [13]:
import plotly.graph_objects as go
# Create a bar plot
fig = go.Figure()

# Add bars for each fingerprint type
for fingerprint_type in final_results['Fingerprint Type'].unique():
    filtered_df = final_results[final_results['Fingerprint Type'] == fingerprint_type]
    fig.add_trace(go.Bar(
        x=filtered_df['Model'],
        y=filtered_df['R2'],
        name=fingerprint_type,
        text=filtered_df['R2'].round(4),
        textposition='auto' 
    ))

# Update layout
fig.update_layout(
    title='Model R2 Scores by Fingerprint Type',
    xaxis_title='Model',
    yaxis_title='R2 Score',
    barmode='group',  # Group bars together
    height=600,
    width=1150
)

# Show the plot
fig.show()

In [12]:
import plotly.graph_objects as go

# Find the best model and fingerprint type
best_row = final_results.loc[final_results['R2'].idxmax()]
best_model = best_row['Model']
best_fingerprint_type = best_row['Fingerprint Type']
best_r2 = best_row['R2']

# Create a bar plot
fig = go.Figure()

# Add bars for each fingerprint type
for fingerprint_type in final_results['Fingerprint Type'].unique():
    filtered_df = final_results[final_results['Fingerprint Type'] == fingerprint_type]
    fig.add_trace(go.Bar(
        x=filtered_df['Model'],
        y=filtered_df['R2'],
        name=fingerprint_type,
        text=filtered_df['R2'].round(4),
        textposition='auto',
        marker=dict(color=['red' if (row['Model'] == best_model and row['Fingerprint Type'] == best_fingerprint_type) else 'blue' for index, row in filtered_df.iterrows()])
    ))

# Add annotation for the best model
fig.add_annotation(
    x=best_model,
    y=best_r2,
    text=f"Best Model: {best_model}<br>Fingerprint Type: {best_fingerprint_type}<br>R2: {best_r2:.4f}",
    showarrow=True,
    arrowhead=2,
    ax=0,
    ay=-40
)

# Update layout
fig.update_layout(
    title='Model R2 Scores by Fingerprint Type',
    xaxis_title='Model',
    yaxis_title='R2 Score',
    barmode='group',  # Group bars together
    height=800,
    width=1000
)

# Show the plot
fig.show()