In [1]:
# Importing dependencies
from rdkit import Chem
from rdkit.Chem import Draw, Descriptors, MACCSkeys
from rdkit.Chem import AllChem
import pandas as pd
from rdkit.Chem.AtomPairs import Pairs
from rdkit.Chem.Fingerprints import FingerprintMols

# Defining the SMILES strings for the curcumin variants
curcumin_variants = (
    "COc3cc(OC)c(C/C=C/C2=CC(/C=C/Cc1c(OC)cc(OC)cc1OC)=[O+][B-](F)(F)O2)c(OC)c3",
    "COc3ccc(C/C=C/C2=[O+][B-](F)(F)OC(/C=C/Cc1c(OC)cc(OC)cc1OC)=C2)cc3",
    "COc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(OC)cc1OC)=[O+][B-](F)(F)O2)c(OC)c3",
    "COc5ccc(C/C=C/C4=[O+][B-](F)(F)OC(/C=C/Cc2c1ccccc1cc3ccccc23)=C4)cc5",
    "COc3ccc(OC)c(C/C=C/C2=CC(/C=C/Cc1cc(OC)ccc1OC)=[O+][B-](F)(F)O2)c3",
    "COc3cc(C/C=C/C2=CC(/C=C/Cc1ccc(O)c(OC)c1)=[O+][B-](F)(F)O2)ccc3O",
    "F[B-]3(F)OC(/C=C/Cc1ccc(Br)cc1)=CC(/C=C/Cc2ccc(Br)cc2)=[O+]3",
    "C=C(OC)c3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(C(=C)OC)cc1)=[O+][B-](F)(F)O2)cc3",
    "C=C(OC)c3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(C(=C)OC)cc1)=[O+][B-](F)(F)O2)cc3",
    "CN(C)c3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(N(C)C)cc1)=[O+][B-](F)(F)O2)cc3",
    "CCCOc1cc(N(CC)CC)ccc1C/C=C/C3=CC(/C=C/Cc2ccc(N(CC)CC)cc2OCCC)=[O+][B-](F)(F)O3",
    "CCCOc1cc(N(CC)CC)ccc1C/C=C/C3=CC(/C=C/Cc2ccc(N(CC)CC)cc2OCCC)=[O+][B-](F)(F)O3",
    "N#Cc3ccc(C/C=C/C2=[O+][B-](F)(F)OC(/C=C/Cc1ccccc1)=C2)cc3",
    "COc6ccc(C/C=C/C5=CC(/C=C/Cc1cc2ccc3cccc4ccc(c1)c2c34)=[O+][B-](F)(F)O5)cc6",
    "COc4ccc(C/C=C/C3=CC(/C=C/Cc1ccc(OC)c2ccccc12)=[O+][B-](F)(F)O3)c5ccccc45",
    "CN(C)c4ccc(C/C=C/C3=CC(/C=C/Cc1ccc(N(C)C)c2ccccc12)=[O+][B-](F)(F)O3)c5ccccc45",
    "N#Cc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(C#N)cc1)=[O+][B-](F)(F)O2)cc3",
    "CCCCN(CCCC)c3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(N(CCCC)CCCC)cc1)=[O+][B-](F)(F)O2)cc3",
    "COc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(C#N)cc1)=[O+][B-](F)(F)O2)cc3",
    "CN5/C(=C\C\C=C\C3=CC(/C=C/C/C=C/2N(C)c1ccccc1C2(C)C)=[O+][B-](F)(F)O3)C(C)(C)c4ccccc45",
    "COc3ccc(C/C=C/C2=[O+][B-](F)(F)OC(/C=C/Cc1ccc(SC)cc1)=C2)cc3",
    "CSc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(SC)cc1)=[O+][B-](F)(F)O2)cc3",
    "COc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(N(C)C)cc1)=[O+][B-](F)(F)O2)cc3",
    "COc5ccc(C/C=C/C4=[O+][B-](F)(F)OC(/C=C/Cc3ccc2c1ccccc1n(C)c2c3)=C4)cc5",
    "COc1ccccc1C/C=C/C3=CC(/C=C/Cc2ccccc2OC)=[O+][B-](F)(F)O3",
    "CCCCCC(CC)c5ccc(c4ccc(C/C=C/C3=CC(/C=C/Cc2ccc(c1ccc(C(CC)CCCCC)s1)s2)=[O+][B-](F)(F)O3)s4)s5"
)

# Defining the molecule names and HOMO-LUMO Gap
molecule_names = ['2-ADMeO3', '3-MR83a', 'AD-10', 'AD-1013', 'AD-1022', 'AD-11', 'AD-12', 'AD-13', 'AD-13-DMF', 'AD-14-Moore', 'AD-15', 'AD-15-DMF', 'AD-16-DMF', 'AD-18', 'AD-24', 'AD-25', 'AD-3', 'AD-35', 'AD-4', 'AD-48', 'AD-5', 'AD-6', 'AD-7', 'AD-8', 'AD-9', 'YD-30']
homo_lumo_gap = [3.077, 3.072, 3.259, 2.625, 2.938, 2.946, 3.191, 3.226, 3.228, 2.811, 2.971, 2.813, 3.231, 2.735, 2.878, 2.686, 3.215, 2.77, 3.001, 2.702, 2.97, 2.89, 2.859, 2.97, 3.137, 2.525]

# Molecules generation and descriptor calculations
molecules = [Chem.MolFromSmiles(smiles) for smiles in curcumin_variants]
mws = [round(Descriptors.MolWt(mol), 3) for mol in molecules]
logp = [Descriptors.MolLogP(mol) for mol in molecules]
legends = [f"{name} : {gap} eV" for name, gap in zip(molecule_names, homo_lumo_gap)]
Draw.MolsToGridImage(molecules, molsPerRow=4, subImgSize=(300, 250), legends=legends)

# Function to generate Morgan fingerprints
def generate_morgan_fingerprint(smiles, radius=4, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits, useChirality=True)
        return list(fingerprint)
    else:
        return None

# Function to generate MACCS keys
def generate_maccs_keys(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fingerprint = MACCSkeys.GenMACCSKeys(mol)
        return list(fingerprint)
    else:
        return None

# Function to generate FCFP (ECFP) fingerprints
def generate_fcfc_fingerprint(smiles, radius=4, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits, useFeatures=True)
        return list(fingerprint)
    else:
        return None

# Function to generate PubChem fingerprints
def generate_pubchem_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fingerprint = FingerprintMols.FingerprintMol(mol)
        return list(fingerprint)
    else:
        return None

# Creating the initial dataframe
data = {
    'Molecule': molecule_names,
    'Molecular Weight': mws,
    'LogP': logp,
    'Homo-Lumo Gap (eV)': homo_lumo_gap,
    'Smiles': curcumin_variants
}

curcumin_df = pd.DataFrame(data)

# Generate Morgan fingerprints and create the dataframe
curcumin_df['Morgan_Fingerprint'] = curcumin_df['Smiles'].apply(generate_morgan_fingerprint)
morgan_df = pd.DataFrame(curcumin_df['Morgan_Fingerprint'].tolist(), columns=[f'morgan_{i}' for i in range(2048)])
curcumin_df_mf = pd.concat([curcumin_df.drop(columns=['Morgan_Fingerprint']), morgan_df], axis=1)

# Generate MACCS keys and create the dataframe
curcumin_df['MACCS_Keys'] = curcumin_df['Smiles'].apply(generate_maccs_keys)
maccs_df = pd.DataFrame(curcumin_df['MACCS_Keys'].tolist(), columns=[f'maccs_{i}' for i in range(167)])
curcumin_df_mac = pd.concat([curcumin_df.drop(columns=['MACCS_Keys','Morgan_Fingerprint']), maccs_df], axis=1)

# Generate FCFP (ECFP) fingerprints and create the dataframe
curcumin_df['FCFP_Fingerprint'] = curcumin_df['Smiles'].apply(generate_fcfc_fingerprint)
fcfp_df = pd.DataFrame(curcumin_df['FCFP_Fingerprint'].tolist(), columns=[f'fcfp_{i}' for i in range(2048)])
curcumin_df_fcfp = pd.concat([curcumin_df.drop(columns=['FCFP_Fingerprint','MACCS_Keys','Morgan_Fingerprint']), fcfp_df], axis=1)

# Generate PubChem fingerprints and create the dataframe
curcumin_df['PubChem_Fingerprint'] = curcumin_df['Smiles'].apply(generate_pubchem_fingerprint)
pubchem_df = pd.DataFrame(curcumin_df['PubChem_Fingerprint'].tolist())
pubchem_df.columns = [f'pubchem_{i}' for i in range(pubchem_df.shape[1])]
curcumin_df_pubchem = pd.concat([curcumin_df.drop(columns=['PubChem_Fingerprint','MACCS_Keys','Morgan_Fingerprint','FCFP_Fingerprint']), pubchem_df], axis=1)

# # Save final dataframes
# curcumin_df_mf.to_csv('curcumin_morgan_fingerprints.csv', index=False)
# curcumin_df_mac.to_csv('curcumin_maccs_keys.csv', index=False)
# curcumin_df_fcfp.to_csv('curcumin_fcfp_fingerprints.csv', index=False)
# curcumin_df_pubchem.to_csv('curcumin_pubchem_fingerprints.csv', index=False)

# # Optionally, combine all descriptors into one dataframe
# combined_df = pd.concat([curcumin_df.drop(columns=['Morgan_Fingerprint', 'MACCS_Keys', 'FCFP_Fingerprint', 'PubChem_Fingerprint']), morgan_df, maccs_df, fcfp_df, pubchem_df], axis=1)
# combined_df.to_csv('curcumin_combined_fingerprints.csv', index=False)

In [2]:
curcumin_df_mf.head()

Unnamed: 0,Molecule,Molecular Weight,LogP,Homo-Lumo Gap (eV),Smiles,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,2-ADMeO3,532.345,5.0293,3.077,COc3cc(OC)c(C/C=C/C2=CC(/C=C/Cc1c(OC)cc(OC)cc1...,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,3-MR83a,472.293,5.0121,3.072,COc3ccc(C/C=C/C2=[O+][B-](F)(F)OC(/C=C/Cc1c(OC...,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,AD-10,472.293,5.0121,3.259,COc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(OC)cc1OC)=[O+]...,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,AD-1013,482.335,7.2927,2.625,COc5ccc(C/C=C/C4=[O+][B-](F)(F)OC(/C=C/Cc2c1cc...,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,AD-1022,472.293,5.0121,2.938,COc3ccc(OC)c(C/C=C/C2=CC(/C=C/Cc1cc(OC)ccc1OC)...,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [3]:
curcumin_df_mac.head()

Unnamed: 0,Molecule,Molecular Weight,LogP,Homo-Lumo Gap (eV),Smiles,maccs_0,maccs_1,maccs_2,maccs_3,maccs_4,...,maccs_157,maccs_158,maccs_159,maccs_160,maccs_161,maccs_162,maccs_163,maccs_164,maccs_165,maccs_166
0,2-ADMeO3,532.345,5.0293,3.077,COc3cc(OC)c(C/C=C/C2=CC(/C=C/Cc1c(OC)cc(OC)cc1...,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
1,3-MR83a,472.293,5.0121,3.072,COc3ccc(C/C=C/C2=[O+][B-](F)(F)OC(/C=C/Cc1c(OC...,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
2,AD-10,472.293,5.0121,3.259,COc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(OC)cc1OC)=[O+]...,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
3,AD-1013,482.335,7.2927,2.625,COc5ccc(C/C=C/C4=[O+][B-](F)(F)OC(/C=C/Cc2c1cc...,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
4,AD-1022,472.293,5.0121,2.938,COc3ccc(OC)c(C/C=C/C2=CC(/C=C/Cc1cc(OC)ccc1OC)...,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0


In [4]:
curcumin_df_fcfp.head()

Unnamed: 0,Molecule,Molecular Weight,LogP,Homo-Lumo Gap (eV),Smiles,fcfp_0,fcfp_1,fcfp_2,fcfp_3,fcfp_4,...,fcfp_2038,fcfp_2039,fcfp_2040,fcfp_2041,fcfp_2042,fcfp_2043,fcfp_2044,fcfp_2045,fcfp_2046,fcfp_2047
0,2-ADMeO3,532.345,5.0293,3.077,COc3cc(OC)c(C/C=C/C2=CC(/C=C/Cc1c(OC)cc(OC)cc1...,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,3-MR83a,472.293,5.0121,3.072,COc3ccc(C/C=C/C2=[O+][B-](F)(F)OC(/C=C/Cc1c(OC...,1,0,1,0,1,...,0,0,1,0,0,0,0,0,0,0
2,AD-10,472.293,5.0121,3.259,COc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(OC)cc1OC)=[O+]...,1,0,1,0,1,...,0,0,1,0,0,0,0,0,0,0
3,AD-1013,482.335,7.2927,2.625,COc5ccc(C/C=C/C4=[O+][B-](F)(F)OC(/C=C/Cc2c1cc...,1,0,1,0,1,...,0,0,1,0,0,0,0,0,0,0
4,AD-1022,472.293,5.0121,2.938,COc3ccc(OC)c(C/C=C/C2=CC(/C=C/Cc1cc(OC)ccc1OC)...,1,0,1,0,1,...,0,0,1,0,0,0,0,0,0,0


In [5]:
curcumin_df_pubchem.head()

Unnamed: 0,Molecule,Molecular Weight,LogP,Homo-Lumo Gap (eV),Smiles,pubchem_0,pubchem_1,pubchem_2,pubchem_3,pubchem_4,...,pubchem_2038,pubchem_2039,pubchem_2040,pubchem_2041,pubchem_2042,pubchem_2043,pubchem_2044,pubchem_2045,pubchem_2046,pubchem_2047
0,2-ADMeO3,532.345,5.0293,3.077,COc3cc(OC)c(C/C=C/C2=CC(/C=C/Cc1c(OC)cc(OC)cc1...,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,3-MR83a,472.293,5.0121,3.072,COc3ccc(C/C=C/C2=[O+][B-](F)(F)OC(/C=C/Cc1c(OC...,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,AD-10,472.293,5.0121,3.259,COc3ccc(C/C=C/C2=CC(/C=C/Cc1ccc(OC)cc1OC)=[O+]...,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,AD-1013,482.335,7.2927,2.625,COc5ccc(C/C=C/C4=[O+][B-](F)(F)OC(/C=C/Cc2c1cc...,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,AD-1022,472.293,5.0121,2.938,COc3ccc(OC)c(C/C=C/C2=CC(/C=C/Cc1cc(OC)ccc1OC)...,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
