# Small molecule generation

Please make sure you've add your ANTHROPIC_API_KEY and MOLMIN_API_KEY to a .env file in the root of the project.

In [None]:
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from rdkit import Chem

def plot_boxenplot(df, x, y, hue, title, order):
    colours = ["#B0B1B6", "#5359CC", "#DA6AF7", "#AA6BE0", "#F94156", "#FBAC3B", "#FFFFFF"]
    fig, ax = plt.subplots(figsize=(9, 5))
    sns.boxenplot(data=df, x=x, y=y, hue=hue, ax=ax, palette=colours, hue_order=order)
    plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)
    ax.set_title(title)
    plt.tight_layout()
    
    
DATASET_FILE = {
    "a2a": "data/adenosineA2A.csv",
    "aryl": "data/Aryl piperazine.csv",
    "sirt2": "data/SIRT2.csv",
}

### Run the models
Run the run_models.py script to generate new molecules using differnt i.e. MolMim

In [None]:
!python run_models.py

### Post processing and data cleaning
Join all the generated output together in one dataframe and remove any bad smiles. 

In [None]:
# Merge all data into a single DataFrame
df = pd.concat([pd.read_csv(f) for f in glob.glob("output/*.csv") + DATASET_FILE.values()])
df["Model"] = df["Model"].apply(lambda x: x if x else "original")


# Clean up Smiles for analysis
def canonicalize_and_validate_smiles(smiles):
    try:
        molecule = Chem.MolFromSmiles(smiles)
        if molecule:
            return Chem.MolToSmiles(molecule)
    except:
        pass
    return np.nan

df["canonical smiles"] = df["Smiles"].apply(canonicalize_and_validate_smiles)
df = df.dropna(subset=["canonical smiles"])

### Visual inspection of generated molecules

In [None]:
subset = df[df["Dataset"] == "aryl"]
models = ["original", "crem", "molmin", "claude", "claude_scaffold", "reinvent"]

samples = {model: subset[subset["Model"] == model].sample(5, random_state=seed)["Smiles"].values for model in models}
molecules = [Chem.MolFromSmiles(val) for tup in zip(*samples.values()) for val in tup]
legends = [model.capitalize() if "_" not in model else model.replace("_", " ").capitalize() for model in models] * 5

# Draw and save the grid image
img = Chem.Draw.MolsToGridImage(molecules, molsPerRow=6, subImgSize=(300, 300), legends=legends, returnPNG=False)



In [None]:
# Plot the results

# Calculate Tanimoto Similarities
def calculate_tanimoto_similarities(reference_smiles: List[str], original_smiles_list: List[str]) -> List[float]:
    ref_fp = Chem.AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(reference_smiles), radius=2)
    fps_list = [Chem.AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles), radius=2) for smiles in original_smiles_list]
    return Chem.BulkTanimotoSimilarity(ref_fp, fps_list)


df["Tanimoto Score"] = np.nan

for dataset in DATASET_FILE.keys():
    tmp_df = df[df["Dataset"] == dataset]
    original_smiles = tmp_df[tmp_df["Model"] == "original"]["Smiles"].values
    tmp_df["Tanimoto Score"] = tmp_df["Smiles"].apply(lambda smiles: np.max(calculate_tanimoto_similarities(smiles, original_smiles)))
    df.update(tmp_df)

df.to_csv(os.path.join("results", "combined_with_tanimoto_scores.csv"), index=False)

# Plotting
def plot_boxenplot(df, x, y, hue, title, order):
    colours = ["#B0B1B6", "#5359CC", "#DA6AF7", "#AA6BE0", "#F94156", "#FBAC3B", "#FFFFFF"]
    fig, ax = plt.subplots(figsize=(9, 5))
    sns.boxenplot(data=df, x=x, y=y, hue=hue, ax=ax, palette=colours, hue_order=order)
    plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)
    ax.set_title(title)
    plt.tight_layout()


order = ["original", "molmin", "claude", "claude_scaffold", "reinvent", "crem"]
plot_boxenplot(df, x="Dataset", y="Tanimoto Score", hue="Model", title="Tanimoto Similarity Score", order=order)
