In [None]:
import pandas as pd
import json
from datetime import datetime
import plotly.express as px
from pathlib import Path

In [None]:
input_csv = "/Users/alexpayne/Scientific_Projects/asapdiscovery-sars-retrospective/science/20241025_ligand_analysis/data/generic_cluster_labels.csv"
compound_data_csv = "/Users/alexpayne/Scientific_Projects/asapdiscovery-sars-retrospective/science/20241025_ligand_analysis/data/unique_compounds.csv"
date_json = "/Users/alexpayne/Scientific_Projects/asapdiscovery-sars-retrospective/science/20240403_multi_pose_docking_v2/20240430_analyze_cross_docking_results/20240503_inputs_analysis/date_dict.json"
figures = Path("./figures")

In [None]:
def date_processor(date_string):
    if type(date_string) == str and not date_string == "None":
        try:
            return datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")
        except ValueError:
            return datetime.strptime(date_string, "%d/%m/%Y %H:%M")
    else:
        return None

In [None]:
with open(date_json, "r") as f:
    date_dict = [
        {"Name": name, "Date": date_processor(date)}
        for name, date in json.load(f).items()
    ]
    date_df = pd.DataFrame.from_records(date_dict)

compound_data = pd.read_csv(compound_data_csv)

compound_data = compound_data.merge(
    date_df, left_on="structure_name", right_on="Name"
)

df = pd.read_csv(input_csv)

df = df.merge(compound_data, on="compound_name", how="left")

## count number of structures per cluster

In [None]:
cluster_counts = df.groupby('cluster_id').count().reset_index()[['cluster_id', 'compound_name']]
cluster_counts.columns = ['cluster_id', 'count']

In [None]:
df_ccounts = pd.merge(df, cluster_counts, on='cluster_id', how='left')

## remove singlets

In [None]:
no_singlets = df_ccounts[df_ccounts['count'] > 1]

In [None]:
no_singlets_ccs = cluster_counts[cluster_counts['count'] > 1]

# Plot Scaffolds Over Time

In [None]:
def make_image(df,x_col = "Date", color="cluster_id"):
    import plotly.express as px

    large_font = 24
    small_font = 18

    fig = px.ecdf(
        df,
        x=x_col,
        color=color,
        ecdfnorm=None,
        template="simple_white",
        height=600,
        width=800,
    )
    # update legend title
    fig.update_layout(legend_title_text="<b> Scaffold </b>")
    fig.update_xaxes(title_text="<b> Date of Crystal Structure Collection </b>")
    fig.update_yaxes(title_text="<b> Cumulative Number of Structures </b>")

    update_layout_dict = dict(
        xaxis=dict(
            title_font=dict(size=large_font),
            color="black",
        ),
        yaxis=dict(
            # range=(0,1),
            title_font=dict(size=large_font),
            color="black",
        ),
    )

    # move legend to inside the plot
    fig.update_layout(
        legend=dict(yanchor="bottom", y=0.25, xanchor="right", x=1.1),
        **update_layout_dict,
    )

    return fig

In [None]:
df.cluster_id.max()

In [None]:
no_singlets.cluster_id.max()

In [None]:
df[df.cluster_id > 4]

In [None]:
max_id = 5
df['Simplified_Cluster_ID'] = df.cluster_id.apply(lambda x: x if x < max_id else f'Scaffold {max_id}-{df.cluster_id.max()}')

In [None]:
fig = make_image(df)

In [None]:
fig.write_image(figures / "20250123_scaffold_over_time.png")
fig.write_image(figures / "20250123_scaffold_over_time.svg")

# make sideways bar

In [None]:
top_clusters = df[df.cluster_id < 4]

In [None]:
count_df = top_clusters.groupby("cluster_id").count().reset_index()[['cluster_id', 'compound_name']]
count_df.columns = ['Scaffold', 'Count']
count_df.Scaffold = [f'Scaffold {i+1}' for i in count_df.Scaffold] 

In [None]:
labels = {'Count': 'Number of Molecules with Scaffold',
          'Scaffold': 'Scaffold ID'}

In [None]:
fig = px.bar(count_df, x='Count', y='Scaffold', template='simple_white', height=400, width=600)
fig.update_xaxes(range=[20,65])
fig.show()
fig.write_image(figures / "20250123_count_scaffolds_sideways_bar.svg")

# Plot Structure Collection Dates

In [None]:
from datetime import datetime
df["Day"] = df.Date.apply(lambda x: x.date())
df_pseries = df[df.series == 'P']

In [None]:
# get date counts
date_counts = df_pseries.groupby('Day').count().reset_index()[['Day', 'compound_name']]
date_counts.columns = ['Day', 'Count']
date_counts.sort_values('Day')

In [None]:
date_counts["cumulative"] = date_counts.Count.cumsum()

In [None]:
date_counts

# Plot just P-Series

In [None]:
p_series = df[df.series == 'P']

In [None]:
fig = make_image(p_series)
fig.write_image(figures / "20250123_scaffold_over_time_p_series.png")
fig.write_image(figures / "20250123_scaffold_over_time_p_series.svg")

# save svg of all scaffolds

In [None]:
from rdkit import Chem
from rdkit.Chem import Draw, rdDepictor

In [None]:
# get tuple of scaffold_id and rdkit mol
scaffold_mols = df.groupby('cluster_id').first().reset_index()[['cluster_id', 'scaffold_smarts']]
scaffold_mols['scaffold_mol'] = scaffold_mols.scaffold_smarts.apply(lambda x: Chem.MolFromSmiles(x) if x is not None else None)

In [None]:
def draw_single_mol(mol, fn, size=(400, 400)):
    mol = Chem.RemoveHs(mol)
    rdDepictor.Compute2DCoords(mol)
    rdDepictor.StraightenDepiction(mol)
    
    # Create the drawing object
    drawer = Draw.rdMolDraw2D.MolDraw2DSVG(*size)
    drawer.DrawMolecule(mol)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    with open(fn, "w") as f:
        f.write(svg)
    
    img = Draw.MolsToImage([mol], subImgSize=size, legends=[f"Scaffold {row.cluster_id}"])
    img.save(fn.with_suffix(".png"))

In [None]:
scaff_dir = figures / "scaffolds"
scaff_dir.mkdir(exist_ok=True)
for i, row in scaffold_mols.iterrows():
    if row.scaffold_mol is not None:
        row.scaffold_mol.SetProp("_Name", f"Scaffold {row.cluster_id}")
        draw_single_mol(row.scaffold_mol, scaff_dir / f"generic_scaffold_{row.cluster_id}.svg")

# do the same thing but with the generic scaffolds

In [None]:
default_df = pd.read_csv("/Users/alexpayne/Scientific_Projects/asapdiscovery-sars-retrospective/science/20241025_ligand_analysis/data/default_cluster_labels.csv")
default_df = default_df.merge(compound_data, on="compound_name", how="left")

In [None]:
p_series_default = default_df[default_df.series == 'P']

In [None]:
p_series_default

In [None]:
fig = make_image(p_series_default)
fig.write_image(figures / "20250123_scaffold_over_time_p_series_default.png")
fig.write_image(figures / "20250123_scaffold_over_time_p_series_default.svg")

In [None]:
# get tuple of scaffold_id and rdkit mol
scaffold_mols = default_df.groupby('cluster_id').first().reset_index()[['cluster_id', 'scaffold_smarts']]
scaffold_mols['scaffold_mol'] = scaffold_mols.scaffold_smarts.apply(lambda x: Chem.MolFromSmiles(x) if x is not None else None)

In [None]:
scaff_dir = figures / "scaffolds"
scaff_dir.mkdir(exist_ok=True)
for i, row in scaffold_mols.iterrows():
    if row.scaffold_mol is not None:
        row.scaffold_mol.SetProp("_Name", f"Scaffold {row.cluster_id}")
        draw_single_mol(row.scaffold_mol, scaff_dir / f"default_scaffold_{row.cluster_id}.svg")