In [None]:
import pandas as pd
import json
from datetime import datetime

In [None]:
input_csv = "./data/generic_cluster_labels.csv"
output_dir = "test"
compound_data_csv = "./data/unique_compounds.csv"
date_json = "/Users/alexpayne/Scientific_Projects/asapdiscovery-sars-retrospective/science/20240403_multi_pose_docking_v2/20240430_analyze_cross_docking_results/20240503_inputs_analysis/date_dict.json"

In [None]:
def date_processor(date_string):
    if type(date_string) == str and not date_string == "None":
        try:
            return datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")
        except ValueError:
            return datetime.strptime(date_string, "%d/%m/%Y %H:%M")
    else:
        return None

In [None]:
with open(date_json, "r") as f:
    date_dict = [
        {"Name": name, "Date": date_processor(date)}
        for name, date in json.load(f).items()
    ]
    date_df = pd.DataFrame.from_records(date_dict)

compound_data = pd.read_csv(compound_data_csv)

compound_data = compound_data.merge(
    date_df, left_on="structure_name", right_on="Name"
)

df = pd.read_csv(input_csv)

df = df.merge(compound_data, on="compound_name", how="left")

In [None]:
def make_image(df):
    import plotly.express as px

    large_font = 24
    small_font = 18

    fig = px.ecdf(
        df,
        x="Date",
        color="cluster_id",
        ecdfnorm=None,
        template="simple_white",
        height=600,
        width=800,
    )
    # update legend title
    fig.update_layout(legend_title_text="<b> Bemis-Murcko Cluster </b>")
    fig.update_xaxes(title_text="<b> Date of Crystal Structure Collection </b>")
    fig.update_yaxes(title_text="<b> Cumulative Number of Structures </b>")

    update_layout_dict = dict(
        xaxis=dict(
            title_font=dict(size=large_font),
            color="black",
        ),
        yaxis=dict(
            # range=(0,1),
            title_font=dict(size=large_font),
            color="black",
        ),
    )

    # move legend to inside the plot
    fig.update_layout(
        legend=dict(yanchor="bottom", y=0.25, xanchor="right", x=1.1),
        **update_layout_dict,
    )

    return fig


In [None]:
make_image(df)

In [None]:
df

In [None]:
import plotly.express as px

In [None]:
cluster_counts = df.groupby('cluster_id').count().reset_index()

In [None]:
cluster_counts

In [None]:
px.ecdf(cluster_counts, x="cluster_id", y="compound_name", ecdfnorm=None, template="simple_white", height=600, width=800)

In [None]:
fig = px.histogram(cluster_counts, x="compound_name", template="simple_white", height=600, width=800, log_y=True, text_auto=True)
fig.update_yaxes(title_text="<b> Number of Clusters </b>", tickvals=[(10**big)*small for big in range(0, 3) for small in [1,2,4,6,8]])
fig.update_xaxes(title_text="<b> Number of Compounds in Cluster </b>")
fig.show()
fig.write_image("figures/20241121_generic_cluster_bar.png")
fig.write_image("figures/20241121_generic_cluster_bar.svg")

In [None]:
from rdkit.Chem import MolFromSmiles

In [None]:
cluster0 = "CC(CC1CCCC2CCCCC21)C1CCCC2CCCCC21"
mol = MolFromSmiles(cluster0)

In [None]:
mol

In [None]:
df.groupby('cluster_id').nunique()

In [None]:
just_cluster0 = df[df['cluster_id'] == 0]

In [None]:
make_image(just_cluster0)

In [None]:
just_cluster0.sort_values('Date')

In [None]:
# why all the nans?

In [None]:
unique_names = compound_data.compound_name

In [None]:
just_cluster0[~just_cluster0['compound_name'].isin(unique_names)]

In [None]:
example = "MAT-POS-96f51285-5"

In [None]:
name_list = unique_names.to_list()

In [None]:
example in name_list

In [None]:
raw_compound_data = pd.read_csv(compound_data_csv)

In [None]:
raw_compound_data[raw_compound_data['compound_name'] == example]

In [None]:
date_df[date_df.Name == "Mpro-P0831"]

In [None]:
compound_data = pd.read_csv(compound_data_csv)
compound_data["structure_name"] = (
    "Mpro-" + compound_data["series"] + compound_data["number"].astype(str).str.zfill(4)
)

compound_data = compound_data.merge(
    date_df, left_on="structure_name", right_on="Name"
)

In [None]:
compound_data[compound_data['structure_name'] == "Mpro-P0831"]