In [None]:
import pandas as pd

# Function Definitions

In [None]:
import json
from datetime import datetime
def date_processor(date_string):
    if type(date_string) == str and not date_string == 'None':
        try:
            return datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")
        except ValueError:
            return datetime.strptime(date_string, "%d/%m/%Y %H:%M")
    else:
        return None

# Plotting Variables

In [None]:
large_font = 24
small_font = 18 

# Execution

In [None]:
with open("/Users/alexpayne/Scientific_Projects/asapdiscovery-sars-retrospective/science/20240403_multi_pose_docking_v2/20240430_analyze_cross_docking_results/20240503_inputs_analysis/date_dict.json", "r") as f:
    date_dict = [{"Name": name, "Date": date_processor(date)} for name, date in json.load(f).items()]
date_df = pd.DataFrame.from_records(date_dict)

In [None]:
date_df.head()

In [None]:
generic_scaffolds = pd.read_csv("generic_cluster_labels.csv")
scaffolds = pd.read_csv("default_cluster_labels.csv")

In [None]:
# add dates
generic_scaffolds = generic_scaffolds.merge(date_df, left_on="structure_name", right_on="Name")
scaffolds = scaffolds.merge(date_df, left_on="structure_name", right_on="Name")

In [None]:
generic_scaffolds.Cluster.nunique()

In [None]:
scaffolds.Cluster.nunique()

In [None]:
gs_counts = generic_scaffolds.groupby("Cluster").count()
# rename cluster ID to be ordered by size
gs_counts.sort_values("name", ascending=False, inplace=True)
gs_counts['Cluster_ID'] = [i for i in range(1, len(gs_counts)+1)]
gs_counts['Method'] = "Generic"

scaffolds_counts = scaffolds.groupby("Cluster").count()
scaffolds_counts['Method'] = "Default"
# rename cluster ID to be ordered by size
scaffolds_counts.sort_values("name", ascending=False, inplace=True)
scaffolds_counts['Cluster_ID'] = [i for i in range(1, len(scaffolds_counts)+1)]

In [None]:
cluster_counts = pd.concat([gs_counts, scaffolds_counts])

In [None]:
cluster_counts.sort_values('Cluster_ID', inplace=True)

In [None]:
import plotly.express as px
fig = px.ecdf(cluster_counts, 
              x="Cluster_ID", 
              color='Method', 
              y='name', 
              template='simple_white', 
              height=600, 
              width=800, 
              ecdfnorm=None,
              log_x=True,
              )
fig.update_xaxes(title_text="<b> Cluster ID (Ordered by Size) </b>")
fig.update_yaxes(title_text="<b> Cumulative Number of Molecules </b>")
fig.update_layout(
    margin=dict(
        # l=0, 
        # r=0, 
        t=large_font, 
        b=large_font
    )  # Set the margins to 0
)
fig.show()
fig.write_image("20241106_cluster_size_ecdf.svg")

In [None]:
import plotly.express as px
fig = px.bar(cluster_counts, x="Cluster_ID", 
             color='Method',
             # color = 'name',
             y='name', 
             template='simple_white', 
             height=600, width=800, 
             log_x=True, 
             # facet_col='Method',
             barmode='group',
             # barmode='overlay',
             )
fig.update_xaxes(title_text="<b> Cluster ID (Ordered by Size, Log Scale) </b>")
fig.update_yaxes(title_text="<b> Number of Molecules In the Cluster </b>")
fig.update_layout(
    bargap=0,  # Set the gap between bars to 0
    margin=dict(
        # l=0, 
        # r=0, 
        t=large_font, 
        # b=large_font
    )  # Set the margins to 0
)
fig.show()
fig.write_image("20241106_cluster_size_bar.svg")

## overlay both figures

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
# make separate dfs for diff methods
methods = cluster_counts['Method'].unique()
methods.sort()
data_dict = {method: cluster_counts[cluster_counts['Method'] == method] for method in methods}

bar_plots = []
ecdf_plots = []

for i, (method, data) in enumerate(data_dict.items()): 
    # Create the bar plot
    bar_fig = go.Bar(
        x=data['Cluster_ID'],
        y=data['name'],  # Ensure that 'name' is the count for the bar plot
        name=f'{method} - Cluster Count',
        marker=dict(color=px.colors.qualitative.Dark2[i]),
        opacity=0.5,
    )
    bar_plots.append(bar_fig)
    cumsum = data['name'].cumsum()
    cumsum[-0.5] = 0
    print(cumsum)
    # Create the ECDF plot (line)
    ecdf_fig = go.Scatter(
        x=data['Cluster_ID'],
        y=cumsum,  # Ensure that 'name' is the cumulative value
        mode='lines',
        name=f'{method} - CDF',
        line=dict(color=px.colors.qualitative.Dark2[i]),
    )
    ecdf_plots.append(ecdf_fig)



# Create subplots: one plot with two y-axes
fig = make_subplots(
    rows=1, cols=1,
    shared_xaxes=True,
    vertical_spacing=0.1,
    subplot_titles=['Cluster Size Distribution by Bemis-Murcko Scaffold Type'],
    specs=[[{'secondary_y': True}]]  # Allow secondary y-axis for the bar plot
)

for ecdf_fig, bar_fig in zip(ecdf_plots, bar_plots):
    fig.add_trace(ecdf_fig, row=1, col=1, secondary_y=True)
    fig.add_trace(bar_fig, row=1, col=1)

# Update the x-axis
fig.update_xaxes(title_text="<b>Cluster ID (Ordered by Size, Log Scale)</b>", row=1, col=1, 
                 type='log',
                 range=[0, np.log10(data['Cluster_ID'].max())],
                 )

# Update the y-axes
fig.update_yaxes(title_text="<b>Number of Molecules in the Cluster</b>", row=1, col=1, secondary_y=False)
fig.update_yaxes(title_text="<b>Cumulative Number of Molecules</b>", row=1, col=1, secondary_y=True, )


# Update the layout
fig.update_layout(
    template='simple_white',
    height=600,
    width=800,
    bargap=0,
    margin=dict(t=large_font),  # Adjust margins as needed
    barmode='overlay',  # Overlay the bar plots
)

# Show the figure
fig.show()

In [None]:
data['Cluster_ID']

In [None]:
import numpy as np
np.log(0.5)

In [None]:
import plotly.graph_objects as go
# make a cdf plot and a bar plot with separate y axes




In [None]:
scaffolds.Cluster.nunique()

In [None]:
import plotly.express as px
fig = px.ecdf(scaffolds, 
              x='Date', 
              color='Cluster', 
              ecdfnorm=None, 
              template='simple_white', 
              log_y=True,height=600,width=800,)
# update legend title
fig.update_layout(legend_title_text="<b> Bemis-Murcko Cluster </b>")
fig.update_xaxes(title_text="<b> Date of Crystal Structure Collection </b>")
fig.update_yaxes(title_text="<b> Cumulative Number of Structures </b>")

update_layout_dict = dict(xaxis=dict(title_font=dict(size=large_font), 
                            color='black', 
                            ),
                  yaxis=dict(
                      # range=(0,1),  
                      title_font=dict(size=large_font), 
                             color='black', 
                             ))

# move legend to inside the plot
fig.update_layout(legend=dict(
    yanchor="bottom",
    y=0.25,
    xanchor="right",
    x=1.1
), **update_layout_dict)
fig.show()
fig.write_image("20241106_cumulative_cluster_by_date.svg")