In [None]:
import numpy as np
import pandas as pd

In [None]:
# Load the data

In [None]:
tc = pd.read_csv('tanimoto_combo.csv')

In [None]:
mcss = pd.read_csv('mcss_tanimoto.csv')

In [None]:
fp = pd.read_csv('tanimoto_fingerprint_comparison.csv')

In [None]:
tc

In [None]:
mcss

In [None]:
fp

In [None]:
import plotly.express as px
px.histogram(fp.groupby("Mol1")[["Tanimoto"]].max()[["Tanimoto"]], x="Tanimoto")

## Rename "Fingerprint"

In [None]:
fp = fp.rename(columns={'Fingerprint': 'Similarity Metric'})

### remove 1024 bit fingerprint and ECFP8

In [None]:
fp = fp[fp["BitSize"] != "1024 bits"]
fp = fp[fp["Similarity Metric"] != "ECFP8"]

In [None]:
px.histogram(fp.groupby("Mol1")[["Tanimoto"]].max()[["Tanimoto"]], x="Tanimoto")

In [None]:
mcss["Similarity Metric"] = "MCSS (N Atoms)"

In [None]:
tc["Similarity Metric"] = "0.5*TanimotoCombo (OpenEye)"

### divide tc by 2

In [None]:
tc["Tanimoto"] = tc["Tanimoto"] / 2

In [None]:
df = pd.concat([tc, mcss, fp])

## remove self-comparisons

In [None]:
df = df[df["Mol1"] != df["Mol2"]]

## Save the data

In [None]:
df.to_csv('all_tanimoto.csv', index=False)

## Plot the data

In [None]:
import plotly.figure_factory as ff
hist_data = [df[df["Similarity Metric"] == fp].groupby("Mol1")[["Tanimoto"]].max()["Tanimoto"] for fp in df["Similarity Metric"].unique()]

In [None]:
fig = ff.create_distplot(hist_data,
                         group_labels=df["Similarity Metric"].unique(),
                         bin_size=0.1,
                         histnorm="probability",
                         show_rug=False,
                         show_hist=False,
                         # show_curve=False,
                         colors=["#5ba300","#89ce00", "#0073e6", "#5928ed","#e6308a","#b51963",]
                         )
fig.update_xaxes(range=[0, 1])
fig.update_yaxes(range=[0, 1])

In [None]:
fig.update_layout(template="simple_white",
                  xaxis_title="Tanimoto Coefficients",
                  yaxis_title="Probability",
                  height=400,
                  width=600,
                  legend=dict(title="Similarity Metric", y=0.9, x=0.1),)

In [None]:
fig.write_image("20240208_combined_analysis_max_tanimoto.svg")
fig.write_image("20240208_combined_analysis_max_tanimoto.png")

# Use all data instead of max

In [None]:
hist_data = [df[df["Similarity Metric"] == fp]["Tanimoto"] for fp in df["Similarity Metric"].unique()]

In [None]:
fig = ff.create_distplot(hist_data,
                         group_labels=df["Similarity Metric"].unique(),
                         bin_size=0.1,
                         histnorm="probability",
                         show_rug=False,
                         show_hist=False,
                         # show_curve=False,
                         colors=["#5ba300","#89ce00", "#0073e6", "#5928ed","#e6308a","#b51963",]
                         )

In [None]:
fig.update_xaxes(range=[0, 1])
fig.update_yaxes(range=[0, 1])
fig.update_layout(template="simple_white",
                  xaxis_title="Tanimoto Coefficients",
                  yaxis_title="Probability",
                  height=400,
                  width=600,
                  legend=dict(title="Similarity Metric", y=0.8, x=0.5),)

In [None]:
fig.write_image("20240208_combined_analysis_all_tanimoto.svg")
fig.write_image("20240208_combined_analysis_all_tanimoto.png")