# Imports

In [None]:
import numpy as np
import pandas as pd
from asapdiscovery.docking import analysis as a
from importlib import reload
reload(a)
import plotly.figure_factory as ff
from openeye import oegraphsim
from asapdiscovery.data.readers.molfile import MolFileFactory
from pathlib import Path

# Load the data

In [None]:
ogpath = Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/mpro_fragalysis-04-01-24_curated_cache/combined_2d.sdf")

In [None]:
ligs = MolFileFactory(filename=ogpath).load() 

In [None]:
len(ligs)

In [None]:
def get_fp(mol, bit_size=2048, radius=2):
    fp = oegraphsim.OEFingerPrint()
    oegraphsim.OEMakeCircularFP(fp, mol, bit_size, 0, radius, oegraphsim.OEFPAtomType_DefaultCircularAtom, oegraphsim.OEFPBondType_DefaultCircularBond)
    return fp

In [None]:
fps = [get_fp(mol.to_oemol()) for mol in ligs]

In [None]:
def calculate_tanimoto(fp1, fp2):
    return oegraphsim.OETanimoto(fp1, fp2)

In [None]:
tc_list = [calculate_tanimoto(fp1, fp2) for fp1 in fps for fp2 in fps]

In [None]:
tc_matrix = np.array(tc_list).reshape(len(fps), len(fps))

In [None]:
def get_tc_df(mols, bit_size=2048, radius=2, filter_out_self=True):
    fps = [get_fp(mol.to_oemol(), bit_size, radius) for mol in mols]
    tc_list = [calculate_tanimoto(fp1, fp2) for fp1 in fps for fp2 in fps]
    df = pd.DataFrame({"Reference_Ligand": [m.compound_name for m in mols for _ in mols], 
                       "Query_Ligand": [m.compound_name for _ in mols for m in mols], 
                       "Tanimoto": tc_list,
                       "Fingerprint": f"ECFP{2*radius}",
                       "BitSize": f"{bit_size} bits"})
    if filter_out_self:
        df = df[df["Reference_Ligand"] != df["Query_Ligand"]]
    return df

In [None]:
dfs = []
for radius in [2, 3, 4, 5]:
    for bit_size in [2048]:
        df = get_tc_df(ligs, bit_size, radius)
        dfs.append(df)

In [None]:
all_df = pd.concat(dfs)

In [None]:
all_df.to_csv("20240503_all_tc_comparison.csv", index=False)

# Plot

## Max

### for this to make sense, need to remove the stereoisomeric pairs that have a 2D TC of 1

In [None]:
all_top_df = all_df[all_df.Tanimoto != 1.0]
all_top_df = all_top_df.groupby(["Query_Ligand", "Fingerprint", "BitSize",], group_keys=True).max().reset_index()

In [None]:
hist_data = [all_top_df[all_top_df["Fingerprint"] == fp]["Tanimoto"] for fp in 
             ["ECFP4", "ECFP6", "ECFP8", "ECFP10"]]

In [None]:
fig = ff.create_distplot(hist_data, 
                         group_labels=["ECFP4", "ECFP6", "ECFP8", "ECFP10"], 
                         bin_size=0.1, 
                         histnorm="probability", 
                         show_rug=False,
                         show_hist=False)

In [None]:
fig.update_xaxes(range=[0, 1])
fig.update_yaxes(range=[0, 1])
fig.update_layout(template="simple_white", 
                  title="Maximum Tanimoto similarities in this dataset", 
                  xaxis_title="Tanimoto similarity", 
                  yaxis_title="Probability", 
                  height=400, 
                  width=600,
                  legend=dict(title="Fingerprint Type", y=0.5, x=0.2),
                  )
fig.write_image("tanimoto_max_kde.png")