In [None]:
import numpy as np
import pandas as pd
from asapdiscovery.docking import analysis as a
from importlib import reload
reload(a)
from asapdiscovery.data.schema.ligand import Ligand
from asapdiscovery.data.schema.complex import Complex
from asapdiscovery.data.operators.selectors.mcs_selector import MCSSelector
from openeye import oegraphsim
from tqdm import tqdm
from pathlib import Path
import json
from itertools import combinations, permutations, product

# Load the data

## Load the data

In [None]:
data_path = Path("../../data/20240202_fragalysis_p_series_schema")

In [None]:
complexes = [Complex.from_dict(json.load(open(p))) for p in data_path.glob("*.json")]

In [None]:
mols = [c.ligand for c in complexes]

In [None]:
def get_n_to_n_mcs(mols: list[Ligand]):
    from asapdiscovery.data.backend.openeye import oechem
   # these are the defaaults for atom and bond expressions but just to be explicit I'm putting them here
    atomexpr = (
        oechem.OEExprOpts_Aromaticity
        | oechem.OEExprOpts_AtomicNumber
        | oechem.OEExprOpts_FormalCharge
    )
    bondexpr = (
        oechem.OEExprOpts_Aromaticity
        | oechem.OEExprOpts_BondOrder
    )
    
    refmols = [mol.to_oemol() for mol in mols]
    querymols = [mol.to_oemol() for mol in mols]
    
    # Set up the search pattern and MCS objects
    mcs_num_atoms = np.zeros((len(refmols), len(querymols)), dtype=int)
    for i, refmol in tqdm(enumerate(refmols), total=len(refmols)):
        pattern_query = oechem.OEQMol(refmol)
        pattern_query.BuildExpressions(atomexpr, bondexpr)
        mcss = oechem.OEMCSSearch(pattern_query)
        mcss.SetMCSFunc(oechem.OEMCSMaxAtomsCompleteCycles())

        for j, querymol in enumerate(querymols):
            # MCS search
            try:
                mcs = next(iter(mcss.Match(querymol, True)))
                mcs_num_atoms[i, j] = mcs.NumAtoms()
            except StopIteration:  # no match found
                mcs_num_atoms[i, j] = 0
    return mcs_num_atoms

In [None]:
mcs_num_atoms = get_n_to_n_mcs(mols)

In [None]:
mcs_num_atoms

In [None]:
num_atoms = np.array([mol.to_oemol().NumAtoms() for mol in mols])

In [None]:
total_atoms = np.array([i+j for i, j in product(num_atoms, num_atoms)])

In [None]:
total_atoms_matrix = total_atoms.reshape((len(mols), len(mols)))

In [None]:
total_atoms_matrix

In [None]:
union_matrix = total_atoms_matrix - mcs_num_atoms

In [None]:
tc_matrix = mcs_num_atoms / union_matrix

In [None]:
tc_matrix

In [None]:
import plotly.express as px

In [None]:
px.imshow(tc_matrix)

In [None]:
def get_tc_matrix(mols):
    mcs_num_atoms = get_n_to_n_mcs(mols)
    num_atoms = np.array([mol.to_oemol().NumAtoms() for mol in mols])
    total_atoms = np.array([i+j for i, j in product(num_atoms, num_atoms)])
    total_atoms_matrix = total_atoms.reshape((len(mols), len(mols)))
    union_matrix = total_atoms_matrix - mcs_num_atoms
    tc_matrix = mcs_num_atoms / union_matrix
    return tc_matrix

In [None]:
def get_tc_df(mols, tc_matrix):
    df = pd.DataFrame({"Mol1": [m.compound_name for m in mols for _ in mols], 
                       "Mol2": [m.compound_name for _ in mols for m in mols], 
                       "Tanimoto": tc_matrix.reshape(-1),
                       })
    df_filtered = df[df["Mol1"] != df["Mol2"]]
    return df_filtered

In [None]:
df = get_tc_df(mols, tc_matrix)

## Save the Tanimoto similarity matrix

In [None]:
df.to_csv("mcss_tanimoto.csv", index=False)

In [None]:
import plotly.figure_factory as ff

In [None]:
hist_data = df["Tanimoto"]

In [None]:
np.save("mcss_tanimoto.npy", hist_data)

In [None]:
fig = ff.create_distplot([hist_data], 
                         group_labels=["Maximum Common Substructure"], 
                         bin_size=0.1, 
                         histnorm="probability", 
                         show_rug=False,
                         show_hist=False)

In [None]:
fig.update_xaxes(range=[0, 1])
fig.update_yaxes(range=[0, 1])
fig.update_layout(template="simple_white", 
                  title="Maximum Tanimoto similarities in this dataset", 
                  xaxis_title="Tanimoto similarity", 
                  yaxis_title="Probability", 
                  height=400, 
                  width=600,
                  )
fig.write_image("mcss_tanimoto_kde.svg")
fig.write_image("mcss_tanimoto_kde.png")