# Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import yaml
from pathlib import Path
from rdkit import Chem
import plotly.express as px

## paths

In [None]:
data_path = Path("../../data")

In [None]:
df = pd.read_csv('rmsd_data_with_all_info.csv', index_col=0)

In [None]:
df.head()

In [None]:
df.groupby(['Dataset', 'Compound_ID']).nunique()

In [None]:
df['Reference_Ligand'] = df.Dataset.apply(lambda x: dataset_to_cmpd["_".join(x.split('_')[:-1])])

In [None]:
sum(df['Compound_ID'] == df['Reference_Ligand'])

# Data Preprocessing

## Remove pose duplicates

In [None]:
cols = [col for col in df.columns if not col == "Pose_ID"]

In [None]:
df1 = df.drop_duplicates(subset=cols)

In [None]:
len(df) - len(df1)

In [None]:
len(df1)

## 297286 duplicate poses have been removed, leaving 544518

In [None]:
cmpd_to_dataset = yaml.safe_load(open(data_path / '202240129_p_only_cmpd_to_frag.yaml', 'r'))

In [None]:
dataset_to_cmpd = {v: k for k, v in cmpd_to_dataset.items()}

## load similarity metrics

In [None]:
similarity_metrics = pd.read_csv(data_path / 'all_tanimoto.csv')

In [None]:
set(similarity_metrics.Mol1.unique())== set(similarity_metrics.Mol2.unique())

In [None]:
set(similarity_metrics.Mol1.unique()) == set(df1.Compound_ID.unique())

In [None]:
set(similarity_metrics.Mol1.unique()) - set(df1.Compound_ID.unique())

In [None]:
set(df1.Compound_ID.unique()) - set(similarity_metrics.Mol1.unique())

In [None]:
'MAT-POS-7174c657-5' in df1.Compound_ID.unique()

## use BEN-DND-4f474d93-1 for ALP-POS-ce760d3f-2

In [None]:
df1['Compound_ID'] = df1.Compound_ID.apply(lambda x: 'BEN-DND-4f474d93-1' if x == 'ALP-POS-ce760d3f-2' else x)

In [None]:
set(df1.Compound_ID.unique()) - set(similarity_metrics.Mol1.unique())

## drop MAT-POS-7174c657-6

In [None]:
df1 = df1[df1.Compound_ID != 'MAT-POS-7174c657-6']

## reset Complex_ID

In [None]:
df1['Complex_ID'] = df1.Compound_ID + '_' + ["_".join(dataset.split("_")[:-1]) for dataset in df1.Dataset]

# Add Reference Compound

In [None]:
df1['Reference_Compound'] = df1.Dataset.apply(lambda x: dataset_to_cmpd["_".join(x.split('_')[:-1])])

# add chemical similarity stuff

In [None]:
similarity_metrics['Index'] = similarity_metrics.Mol1 + '_' + [cmpd_to_dataset.get(mol, 'FAIL') for mol in similarity_metrics.Mol2]

In [None]:
sim2 = similarity_metrics[similarity_metrics.Index.apply(lambda x: "FAIL" not in x)]

In [None]:
wide = sim2.pivot(index=['Index'], columns=['Similarity Metric'], values=['Tanimoto']).reindex()
wide.columns = wide.columns.droplevel()
wide['Complex_ID'] = wide.index

In [None]:
df2 = pd.merge(df1, wide, on='Complex_ID', how='left')

In [None]:
df2.nunique()

## make Pose_ID an int

In [None]:
df2['Pose_ID'] = df2.Pose_ID.astype(int)

In [None]:
df2.columns

In [None]:
df2['Pose_ID']

# Ok the goal is to generate a plot with n poses on the x axis and the fraction of compounds with rmsd < 2 on the y axis 

In [None]:
df2.columns

In [None]:
sum(df2.ECFP10 == 1)

In [None]:
self_docked = df2[df2.Compound_ID == df2.Reference_Compound]

In [None]:
self_docked