In [1]:
# IMPORTS
# base
import pandas as pd
import numpy as np
import json

# plots
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px

# importing from scripts
from src.scripts import targets

import warnings

warnings.filterwarnings("ignore")

In [2]:
BINDING_DATASET = "../data/BindingDB_All.tsv"

In [3]:
usecols = [
    "BindingDB Reactant_set_id",
    # "Ligand SMILES",
    # "Ligand InChI",
    # "Ligand InChI Key",
    # "BindingDB MonomerID",
    # "BindingDB Ligand Name",
    # "Target Name",
    "Target Source Organism According to Curator or DataSource",
    # "Ki (nM)",
    # "IC50 (nM)",
    # "Kd (nM)",
    # "EC50 (nM)",
    # "kon (M-1-s-1)",
    # "koff (s-1)",
    # "pH",
    # "Temp (C)",
    # "Curation/DataSource",
    # "Article DOI",
    # "BindingDB Entry DOI",
    # "PMID",
    # "PubChem AID",
    "Patent Number",
    # "Authors",
    # "Institution",
    # "Ligand HET ID in PDB",
    # "PDB ID(s) for Ligand-Target Complex",
    # "PubChem CID",
    # "PubChem SID",
    # "ChEBI ID of Ligand",
    # "ChEMBL ID of Ligand",
    # "DrugBank ID of Ligand",
    # "IUPHAR_GRAC ID of Ligand",
    # "KEGG ID of Ligand",
    # "ZINC ID of Ligand",
    # "Number of Protein Chains in Target (>1 implies a multichain complex)",
    # "BindingDB Target Chain Sequence",
    # "PDB ID(s) of Target Chain",
    "UniProt (SwissProt) Recommended Name of Target Chain",
    "UniProt (SwissProt) Entry Name of Target Chain",
    "UniProt (SwissProt) Primary ID of Target Chain",
    "UniProt (TrEMBL) Primary ID of Target Chain",
    "UniProt (TrEMBL) Submitted Name of Target Chain",
]

In [4]:
df = pd.read_csv(BINDING_DATASET, sep="\t", on_bad_lines="skip", usecols=usecols)

### Sankey diagram that links target class with disease class:

In [5]:
from src.scripts.disease_plotting import (
    load_uniprotid_diseases,
    quantify_missing_diseases,
    add_keywords_when_comments_missing,
    sort_diseases,
)

diseases = load_uniprotid_diseases()
ids_missing_diseases, percentage_missing = quantify_missing_diseases(diseases)
diseases_df = add_keywords_when_comments_missing(diseases)
diseases_df = diseases_df.rename(columns={"comments_bfill": "diseases"})
diseases_df = diseases_df.dropna(subset="diseases").drop(
    columns=["comments", "keywords"]
)
diseases_df["Disease Classes"] = diseases_df["diseases"].apply(
    lambda l: [sort_diseases(l_i) for l_i in l]
)
diseases_df = diseases_df[['UniProt (SwissProt) Primary ID of Target Chain', 'Disease Classes']]

In [6]:
# remove 'Disease variant' error -> where does it come from?
import ast
diseases_df['Disease Classes'] = diseases_df['Disease Classes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
diseases_df['Disease Classes'] = diseases_df['Disease Classes'].apply(lambda x: [disease for disease in x if disease != "Disease variant"] if isinstance(x, list) else x)

In [7]:
from src.scripts import targets
mapped_names = targets.get_target_class(names_df=df)
merged = df.merge(mapped_names, left_index=True, right_index=True)
merged= merged[['UniProt (SwissProt) Recommended Name of Target Chain_y', 'UniProt (SwissProt) Primary ID of Target Chain']]

In [8]:
diseases_target_df = diseases_df.merge(merged, on='UniProt (SwissProt) Primary ID of Target Chain')
diseases_target_df = diseases_target_df.rename(columns={'UniProt (SwissProt) Recommended Name of Target Chain_y': 'Target Classes'})

In [9]:
diseases_target_df = diseases_target_df.groupby('Target Classes')['Disease Classes'].apply(lambda x: list(set([disease for sublist in x for disease in sublist]))).reset_index()

In [10]:
diseases_target_df

Unnamed: 0,Target Classes,Disease Classes
0,"1,25-dihydroxyvitamin D(3) 24-hydroxylase, mit...","[Hypercalcemia, infantile, 1]"
1,1-acyl-sn-glycerol-3-phosphate acyltransferase...,[Lipodystrophy]
2,11-beta-hydroxysteroid dehydrogenase 1,[Cortisone reductase deficiency 2]
3,11-beta-hydroxysteroid dehydrogenase type 2,[Apparent mineralocorticoid excess]
4,14-3-3 protein gamma,[Epilepsy]
...,...,...
1139,cGMP-dependent protein kinase 1,[Aortic aneurysm]
1140,cGMP-dependent protein kinase 2,[Dysplasia]
1141,eIF-2-alpha kinase GCN2,[Pulmonary venoocclusive disease]
1142,m7GpppX diphosphatase,[Al-Raqad syndrome]


In [11]:
temp = mapped_names.value_counts().reset_index()
top_classes = temp.iloc[:10].sort_values("count", ascending=False)
diseases_target_df = diseases_target_df.merge(top_classes, left_on='Target Classes', right_on='UniProt (SwissProt) Recommended Name of Target Chain')
diseases_target_top10 = diseases_target_df.sort_values(by='count', ascending=False).head(10)
diseases_target_top10 = diseases_target_top10.drop(['UniProt (SwissProt) Recommended Name of Target Chain'], axis=1)
diseases_target_top10 = diseases_target_top10.drop(['count'], axis=1)

In [12]:
diseases_target_top10

Unnamed: 0,Target Classes,Disease Classes
4,Neurotransmitter receptor,"[Obesity, Epilepsy, Neurodevelopmental disorde..."
6,Non Receptor Tyr Kinase,"[Polycythemia vera, Myelofibrosis, Autoinflamm..."
7,Other Protein Kinase,"[Epilepsy, Cardiomyopathy, Melorheostosis, iso..."
0,Growth Factor Receptor,"[Hypochondroplasia, Lymphatic malformation 1, ..."
2,Hormone Receptor,"[Diabetes, Glucocorticoid resistance, generali..."
3,Ion Channel,"[Pulmonary hypertension, primary, 4, Dehydrate..."
1,Histone Modifier,"[Neurodevelopmental disorder, Dysplasia, Cance..."
9,Purine Receptor,"[Deafness, Bleeding disorder]"
8,Phosphatidylinositol Kinase,[Megalencephaly-capillary malformation-polymic...
5,Neurotransmitter transporter,"[Neurodegeneration, Butyrylcholinesterase defi..."


In [13]:
exploded = diseases_target_top10.explode('Disease Classes')
exploded

Unnamed: 0,Target Classes,Disease Classes
4,Neurotransmitter receptor,Obesity
4,Neurotransmitter receptor,Epilepsy
4,Neurotransmitter receptor,Neurodevelopmental disorder
4,Neurotransmitter receptor,Schizophrenia
4,Neurotransmitter receptor,Blindness
...,...,...
8,Phosphatidylinositol Kinase,Cancer
8,Phosphatidylinositol Kinase,Immunodeficiency
8,Phosphatidylinositol Kinase,SHORT syndrome
5,Neurotransmitter transporter,Neurodegeneration


In [14]:
to_plot = exploded.groupby(['Target Classes', 'Disease Classes']).size().reset_index(name='Count')
to_plot

Unnamed: 0,Target Classes,Disease Classes,Count
0,Growth Factor Receptor,Achondroplasia,1
1,Growth Factor Receptor,Antley-Bixler syndrome,1
2,Growth Factor Receptor,Apert syndrome,1
3,Growth Factor Receptor,Arthrogryposis,1
4,Growth Factor Receptor,Beare-Stevenson cutis gyrata syndrome,1
...,...,...,...
216,Phosphatidylinositol Kinase,SHORT syndrome,1
217,Phosphatidylinositol Kinase,"Spastic paraplegia 84, autosomal recessive",1
218,Phosphatidylinositol Kinase,Squamous cell carcinoma of the head and neck,1
219,Purine Receptor,Bleeding disorder,1


In [17]:
import plotly.graph_objects as go

to_plot = to_plot.groupby('Target Classes').apply(
    lambda x: x.nlargest(10, 'Count')
).reset_index(drop=True)

target_classes = to_plot["Target Classes"].unique()
disease_classes = to_plot["Disease Classes"].unique()
target_mapping = {name: idx for idx, name in enumerate(target_classes)}
disease_mapping = {name: idx + len(target_classes) for idx, name in enumerate(disease_classes)}
sources = to_plot['Target Classes'].map(target_mapping)
targets = to_plot['Disease Classes'].map(disease_mapping)
values = to_plot['Count']

fig = go.Figure(go.Sankey(
    node=dict(
        pad=30,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=list(target_classes) + list(disease_classes)
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values
    )
))
fig.update_layout(
    title_text="Target to Disease Classes Diagram",
    font_size=10,
    width=1000, 
    height=800,
    title_x=0.5,
    plot_bgcolor="rgb(34, 37, 41)", paper_bgcolor="rgb(34, 37, 41)",
    template="plotly_dark"
)
fig.show()

In [None]:
# custom_style = """
# <style>
#   body, html {
#     background-color: rgb(34, 37, 41);
#   }
# </style>
# """

# # Save the HTML file and inject the custom style
# html_content = fig.to_html(full_html=True, include_plotlyjs="cdn")
# # Insert the style in the <head>
# html_content = html_content.replace("</head>", f"{custom_style}</head>")

# # Write to a file
# with open("Sankey_Targets_Diseases.html", "w") as f:
#     f.write(html_content)

# fig.write_image("Sankey_Targets_Diseases.svg")