# FDA Analysis

This notebook contains the analysis of FDA approved drugs for the KGs.

In [1]:
import os
import json
import pandas as pd
from tqdm import tqdm

from pyenveda.chemicals import get_chemical_by_name

from utils import get_disease_map, KG_DATA_PATH, DATA_DIR, get_chem_map, create_venn_diagram

# Load mapping dicts

In [2]:
disease_dict = get_disease_map()
chemical_dict = get_chem_map()

# Load and normalize FDA file

In [3]:
df = pd.read_csv(
    os.path.join(DATA_DIR, 'gold-standard', 'drug_indication_df.tsv'),
    sep='\t',
    dtype=str
)

df['indication'] = df['indication'].apply(lambda x: f'umls:{x}')

pubchem = []
mondo_id = []
for row in tqdm(df[['drug', 'drug_name', 'indication']].values, desc='Normalizing chemicals'):
    drug_idx, drug_name, mesh_id = row
    pubchem_id = chemical_dict.get(drug_idx)
    
    if not pubchem_id:
        pubchem_id = get_chemical_by_name(drug_name)
        
    pubchem.append(pubchem_id)
    
    mondo_id.append(disease_dict.get(mesh_id))
    
    
df['pubchem_id'] = pubchem
df['mondo_id'] = mondo_id
df

Normalizing chemicals: 100%|██████████| 5926/5926 [01:28<00:00, 66.86it/s] 


Unnamed: 0,drug,drug_name,indication,indication_name,pubchem_id,mondo_id
0,DB00001,Lepirudin,umls:C0040038,Thromboembolism,pubchem.compound:118856773,
1,DB00001,Lepirudin,umls:C0002965,"Angina, Unstable",pubchem.compound:118856773,mondo:0006805
2,DB00001,Lepirudin,umls:C0040053,thrombosis,pubchem.compound:118856773,mondo:0000831
3,DB00002,Cetuximab,umls:C0007102,Malignant Tumor Of Colon,,mondo:0021063
4,DB00002,Cetuximab,umls:C1168401,head and neck squamous cell carcinoma (HNSCC),,mondo:0010150
...,...,...,...,...,...,...
5921,,zotarolimus,umls:C0948480,coronary artery restenosis,pubchem.compound:9876378,mondo:0005355
5922,DB09225,zotepine,umls:C0036341,schizophrenia,5736,mondo:0005090
5923,DB01624,zuclopenthixol,umls:C0036341,schizophrenia,5311507,mondo:0005090
5924,DB01624,zuclopenthixol,umls:C0005586,bipolar disorder,5311507,mondo:0004985


In [4]:
df.dropna(subset=['pubchem_id', 'mondo_id'], how='any', inplace=True)
df

Unnamed: 0,drug,drug_name,indication,indication_name,pubchem_id,mondo_id
1,DB00001,Lepirudin,umls:C0002965,"Angina, Unstable",pubchem.compound:118856773,mondo:0006805
2,DB00001,Lepirudin,umls:C0040053,thrombosis,pubchem.compound:118856773,mondo:0000831
45,DB00010,Sermorelin,umls:C0013338,Pituitary Dwarfism,pubchem.compound:16129620,mondo:0006909
46,DB00010,Sermorelin,umls:C0343755,Hiv Wasting Syndrome,pubchem.compound:16129620,mondo:0005797
47,DB00010,Sermorelin,umls:C0021359,infertility,pubchem.compound:16129620,mondo:0005047
...,...,...,...,...,...,...
5921,,zotarolimus,umls:C0948480,coronary artery restenosis,pubchem.compound:9876378,mondo:0005355
5922,DB09225,zotepine,umls:C0036341,schizophrenia,5736,mondo:0005090
5923,DB01624,zuclopenthixol,umls:C0036341,schizophrenia,5311507,mondo:0005090
5924,DB01624,zuclopenthixol,umls:C0005586,bipolar disorder,5311507,mondo:0004985


# Save pairs which overlap with KG

In [5]:
with open(os.path.join(KG_DATA_PATH, 'normalized', 'data.json')) as f:
    data_dict = json.load(f)

In [6]:
pair_overlaps = set()

for source, target in tqdm(df[['pubchem_id', 'mondo_id']].values):
    if source in data_dict['chemicals'] and target in data_dict['diseases']:
        pair_overlaps.add((source, target))
        
len(pair_overlaps)

100%|██████████| 4237/4237 [00:00<00:00, 13004.79it/s]


23

In [7]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'fda_pairs.json'), 'w') as f:
    json.dump(list(pair_overlaps), f, ensure_ascii=True, indent=2)