# Prepare data for AI training

This is an exploratory analysis of the PharmGKB data.

* Only PK-related data is considered.
* Only significant interactions are considered.
* Only compounds with a valid smiles

In [105]:
import os
import pandas as pd

df = pd.read_csv("../data/pharmgkb_processed/final_tables/pgkb_merged.csv", low_memory=False)

In [106]:
print("Compounds:", len(set(df["cid"])))
print("Genes:    ", len(set(df["gid"])))
print("Variants: ", len(set(df["vid"])))

Compounds: 914
Genes:     1825
Variants:  6957


In [118]:
from rdkit import Chem
from tqdm import tqdm

cid2smi = {}

for r in df[["cid", "smiles"]].values:
    if str(r[1]) != "nan":
        cid2smi[r[0]] = r[1]

cid2key = {}
for k,v in tqdm(cid2smi.items()):
    mol = Chem.MolFromSmiles(v)
    inchi = Chem.rdinchi.MolToInchi(mol)[0]
    inchikey = Chem.rdinchi.InchiToInchiKey(inchi)
    cid2key[k] = inchikey

100%|██████████| 721/721 [00:00<00:00, 3673.77it/s]


In [133]:
hp = pd.read_csv("../data/other/human_proteome_with_genenames.tab", sep="\t")
cols = list(hp.columns)
hp = hp[(hp[cols[0]].notnull()) & (hp[cols[2]].notnull())]
g2p = {}
for r in hp[[cols[0], cols[2]]].values:
    for x in r[1].split(" "):
        g2p[x] = r[0]

gid2key = {}
for r in df[["gid", "gene"]].values:
    if str(r[0]) == "nan" or str(r[1]) == "nan":
        continue
    if r[1] not in g2p:
        continue
    gid2key[r[0]] = g2p[r[1]]

In [136]:
df["phenotype"].value_counts()

phenotype
Metabolism/PK    123239
Toxicity          88571
Efficacy          75688
Dosage            34507
Other             12819
PD                  183
Name: count, dtype: int64

In [137]:
df["significance"].value_counts()

significance
 1.0    127235
-1.0     88453
 0.0     29906
Name: count, dtype: int64

In [138]:
df = df[df["phenotype"].isin(["Metabolism/PK", "Toxicity", "Dosage"])]
df = df[df["significance"] != -1]

In [140]:
df = df[["cid", "gid", "vid"]]
df = df.drop_duplicates()

In [141]:
df.value_counts("cid")

cid
PA450401       351
PA450688       342
PA448497       332
PA451103       319
PA451363       310
              ... 
PA166250364      1
PA166256441      1
PA166258723      1
PA448018         1
PA448360         1
Name: count, Length: 658, dtype: int64

In [151]:
import random
triplets = set()
for r in df[["cid", "gid", "vid"]].values:
    r = tuple(r)
    if r[0] not in cid2key:
        continue
    if r[1] not in gid2key:
        continue
    triplets.update([(cid2key[r[0]], gid2key[r[1]], r[2])])
triplets = list(triplets)
random.shuffle(triplets)

pairs = set()
for r in df[["cid", "gid"]].values:
    r = tuple(r)
    if r[0] not in cid2key:
        continue
    if r[1] not in gid2key:
        continue
    pairs.update([(cid2key[r[0]], gid2key[r[1]])])
pairs = list(pairs)
random.shuffle(pairs)

AttributeError: 'list' object has no attribute 'shuffle'

In [146]:
print("Compound, Gene, Variant Triplets", len(triplets))
print("Compound, Gene Pairs", len(pairs))

Compound, Gene, Variant Triplets 23726
Compound, Gene Pairs 2635


In [149]:
pairs = list(pairs)