In [167]:
import pandas as pd
import os
from tqdm import tqdm
from rdkit import Chem
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*')


df = pd.read_csv("../data/pharmgkb_processed/final_tables/pgkb_merged.csv", low_memory=False)
print("Compounds:", len(set(df["cid"])))
print("Genes:    ", len(set(df["gid"])))
print("Variants: ", len(set(df["vid"])))
df.shape

Compounds: 1190
Genes:     1949
Variants:  6986


(345420, 16)

In [168]:
focus_genes = pd.read_csv("../data/of_interest/adme_gene_list.tsv", sep="\t")
focus_compounds = pd.read_csv("../data/of_interest/curated_drugs_for_gradient.tsv", sep="\t")
inchikeys = []
for smi in focus_compounds["SMILES"].tolist():
    mol = Chem.MolFromSmiles(smi)
    inchi = Chem.rdinchi.MolToInchi(mol)[0]
    inchikey = Chem.rdinchi.InchiToInchiKey(inchi)
    inchikeys += [inchikey]
focus_compounds["inchikey"] = inchikeys

In [169]:
df = df[df["gid"].isin(focus_genes["PharmGKB ID"].tolist())]
df.shape

(303189, 16)

In [170]:
cid2smi = {}

for r in df[["cid", "smiles"]].values:
    if str(r[1]) != "nan":
        cid2smi[r[0]] = r[1]

cid2key = {}
for k,v in tqdm(cid2smi.items()):
    mol = Chem.MolFromSmiles(v)
    inchi = Chem.rdinchi.MolToInchi(mol)[0]
    inchikey = Chem.rdinchi.InchiToInchiKey(inchi)
    cid2key[k] = inchikey

100%|██████████| 635/635 [00:00<00:00, 3943.09it/s]


In [171]:
hp = pd.read_csv("../data/other/human_proteome_with_genenames.tab", sep="\t")
cols = list(hp.columns)
hp = hp[(hp[cols[0]].notnull()) & (hp[cols[2]].notnull())]
g2p = {}
up = pd.read_csv(
    os.path.join(
        "..", "data", "other", "human_proteome_with_genenames.tab"
    ),
    sep="\t",
)
for v in up[
    ["Entry", "Gene names", "Gene names  (primary )", "Gene names  (synonym )"]
].values:
    p = v[0]
    g = []
    for x in v[1:]:
        x = str(x)
        if x == "nan":
            continue
        for y in x.split(" "):
            g += [y]
    for x in g:
        g2p[x] = p

gid2key = {}
for r in df[["gid", "gene"]].values:
    if str(r[0]) == "nan" or str(r[1]) == "nan":
        continue
    if r[1] not in g2p:
        continue
    gid2key[r[0]] = g2p[r[1]]


In [172]:
df = df[df["phenotype"].isin(["Metabolism/PK"])]#, "Toxicity", "Dosage"])]
df = df[df["significance"] != -1]
df = df[df["evidence"] != "4"]
print(df.shape)

triplets = set()
for r in df[["cid", "gid", "gene", "vid", "variant"]].values:
    r = tuple(r)
    if r[0] not in cid2key:
        ckey = None
    else:
        ckey = cid2key[r[0]]
    if r[1] not in gid2key:
        gkey = None
    else:
        gkey = gid2key[r[1]]
    if gkey is None or ckey is None:
        continue
    triplets.update([(ckey, r[0], gkey, r[2], r[1], r[4], r[3])])
triplets = list(triplets)
triplets = list(set(triplets))

dt = pd.DataFrame(triplets, columns=["inchikey", "cid", "uniprot_ac", "gene_name", "gid", "variant", "vid"])

(94948, 16)


In [177]:
len(set(dt["inchikey"]).intersection(focus_compounds["inchikey"]))

10

In [178]:
focus_compounds.shape

(32, 6)

In [149]:
print("Compounds", len(set(dt["inchikey"])))
print("Proteins", len(set(dt["uniprot_ac"])))
print("Variants", len(set(dt["vid"])))

Compounds 324
Proteins 88
Variants 1035


In [182]:
dt.value_counts("cid")

cid
PA450401       301
PA451363       294
PA166124478    268
PA166127652    261
PA451581       234
              ... 
PA166131548      1
PA166131561      1
PA166131580      1
PA450644         1
PA10005          1
Length: 328, dtype: int64