# Prepare data for AI training

This is an exploratory analysis of the PharmGKB data.

* Only PK-related data is considered.
* Only significant interactions are considered.
* Only compounds with a valid smiles

In [75]:
import pandas as pd
import numpy as np
import random
from rdkit import Chem
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
import sys
from lol import LOL

sys.path.append("../src")
from compound_structures import CompoundStructureEmbedding
from protein_sequences import ProteinSequenceEmbedding
from bioteque import BiotequeGeneEmbedding

In [76]:
df = pd.read_csv("../data/pharmgkb_processed/final_tables/pgkb_merged.csv", low_memory=False)
print("Compounds:", len(set(df["cid"])))
print("Genes:    ", len(set(df["gid"])))
print("Variants: ", len(set(df["vid"])))

Compounds: 914
Genes:     1825
Variants:  6957


In [29]:
cid2smi = {}

for r in df[["cid", "smiles"]].values:
    if str(r[1]) != "nan":
        cid2smi[r[0]] = r[1]

cid2key = {}
for k,v in tqdm(cid2smi.items()):
    mol = Chem.MolFromSmiles(v)
    inchi = Chem.rdinchi.MolToInchi(mol)[0]
    inchikey = Chem.rdinchi.InchiToInchiKey(inchi)
    cid2key[k] = inchikey

100%|██████████| 721/721 [00:00<00:00, 3603.67it/s]


In [30]:
hp = pd.read_csv("../data/other/human_proteome_with_genenames.tab", sep="\t")
cols = list(hp.columns)
hp = hp[(hp[cols[0]].notnull()) & (hp[cols[2]].notnull())]
g2p = {}
for r in hp[[cols[0], cols[2]]].values:
    for x in r[1].split(" "):
        g2p[x] = r[0]

gid2key = {}
for r in df[["gid", "gene"]].values:
    if str(r[0]) == "nan" or str(r[1]) == "nan":
        continue
    if r[1] not in g2p:
        continue
    gid2key[r[0]] = g2p[r[1]]

In [80]:
df["phenotype"].value_counts()

phenotype
Metabolism/PK    123239
Toxicity          88571
Efficacy          75688
Dosage            34507
Other             12819
PD                  183
Name: count, dtype: int64

In [79]:
df["significance"].value_counts()

significance
 1.0    127235
-1.0     88453
 0.0     29906
Name: count, dtype: int64

In [81]:
df = df[df["phenotype"].isin(["Metabolism/PK", "Toxicity", "Dosage"])]
df = df[df["significance"] != -1]
df = df[df["evidence"] != 4]

In [85]:
df[df["significance"].isnull()].value_counts("evidence")

evidence
3     38874
1A    22841
4      4378
2A     1728
1B     1431
2B       60
Name: count, dtype: int64

In [34]:
df = df[["cid", "gid", "vid"]]
df = df.drop_duplicates()

In [35]:
df.value_counts("cid")

cid
PA450401       351
PA450688       342
PA448497       332
PA451103       319
PA451363       310
              ... 
PA166250364      1
PA166256441      1
PA166258723      1
PA448018         1
PA448360         1
Name: count, Length: 658, dtype: int64

In [36]:
triplets = set()
for r in df[["cid", "gid", "vid"]].values:
    r = tuple(r)
    if r[0] not in cid2key:
        continue
    if r[1] not in gid2key:
        continue
    triplets.update([(cid2key[r[0]], gid2key[r[1]], r[2])])
triplets = list(triplets)
random.shuffle(triplets)

pairs = set()
for r in df[["cid", "gid"]].values:
    r = tuple(r)
    if r[0] not in cid2key:
        continue
    if r[1] not in gid2key:
        continue
    pairs.update([(cid2key[r[0]], gid2key[r[1]])])
pairs = list(pairs)

In [37]:
print("Compound, Gene, Variant Triplets", len(triplets))
print("Compound, Gene Pairs", len(pairs))

Compound, Gene, Variant Triplets 23726
Compound, Gene Pairs 2635


In [38]:
cemb = CompoundStructureEmbedding().get()
pemb = ProteinSequenceEmbedding().get()

In [39]:
class PairBalancer(object):

    def __init__(self, pairs):
        self.pairs = pairs

    def balanced_positives_negatives(self):
        x = list([x[0] for x in self.pairs])
        y = list([y[1] for y in self.pairs])
        random.shuffle(x)
        rand_pairs = [(x_, y_) for x_, y_ in zip(x, y)]
        all_pairs = self.pairs + rand_pairs
        y = [1]*len(self.pairs) + [0]*(len(all_pairs)-len(self.pairs))
        idxs = [i for i in range(len(y))]
        random.shuffle(idxs)
        all_pairs = [all_pairs[i] for i in idxs]
        y = [y[i] for i in idxs]
        return all_pairs, y

all_pairs, y = PairBalancer(pairs).balanced_positives_negatives()

In [44]:
class PairSplitter(object):

    def __init__(self, pairs, y=None):
        self.pairs = pairs
        self.y = y

    def random(self, test_size=0.2):
        idxs = [i for i in range(len(self.pairs))]
        random.shuffle(idxs)
        all_pairs = [self.pairs[i] for i in idxs]
        n = int(len(all_pairs)*(1-test_size))
        m = len(all_pairs) - n
        x_tr = all_pairs[:n]
        x_te = all_pairs[-m:]
        if self.y is not None:
            y = [self.y[i] for i in idxs]
            y_tr = y[:n]
            y_te = y[-m:]
            return x_tr, x_te, y_tr, y_te
        else:
            return x_tr, x_te

    def leave_drugs_out(self, test_size=0.2):
        uniq = list(set([x[0] for x in pairs]))
        random.shuffle(uniq)
        n = int(len(uniq)*(1-test_size))
        u_tr = set(uniq[:n])
        x_tr = []
        x_te = []
        y_tr = []
        y_te = []
        idxs = [i for i in range(len(self.pairs))]
        random.shuffle(idxs)
        all_pairs = [self.pairs[i] for i in idxs]
        if self.y is not None:
            y = [self.y[i] for i in idxs]
        else:
            y = None
        for i, p in enumerate(all_pairs):
            if p[0] in u_tr:
                x_tr += [p]
                if y is not None:
                    y_tr += [y[i]]
            else:
                x_te += [p]
                if y is not None:
                    y_te += [y[i]]
        if self.y is not None:
            return x_tr, x_te, y_tr, y_te
        else:
            return x_tr, x_te
        
    def leave_genes_out(self, test_size=0.2):
        uniq = list(set([x[1] for x in pairs]))
        random.shuffle(uniq)
        n = int(len(uniq)*(1-test_size))
        u_tr = set(uniq[:n])
        x_tr = []
        x_te = []
        y_tr = []
        y_te = []
        idxs = [i for i in range(len(self.pairs))]
        random.shuffle(idxs)
        all_pairs = [self.pairs[i] for i in idxs]
        if self.y is not None:
            y = [self.y[i] for i in idxs]
        else:
            y = None
        for i, p in enumerate(all_pairs):
            if p[1] in u_tr:
                x_tr += [p]
                if y is not None:
                    y_tr += [y[i]]
            else:
                x_te += [p]
                if y is not None:
                    y_te += [y[i]]
        if self.y is not None:
            return x_tr, x_te, y_tr, y_te
        else:
            return x_tr, x_te

    def leave_my_drugs_out(self, interest_list):
        pass

    def leave_my_targets_out(self, interest_list):
        pass


pairs_tr, pairs_te, y_tr, y_te = PairSplitter(all_pairs, y).leave_drugs_out(0.2)


In [45]:

from sklearn.ensemble import RandomForestClassifier


class BimodalStackedModel(object):

    def __init__(self, df_A, df_B):
        self.dict_A = dict((r[0], r[1:]) for r in df_A.values)
        self.dict_B = dict((r[0], r[1:]) for r in df_B.values)
        self.reducer = LOL(n_components=100)
        self.model = RandomForestClassifier()

    def fit(self, pairs, y):
        y = np.array(y)
        A = []
        B = []
        y_ = []
        for i, t in enumerate(pairs):
            if t[0] not in self.dict_A:
                continue
            if t[1] not in self.dict_B:
                continue
            A += [self.dict_A[t[0]]]
            B += [self.dict_B[t[1]]]
            y_ += [y[i]]
        A = np.array(A)
        B = np.array(B)
        X = np.hstack([A, B])
        y = np.array(y_).astype(int)
        self.reducer.fit(X, y)
        X = self.reducer.transform(X)
        self.model.fit(X, y)

    def predict(self, pairs):
        A = []
        B = []
        idxs = []
        for i, t in enumerate(pairs):
            if t[0] not in self.dict_A:
                continue
            if t[1] not in self.dict_B:
                continue
            A += [self.dict_A[t[0]]]
            B += [self.dict_B[t[1]]]
            idxs += [i]
        A = np.array(A)
        B = np.array(B)
        X = np.hstack([A, B])
        X = self.reducer.transform(X)
        y_ = self.model.predict_proba(X)[:,1]
        y_hat = np.full((len(pairs),), np.nan)
        y_hat[idxs] = y_
        return y_hat
    
    def evaluate(self, pairs, y):
        y_hat = self.predict(pairs)
        idxs = [i for i in range(len(y_hat)) if not np.isnan(y_hat[i])]
        y_hat = [y_hat[i] for i in idxs]
        y = [y[i] for i in idxs]
        return roc_auc_score(y, y_hat)
    
mdl = BimodalStackedModel(cemb, pemb)

mdl.fit(pairs_tr, y_tr)
mdl.evaluate(pairs_te, y_te)

0.5242395357430798

In [46]:
bge = BiotequeGeneEmbedding()
pse = ProteinSequenceEmbedding()
cse = CompoundStructureEmbedding()

In [50]:
cemb_list = []
for x in cse.available():
    cemb_list += [(x, CompoundStructureEmbedding(x).get())]

pemb_list = []
for x in pse.available():
    pemb_list += [(x, ProteinSequenceEmbedding(x).get())]

for x in bge.available().values:
    pemb_list += [(x, BiotequeGeneEmbedding(x[0], x[1]).get())]

In [51]:
for cemb in cemb_list:
    for pemb in pemb_list:
        model = BimodalStackedModel(cemb[1], pemb[1])
        model.fit(pairs_tr, y_tr)
        print(cemb[0], pemb[0], model.evaluate(pairs_te, y_te))

ersilia esm1b 0.5426716608438591
ersilia prot5 0.5580867038430529
ersilia ['GEN-pdf-TIS' 'hpa_proteome'] 0.5062456883808022
ersilia ['GEN-has-DOM' 'interpro'] 0.5416626851409461
ersilia ['GEN-pab-TIS' 'hpa_proteome'] 0.5321522839905193
ersilia ['GEN-has-MFN' 'gomf_goa_curated'] 0.5180914264873839
ersilia ['GEN-ppi-GEN' 'string'] 0.5547526651500959
ersilia ['GEN-ass-DIS' 'opentargets'] 0.5194402547343724
signaturizer esm1b 0.6007481567909134
signaturizer prot5 0.5620125375855738
signaturizer ['GEN-pdf-TIS' 'hpa_proteome'] 0.5954956171581507
signaturizer ['GEN-has-DOM' 'interpro'] 0.596139923027093
signaturizer ['GEN-pab-TIS' 'hpa_proteome'] 0.5775480356376127
signaturizer ['GEN-has-MFN' 'gomf_goa_curated'] 0.6425680840898232
signaturizer ['GEN-ppi-GEN' 'string'] 0.6339556183318675
signaturizer ['GEN-ass-DIS' 'opentargets'] 0.6289106545628286
grover esm1b 0.5318039820843841
grover prot5 0.5387878939950904
grover ['GEN-pdf-TIS' 'hpa_proteome'] 0.5310604758514007
grover ['GEN-has-DOM' 'int

In [52]:
df = pd.read_csv("../data/pharmgkb_processed/final_tables/pgkb_merged.csv", low_memory=False)

In [73]:
print(sorted([str(x) for x in list(set(df[df["chemical"] == "warfarin"]["gene"]))]))

['ABCB1', 'APOB', 'APOC1', 'APOE', 'ASPH', 'CACNA1C', 'CALU', 'CRP', 'CYP1A1', 'CYP1A2', 'CYP2A6', 'CYP2C18', 'CYP2C19', 'CYP2C9', 'CYP3A4', 'CYP4F11', 'CYP4F2', 'DDHD1', 'DNMT3A', 'EPHA7', 'EPHX1', 'F11', 'F13A1', 'F2', 'F5', 'F7', 'FGFBP2', 'FPGS', 'GATA4', 'GGCX', 'HNF4A', 'LRP1', 'MIR133B', 'MPZ', 'MYC', 'NEDD4', 'NQO1', 'NR1I2', 'NR1I3', 'OPN1SW', 'ORM1', 'ORM2', 'POR', 'PROC', 'PROCR', 'PROS1', 'PRSS53', 'SLC15A2', 'SLCO1B3', 'STX1B', 'STX4', 'THBD', 'UGT1A1', 'VDR', 'VEGFA', 'VKORC1', 'VKORC1L1', 'nan']


In [59]:
dg = pd.read_csv("../data/pharmgkb_processed/gene.csv")

In [62]:
dg[dg["gene"] == "CYB5R1"]

Unnamed: 0,gid,gene,hgnc_id,ensembl_id,vip,variant_annotation,dosing_guideline
4256,PA134979668,CYB5R1,HGNC:13397,ENSG00000159348,-1,-1,-1


In [63]:
d = pd.read_csv("../data/pharmgkb_processed/drug_labels.csv")

In [65]:
d[d["gene"] == "CYB5R1"]

Unnamed: 0,dlid,source,drug_label,prescribing_guideline,dosing_guideline,chemical,gene,variant,haplotype
321,PA166105195,FDA,3,-1,-1,metoclopramide,CYB5R1,,
442,PA166127702,HCSC,3,-1,-1,primaquine,CYB5R1,,
461,PA166104783,FDA,4,-1,-1,rasburicase,CYB5R1,,
785,PA166163431,FDA,4,-1,-1,isosorbide mononitrate,CYB5R1,,
809,PA166170934,FDA,3,-1,-1,primaquine,CYB5R1,,
1012,PA166163430,FDA,4,-1,-1,isosorbide dinitrate,CYB5R1,,
