In [1]:
import pandas as pd
import os

DATAPATH = "../data/pharmgkb_processed"

# Chemicals

We are missing the SMILES for ~1.7k chemicals out of 4.5k. Many of them are from Drug Classes, and potentially could be separated into the different classes they are referred to.
By searching with the PharmGKB API, we have only found 3 more SMILES. The PharmGKB API search is now incorporated in chemical.py for future uses.

In [22]:
#re-parse chemicals file to get smiles from API if they are not there and compare with original file

df = pd.read_csv(os.path.join(DATAPATH, "0_chemical.csv"))
print(len(df))
df = df[df["smiles"].isna()]
len(df)

4532


1753

In [19]:
import requests

cid2smi = {}
for cid in df["cid"].tolist():
    url = "https://api.pharmgkb.org/v1/data/chemical/{}?view=base".format(cid)
    response = requests.get(url)

    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        print(data)
        try:
            smiles = data['data']['smiles']
            print("SMILES from API!")
            cid2smi[cid] = smiles
        except:
            smiles=None
    else:
        print("Failed to fetch data from the API. Status code:", response.status_code)

{'data': {'objCls': 'Chemical', 'id': 'PA166131395', 'name': '10-hydroxy r-warfarin', 'pediatric': False, 'types': ['Metabolite']}, 'status': 'success'}
{'data': {'objCls': 'Chemical', 'id': 'PA166165005', 'name': '12-hydroxy-sirolimus', 'pediatric': False, 'types': ['Metabolite']}, 'status': 'success'}
{'data': {'objCls': 'Chemical', 'id': 'PA166160599', 'name': '14-hydroxyclarithromycin', 'pediatric': False, 'types': ['Metabolite']}, 'status': 'success'}
{'data': {'objCls': 'Chemical', 'id': 'PA166131541', 'name': '1-hydroxyibuprofen glucuronide', 'pediatric': False, 'types': ['Metabolite']}, 'status': 'success'}
{'data': {'objCls': 'Chemical', 'id': 'PA166131511', 'name': '2,3-diene valproic acid-coenzyme A', 'pediatric': False, 'types': ['Metabolite']}, 'status': 'success'}
{'data': {'objCls': 'Chemical', 'id': 'PA166131512', 'name': '2,4-diene valproic acid-coenzyme A', 'pediatric': False, 'types': ['Metabolite']}, 'status': 'success'}
{'data': {'objCls': 'Chemical', 'id': 'PA1647

In [20]:
cid2smi #we have manually modified these three to avoid the re-fetching of all smiles

{'PA164748138': 'C[C@H](CCCC(C)C)[C@H]1CC[C@@H]2[C@@]1(CCCC2=CC=C3C[C@H](CCC3=C)O)C',
 'PA449932': 'CC(=O)[C@]1(CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2CCC4=CC(=O)CC[C@]34C)C)O',
 'PA451652': 'CC1=C(SC=[N+]1CC2=CN=C(N=C2N)C)CCO'}

# Genes, haplotypes and variants

We have 24.5k genes, but only 1k have information about variants/haplotypes. We are assuming that if a gene does not have any described haplotype or variant, it will not have any PGx annotation in PharmGKB.
The genes do not have their variants listed when searching via the API, that is why we have parsed the rlx file as the source of truth for haplotypes, and from there we are getting the variants.

In [10]:
DATAPATH = "../data/pharmgkb_processed_revision1"

In [11]:
gene = pd.read_csv(os.path.join(DATAPATH,"0_gene.csv"))
rlx = pd.read_csv(os.path.join(DATAPATH,"1_haplotype_rlx.csv"))
haps = pd.read_csv(os.path.join(DATAPATH, "2_haplotype.csv"))

print("All genes in Pharmgkb: ", len(set(gene["gene"])))
print("All genes with haplotypes/variants in PharmGKB: ", len(set(rlx["gid"])))
print("All haplotypes in PharmGKB: ", len(set(haps["hid"])))

All genes in Pharmgkb:  24550
All genes with haplotypes/variants in PharmGKB:  45
All haplotypes in PharmGKB:  6471


In [12]:
#Manually check that indeed genes that are not in the RLX file do not have variants reported in PharmGKB
no_hap = gene[~gene['gid'].isin(rlx['gid'])]
print(len(no_hap))
print("Genes without haplotypes: ", no_hap.sample(n=10)['gid'].tolist())
print("Genes with haplotypes: ", rlx.sample(n=10)['gid'].tolist())

24505
Genes without haplotypes:  ['PA34140', 'PA165512406', 'PA24731', 'PA165393835', 'PA142671102', 'PA25487', 'PA26861', 'PA38663', 'PA33983', 'PA164722813']
Genes with haplotypes:  ['PA35057', 'PA34896', 'PA356', 'PA28469', 'PA37178', 'PA121', 'PA28469', 'PA28469', 'PA35056', 'PA28469']


# Haplotype to variant

We get the information of the variants found within an haplotype by downloading all haplotype files from PharmGKB (by gene)

In [13]:
#genes for which there is haplotype information:
assert len(list(set(gene["gene"].tolist()))) == len(no_hap)+len(list(set(rlx["gene"].tolist())))
print("Genes with Haplotypes: ", len(list(set(rlx["gene"].tolist()))))

Genes with Haplotypes:  45


In [92]:
folder_path = os.path.join(DATAPATH, "haplotypes")
files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
num_files = len(files)
print(f"There are {num_files} files in the haplotype folder.")
gene_names = [f.split('_')[0] for f in files]
df = pd.DataFrame({'gene': gene_names})
#df.to_csv(os.path.join(DATAPATH, "reference_alleles.csv"), index=False) #manually curate the reference alleles, get them in a list

There are 49 files in the haplotype folder.


In [4]:
ex = ["The CYP2D6*2 allele is assigned as a normal function allele by CPIC. AAAAAXjnedo",
"The CYP1C9*2 allele is assigned as a normal function allele by CPIC. bulrfbv"]

def get_allele(statements):
    haps = []
    for s in statements:
        if "allele is assigned as a normal function allele by CPIC." in s:
            hap = s.split(" ")[1]
            haps += [hap]
    return haps

haps = get_allele(ex)
print(haps)

['CYP2D6*2', 'CYP1C9*2']


In [90]:
gene_names_rlx = list(set(rlx["gene"].tolist()))

In [16]:
new_genes = set(gene_names) - set(gene_names_rlx)
print(new_genes)
print(len(list(new_genes))-1, " genes were not in RLX but have a Haplotype file in PharmGKB")

{'ABCG2', 'VKORC1', 'F5', 'manual'}
3  genes were not in RLX but have a Haplotype file in PharmGKB


In [17]:
new_genes = set(gene_names_rlx) - set(gene_names)
print(new_genes)
print(len(list(new_genes)), " genes were in RLX but do not have a Haplotype file in PharmGKB")

set()
0  genes were in RLX but do not have a Haplotype file in PharmGKB


In [18]:
#also quickly check we are not missing any gene from rlx
import sys 
cwd = os.getcwd()
sys.path.append(os.path.join(cwd, "..", "src"))
from pharmgkb import RawData

def get_raw_files():
    r = RawData()
    df = r.relationships
    return df

df = get_raw_files()

entity_types = list(set(df["Entity1_type"].tolist()))
print(entity_types)

['Haplotype', 'Disease', 'Gene', 'Chemical', 'Variant']


In [19]:
df1 = df[df["Entity1_type"] == "Haplotype"]
print(df1.shape)
df1.drop_duplicates(subset=["Entity1_name"], keep="first", inplace=True)
print(df1.shape)

(11788, 11)
(1028, 11)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop_duplicates(subset=["Entity1_name"], keep="first", inplace=True)


In [20]:
df2 = df[df["Entity2_type"] == "Haplotype"]
print(df2.shape)
df2.drop_duplicates(subset=["Entity2_name"], keep="first", inplace=True)
print(df2.shape)

(11788, 11)
(1029, 11)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.drop_duplicates(subset=["Entity2_name"], keep="first", inplace=True)


In [21]:
h2v = pd.read_csv(os.path.join(DATAPATH, "3_hid_vid_complete.csv"))
print(h2v.shape)
print(len(set(h2v["vid"])), " unique variants associated to an haplotype") 

#why the difference?
h2v_nohla = h2v[~h2v["haplotype"].str.contains('HLA')]
print(h2v_nohla.shape)

(8715, 12)
1111  unique variants associated to an haplotype
(3549, 12)


### Variants

In [68]:
vars = pd.read_csv(os.path.join(DATAPATH, "5_variant_complete.csv"))
vars.shape

(8320, 4)

In [31]:
df1 = df[df["Entity1_type"] == "Variant"]
print(df1.shape)
df1.drop_duplicates(subset=["Entity1_name"], keep="first", inplace=True)
print(df1.shape)

df2 = df[df["Entity1_type"] == "Variant"]
print(df2.shape)
df2.drop_duplicates(subset=["Entity1_name"], keep="first", inplace=True)
print(df2.shape)

(28032, 11)
(6307, 11)
(28032, 11)
(6307, 11)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop_duplicates(subset=["Entity1_name"], keep="first", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.drop_duplicates(subset=["Entity1_name"], keep="first", inplace=True)


In [32]:
vars_rlx = list(set(df1["Entity1_id"]))
print(len(vars_rlx), " Unique Variants in Relationship file")
print(len(set(vars["vid"])), "Unique Variants in Variants file")

6307  Unique Variants in Relationship file
7325 Unique Variants in Variants file


In [33]:
len(h2v)

8715

In [34]:
vars_h2v = list(set(h2v["vid"]))
vars_rlx = list(set(df1["Entity1_id"]))
vars_file = list(set(vars["vid"]))

print(len(vars_h2v), " Unique Variants in Hid to Vid file")
print(len(vars_rlx), " Unique Variants in Relationship file")
print(len(set(vars["vid"])), "Unique Variants in Variants file")

print(len(set(vars_h2v)-set(vars_file)), " Variants present in Hid2Vid but not in Variant File")
print(len(set(vars_rlx)-set(vars_file)), " Variants present in RLX file but not in Variant File")
print(len(set(vars_h2v)-set(vars_rlx)), " Variants present in H2V but not RLX")

1111  Unique Variants in Hid to Vid file
6307  Unique Variants in Relationship file
7325 Unique Variants in Variants file
0  Variants present in Hid2Vid but not in Variant File
63  Variants present in RLX file but not in Variant File
766  Variants present in H2V but not RLX


In [35]:
# Total Unique Variants

total_vars = set(vars_h2v+vars_rlx+vars_file)
print(len(total_vars))

7388


In [79]:
# we will also parse RLX to get these 63 variants that are not present in the variant_complete file

import sys 
cwd = os.getcwd()
sys.path.append(os.path.join(cwd, "..", "src"))
from pharmgkb import RawData
from utils import CsvCleaner

def get_raw_files():
    r = RawData()
    df = r.relationships
    return df

def parse_rlx():
    df = get_raw_files()
    df = df[df["Entity1_type"]=="Variant"]
    df = df[df["Entity2_type"]=="Haplotype"]
    return df

df = parse_rlx()


vars = pd.read_csv(os.path.join(DATAPATH, "5_variant_complete.csv"))
vars_file = list(set(vars["vid"]))
vars_rlx = list(set(df["Entity1_id"]))
print(len(set(vars_rlx)-set(vars_file)), " Variants present in RLX file but not in Variant File")

# 11 variants in RLX not in Var Complete are associated to a gene
# 1 variant in RLX not in Var Complete are associated to a haplotype
# we will need an online query to get the genes associated to the 68 variants

1  Variants present in RLX file but not in Variant File


## Clinical Annotations and Clinical Variants

From Clinical Annotations, we only consider the clinical_annotations.csv file

In [31]:
clann = pd.read_csv("../data/pharmgkb/clinicalAnnotations/clinical_annotations.csv")
clvar = pd.read_csv("../data/pharmgkb/clinicalVariants/clinicalVariants.csv")
dfa = pd.read_csv(os.path.join(DATAPATH, "6_clinical_annotation.csv"))
dfv = pd.read_csv(os.path.join(DATAPATH, "7_clinical_variant.csv"))

In [32]:
# Difference between clinical annotation and clincal variant?
clann = clann[['Variant/Haplotypes', 'Gene','Level of Evidence',  'Phenotype Category',  'Drug(s)','Phenotype(s)']]
clann.rename(columns={'Variant/Haplotypes': 'variant',    
                      'Gene': 'gene',
                      'Level of Evidence':'level of evidence',  
                      'Phenotype Category':'type', 
                      'Drug(s)': 'chemicals',
                      'Phenotype(s)':'phenotypes'
                    }, inplace=True
             )
clann = clann[['variant', 'gene', 'type', 'level of evidence', 'chemicals', 'phenotypes']]

In [33]:
print(clann.shape)
print(clvar.shape)

(5036, 6)
(5036, 6)


In [38]:
common_column = 'variant'
df1_sorted = clann.sort_values(by=common_column).reset_index(drop=True)
df2_sorted = clvar.sort_values(by=common_column).reset_index(drop=True)

# Compare if the sorted DataFrames are identical
if df1_sorted.equals(df2_sorted):
    print("DataFrames are identical.")
else:
    print("DataFrames are not identical.")

DataFrames are not identical.


In [43]:
#check if we have variants in one that are not present in the other
set(df1_sorted["variant"])-set(df2_sorted["variant"])
set(df2_sorted["variant"])-set(df1_sorted["variant"])

set()

# Compare pipeline results

In [36]:
DATAPATH = "../data"
df1 = pd.read_csv(os.path.join(DATAPATH, "pharmgkb_processed_revision1","14_pgkb_merged.csv"), low_memory=False)
df2 = pd.read_csv(os.path.join(DATAPATH, "pharmgkb_processed_old","final_tables", "pgkb_merged.csv"), low_memory=False)

In [37]:
print(df1.shape)
print(df2.shape)

(345091, 16)
(345420, 16)


In [38]:
print("Compounds:", len(set(df1["cid"])))
print("Genes:    ", len(set(df1["gid"])))
print("Variants: ", len(set(df1["vid"])))

Compounds: 1190
Genes:     1949
Variants:  6986


In [39]:
print("Compounds:", len(set(df2["cid"])))
print("Genes:    ", len(set(df2["gid"])))
print("Variants: ", len(set(df2["vid"])))

Compounds: 1190
Genes:     1949
Variants:  6986


# MANUAL CHECK OF DATA

we compare the data curated in our file with the annotations we can retrieve from PharmGKB to ensure we are not losing information in the process

In [40]:
df = pd.read_csv(os.path.join(DATAPATH, "pharmgkb_processed_revision1", "14_pgkb_merged.csv"))
print(df.columns)
df['caid'].fillna(0, inplace=True)
df['caid'] = df['caid'].astype(int)
df['vaid'].fillna(0, inplace=True)
df['vaid'] = df['vaid'].astype(int)

Index(['cid', 'chemical', 'smiles', 'gid', 'gene', 'ensembl_id', 'vid',
       'variant', 'evidence', 'significance', 'phenotype', 'did', 'disease',
       'biogroup', 'caid', 'vaid'],
      dtype='object')


  df = pd.read_csv(os.path.join(DATAPATH, "pharmgkb_processed_revision1", "14_pgkb_merged.csv"))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['caid'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['vaid'].fillna(0, inplace=True)


In [41]:
#randomly select 10 drugs and compare the information of each drug on PharmGKB
#cids = df['cid'].sample(n=10).tolist()
cids = ['PA450640', 'PA449015', 'PA451866', 'PA452233', 'PA450801', 'PA450550', 'PA165823907', 'PA450401', 'PA451906', 'PA451241']
print(cids)

['PA450640', 'PA449015', 'PA451866', 'PA452233', 'PA450801', 'PA450550', 'PA165823907', 'PA450401', 'PA451906', 'PA451241']


In [42]:
cpd = cids[0]

df_ = df[df["cid"]==cpd]
print(cpd)
print(df_["chemical"].tolist()[0])
print("Genes:    ", len(set(df_["gid"])))
print("Variants: ", len(set(df_["vid"])))

PA450640
nitrofurantoin
Genes:     1
Variants:  172


In [59]:
#check if we can reduce the variability
df2 = df_[['cid', 'chemical', 'smiles', 'gid', 'gene', 'ensembl_id', 'vid',
       'variant', 'evidence', 'phenotype']]
df2.drop_duplicates(keep="first", inplace=True)
df2.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.drop_duplicates(keep="first", inplace=True)


(165, 10)

In [30]:
df2

Unnamed: 0,cid,chemical,smiles,gid,gene,ensembl_id,vid,variant,evidence,phenotype
24739,PA450640,nitrofurantoin,C1C(=O)NC(=O)N1/N=C\C2=CC=C(O2)[N+](=O)[O-],PA28469,G6PD,ENSG00000160211,PA166157860,rs1050829,3,Toxicity
24740,PA450640,nitrofurantoin,C1C(=O)NC(=O)N1/N=C\C2=CC=C(O2)[N+](=O)[O-],PA28469,G6PD,ENSG00000160211,PA166157859,rs1050828,3,Toxicity
24741,PA450640,nitrofurantoin,C1C(=O)NC(=O)N1/N=C\C2=CC=C(O2)[N+](=O)[O-],PA28469,G6PD,ENSG00000160211,PA166287041,NC_000023.11:g.154532046A>C,3,Toxicity
24742,PA450640,nitrofurantoin,C1C(=O)NC(=O)N1/N=C\C2=CC=C(O2)[N+](=O)[O-],PA28469,G6PD,ENSG00000160211,PA166287042,NC_000023.11:g.154532058_154532060delTTC,3,Toxicity
24743,PA450640,nitrofurantoin,C1C(=O)NC(=O)N1/N=C\C2=CC=C(O2)[N+](=O)[O-],PA28469,G6PD,ENSG00000160211,PA166287043,NC_000023.11:g.154532082G>A,3,Toxicity
...,...,...,...,...,...,...,...,...,...,...
24909,PA450640,nitrofurantoin,C1C(=O)NC(=O)N1/N=C\C2=CC=C(O2)[N+](=O)[O-],PA28469,G6PD,ENSG00000160211,PA166287148,NC_000023.11:g.154546116C>T,3,Toxicity
24910,PA450640,nitrofurantoin,C1C(=O)NC(=O)N1/N=C\C2=CC=C(O2)[N+](=O)[O-],PA28469,G6PD,ENSG00000160211,PA166287149,NC_000023.11:g.154546122C>A,3,Toxicity
24911,PA450640,nitrofurantoin,C1C(=O)NC(=O)N1/N=C\C2=CC=C(O2)[N+](=O)[O-],PA28469,G6PD,ENSG00000160211,PA166287150,NC_000023.11:g.154546131G>A,3,Toxicity
93988,PA450640,nitrofurantoin,C1C(=O)NC(=O)N1/N=C\C2=CC=C(O2)[N+](=O)[O-],PA28469,G6PD,ENSG00000160211,PA166157959,G6PD deficiency,,Toxicity


In [31]:
print("Genes:    ", len(set(df2["gid"])))
print("Variants: ", len(set(df2["vid"])))

Genes:     1
Variants:  172


In [32]:
print(set(df2["variant"]))

{'rs34193178', 'rs782757170', 'rs398123546', 'NC_000023.11:g.154532699G>C', 'rs5030872', 'rs137852331', 'NC_000023.11:g.154532608C>T', 'NC_000023.11:g.154546057T>C', 'NC_000023.11:g.154535261C>A', 'rs137852320', 'NC_000023.11:g.154532279C>G', 'rs76645461', 'NC_000023.11:g.154533031C>T', 'NC_000023.11:g.154532773C>T', 'rs138687036', 'NC_000023.11:g.154535301A>G', 'rs267606835', 'NC_000023.11:g.154532231T>G', 'NC_000023.11:g.154534409G>C', 'rs137852326', 'rs267606836', 'NC_000023.11:g.154533619T>A', 'NC_000023.11:g.154533016G>T', 'NC_000023.11:g.154535247G>A', 'NC_000023.11:g.154532991_154532993delGGT', 'NC_000023.11:g.154532679A>G', 'NC_000023.11:g.154536151G>A', 'NC_000023.11:g.154532458A>C', 'rs137852347', 'NC_000023.11:g.154532086C>T', 'NC_000023.11:g.154532692T>C', 'NC_000023.11:g.154534036G>C', 'rs137852329', 'NC_000023.11:g.154532716T>C', 'NC_000023.11:g.154536008A>T', 'NC_000023.11:g.154536045C>G', 'NC_000023.11:g.154532392A>T', 'NC_000023.11:g.154532623T>C', 'NC_000023.11:g.1545

In [34]:
print(set(df2["evidence"]))

{'3', nan, nan}


In [78]:
# get data for testing

chemicals = ["warfarin", "rasburicase", "nitrofurantoin", "bupivacaine"]
chemicals_red = ["rasburicase", "nitrofurantoin", "bupivacaine"]

def select_rows_with_chemicals(df, chemicals):
    return df[df["Chemical Name"].str.contains("|".join(chemicals))]
df =pd.read_csv("../data/pharmgkb/automated_annotations/automated_annotations.csv")

outfile = "../scripts/test_autom_ann"

df_ = select_rows_with_chemicals(df, chemicals)
df__ = select_rows_with_chemicals(df, chemicals_red)

df_.to_csv(os.path.join(outfile, "autom_ann_test.csv"), index=False)
df__.to_csv(os.path.join(outfile, "autom_ann_test_.csv"), index=False)

# After Redoing the pipeline, check again:

In [3]:
import pandas as pd
import os
DATAPATH = "../data/pharmgkb_processed"
df = pd.read_csv(os.path.join(DATAPATH, "13_pgkb_merged.csv"))
print("Compounds:", len(set(df["cid"])))
print("Genes:    ", len(set(df["gid"])))
print("Variants: ", len(set(df["vid"])))

def count_pairs(df):
    x = []
    for r in df[df["gid"].notnull()][["cid", "gid"]].values:
        x += [(r[0], r[1])]
    return len(set(x))

print("Unique pairs:", count_pairs(df))

Compounds: 1117
Genes:     1634
Variants:  4526
Unique pairs: 7283


  df = pd.read_csv(os.path.join(DATAPATH, "13_pgkb_merged.csv"))


In [4]:
def report(data):
    print("Compounds:", len(set(data["cid"])))
    print("Genes:    ", len(set(data["gid"])))
    print("Variants: ", len(set(data["vid"])))
    print("Unique pairs:", count_pairs(data))

report(df[df["evidence"] == "1B"])

Compounds: 418
Genes:     182
Variants:  223
Unique pairs: 762


In [5]:
report(df[(df["phenotype"].notnull()) & (df["evidence"] == "1B")])

Compounds: 12
Genes:     9
Variants:  223
Unique pairs: 16


## finding repeated data

In [6]:
df2 = df[['cid', 'chemical', 'smiles', 'gid', 'gene', 'ensembl_id', 'vid',
       'variant', 'evidence', 'phenotype']]
df2.drop_duplicates(keep="first", inplace=True)
print(df2.shape)

df2 = df[['cid', 'chemical', 'smiles', 'gid', 'gene', 'ensembl_id', 'vid',
       'variant', 'evidence']]
df2.drop_duplicates(keep="first", inplace=True)
print(df2.shape)

df2 = df[['cid', 'chemical', 'smiles', 'gid', 'gene', 'ensembl_id', 'vid',
       'variant', 'phenotype']]
df2.drop_duplicates(keep="first", inplace=True)
print(df2.shape)

df2 = df[['cid', 'chemical', 'smiles', 'gid', 'gene', 'ensembl_id', 'vid',
       'variant']]
df2.drop_duplicates(keep="first", inplace=True)
print(df2.shape)

(80946, 10)
(63823, 9)
(58124, 9)
(34657, 8)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.drop_duplicates(keep="first", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.drop_duplicates(keep="first", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.drop_duplicates(keep="first", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.drop_duplicates(keep

## Manual check

In [3]:
cids = ['PA450640', 'PA449015', 'PA451866', 'PA452233', 'PA450801', 'PA450550', 'PA165823907', 'PA450401', 'PA451906', 'PA451241']
print(cids)

['PA450640', 'PA449015', 'PA451866', 'PA452233', 'PA450801', 'PA450550', 'PA165823907', 'PA450401', 'PA451906', 'PA451241']


In [45]:
cpd = cids[0]

df_ = df[df["cid"]==cpd]
print(cpd)
print(df_["chemical"].tolist()[0])
print("Genes:    ", len(set(df_["gid"])))
print("Variants: ", len(set(df_["vid"])))

PA450640
nitrofurantoin
Genes:     1
Variants:  3


In [46]:
# genes with clinical guidelines
clin_ev = df_[(df_["evidence"] == "1A") | (df_["evidence"] == "1B")]["gene"]
set(clin_ev)

{'G6PD'}

In [47]:
# genes with level 2
ev= df_[(df_["evidence"] == "2")]["gene"]
ev = [x for x in ev if x==x] 
ev = sorted(list(set(ev)))
print("genes with EV2", ev)

# genes with level 3
ev= df_[(df_["evidence"] == "3")]["gene"]
ev = [x for x in ev if x==x] 
ev = sorted(list(set(ev)))
print("genes with EV3", ev)

# genes with level 4
ev= df_[(df_["evidence"] == "4")]["gene"]
ev = [x for x in ev if x==x] 
ev = sorted(list(set(ev)))
print("genes with EV4", ev)

# genes with other evidence 
evo = df_[~df_["evidence"].isin(["1A", "1B", "2", "3", "4"])]["gene"]
evo = [x for x in evo if x==x] 
evo = sorted(list(set(evo)))
print("genes with other EV", evo)

genes with EV2 []
genes with EV3 ['G6PD']
genes with EV4 []
genes with other EV []


### Citalopram

In [5]:
cpd = cids[1]

df_ = df[df["cid"]==cpd]
print(cpd)
print(df_["chemical"].tolist()[0])
print("Genes:    ", len(set(df_["gid"])))
print("Variants: ", len(set(df_["vid"])))

PA449015
citalopram
Genes:     39
Variants:  210


In [8]:
# genes with clinical guidelines
clin_ev = df_[(df_["evidence"] == "1A") | (df_["evidence"] == "1B")]["gene"]
set(clin_ev)

{'CYP2C19', 'CYP2D6'}

In [31]:
# genes with level 2
ev= df_[(df_["evidence"] == "2")]["gene"]
ev = [x for x in ev if x==x] 
ev = sorted(list(set(ev)))
print("genes with EV2", ev)

# genes with level 3
ev= df_[(df_["evidence"] == "3")]["gene"]
ev = [x for x in ev if x==x] 
ev = sorted(list(set(ev)))
print("genes with EV3", ev)

# genes with level 4
ev= df_[(df_["evidence"] == "4")]["gene"]
ev = [x for x in ev if x==x] 
ev = sorted(list(set(ev)))
print("genes with EV4", ev)

# genes with other evidence 
evo = df_[~df_["evidence"].isin(["1A", "1B", "2", "3", "4"])]["gene"]
evo = [x for x in evo if x==x] 
evo = sorted(list(set(evo)))
print("genes with other EV", evo)

genes with EV2 []
genes with EV3 ['ABCB1', 'COMT', 'CYP2C19', 'CYP2D6', 'FKBP5', 'GABRA6', 'GABRP', 'GABRQ', 'GRIA3', 'HTR1B', 'HTR2A', 'SLC6A2', 'TPH2']
genes with EV4 ['CYP2C19']
genes with other EV ['ABCB1', 'ANKK1', 'COMT', 'CYP2C9', 'CYP2D6', 'DRD2', 'FKBP5', 'GABRQ', 'GRIA3', 'HTR1B', 'HTR2A', 'HTR7', 'LBP', 'MAOA', 'SLC6A2', 'SLC6A4', 'TPH2']


### Venlafaxine

In [27]:
cpd = cids[2]

df_ = df[df["cid"]==cpd]
print(cpd)
print(df_["chemical"].tolist()[0])
print("Genes:    ", len(set(df_["gid"])))
print("Variants: ", len(set(df_["vid"])))

PA451866
venlafaxine
Genes:     20
Variants:  183


In [28]:
# genes with clinical guidelines
clin_ev = df_[(df_["evidence"] == "1A") | (df_["evidence"] == "1B")]["gene"]
set(clin_ev)

## WE ARE MISSING TWO GENES, SLC6A4 and HTR2A, In prescribing info, but not in drug label? they are not in the clinical annotations either

{'CYP2D6'}

In [30]:
# genes with level 2
ev= df_[(df_["evidence"] == "2")]["gene"]
ev = [x for x in ev if x==x] 
ev = sorted(list(set(ev)))
print("genes with EV2", ev)

# genes with level 3
ev= df_[(df_["evidence"] == "3")]["gene"]
ev = [x for x in ev if x==x] 
ev = sorted(list(set(ev)))
print("genes with EV3", ev)

# genes with level 4
ev= df_[(df_["evidence"] == "4")]["gene"]
ev = [x for x in ev if x==x] 
ev = sorted(list(set(ev)))
print("genes with EV4", ev)

# genes with other evidence 
evo = df_[~df_["evidence"].isin(["1A", "1B", "2", "3", "4"])]["gene"]
evo = [x for x in evo if x==x] 
evo = sorted(list(set(evo)))
print("genes with other EV", evo)

#missing SLC6A4 Evidence 4, but present in other ev

genes with EV2 []
genes with EV3 ['ABCB1', 'COMT', 'CYP2C19', 'CYP2D6', 'FKBP5', 'GABRA6', 'GABRP', 'GABRQ', 'GRIA3', 'HTR1B', 'HTR2A', 'SLC6A2', 'TPH2']
genes with EV4 ['CYP2C19']
genes with other EV ['ABCB1', 'ANKK1', 'COMT', 'CYP2C9', 'CYP2D6', 'DRD2', 'FKBP5', 'GABRQ', 'GRIA3', 'HTR1B', 'HTR2A', 'HTR7', 'LBP', 'MAOA', 'SLC6A2', 'SLC6A4', 'TPH2']


### Paroxetine

In [33]:
cpd = cids[4]

df_ = df[df["cid"]==cpd]
print(cpd)
print(df_["chemical"].tolist()[0])
print("Genes:    ", len(set(df_["gid"])))
print("Variants: ", len(set(df_["vid"])))

PA450801
paroxetine
Genes:     25
Variants:  193


In [34]:
# genes with clinical guidelines
clin_ev = df_[(df_["evidence"] == "1A") | (df_["evidence"] == "1B")]["gene"]
set(clin_ev)

## WE ARE MISSING TWO GENES, SLC6A4 and HTR2A, In prescribing info, but not in drug label? they are not in the clinical annotations either

{'CYP2D6'}

In [35]:
# genes with level 2
ev= df_[(df_["evidence"] == "2")]["gene"]
ev = [x for x in ev if x==x] 
ev = sorted(list(set(ev)))
print("genes with EV2", ev)

# genes with level 3
ev= df_[(df_["evidence"] == "3")]["gene"]
ev = [x for x in ev if x==x] 
ev = sorted(list(set(ev)))
print("genes with EV3", ev)

# genes with level 4
ev= df_[(df_["evidence"] == "4")]["gene"]
ev = [x for x in ev if x==x] 
ev = sorted(list(set(ev)))
print("genes with EV4", ev)

# genes with other evidence 
evo = df_[~df_["evidence"].isin(["1A", "1B", "2", "3", "4"])]["gene"]
evo = [x for x in evo if x==x] 
evo = sorted(list(set(evo)))
print("genes with other EV", evo)

#missing SLC6A4 Evidence 4, but present in other ev

genes with EV2 []
genes with EV3 ['ABCB1', 'ADM', 'BDNF', 'COMT', 'CYP1A2', 'DRD3', 'FKBP5', 'GDNF', 'HTR1A', 'HTR1B', 'HTR2A', 'HTR7', 'MDGA2', 'REEP5', 'RPP30', 'SLC6A4', 'SRP19']
genes with EV4 ['HTR2A', 'HTR3B', 'TPH1']
genes with other EV ['ABCB1', 'ADM', 'BDNF', 'BDNF-AS', 'COMT', 'CYP1A2', 'CYP2D6', 'DRD3', 'FKBP5', 'GDNF', 'HTR1A', 'HTR1B', 'HTR2A', 'HTR3A', 'HTR3B', 'HTR7', 'IL1B', 'MAOA', 'MDGA2', 'REEP5', 'RPP30', 'SBF2', 'SLC6A4', 'SRP19']


### Morphine

In [36]:
cpd = cids[5]

df_ = df[df["cid"]==cpd]
print(cpd)
print(df_["chemical"].tolist()[0])
print("Genes:    ", len(set(df_["gid"])))
print("Variants: ", len(set(df_["vid"])))

PA450550
morphine
Genes:     24
Variants:  72


In [37]:
# genes with clinical guidelines
clin_ev = df_[(df_["evidence"] == "1A") | (df_["evidence"] == "1B")]["gene"]
set(clin_ev)

set()

In [38]:
# genes with level 2
ev= df_[(df_["evidence"] == "2")]["gene"]
ev = [x for x in ev if x==x] 
ev = sorted(list(set(ev)))
print("genes with EV2", ev)

# genes with level 3
ev= df_[(df_["evidence"] == "3")]["gene"]
ev = [x for x in ev if x==x] 
ev = sorted(list(set(ev)))
print("genes with EV3", ev)

# genes with level 4
ev= df_[(df_["evidence"] == "4")]["gene"]
ev = [x for x in ev if x==x] 
ev = sorted(list(set(ev)))
print("genes with EV4", ev)

# genes with other evidence 
evo = df_[~df_["evidence"].isin(["1A", "1B", "2", "3", "4"])]["gene"]
evo = [x for x in evo if x==x] 
evo = sorted(list(set(evo)))
print("genes with other EV", evo)

#missing SLC6A4 Evidence 4, but present in other ev

genes with EV2 []
genes with EV3 ['ABCB1', 'ABCC3', 'COMT', 'FAAH', 'IL1B', 'KCNJ6', 'OPRK1', 'OPRM1', 'RHBDF2', 'SLC6A4', 'SULT1A3', 'SULT1A4', 'TAOK3', 'TLR2', 'UGT2B7']
genes with EV4 ['ABCB1', 'COMT', 'OPRM1', 'UGT2B7']
genes with other EV ['ABCB1', 'ABCC3', 'ARVCF', 'COMT', 'CYP2C19', 'CYP2D6', 'FAAH', 'IL1B', 'KCNJ6', 'MTRF1L', 'OPRD1', 'OPRK1', 'OPRM1', 'RHBDF2', 'SLC22A1', 'SLC6A4', 'TAOK3', 'TLR2', 'TNFRSF1B', 'TXNRD2', 'UGT2B7']


# Deconvoluting Final Table

We want to further deconvolute the chemicals that are grouped into single groups

In [16]:
df = pd.read_csv(os.path.join(DATAPATH, "13_pgkb_merged.csv"), low_memory=False)

df_ = df[df["smiles"].isna()]
print(df_.shape)

no_smi = list(set(df_["cid"]))
print(len(no_smi))
print(no_smi)

(8112, 12)
274
['PA10176', 'PA452233', 'PA164924561', 'PA166190121', 'PA166165399', 'PA132595336', 'PA10505', 'PA164712720', 'PA166115442', 'PA164779048', 'PA164712817', 'PA166190221', 'PA166182624', 'PA166183787', 'PA164713347', 'PA452229', 'PA10715', 'PA165290928', 'PA164713404', 'PA164713175', 'PA166278341', 'PA165860521', 'PA166169875', 'PA166246281', 'PA166268781', 'PA451961', 'PA165906891', 'PA166189801', 'PA452200', 'PA452639', 'PA166184503', 'PA133822447', 'PA164712948', 'PA164776637', 'PA165815771', 'PA165110778', 'PA165948903', 'PA10402', 'PA166184501', 'C010792', 'PA166184498', 'PA452621', 'PA164712462', 'PA165291493', 'PA130232992', 'PA164713160', 'PA164712832', 'PA166184497', 'PA449924', 'PA164712669', 'PA452610', 'PA166190041', 'PA164920420', 'PA166104276', 'PA164713257', 'PA134521193', 'PA166251541', 'PA164712898', 'PA134687942', 'PA164712789', 'PA164712420', 'PA452640', 'PA166277741', 'PA164712704', 'PA164743704', nan, 'PA452634', 'D000077602', 'PA134687887', 'PA1519585

In [6]:
import requests

drug_classes = []
for cid in no_smi:
    url = f'https://api.pharmgkb.org/v1/data/chemical/{cid}'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        drug_type = data["data"]["types"][0]
        if drug_type == "Drug Class":
            drug_classes += [cid]
        print(f'Identifier: {cid}, Type: {drug_type}')
    else:
        print(f'Failed to fetch data for identifier: {cid}')


Identifier: PA10176, Type: Drug
Identifier: PA452233, Type: Drug Class
Identifier: PA164924561, Type: Drug
Identifier: PA166190121, Type: Drug
Identifier: PA166165399, Type: Biological Intermediate
Identifier: PA132595336, Type: Drug
Identifier: PA10505, Type: Drug Class
Identifier: PA164712720, Type: Drug Class
Identifier: PA166115442, Type: Prodrug
Identifier: PA164779048, Type: Drug
Identifier: PA164712817, Type: Drug Class
Identifier: PA166190221, Type: Drug
Identifier: PA166182624, Type: Drug
Identifier: PA166183787, Type: Drug
Identifier: PA164713347, Type: Drug Class
Identifier: PA452229, Type: Drug Class
Identifier: PA10715, Type: Drug
Identifier: PA165290928, Type: Drug
Identifier: PA164713404, Type: Drug Class
Identifier: PA164713175, Type: Drug Class
Identifier: PA166278341, Type: Drug
Identifier: PA165860521, Type: Drug
Identifier: PA166169875, Type: Drug
Identifier: PA166246281, Type: Drug
Identifier: PA166268781, Type: Drug
Identifier: PA451961, Type: Drug
Identifier: PA1

In [15]:
len(drug_classes)
drug_class_df = df[df['cid'].isin(drug_classes)]
dc = drug_class_df[["chemical", "cid"]].drop_duplicates()
dc.to_csv(os.path.join(DATAPATH, "drug_classes.csv"), index=False)

In [21]:
len(set(df[df["evidence"].str.contains("1")]["cid"]))

467

In [24]:
df_[df_["chemical"].isna()]

Unnamed: 0,cid,chemical,smiles,gid,gene,ensembl_id,vid,variant,evidence,phenotype,did,disease
64282,,,,PA26491,CHRNA5,ENSG00000169684,PA166154938,rs16969968,5,Other,,
64298,,,,PA43,AGTR1,ENSG00000144891,PA166156276,rs5186,5,Other,,
64373,,,,PA24415,ABO,,PA166157797,rs8176746,5,,,
64428,,,,PA189,HMGCR,ENSG00000113161,PA166156878,rs12654264,5,Other,,
64448,,,,PA31945,OPRM1,ENSG00000112038,PA166156991,rs1799971,5,Toxicity,,
...,...,...,...,...,...,...,...,...,...,...,...,...
79492,,,,PA121,CYP2A6,"ENSG00000198077,""ENSG00000255974""",PA166279706,rs28399479,5,Other,,
79493,,,,PA121,CYP2A6,"ENSG00000198077,""ENSG00000255974""",PA166279707,rs28399480,5,Other,,
79494,,,,PA121,CYP2A6,"ENSG00000198077,""ENSG00000255974""",PA166279708,NC_000019.10:g.40843693C>G,5,Other,,
79495,,,,PA121,CYP2A6,"ENSG00000198077,""ENSG00000255974""",PA166279709,NC_000019.10:g.40843692G>C,5,Other,,


In [23]:
df_[df_["chemical"].str.contains("/")]

ValueError: Cannot mask with non-boolean array containing NA / NaN values