In [2]:
import pandas as pd
import os

DATAPATH = "../data/pharmgkb_processed"

# Chemicals

We are missing the SMILES for ~1.7k chemicals out of 4.5k. Many of them are from Drug Classes, and potentially could be separated into the different classes they are referred to.
By searching with the PharmGKB API, we have only found 3 more SMILES. The PharmGKB API search is now incorporated in chemical.py for future uses.

In [22]:
#re-parse chemicals file to get smiles from API if they are not there and compare with original file

df = pd.read_csv(os.path.join(DATAPATH, "0_chemical.csv"))
print(len(df))
df = df[df["smiles"].isna()]
len(df)

4532


1753

In [19]:
import requests

cid2smi = {}
for cid in df["cid"].tolist():
    url = "https://api.pharmgkb.org/v1/data/chemical/{}?view=base".format(cid)
    response = requests.get(url)

    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        print(data)
        try:
            smiles = data['data']['smiles']
            print("SMILES from API!")
            cid2smi[cid] = smiles
        except:
            smiles=None
    else:
        print("Failed to fetch data from the API. Status code:", response.status_code)

{'data': {'objCls': 'Chemical', 'id': 'PA166131395', 'name': '10-hydroxy r-warfarin', 'pediatric': False, 'types': ['Metabolite']}, 'status': 'success'}
{'data': {'objCls': 'Chemical', 'id': 'PA166165005', 'name': '12-hydroxy-sirolimus', 'pediatric': False, 'types': ['Metabolite']}, 'status': 'success'}
{'data': {'objCls': 'Chemical', 'id': 'PA166160599', 'name': '14-hydroxyclarithromycin', 'pediatric': False, 'types': ['Metabolite']}, 'status': 'success'}
{'data': {'objCls': 'Chemical', 'id': 'PA166131541', 'name': '1-hydroxyibuprofen glucuronide', 'pediatric': False, 'types': ['Metabolite']}, 'status': 'success'}
{'data': {'objCls': 'Chemical', 'id': 'PA166131511', 'name': '2,3-diene valproic acid-coenzyme A', 'pediatric': False, 'types': ['Metabolite']}, 'status': 'success'}
{'data': {'objCls': 'Chemical', 'id': 'PA166131512', 'name': '2,4-diene valproic acid-coenzyme A', 'pediatric': False, 'types': ['Metabolite']}, 'status': 'success'}
{'data': {'objCls': 'Chemical', 'id': 'PA1647

In [20]:
cid2smi #we have manually modified these three to avoid the re-fetching of all smiles

{'PA164748138': 'C[C@H](CCCC(C)C)[C@H]1CC[C@@H]2[C@@]1(CCCC2=CC=C3C[C@H](CCC3=C)O)C',
 'PA449932': 'CC(=O)[C@]1(CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2CCC4=CC(=O)CC[C@]34C)C)O',
 'PA451652': 'CC1=C(SC=[N+]1CC2=CN=C(N=C2N)C)CCO'}

# Genes, haplotypes and variants

We have 24.5k genes, but only 1k have information about variants/haplotypes. We are assuming that if a gene does not have any described haplotype or variant, it will not have any PGx annotation in PharmGKB.
The genes do not have their variants listed when searching via the API, that is why we have parsed the rlx file as the source of truth for haplotypes, and from there we are getting the variants.

In [3]:
gene = pd.read_csv(os.path.join(DATAPATH,"0_gene.csv"))
rlx = pd.read_csv(os.path.join(DATAPATH,"1_haplotype_rlx.csv"))
haps = pd.read_csv(os.path.join(DATAPATH, "2_haplotype.csv"))

print("All genes in Pharmgkb: ", len(gene))
print("All genes with haplotypes/variants in PharmGKB: ", len(set(rlx["gid"])))

All genes in Pharmgkb:  24550
All genes with haplotypes/variants in PharmGKB:  45
All haplotypes in PharmGKB: 


In [26]:
#Manually check that indeed genes that are not in the RLX file do not have variants reported in PharmGKB
no_hap = gene[~gene['gid'].isin(rlx['gid'])]
print(len(no_hap))
print("Genes without haplotypes: ", no_hap.sample(n=10)['gid'].tolist())
print("Genes with haplotypes: ", rlx.sample(n=10)['gid'].tolist())

24505
Genes without haplotypes:  ['PA134880462', 'PA36352', 'PA26680', 'PA28694', 'PA25001', 'PA134861867', 'PA134958992', 'PA31958', 'PA30602', 'PA134957759']
Genes with haplotypes:  ['PA28469', 'PA28469', 'PA124', 'PA124', 'PA356', 'PA128', 'PA121', 'PA28469', 'PA124', 'PA126']


# Haplotype to variant

We get the information of the variants found within an haplotype by downloading all haplotype files from PharmGKB (by gene)

In [31]:
#genes for which there is haplotype information:
assert len(list(set(gene["gene"].tolist()))) == len(no_hap)+len(list(set(rlx["gene"].tolist())))
print("Genes with Haplotypes: ", len(list(set(rlx["gene"].tolist()))))

Genes with Haplotypes:  45


In [36]:
folder_path = os.path.join(DATAPATH, "haplotypes")
files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
num_files = len(files)
print(f"There are {num_files} files in the haplotype folder.")
gene_names = [f.split('_')[0] for f in files]

There are 49 files in the haplotype folder.


In [37]:
gene_names_rlx = list(set(rlx["gene"].tolist()))

In [51]:
new_genes = set(gene_names) - set(gene_names_rlx)
print(new_genes)
print(len(list(new_genes))-1, " genes were not in RLX but have a Haplotype file in PharmGKB")

{'VKORC1', 'manual', 'ABCG2', 'F5'}
3  genes were not in RLX but have a Haplotype file in PharmGKB


In [52]:
new_genes = set(gene_names_rlx) - set(gene_names)
print(new_genes)
print(len(list(new_genes)), " genes were in RLX but do not have a Haplotype file in PharmGKB")

set()
0  genes were in RLX but do not have a Haplotype file in PharmGKB


In [63]:
#also quickly check we are not missing any gene from rlx
import sys 
cwd = os.getcwd()
sys.path.append(os.path.join(cwd, "..", "src"))
from pharmgkb import RawData
def get_raw_files():
    r = RawData()
    df = r.relationships
    return df

df = get_raw_files()

entity_types = list(set(df["Entity1_type"].tolist()))
print(entity_types)

['Chemical', 'Variant', 'Haplotype', 'Disease', 'Gene']


In [65]:
df1 = df[df["Entity1_type"] == "Haplotype"]
print(df1.shape)
df1.drop_duplicates(subset=["Entity1_name"], keep="first", inplace=True)
print(df1.shape)

(11788, 11)
(1028, 11)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop_duplicates(subset=["Entity1_name"], keep="first", inplace=True)


In [66]:
df2 = df[df["Entity2_type"] == "Haplotype"]
print(df2.shape)
df2.drop_duplicates(subset=["Entity2_name"], keep="first", inplace=True)
print(df2.shape)

(11788, 11)
(1029, 11)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.drop_duplicates(subset=["Entity2_name"], keep="first", inplace=True)


In [70]:
h2v = pd.read_csv(os.path.join(DATAPATH, "3_hid_vid_complete.csv"))
print(h2v. shape)
print(len(set(h2v["vid"])), " unique variants associated to an haplotype") 

#why the difference?
h2v_nohla = h2v[~h2v["haplotype"].str.contains('HLA')]
print(h2v_nohla.shape)

(8715, 12)
1111  unique variants associated to an haplotype
(3549, 12)


### Variants

In [75]:
vars = pd.read_csv(os.path.join(DATAPATH, "5_variant_complete.csv"))
vars.shape

(7771, 4)

In [73]:
df1 = df[df["Entity1_type"] == "Variant"]
print(df1.shape)
df1.drop_duplicates(subset=["Entity1_name"], keep="first", inplace=True)
print(df1.shape)

df2 = df[df["Entity1_type"] == "Variant"]
print(df2.shape)
df2.drop_duplicates(subset=["Entity1_name"], keep="first", inplace=True)
print(df2.shape)

(28032, 11)
(6307, 11)
(28032, 11)
(6307, 11)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop_duplicates(subset=["Entity1_name"], keep="first", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.drop_duplicates(subset=["Entity1_name"], keep="first", inplace=True)


In [76]:
vars_rlx = list(set(df1["Entity1_id"]))
print(len(vars_rlx), " Unique Variants in Relationship file")
print(len(set(vars["vid"])), "Unique Variants in Variants file")

6307  Unique Variants in Relationship file
6777 Unique Variants in Variants file


In [80]:
vars_h2v = list(set(h2v["vid"]))
vars_rlx = list(set(df1["Entity1_id"]))
vars_file = list(set(vars["vid"]))

print(len(vars_h2v), " Unique Variants in Hid to Vid file")
print(len(vars_rlx), " Unique Variants in Relationship file")
print(len(set(vars["vid"])), "Unique Variants in Variants file")

print(len(set(vars_h2v)-set(vars_file)), " Variants present in Hid2Vid but not in Variant File")
print(len(set(vars_rlx)-set(vars_file)), " Variants present in RLX file but not in Variant File")
print(len(set(vars_h2v)-set(vars_rlx)), " Variants present in H2V but not RLX")

1111  Unique Variants in Hid to Vid file
6307  Unique Variants in Relationship file
6777 Unique Variants in Variants file
548  Variants present in Hid2Vid but not in Variant File
63  Variants present in RLX file but not in Variant File
766  Variants present in H2V but not RLX


In [82]:
# Total Unique Variants

total_vars = set(vars_h2v+vars_rlx+vars_file)
print(len(total_vars))

7388
