In [2]:
import os
import sys
import pandas as pd

sys.path.append("../src")
from pharmgkb import RawData


data_path = os.path.join("..", "data")
pharmgkb_path = os.path.join(data_path, "pharmgkb")
processed_path = os.path.join(data_path, "pharmgkb_processed")

## Quick and dirty exploration of files for curation

### Chemicals

In [None]:
df = pd.read_csv(os.path.join(pharmgkb_path, "chemicals", "chemicals.csv"))
for i,c in enumerate(df.columns):
    print(i, c)

df["Top FDA Label Testing Level"].unique()

In [None]:
df[df["Name"]=="apremilast"]

In [None]:
# Retrieve SMILES for PubChem compounds
import requests

data = requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/59272813/property/CanonicalSMILES/TXT")
smi = data.text.strip()
smi

In [None]:
sys.path.append(os.path.join("../src"))

from utils import CsvCleaner
import requests

c = CsvCleaner()

data_dict = {}
for r in df.values:
    chemical = c.stringify(r[1])
    crossr = c.inline_comma_splitter(r[6])
    if crossr is not None:
        for i in crossr:
            print(crossr)
            if len(crossr) == 1:
                if "PubChem Compound" in i:
                    cpd = (i.split(":")[-1])
                    data = requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{}/property/CanonicalSMILES/TXT".format(cpd))
                    smi = data.text.strip()
                    print(smi)
            else:
                if "PubChem Compound" in i:
                    cpd = (i.split(":")[-1][:-1])
                    data = requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{}/property/CanonicalSMILES/TXT".format(cpd))
                    smi = data.text.strip()
                    print(smi)


In [61]:
df = pd.read_csv(os.path.join(processed_path, "chemical.csv"))
print(df.shape)
print(len(df[df["smiles"].isna()]))

(4532, 6)
1756


In [49]:
df[(df["smiles"].isna())&(df["chemical_type"]=="Drug")].head(20)

Unnamed: 0,cid,chemical,chemical_type,smiles,dosing_guideline,drug_label
179,PA151958637,4-methylthioamphetamine,Drug,,-1,0
180,PA154081778,4-methylumbelliferone,Drug,,-1,0
222,PA151958362,"6,7-dihydroxybergamottin",Drug,,-1,0
270,PA152530740,"8-cyclopentyl-1,3-dipropylxanthine",Drug,,-1,0
293,PA164747080,abatacept,Drug,,-1,0
294,PA448006,abciximab,Drug,,-1,0
298,PA166104276,ABT-751,Drug,,-1,0
313,PA448014,acetamide mea,Drug,,-1,0
315,PA166246281,acetaminophen / caffeine / dihydrocodeine,Drug,,-1,3
320,PA165290927,acetaminophen/propoxyphene napsylate,Drug,,-1,0


In [52]:
data = requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/adalimumab/property/CanonicalSMILES/TXT")
smi = data.text.strip()
print(smi)

Status: 404
Code: PUGREST.NotFound
Message: No CID found
Detail: No CID found that matches the given name


In [56]:
if "Status: 404" in smi:
    print("yes")

yes


### Genes

In [31]:
df = pd.read_csv(os.path.join(pharmgkb_path, "genes", "genes.csv"),encoding= 'unicode_escape')
for i,c in enumerate(df.columns):
    print(i, c)

0 PharmGKB Accession Id
1 NCBI Gene ID
2 HGNC ID
3 Ensembl Id
4 Name
5 Symbol
6 Alternate Names
7 Alternate Symbols
8 Is VIP
9 Has Variant Annotation
10 Cross-references
11 Has CPIC Dosing Guideline
12 Chromosome
13 Chromosomal Start - GRCh37
14 Chromosomal Stop - GRCh37
15 Chromosomal Start - GRCh38
16 Chromosomal Stop - GRCh38


### Disease

In [33]:
df = pd.read_csv(os.path.join(pharmgkb_path, "phenotypes", "phenotypes.csv"),encoding= 'unicode_escape')
for i,c in enumerate(df.columns):
    print(i, c)

0 PharmGKB Accession Id
1 Name
2 Alternate Names
3 Cross-references
4 External Vocabulary


### Variants

In [34]:
df = pd.read_csv(os.path.join(pharmgkb_path, "variants", "variants.csv"),encoding= 'unicode_escape')
for i,c in enumerate(df.columns):
    print(i, c)

0 Variant ID
1 Variant Name
2 Gene IDs
3 Gene Symbols
4 Location
5 Variant Annotation count
6 Clinical Annotation count
7 Level 1/2 Clinical Annotation count
8 Guideline Annotation count
9 Label Annotation count
10 Synonyms


### Haplotypes

In [29]:
df = pd.read_csv(os.path.join(pharmgkb_path, "relationships", "relationships.csv"),encoding= 'unicode_escape')
for i,c in enumerate(df.columns):
    print(i, c)

0 Entity1_id
1 Entity1_name
2 Entity1_type
3 Entity2_id
4 Entity2_name
5 Entity2_type
6 Evidence
7 Association
8 PK
9 PD
10 PMIDs


In [None]:
#We keep all the haplotypes that appear, either in entity 1 or in entity 2, and then get the gene from the Haplotype name
df1 = df["Entity1_name"][df["Entity1_type"]=="Haplotype"].tolist()
print(len(df1))
df2 = df["Entity2_name"][df["Entity2_type"]=="Haplotype"].tolist()
print(len(df2))
print(len(set(df1+df2)))

#get unique hids for either entity 1 or entity2 -- is the same
haps = list(set(df1+df2))
hids = []
for r in df.values:
    hap = r[1]
    hid = r[0]
    if hap in haps:
        hids += [hid]
len(hids)
len(set(hids))

haps = list(set(df1+df2))
hids = []
for r in df.values:
    hap = r[4]
    hid = r[3]
    if hap in haps:
        hids += [hid]
len(hids)
len(set(hids))
hids = list(set(hids))

In [34]:
#We keep all the haplotypes that appear, either in entity 1 or in entity 2, and then get the gene from the Haplotype name
df1 = df[df["Entity1_type"]=="Haplotype"]
print(df1.shape)
df1.drop_duplicates(subset= ["Entity1_name"], keep="first", inplace=True)
print(df1.shape)
df1 = df1[["Entity1_id", "Entity1_name"]]
df1.rename(columns={"Entity1_id": "hid", "Entity1_name":"haplotype"}, inplace=True)

(11788, 11)
(1028, 11)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop_duplicates(subset= ["Entity1_name"], keep="first", inplace=True)


In [47]:
gene = []
for h in df1["haplotype"]:
    if " " in h:
        g = h.split(" ")[0]
    elif "*" in h:
        g = h.split("*")[0]
    else:
        g = h
    gene += [g]
df1["gene"]= gene
gl = list(set(df1["gene"].tolist()))
gl

### Clincal variants

In [None]:
df = pd.read_csv(os.path.join(pharmgkb_path, "clinicalVariants", "clinicalVariants.csv"),encoding= 'unicode_escape')
for i,c in enumerate(df.columns):
    print(i, c)

### Clinical Annotations

In [60]:
#clinical annotations are identified by a single ID but can be  referenced in different files
al = pd.read_csv(os.path.join(pharmgkb_path, "clinicalAnnotations", "clinical_ann_alleles.csv"))
ev = pd.read_csv(os.path.join(pharmgkb_path, "clinicalAnnotations", "clinical_ann_evidence.csv"),encoding= 'unicode_escape')
his = pd.read_csv(os.path.join(pharmgkb_path, "clinicalAnnotations", "clinical_ann_history.csv"),encoding= 'unicode_escape')
cl = pd.read_csv(os.path.join(pharmgkb_path, "clinicalAnnotations", "clinical_annotations.csv"),encoding= 'unicode_escape')


print("Alleles")
for i,c in enumerate(al.columns):
    print(i, c)
print("Evidence")
for i,c in enumerate(ev.columns):
    print(i, c)
print("History")
for i,c in enumerate(his.columns):
    print(i, c)
print("ClinicalAnn")
for i,c in enumerate(cl.columns):
    print(i, c)

Alleles
0 Clinical Annotation ID
1 Genotype/Allele
2 Annotation Text
3 Allele Function
4 test
Evidence
0 Clinical Annotation ID
1 Evidence ID
2 Evidence Type
3 Evidence URL
4 PMID
5 Summary
6 Score
History
0 Clinical Annotation ID
1 Date (YYYY-MM-DD)
2 Type
3 Comment
ClinicalAnn
0 Clinical Annotation ID
1 Variant/Haplotypes
2 Gene
3 Level of Evidence
4 Level Override
5 Level Modifiers
6 Score
7 Phenotype Category
8 PMID Count
9 Evidence Count
10 Drug(s)
11 Phenotype(s)
12 Latest History Date (YYYY-MM-DD)
13 URL
14 Specialty Population


In [70]:
#Use a CAID (Clinical Ann ID) as an example see the information across files
caid = 1183615480
al[al["Clinical Annotation ID"]==caid]

Unnamed: 0,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function,test
2809,1183615480,*1,Patients carrying the CYP2D6*1 allele in combi...,Normal function,
2810,1183615480,*2,Patients carrying the CYP2D6*2 allele in combi...,Normal function,
2811,1183615480,*3,Patients carrying the CYP2D6*3 allele in combi...,No function,
2812,1183615480,*4,Patients carrying the CYP2D6*4 allele in combi...,No function,
2813,1183615480,*5,Patients carrying the CYP2D6*5 allele in combi...,No function,
2814,1183615480,*6,Patients carrying the CYP2D6*6 allele in combi...,No function,
2815,1183615480,*10,Patients carrying the CYP2D6*10 allele in comb...,Decreased function (AV 0.25),


In [71]:
ev[ev["Clinical Annotation ID"]==caid]

Unnamed: 0,Clinical Annotation ID,Evidence ID,Evidence Type,Evidence URL,PMID,Summary,Score
2765,1183615480,827807126,Variant Drug Annotation,https://www.pharmgkb.org/variantAnnotation/827...,21599570.0,CYP2D6 *1/*1 is associated with increased clea...,0.0
2766,1183615480,827807118,Variant Drug Annotation,https://www.pharmgkb.org/variantAnnotation/827...,21599570.0,CYP2D6 *1/*1 is associated with increased clea...,0.0
2767,1183615480,827807123,Variant Drug Annotation,https://www.pharmgkb.org/variantAnnotation/827...,21599570.0,CYP2D6 *1/*1 is associated with increased clea...,0.0
2768,1183615480,827807099,Variant Drug Annotation,https://www.pharmgkb.org/variantAnnotation/827...,21599570.0,CYP2D6 *1/*1 is associated with increased clea...,0.0
2769,1183615480,982036882,Variant Drug Annotation,https://www.pharmgkb.org/variantAnnotation/982...,16849011.0,CYP2D6 *10 is associated with decreased metabo...,1.0
2770,1183615480,982036911,Variant Drug Annotation,https://www.pharmgkb.org/variantAnnotation/982...,16849011.0,CYP2D6 *4 is not associated with metabolism of...,-0.125
2771,1183615480,982036924,Variant Drug Annotation,https://www.pharmgkb.org/variantAnnotation/982...,16849011.0,CYP2D6 *5 is not associated with metabolism of...,-0.125
2772,1183615480,982043452,Variant Drug Annotation,https://www.pharmgkb.org/variantAnnotation/982...,16595916.0,CYP2D6 *10/*10 is associated with decreased cl...,2.25
2773,1183615480,982044314,Variant Drug Annotation,https://www.pharmgkb.org/variantAnnotation/982...,17329852.0,CYP2D6 *1/*4 is associated with decreased clea...,1.5
2774,1183615480,982044321,Variant Drug Annotation,https://www.pharmgkb.org/variantAnnotation/982...,17329852.0,CYP2D6 *1/*5 is associated with decreased clea...,1.5


In [72]:
cl[cl["Clinical Annotation ID"]==caid]

Unnamed: 0,Clinical Annotation ID,Variant/Haplotypes,Gene,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
875,1183615480,"CYP2D6*1, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6...",CYP2D6,3,,Tier 1 VIP,7.5,Metabolism/PK,4,11,carvedilol,Heart Diseases,22/04/2021,https://www.pharmgkb.org/clinicalAnnotation/11...,


In [66]:
his[his["Clinical Annotation ID"]==caid]

Unnamed: 0,Clinical Annotation ID,Date (YYYY-MM-DD),Type,Comment
1989,1183615480,18/02/2017,Update,
1990,1183615480,12/09/2018,Update,Removed inaccurate variant annotation on PMID ...
1991,1183615480,05/11/2018,Update,Updated OMB race to appropriate biogeographica...
1992,1183615480,05/02/2021,Update,Added PMIDs 16849011 and 21599570 to evidence....
1993,1183615480,24/03/2021,Update,CA score added as part of scoring system relea...
1994,1183615480,22/04/2021,Update,Added sentence about DPWG 'no recommendation' ...


In [50]:
#create temporal smaller files to work on:

df = pd.read_csv(os.path.join(processed_path, "clinical_annotation.csv"))
df = df.head(1000)
df.to_csv(os.path.join(processed_path, "clinann_tmp.csv"), index=False)

df = pd.read_csv(os.path.join(pharmgkb_path, "clinicalAnnotations", "clinical_ann_alleles.csv"))
df = df.head(1000)
df.to_csv(os.path.join(processed_path, "clinann_all_tmp.csv"), index=False)

In [52]:
h = "CYP2C9*1"
"*"+h.split("*")[1]

'*1'

## PGX Relation Table

In [None]:
import pandas as pd
df = pd.read_csv("../data/pharmgkb_processed/pgx_relation.csv")
df.columns

In [None]:
print("Association IDs: "+str(len(set(df["aid"]))))
print("Genomic variation: "+str(len(set(df["genomic_variation"]))))
print("Variant IDs: "+str(len(set(df["vid"]))))
print("Haplotype IDs: "+str(len(set(df["hid"]))))
print("Gene IDs: "+str(len(set(df["gid"]))))
print("Chemical IDs: "+str(len(set(df["cid"]))))

In [None]:
import matplotlib.pyplot as plt

bid = ["AAC","AME","SAS","EAS","EUR","LAT","NEA","OCE","SSA","MG","UNK","CST"]
counts = []

for b in bid:
    count = (df['bid'] == b).sum()
    counts += [count]

colors = ['#5F4690', '#E6F598', '#F46D43', '#FDE08B', '#66C2A5', '#CC503E', '#ABDDA4', '#3288BD', '#D53D4F', "#808080", "#808080", "#808080"]
fig, ax = plt.subplots()
ax.bar(bid, counts, color=colors)

In [None]:
ev = df["evidence"].unique().tolist()
counts = []
for e in sorted(ev):
    count = (df['evidence'] == e).sum()
    counts += [count]

colors = ['#5F4690', '#1D6996', '#38A6A5', '#0F8554', '#73AF48', '#EDAD08', '#E17C05', '#CC503E', '#94346E']
fig, ax = plt.subplots()
ax.bar(sorted(ev), counts, color=colors)

In [None]:
#most abundant genes
counts = df["gene"].value_counts()
print(counts[:20])

#most abundant compounds

counts = df["chemical"].value_counts()
print(counts[:20])