# Map CircaDB and treatment data to hetionet ID

In [1]:
import numpy as np
import pandas as pd
import src.mapping_function as mf

### Circadian treatment data
Map treatment to drugbank IDs, and therapeutic area to disease ontology IDs

In [2]:
# read in circadian treatment data
treatment = pd.read_excel('downloads/HumCircMed2018v2.xlsx', sheet_name = 0)
treatment.head(2)

Unnamed: 0,year,drug.trtmnt,roa,class,class.abbrev,halflife.hrs,halflife.hrs.min,therapeutic.area,therapeutic.area.bin,subjects,timepoints,effect,randomized,controlled,notes,reference
0,1997,"oxaliplatin, fluorouracil, folinic acid",iv,chemotherapy,CT,0.5,0.5,colorectal cancer,cancer,186,2,"less tox, more eff",yes,yes,circadian pump vs constant rate infusion,"F. Lévi, R. Zidani, J.-L. Misset, Randomised m..."
1,1993,oxaliplatin,iv,chemotherapy,CT,0.5,0.5,colorectal cancer,cancer,29,1,,no,no,circadian pump infusion,"F. Levi, B. Perpoint, C. Garufi, C. Focan, P. ..."


In [3]:
# read in Drugbank ID map
drugbank = pd.read_csv('https://github.com/dhimmel/drugbank/raw/6b9ae386d6ba4a0eca2d66d4b0337a6e90fe81f4/data/drugbank.tsv', 
                       sep = '\t', header = 0)
drugbank.head(2)

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,description
0,DB00001,Lepirudin,biotech,approved,B01AE02,Antithrombins|Fibrinolytic Agents,,,Lepirudin is identical to natural hirudin exce...
1,DB00002,Cetuximab,biotech,approved,L01XC06,Antineoplastic Agents,,,Epidermal growth factor receptor binding FAB. ...


In [4]:
# read in Disease ontology ID map
disease_ontology = pd.read_csv('data/disease_doid.tsv', sep = '\t')
disease_ontology.head(2)

Unnamed: 0,therapeutic.area,DOID
0,advanced nasopharyngeal carcinoma (NPC),DOID:9261
1,allergic rhinitis,DOID:4481


In [5]:
# map treatments to drugbank ID
drug_trtmnt_id = mf.map_drugbank_id(list(treatment.loc[:,'drug.trtmnt']), list(drugbank.loc[:,'drugbank_id']), list(drugbank.loc[:,'name']))
# map therapeutic areas to DO ID
disease_id = mf.map_disease_id(list(treatment.loc[:,'therapeutic.area']), list(disease_ontology.loc[:,'DOID']), list(disease_ontology.loc[:,'therapeutic.area']))
# insert mapped IDs as new columns
treatment.insert(2,'drug.trtmnt_drugbank_id',drug_trtmnt_id)
treatment.insert(9,'therapeutic.area_doid',disease_id)
treatment.head(2)

Unnamed: 0,year,drug.trtmnt,drug.trtmnt_drugbank_id,roa,class,class.abbrev,halflife.hrs,halflife.hrs.min,therapeutic.area,therapeutic.area_doid,therapeutic.area.bin,subjects,timepoints,effect,randomized,controlled,notes,reference
0,1997,"oxaliplatin, fluorouracil, folinic acid","DB00526, DB00544, NA",iv,chemotherapy,CT,0.5,0.5,colorectal cancer,DOID:9256,cancer,186,2,"less tox, more eff",yes,yes,circadian pump vs constant rate infusion,"F. Lévi, R. Zidani, J.-L. Misset, Randomised m..."
1,1993,oxaliplatin,DB00526,iv,chemotherapy,CT,0.5,0.5,colorectal cancer,DOID:9256,cancer,29,1,,no,no,circadian pump infusion,"F. Levi, B. Perpoint, C. Garufi, C. Focan, P. ..."


In [6]:
# output new dataframe
treatment.to_csv('data/HumCircMed2018v2_mapped.tsv',sep = '\t', header = True, index = False)

### CircaDB data
Map treatment to drugbank IDs, and therapeutic area to disease ontology IDs

In [2]:
# read in CircaDB data
circa_db = pd.read_excel('downloads/aat8806_Data_file_S1.xlsx', sheet_name = 0)
circa_db.head(2)

Unnamed: 0,Gene.Symbol,Ensembl.ID,Entrez.ID,tissue,fdr,rsq,rAmp,ptr,phase,Gene.Type
0,WASH7P,ENSG00000227232,653635,Fat SQ,0.643199,0.056844,0.110338,1.248044,3.546497,unprocessed_pseudogene
1,FAM87B,ENSG00000177757,400728,Fat SQ,0.743341,0.174418,0.289392,1.814491,3.090241,lincRNA


In [3]:
# read in tissue uberon map
tissue_uberon = pd.read_csv('data/tissue_uberon.tsv', sep = '\t')
tissue_uberon.head(2)

Unnamed: 0,tissue,exact_uberon_code,exact_uberon_term,hetionet_uberon_code,hetionet_uberon_term,gtex_name
0,Heart Atrial,UBERON:0006631,right atrium auricular region,UBERON:0002081,acardiac atrium,Heart - Atrial Appendage
1,Aorta,UBERON:0001496,ascending aorta,UBERON:0001515,thoracic aorta,Artery - Aorta


In [4]:
# read in GTEx data 
gtex_exp = pd.read_csv('downloads/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_median_tpm.gct.gz', 
                       sep = '\t', skiprows = 2, compression = 'gzip')
gtex_exp.head(2)

Unnamed: 0,gene_id,Description,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,...,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole Blood
0,ENSG00000223972.4,DDX11L1,0.056945,0.05054,0.0746,0.03976,0.04386,0.04977,0.05878,0.089315,...,0.05417,0.05982,0.06089,0.07025,0.063895,1.76,0.05835,0.03849,0.03386,0.1175
1,ENSG00000227232.4,WASH7P,11.85,9.753,8.023,12.51,12.3,11.59,14.24,5.743,...,17.15,17.74,12.19,18.13,9.3425,17.14,19.255,21.81,17.28,8.439


In [5]:
# extract tissue-specific circadian scores by gene
all_genes = circa_db['Entrez.ID'].unique().tolist()
all_tissues = circa_db['tissue'].unique().tolist()
tissue_len = len(all_tissues)
gene_fdr_list = []
gene_amp_list = []
gene_list = []
all_genes_ensg = []

for gene in all_genes:
    gene_id = circa_db.index[circa_db['Entrez.ID'] == gene]
    if len(gene_id) == tissue_len:
        all_genes_ensg.append(circa_db.iloc[gene_id[0],1])
        gene_list.append(gene)
        
        gene_fdr = np.array(circa_db.fdr[gene_id])
        gene_amp = np.array(circa_db.rAmp[gene_id])
        
        gene_fdr_list.append(gene_fdr)
        gene_amp_list.append(gene_amp)  

gene_list = np.array(gene_list)
gene_fdr_list = np.array(gene_fdr_list)
gene_amp_list = np.array(gene_amp_list)

# extract tissue-specific expression (median of all samples) by gene
all_gene_exp = mf.map_gtex_expression(all_genes_ensg, all_tissues, list(tissue_uberon.loc[:,'gtex_name']), list(tissue_uberon.loc[:,'tissue']), gtex_exp)
all_gene_exp = np.array(all_gene_exp)
all_gene_exp = np.transpose(all_gene_exp)

In [10]:
# combine FDR, amplitude, expression into one ndarray
combine_array = np.concatenate((gene_fdr_list,gene_amp_list,all_gene_exp),axis=1)
combine_data_df = pd.DataFrame(combine_array)
# specify each column name of the ndarray
fdr_names = []
for i in range(0, len(all_tissues)):
    fdr_names.append(all_tissues[i] + '_fdr')
amp_names = []
for i in range(0, len(all_tissues)):
    amp_names.append(all_tissues[i] + '_amp')
exp_names = []
for i in range(0, len(all_tissues)):
    exp_names.append(all_tissues[i] + '_exp')
combine_names = np.concatenate((fdr_names,amp_names,exp_names))
combine_data_df.columns = combine_names
combine_data_df.insert(0, 'gene_id', gene_list)
combine_data_df.head(2)

Unnamed: 0,gene_id,Fat SQ_fdr,Fat Visceral_fdr,Aorta_fdr,Artery Coronary_fdr,Artery Tibial_fdr,Colon_fdr,Esophagus_fdr,Heart Atrial_fdr,Liver_fdr,...,Artery Coronary_exp,Artery Tibial_exp,Colon_exp,Esophagus_exp,Heart Atrial_exp,Liver_exp,Lung_exp,Nerve Tibial_exp,Pituitary_exp,Thyroid_exp
0,653635,0.643199,0.152557,0.844302,0.771204,0.954905,0.505908,0.292828,0.695318,0.973172,...,12.3,11.59,12.72,12.303333,5.369,5.406,13.68,19.48,15.84,19.255
1,79854,0.765248,0.241189,0.047268,0.000238,0.631385,0.860202,0.081987,0.648068,0.640523,...,5.94,8.419,5.44075,4.660667,2.962,3.24,7.024,12.11,9.898,9.7615


In [11]:
# output dataframe that contains FDR, amplitude, expression of all genes measured
combine_data_df.to_csv('data/circa_db_mapped.tsv',sep = '\t', header = True, index = False, float_format = '%.4f')