In [1]:
import numpy as np
import pandas as pd
import src.mapping_function as mf

In [2]:
# read in circadian treatment data
treatment = pd.read_excel('downloads/HumCircMed2018v2.xlsx', sheet_name = 0)
# read in Drugbank ID map
drugbank = pd.read_csv('downloads/drug_links.csv')
# read in Disease ontology ID map
disease_ontology = pd.read_csv('data/disease_doid.tsv', sep = '\t')

In [3]:
# map treatments to drugbank ID
drug_trtmnt_id = mf.map_drugbank_id(list(treatment.loc[:,'drug.trtmnt']), list(drugbank.loc[:,'DrugBank ID']), list(drugbank.loc[:,'Name']))
# map therapeutic areas to DO ID
disease_id = mf.map_disease_id(list(treatment.loc[:,'therapeutic.area']), list(disease_ontology.loc[:,'DOID']), list(disease_ontology.loc[:,'therapeutic.area']))
# insert mapped IDs as new columns
treatment.insert(2,'drug.trtmnt.DrugBankID',drug_trtmnt_id)
treatment.insert(9,'therapeutic.area.DOID',disease_id)

In [4]:
# output new dataframe
treatment.to_csv('data/HumCircMed2018v2_mapped.tsv',sep = '\t', header = True, index = False)

In [5]:
# read in CircaDB data
circa_db = pd.read_excel('downloads/aat8806_Data_file_S1.xlsx', sheet_name = 0)
# read in tissue uberon map
tissue_uberon = pd.read_csv('data/tissue_uberon.tsv', sep = '\t')
# read in GTEx data 
gtex_exp = pd.read_csv('downloads/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_median_tpm.gct', sep = '\t', skiprows = 2)

In [6]:
# extract tissue-specific circadian scores by gene
all_genes = circa_db['Entrez.ID'].unique().tolist()
all_tissues = circa_db['tissue'].unique().tolist()
tissue_len = len(all_tissues)
gene_fdr_list = []
gene_amp_list = []
gene_list = []
all_genes_ensg = []

for gene in all_genes:
    gene_id = circa_db.index[circa_db['Entrez.ID'] == gene]
    if len(gene_id) == tissue_len:
        all_genes_ensg.append(circa_db.iloc[gene_id[0],1])
        gene_list.append(gene)
        
        gene_fdr = np.array(circa_db.iloc[gene_id,4])
        gene_amp = np.array(circa_db.iloc[gene_id,6])
        
        gene_fdr_list.append(gene_fdr)
        gene_amp_list.append(gene_amp)  

gene_list = np.array(gene_list)
gene_fdr_list = np.array(gene_fdr_list)
gene_amp_list = np.array(gene_amp_list)

In [7]:
# extract tissue-specific expression (median of all samples) by gene
all_gene_exp = mf.map_gtex_expression(all_genes_ensg, all_tissues, list(tissue_uberon.loc[:,'GTEx.name']), list(tissue_uberon.loc[:,'Tissue']), gtex_exp)
all_gene_exp = np.array(all_gene_exp)
all_gene_exp = np.transpose(all_gene_exp)

In [8]:
# combine FDR, amplitude, expression into one ndarray
combine_array = np.concatenate((gene_fdr_list,gene_amp_list,all_gene_exp),axis=1)

# specify each column name of the ndarray
fdr_names = []
for i in range(0, len(all_tissues)):
    fdr_names.append(all_tissues[i] + '_fdr')
amp_names = []
for i in range(0, len(all_tissues)):
    amp_names.append(all_tissues[i] + '_amp')
exp_names = []
for i in range(0, len(all_tissues)):
    exp_names.append(all_tissues[i] + '_exp')
combine_names = np.concatenate((fdr_names,amp_names,exp_names))

In [9]:
# output ndarray that contains FDR, amplitude, expression of all genes measured
f1 = open('data/circa_db_mapped.tsv', "w")

combine_names_string = '\t'.join(cn for cn in combine_names)
f1.write('gene_id\t%s\n' % combine_names_string)

for i in range(0, len(combine_array)):
    combine_string = '\t'.join(format(ca,'.4f') for ca in combine_array[i])
    f1.write('%d\t%s\n' % (gene_list[i], combine_string))

f1.close()