In [1]:
import gzip
import numpy as np
import pandas as pd
import itertools
import sys
import os

In [2]:
# Append path to l1ktools/python, source hosted here: https://github.com/cmap/l1ktools
sys.path.append(os.path.abspath('../l1ktools/python/'))

# Import LINCS_functions.py file
# NOTE: Alter LINCS_functions.py file to contain YOUR local path to l1ktools (as above) and the LINCS modzs.gctx file
import LINCS_functions as lf

In [3]:
# system specific path
gctx_path = os.path.abspath('../../lincs/download/modzs.gctx')
assert os.path.exists(gctx_path)

In [13]:
# construct gene_df
path = 'https://github.com/dhimmel/lincs/raw/239199387156ba372dcf462c2fbc8bd9eb0682ab/data/geneinfo/geneinfo.tsv.gz'
gene_df = lf.url_to_df(path)
probe_to_gene = dict(zip(gene_df.pr_id, gene_df.pr_gene_id))
gene_df.head()

Unnamed: 0,pr_id,pr_gene_id,pr_gene_symbol,pr_gene_title,is_lm,is_l1000,is_bing,pr_pool_id
0,202938_x_at,100510314///100510451///27341///91695,,,False,True,False,inferred
1,204006_s_at,2214///2215,,,False,True,False,inferred
2,204060_s_at,5613///5616,,,False,True,False,inferred
3,204419_x_at,3047///3048,,,False,True,False,inferred
4,204438_at,414308///4360,,,False,True,False,inferred


In [5]:
# create list of bing and epi probes
is_bing = gene_df[gene_df.is_bing == True]
bing_list = list(is_bing.pr_id)
is_epi = gene_df[(gene_df.pr_pool_id == 'epsilon') | (gene_df.pr_pool_id == 'epsilon|deltap')]
epi_list = list(is_epi.pr_id)

In [6]:
# construct pert_df
path = 'https://github.com/dhimmel/lincs/raw/239199387156ba372dcf462c2fbc8bd9eb0682ab/data/pertinfo/pertinfo.tsv.gz'
pert_df = lf.url_to_df(path)
pert_df = pert_df[['pert_id', 'pubchem_cid']]
pert_df.rename(columns={'pubchem_cid':'pubchem_id'}, inplace=True)
pert_df.pubchem_id = pert_df['pubchem_id'].astype(str) # when merging frames later, easier to merge strings
pert_df.head()

Unnamed: 0,pert_id,pubchem_id
0,CSS001-ATTGCAT,
1,CSS001-GAGGATA,
2,CSS001-TCAATGA,
3,CSS001-TCAGTTC,
4,CSS001-TCCATCA,


In [7]:
# construct sig_df
path = 'https://github.com/dhimmel/lincs/raw/239199387156ba372dcf462c2fbc8bd9eb0682ab/data/siginfo/siginfo.tsv.gz'
sig_df = lf.url_to_df(path)
sig_df = sig_df[sig_df.is_gold == True]
sig_df = sig_df[sig_df.pert_type == 'trt_cp']
sig_df.head()

Unnamed: 0,sig_id,pert_id,pert_itime,distil_nsample,pert_idose,cell_id,pert_type,is_gold,distil_ss,ngenes_modulated_dn_lm,ngenes_modulated_up_lm
1,CVD001_HUH7_24H:BRD-K07762753-001-03-6:50,BRD-K07762753,24 h,2,50 µM,HUH7,trt_cp,True,14.5514,298,302
12,CPC004_VCAP_6H:BRD-A46393198-003-10-9:10,BRD-A46393198,6 h,4,10 µM,VCAP,trt_cp,True,7.06457,99,99
16,CPC005_VCAP_6H:BRD-A47494775-003-03-0:10,BRD-A47494775,6 h,4,10 µM,VCAP,trt_cp,True,3.10184,30,10
21,CPC005_VCAP_6H:BRD-A09925278-003-03-1:10,BRD-A09925278,6 h,4,10 µM,VCAP,trt_cp,True,2.72505,11,6
22,CPC005_VCAP_6H:BRD-A18419789-001-01-4:10,BRD-A18419789,6 h,4,10 µM,VCAP,trt_cp,True,3.53123,34,16


## Create concensus expression for perturbations

In [8]:
# create signature (gold) expression dataframe
sigs = list(sig_df.sig_id)
sig_expr_df = lf.extract_from_gctx(gctx_path, epi_list, sigs)

                                                                                

In [9]:
# Condense to perturbagens (pert_expr_df)
pert_to_sig_dic = {k: g['sig_id'].tolist() for k, g in sig_df.groupby('pert_id')}
pert_expr_df = lf.get_consensus_signatures(sig_expr_df, pert_to_sig_dic)

In [14]:
# Condense to genes
pert_expr_df = lf.probes_to_genes(pert_expr_df, probe_to_gene)

In [27]:
# Transpose
pert_expr_df = pert_expr_df.transpose()
pert_expr_df.index.name = 'pert_id'

In [34]:
len(pert_expr_df)

13072

In [28]:
pert_expr_df.head()

gene,10007,1001,10013,10038,10046,10049,10051,10057,10058,10059,...,9918,9924,9926,9928,993,994,9943,9961,998,9988
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BRD-A00100033,-0.065179,3.696332,-1.700519,-0.155817,-0.631788,0.214929,-0.05604,-0.741466,-1.280059,-0.242464,...,0.081531,0.690079,-0.256267,-0.289961,-0.092473,-0.878243,-0.949191,0.065545,0.069715,3.350655
BRD-A00150179,0.178658,1.264003,1.399285,0.492447,-0.336569,-0.907009,0.441127,-0.462054,0.010908,0.814729,...,-1.281036,1.16536,0.859917,-0.504816,1.617118,1.02373,0.229926,-0.482598,0.879627,0.19955
BRD-A00267231,-2.383586,2.685182,0.739994,-1.936597,-1.680068,1.42196,-1.536609,-0.622676,-1.369872,1.558413,...,0.064048,-0.44561,-1.440129,-2.776955,-0.227445,-0.67654,-0.112605,-0.083343,-1.780427,-0.523714
BRD-A00420644,2.18651,0.202994,-4.654153,-2.815903,2.550356,-0.307756,-4.25396,3.887502,-1.853218,0.478189,...,-2.109322,-1.722654,0.087142,-3.514419,-0.055212,-2.989028,1.203165,-2.632854,-0.106486,2.473428
BRD-A00474148,-0.704136,3.31484,-0.46252,0.71301,1.250462,0.955812,0.030453,-0.355433,-0.492126,-2.036731,...,-1.00996,-0.194795,-0.698215,-1.104338,0.087596,0.184683,-0.499116,-1.027581,0.513044,-0.211934


In [23]:
# SAVE pert_expr_df
path = "../data/consensus-perts.tsv.gz"
with gzip.open(path, "w") as writefile:
    pert_expr_df.reset_index().to_csv(writefile, sep='\t', index=False, float_format='%.3f')

## Create concensus expression for DrugBank compounds

In [24]:
# construct drugbank_df
path = 'https://github.com/dhimmel/drugbank/raw/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/mapping/pubchem.tsv'
drugbank_df = pd.read_table(path)
drugbank_df.pubchem_id = drugbank_df['pubchem_id'].astype(str)
drugbank_df.head()

Unnamed: 0,drugbank_id,pubchem_id
0,DB00014,11980055
1,DB00014,11981235
2,DB00014,11982741
3,DB00014,16052011
4,DB00014,23581804


In [25]:
# create dataframe with meta-data for perturbagens which map to drugbank drugs
db_meta_df = drugbank_df.merge(pert_df, how='inner').merge(sig_df, how='inner')
db_meta_df.head()

Unnamed: 0,drugbank_id,pubchem_id,pert_id,sig_id,pert_itime,distil_nsample,pert_idose,cell_id,pert_type,is_gold,distil_ss,ngenes_modulated_dn_lm,ngenes_modulated_up_lm
0,DB00014,23581804,BRD-A62434282,CPC010_MCF7_24H:BRD-A62434282-015-02-8:10,24 h,4,10 µM,MCF7,trt_cp,True,2.55441,10,12
1,DB00091,16404350,BRD-A38030642,CPC006_A375_24H:BRD-A38030642-001-02-0:10,24 h,5,10 µM,A375,trt_cp,True,2.79683,5,23
2,DB00091,16404350,BRD-A38030642,CPC001_PC3_24H:BRD-A38030642-001-01-2:10,24 h,6,10 µM,PC3,trt_cp,True,8.4797,169,134
3,DB00091,16404350,BRD-A38030642,CPC006_A549_24H:BRD-A38030642-001-02-0:10,24 h,3,10 µM,A549,trt_cp,True,4.6031,59,54
4,DB00091,16404350,BRD-A38030642,CPC006_HCC15_6H:BRD-A38030642-001-02-0:10,6 h,3,10 µM,HCC15,trt_cp,True,3.44276,19,21


In [26]:
#Condense to drugbank drugs (db_expr_df)
db_to_sig_dic = {k: g['sig_id'].tolist() for k, g in db_meta_df.groupby('drugbank_id')}
db_expr_df = lf.get_consensus_signatures(sig_expr_df, db_to_sig_dic)

In [29]:
# Condense to genes
db_expr_df = lf.probes_to_genes(db_expr_df, probe_to_gene)

In [30]:
# Transpose
db_expr_df = db_expr_df.transpose()
db_expr_df.index.name = 'pert_id'

In [35]:
len(db_expr_df)

1152

In [31]:
db_expr_df.head()

Unnamed: 0_level_0,10007,1001,10013,10038,10046,10049,10051,10057,10058,10059,...,9918,9924,9926,9928,993,994,9943,9961,998,9988
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DB00014,0.648519,-1.951535,-0.132719,1.057276,0.555699,-0.983494,0.552265,1.179404,0.519932,-0.74057,...,-0.452899,0.336255,0.210925,1.084157,0.878788,-0.310546,0.334241,-1.174881,-0.560417,-0.128363
DB00091,-4.877796,2.527034,10.729866,-2.106381,-0.396075,0.993854,-11.172144,2.176819,1.137482,-9.118321,...,-3.132145,10.104579,1.223131,-9.262701,-10.679596,-6.41881,-4.387014,-1.646278,-5.402576,6.926191
DB00121,-1.899073,0.30637,0.953008,-1.185298,1.036692,-0.9174,-0.944869,0.66387,-2.268222,0.280179,...,0.197408,-0.846848,-1.732811,-0.784751,0.205345,0.027872,-0.952107,1.343383,0.180319,0.423003
DB00130,-2.340979,0.493963,0.244222,-2.472491,-0.193256,0.105368,-1.933467,-2.567367,-0.308503,1.824783,...,0.490856,-0.908186,-2.791054,-0.307965,-1.812491,-2.004951,-0.847606,0.533631,0.715944,3.000506
DB00132,-1.853676,2.649402,-1.991589,0.003451,1.16199,-3.276943,-0.517267,-0.923736,-1.964041,-0.506071,...,0.788454,-3.420558,-0.458041,1.700007,1.345593,1.042449,-4.680572,-1.013152,0.140201,-1.30939


In [33]:
# SAVE db_expr_df
path = "../data/consensus-drugbank.tsv.gz"
with gzip.open(path, "w") as writefile:
    db_expr_df.reset_index().to_csv(writefile, sep='\t', index=False, float_format='%.3f')