# Compute perturbagen signatures from gold signature consensi

In [1]:
import gzip

import pandas
import sqlite3

import l1000

## Read probe information

In [2]:
# construct gene_df
probe_df = pandas.read_table('data/geneinfo/geneinfo.tsv.gz')
probe_to_gene = dict(zip(probe_df.pr_id, probe_df.pr_gene_id))
probe_df = probe_df[probe_df.pr_pool_id.str.startswith('epsilon').fillna(False)]
print(len(probe_df))
probe_df.head(2)

978


Unnamed: 0,pr_id,pr_gene_id,pr_gene_symbol,pr_gene_title,is_lm,is_l1000,is_bing,pr_pool_id
1849,201000_at,16,AARS,alanyl-tRNA synthetase,True,True,True,epsilon|deltap
1890,203192_at,10058,ABCB6,"ATP-binding cassette, sub-family B (MDR/TAP), ...",True,True,True,epsilon|deltap


# Preparations

In [3]:
def run_consensi(sig_expr_df, pert_to_sigs, name):
    """Compute consensi signatures"""
    pert_expr_df = l1000.get_consensus_signatures(sig_expr_df, pert_to_sigs)
    pert_expr_df = l1000.probes_to_genes(pert_expr_df, probe_to_gene)
    pert_expr_df = pert_expr_df.transpose()
    pert_expr_df.index.name = 'perturbagen'
    print(pert_expr_df.shape)
    path = 'data/consensi/consensi-{}.tsv.gz'.format(name)
    with gzip.open(path, "w") as write_file:
        pert_expr_df.reset_index().to_csv(write_file, sep='\t', index=False, float_format='%.3f')
    return pert_expr_df

In [4]:
# open database connection
connection = sqlite3.connect('data/l1000.db')

In [5]:
# get all gold signatures
query = "SELECT sigs.sig_id FROM sigs WHERE sigs.is_gold = 1"
sigs = pandas.read_sql(query, connection).sig_id.tolist()

# get all epsilon probes
probes = probe_df.pr_id.tolist()

# read expression values
path = 'download/modzs.gctx'
sig_expr_df = l1000.extract_from_gctx(path, probes, sigs)

                                                                                

## consensi for all pert_ids

In [6]:
query = """
SELECT perts.pert_id, perts.pert_iname, perts.pert_type, sigs.sig_id
FROM perts, sigs
WHERE sigs.pert_id = perts.pert_id
AND sigs.is_gold = 1
"""
sig_df = pandas.read_sql(query, connection)
sig_df.head(2)

Unnamed: 0,pert_id,pert_iname,pert_type,sig_id
0,BRD-K07762753,aminopurvalanol-a,trt_cp,CVD001_HUH7_24H:BRD-K07762753-001-03-6:50
1,BRD-A46393198,tetramisole,trt_cp,CPC004_VCAP_6H:BRD-A46393198-003-10-9:10


In [7]:
# Condense to perturbagens
pert_to_sigs = {k: g['sig_id'].tolist() for k, g in sig_df.groupby('pert_id')}
pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='pert_id')
pert_expr_df.head()

(38327, 978)


Unnamed: 0_level_0,10007,1001,10013,10038,10046,10049,10051,10057,10058,10059,...,9918,9924,9926,9928,993,994,9943,9961,998,9988
perturbagen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
56582,-1.254363,0.089211,0.111265,-0.300563,-0.24781,0.108298,0.282719,0.14503,0.476075,-1.392895,...,-1.159032,-0.5565,0.936144,0.720292,0.609779,-0.533146,-0.1531,0.339551,-0.207581,0.447996
5981,-0.76822,1.000292,-0.614021,-2.363492,1.701089,-0.670759,-1.373671,1.237618,-0.593345,-2.643236,...,-3.173138,-2.818864,-1.473524,0.507414,-4.459481,-0.430242,-1.134777,-0.909409,-0.952527,-1.290155
7150,2.227333,1.062283,0.167462,-0.325643,-0.847363,-0.714761,-2.032643,0.551846,2.737627,-3.651151,...,-0.378593,-0.405246,0.380839,-0.545139,-1.323891,-0.624912,0.240619,0.112557,-1.91077,-0.8957
ABL1_G2A,0.899407,0.193205,1.132389,-0.801951,1.524582,-1.449656,-0.671298,-0.210271,0.602201,-0.390101,...,-1.857257,-0.093534,-0.100065,-1.094285,-0.660739,-0.006031,1.791329,0.646212,-1.164645,-0.72246
ABL1_T315I,-1.018055,-0.579503,-2.612264,0.342139,1.826274,0.258127,3.421808,-1.228036,0.380587,0.579235,...,3.230517,-1.100459,-2.619114,-0.123058,2.44456,-0.035677,0.42922,-0.320637,0.736478,2.341499


## drugbank consensi

In [8]:
query = """
SELECT unichem.resource_id AS drugbank_id, perts.pert_id, sigs.sig_id, sigs.is_gold
FROM unichem, perts, sigs
WHERE unichem.resource = 'drugbank'
AND unichem.pert_uid = perts.pert_uid
AND sigs.pert_id = perts.pert_id
"""
sig_df = pandas.read_sql(query, connection)
sig_df = sig_df.query("is_gold == 1")

In [9]:
pert_to_sigs = {k: g['sig_id'].tolist() for k, g in sig_df.groupby('drugbank_id')}
pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='drugbank')
pert_expr_df.head()

(1170, 978)


Unnamed: 0_level_0,10007,1001,10013,10038,10046,10049,10051,10057,10058,10059,...,9918,9924,9926,9928,993,994,9943,9961,998,9988
perturbagen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DB00014,0.648519,-1.951535,-0.132719,1.057276,0.555699,-0.983494,0.552265,1.179404,0.519932,-0.74057,...,-0.452899,0.336255,0.210925,1.084157,0.878788,-0.310546,0.334241,-1.174881,-0.560417,-0.128363
DB00091,-4.877796,2.527034,10.729866,-2.106381,-0.396075,0.993854,-11.172144,2.176819,1.137482,-9.118321,...,-3.132145,10.104579,1.223131,-9.262701,-10.679596,-6.41881,-4.387014,-1.646278,-5.402576,6.926191
DB00121,-1.899073,0.30637,0.953008,-1.185298,1.036692,-0.9174,-0.944869,0.66387,-2.268222,0.280179,...,0.197408,-0.846848,-1.732811,-0.784751,0.205345,0.027872,-0.952107,1.343383,0.180319,0.423003
DB00130,-2.340979,0.493963,0.244222,-2.472491,-0.193256,0.105368,-1.933467,-2.567367,-0.308503,1.824783,...,0.490856,-0.908186,-2.791054,-0.307965,-1.812491,-2.004951,-0.847606,0.533631,0.715944,3.000506
DB00131,0.038321,1.442435,-1.206446,2.251529,0.117281,0.145681,-0.305768,2.81782,-0.844508,-2.67401,...,0.542303,1.993044,3.488261,-0.3993,2.839586,1.458824,-1.480231,1.41819,0.781102,-1.202783


## knockdown consensi

In [10]:
query = """
SELECT perts.pert_id, perts.pert_iname, perts.pert_type, sigs.sig_id
FROM perts, sigs
WHERE sigs.pert_id = perts.pert_id
AND pert_type = 'trt_sh'
AND sigs.is_gold = 1
"""
sig_df = pandas.read_sql(query, connection)
sig_df.head(2)

Unnamed: 0,pert_id,pert_iname,pert_type,sig_id
0,TRCN0000139323,PTMS,trt_sh,DER001_HA1E_96H:TRCN0000139323:-666
1,TRCN0000005420,RIOK3,trt_sh,DER001_HA1E_96H:TRCN0000005420:-666


In [11]:
# Condense to perturbagens
pert_to_sigs = {k: g['sig_id'].tolist() for k, g in sig_df.groupby('pert_iname')}
pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='knockdown')
pert_expr_df.head()

(4363, 978)


Unnamed: 0_level_0,10007,1001,10013,10038,10046,10049,10051,10057,10058,10059,...,9918,9924,9926,9928,993,994,9943,9961,998,9988
perturbagen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
61E3.4,0.798785,0.16528,-0.129336,-1.634618,0.399336,0.37825,0.135604,-0.225772,0.478346,-3.423953,...,-1.319634,0.400051,-0.5258,-0.631811,-0.750916,-0.333545,0.390227,0.41845,-0.498883,0.794844
A2M,0.648547,-0.247698,-0.295457,2.186343,2.914298,-3.642328,1.963178,0.602741,1.512531,0.575082,...,-3.000993,0.94821,0.96744,0.779634,2.604704,1.947117,3.807188,1.94233,0.849494,-0.612766
AAK1,-1.328346,2.701878,-0.015586,-1.444498,0.937115,1.162105,-2.515015,-2.138751,0.107793,-0.928829,...,1.847077,0.598482,-1.968194,-1.869371,-3.422654,-3.14469,0.02082,0.642789,-4.437894,-0.683096
AARS,2.261713,-0.159598,-0.895977,0.545963,0.485407,-0.245822,0.515724,1.539533,-0.862967,-0.906111,...,-0.243882,-0.30469,-1.207788,1.679649,2.389303,0.337703,1.64615,-0.140159,-0.676427,0.13483
AATF,0.315349,0.620911,0.101233,-3.402492,4.239397,1.968189,-5.048484,3.755463,-0.396552,-2.310335,...,-0.645077,-2.126604,-6.248747,-3.175084,-3.512846,-5.376611,-0.448866,-0.936199,-2.491926,1.230615


## overexpression consensi

In [12]:
query = """
SELECT perts.pert_id, perts.pert_iname, perts.pert_type, sigs.sig_id
FROM perts, sigs
WHERE sigs.pert_id = perts.pert_id
AND pert_type = 'trt_oe'
AND sigs.is_gold = 1
"""
sig_df = pandas.read_sql(query, connection)
sig_df.head(2)

Unnamed: 0,pert_id,pert_iname,pert_type,sig_id
0,CMAP-HSF-DSTYK,DSTYK,trt_oe,HSF001_HEK293T_48H:CMAP-HSF-DSTYK:200
1,CMAP-HSF-DYRK1B,DYRK1B,trt_oe,HSF001_HEK293T_48H:CMAP-HSF-DYRK1B:200


In [13]:
# Condense to perturbagens
pert_to_sigs = {k: g['sig_id'].tolist() for k, g in sig_df.groupby('pert_iname')}
pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='overexpression')
pert_expr_df.head()

(2471, 978)


Unnamed: 0_level_0,10007,1001,10013,10038,10046,10049,10051,10057,10058,10059,...,9918,9924,9926,9928,993,994,9943,9961,998,9988
perturbagen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2M,0.284042,-0.605666,-0.382481,1.000352,-1.097887,1.173788,-0.229661,-0.339811,-0.003036,0.249115,...,-0.030893,1.540295,-1.099515,0.077211,0.42401,-0.480133,-0.393938,-0.94527,-0.323681,0.209589
ABAT,0.237393,0.771777,0.368586,-0.212831,0.965637,0.062695,0.164361,0.154535,0.180123,-0.308108,...,-0.055624,0.010791,-1.657803,-1.899302,0.21014,1.519556,-0.435645,0.513418,0.5921,-1.207425
ABCB5,-0.782926,0.755979,0.668592,1.746648,4.470254,-1.378287,0.407308,0.359877,0.119718,-2.23749,...,-0.364642,1.766631,1.006817,-0.81244,-0.179819,-0.64085,-0.330633,1.018453,-1.253745,-0.410475
ABCF2,-0.770663,-0.026031,-0.6003,0.540287,-1.18803,-0.264354,0.211165,-0.69999,-0.26466,0.393154,...,-0.860969,-1.411077,-0.155398,0.638356,0.37415,0.171606,0.18054,-0.037088,-1.103739,0.558472
ABCF3,0.951937,0.243125,0.511048,1.399538,3.078651,-1.630107,1.54619,0.068404,0.486363,-0.544521,...,-1.101537,0.391974,-0.043784,1.065096,0.80408,0.504707,0.686367,0.724826,-0.923785,-0.868042
