# Compute perturbagen signatures from gold signature consensi

In [1]:
import bz2
import json

import requests
import pandas
import sqlite3

import l1000

## Entrez Gene to Symbol mapping

In [2]:
# Read symbol to entrez_gene_id mapping
url = 'https://github.com/dhimmel/entrez-gene/raw/a7362748a34211e5df6f2d185bb3246279760546/data/symbol-map.json'
symbol_map = json.loads(requests.get(url).text)

# Read dataframe of entrez gene records
url = 'https://github.com/dhimmel/entrez-gene/raw/a7362748a34211e5df6f2d185bb3246279760546/data/genes-human.tsv'
entrez_gene_df = pandas.read_table(url)
renamer = {
    'GeneID': 'entrez_gene_id',
    'Symbol': 'symbol',
    'type_of_gene': 'type_of_gene',
    'description': 'description',
}
entrez_gene_df = entrez_gene_df.rename(columns=renamer)[list(renamer.values())]
entrez_gene_df.entrez_gene_id = entrez_gene_df.entrez_gene_id.astype(str)

## Read probe information

In [3]:
# construct probe_df
probe_df = pandas.read_table('data/geneinfo/geneinfo.tsv.gz')
probe_to_gene = dict(zip(probe_df.pr_id, probe_df.pr_gene_id))

# Landmark probes
landmark_probe_df = probe_df[probe_df.pr_pool_id.str.startswith('epsilon').fillna(False)]
print('landmark genes', len(landmark_probe_df))

# BIGS + Landmark probes
probe_df = probe_df[probe_df.is_bing.fillna(False)]
# If a gene is an epsilon landmark, do not include imputed probes
probe_df.query("pr_gene_id not in @landmark_probe_df.pr_gene_id or pr_id in @landmark_probe_df.pr_id")
print('best inferred gene set size', len(probe_df))

probe_df.head(2)

('landmark genes', 978)
('best inferred gene set size', 10638)


Unnamed: 0,pr_id,pr_gene_id,pr_gene_symbol,pr_gene_title,is_lm,is_l1000,is_bing,pr_pool_id
1831,218075_at,8086,AAAS,"achalasia, adrenocortical insufficiency, alacr...",False,True,True,inferred
1833,218434_s_at,65985,AACS,acetoacetyl-CoA synthetase,False,True,True,inferred


In [4]:
# Create a dataset of the genes represented by probes
gene_df = probe_df.groupby('pr_gene_id').apply(
    lambda df: pandas.Series({'status': 'measured' if df.pr_pool_id.iloc[0].startswith('epsilon') else 'imputed'})
)
gene_df.index.name = 'entrez_gene_id'
gene_df = gene_df.reset_index()
gene_df = gene_df.merge(entrez_gene_df).sort_values('entrez_gene_id')
gene_df.to_csv('data/consensi/genes.tsv', index=False, sep='\t')
gene_df.head(2)

Unnamed: 0,entrez_gene_id,status,symbol,type_of_gene,description
0,100,imputed,ADA,protein-coding,adenosine deaminase
1,1000,imputed,CDH2,protein-coding,"cadherin 2, type 1, N-cadherin (neuronal)"


In [5]:
# Number of genes after converting from probe to gene
gene_df.status.value_counts()

imputed     6489
measured     978
Name: status, dtype: int64

In [6]:
# Filter BING probes for valid entrez gene IDs
probe_df = probe_df.query("pr_gene_id in @gene_df.entrez_gene_id")

# Probes in BING
probe_df.pr_pool_id.value_counts()

inferred          9490
epsilon|deltap     786
epsilon            192
deltap             165
Name: pr_pool_id, dtype: int64

# Preparations

In [7]:
def run_consensi(sig_expr_df, pert_to_sigs, name):
    """Compute consensi signatures"""
    pert_expr_df = l1000.get_consensus_signatures(sig_expr_df, pert_to_sigs, weighting_subset=landmark_probe_df.pr_id)
    pert_expr_df = l1000.probes_to_genes(pert_expr_df, probe_to_gene)
    pert_expr_df = pert_expr_df.transpose()
    pert_expr_df.index.name = 'perturbagen'
    print(pert_expr_df.shape)
    path = 'data/consensi/consensi-{}.tsv.bz2'.format(name)
    with bz2.BZ2File(path, 'w') as write_file:
        pert_expr_df.reset_index().to_csv(write_file, sep='\t', index=False, float_format='%.3f')
    return pert_expr_df

In [8]:
# open database connection
connection = sqlite3.connect('data/l1000.db')

In [9]:
query = "SELECT sigs.sig_id FROM sigs WHERE sigs.is_gold = 1"
sigs = pandas.read_sql(query, connection).sig_id.tolist()

# get probes and extract signatures
probes = probe_df.pr_id.tolist()

In [10]:
import cmap.io.gct
path = 'download/l1000_n1328098x22268.gctx'
gct_object = cmap.io.gct.GCT(path)
rids = gct_object.get_gctx_rid()
cids = gct_object.get_gctx_cid()

In [11]:
# Does l1000_n1328098x22268.gctx contain all the necessary probes
set(probes) <= set(cids)

True

In [12]:
# Does l1000_n1328098x22268.gctx contain all the necessary sigs
set(sigs) <= set(rids)

False

In [13]:
len(rids)

1328098

In [14]:
len(sigs)

240901

In [15]:
len(set(rids) & set(sigs))

0

In [16]:
[x for x in rids if 'CPC005_VCAP_6H' in x][:5]

['CPC005_VCAP_6H_X1_F1B3_DUO52HI53LO:K06',
 'CPC005_VCAP_6H_X2_F1B3_DUO52HI53LO:K06',
 'CPC005_VCAP_6H_X4_B5_DUO52HI53LO:K06',
 'CPC005_VCAP_6H_X5_B5_DUO52HI53LO:K06',
 'CPC005_VCAP_6H_X1_F1B3_DUO52HI53LO:C19']

In [17]:
[x for x in sigs if 'CPC005_VCAP_6H' in x][:5]

[u'CPC005_VCAP_6H:BRD-A47494775-003-03-0:10',
 u'CPC005_VCAP_6H:BRD-A09925278-003-03-1:10',
 u'CPC005_VCAP_6H:BRD-A18419789-001-01-4:10',
 u'CPC005_VCAP_6H:BRD-A13133631-001-03-2:10',
 u'CPC005_VCAP_6H:BRD-A63998256-001-02-3:10']

# DO NOT CONTINUE WITHOUT RESOLVING IDENTIFIERS

In [None]:
%%time

# get all gold signatures
query = "SELECT sigs.sig_id FROM sigs WHERE sigs.is_gold = 1"
sigs = pandas.read_sql(query, connection).sig_id.tolist()

# get probes and extract signatures
probes = probe_df.pr_id.tolist()
path = 'download/l1000_n1328098x22268.gctx'
sig_expr_df = l1000.extract_from_gctx(path, probes, sigs)

## drugbank consensi

In [None]:
query = """
SELECT unichem.resource_id AS drugbank_id, perts.pert_id, sigs.sig_id, sigs.is_gold
FROM unichem, perts, sigs
WHERE unichem.resource = 'drugbank'
AND unichem.pert_uid = perts.pert_uid
AND sigs.pert_id = perts.pert_id
"""
sig_df = pandas.read_sql(query, connection)
sig_df = sig_df.query("is_gold == 1")

In [None]:
%%time
pert_to_sigs = {k: g['sig_id'].tolist() for k, g in sig_df.groupby('drugbank_id')}
pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='drugbank')

## knockdown consensi

In [None]:
query = """
SELECT perts.pert_id, perts.pert_iname, perts.pert_type, sigs.sig_id
FROM perts, sigs
WHERE sigs.pert_id = perts.pert_id
AND pert_type = 'trt_sh'
AND sigs.is_gold = 1
"""
sig_df = pandas.read_sql(query, connection)
sig_df['pertubation_entrez_gene_id'] = sig_df.pert_iname.map(symbol_map.get)
sig_df.head(2)

In [None]:
%%time
# Condense to perturbagens
pert_to_sigs = {int(k): g['sig_id'].tolist() for k, g in sig_df.groupby('pertubation_entrez_gene_id')}
pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='knockdown')

## overexpression consensi

In [None]:
query = """
SELECT perts.pert_id, perts.pert_iname, perts.pert_type, sigs.sig_id
FROM perts, sigs
WHERE sigs.pert_id = perts.pert_id
AND pert_type = 'trt_oe'
AND sigs.is_gold = 1
"""
sig_df = pandas.read_sql(query, connection)
sig_df['pertubation_entrez_gene_id'] = sig_df.pert_iname.map(symbol_map.get)
sig_df.head(2)

In [None]:
%%time
# Condense to perturbagens
pert_to_sigs = {int(k): g['sig_id'].tolist() for k, g in sig_df.groupby('pertubation_entrez_gene_id')}
pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='overexpression')

## consensi for all pert_ids

In [None]:
query = """
SELECT perts.pert_id, perts.pert_iname, perts.pert_type, sigs.sig_id
FROM perts, sigs
WHERE sigs.pert_id = perts.pert_id
AND sigs.is_gold = 1
"""
sig_df = pandas.read_sql(query, connection)
sig_df.head(2)

In [None]:
%%time
# Condense to perturbagens
pert_to_sigs = {k: g['sig_id'].tolist() for k, g in sig_df.groupby('pert_id')}
pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='pert_id')

In [None]:
pert_expr_df.head(2)