In [1]:
import os
import io
import json
import gzip
import functools

import pandas

from api import query_lincs_api

In [7]:
def get_full_query(service):
    """ """
    
    directory = os.path.join('data', service)
    if not os.path.isdir(directory):
        os.mkdir(directory)
    
    path = os.path.join(directory, '{}.json.gz'.format(service))
    if not os.path.exists(path):
        print(path, 'does not exist. Querying API')
        results = query_lincs_api(service=service, query='', block_size = 100)
        with gzip.open(path, 'wt') as write_file:
            json.dump(results, write_file, indent=2)
    else:
        with gzip.open(path, 'rt') as read_file:
            results = json.load(read_file)

    return results

In [3]:
def as_dataframe(dictlist, keys):
    """
    Take a list of dictionaries returned by api and format as a DataFrame
    """
    subsetted_dictlist = list()
    for d in dictlist:
        newdict = dict()
        for key in keys:
            value = d.get(key)
            if isinstance(value, list):
                value = '|'.join(value)
            if value == -666 or value == '-666':
                value = None
            newdict[key] = value
        subsetted_dictlist.append(newdict)
    df = pandas.DataFrame.from_dict(subsetted_dictlist)[keys]
    return df

In [4]:
def key_intersect(dictlist):
    return functools.reduce(set.intersection, (set(x.keys()) for x in dictlist))

In [5]:
def write_gz_tsv(df, path):
    with gzip.open(path, 'wt') as write_file:
        df.to_csv(write_file, sep='\t', index=False)

# `pertinfo`

In [64]:
pertinfo = get_full_query('pertinfo')

data/pertinfo/pertinfo.json.gz does not exist. Querying API


In [102]:
keys = ['pert_id', 'pert_iname', 'pert_type', 'num_gold', 'num_inst',
        'num_sig', 'in_summly', 'pert_summary', 
        'pubchem_cid', 'molecular_formula',
        'molecular_wt', 'pert_vendor', 
        'canonical_smiles', 'inchi_key', 'inchi_string']
pertinfo_df = as_dataframe(pertinfo, keys)
write_gz_tsv(pertinfo_df, 'data/pertinfo/pertinfo.tsv.gz')
pertinfo_df.head()

Unnamed: 0,pert_id,pert_iname,pert_type,num_gold,num_inst,num_sig,in_summly,pert_summary,pubchem_cid,molecular_formula,molecular_wt,pert_vendor,canonical_smiles,inchi_key,inchi_string
0,CSS001-ATTGCAT,ATTGCAT,trt_sh.css,0,0,7,,,,,,,,,
1,CSS001-GAGGATA,GAGGATA,trt_sh.css,0,0,1,,,,,,,,,
2,CSS001-TCAATGA,TCAATGA,trt_sh.css,0,0,7,,,,,,,,,
3,CSS001-TCAGTTC,TCAGTTC,trt_sh.css,0,0,7,,,,,,,,,
4,CSS001-TCCATCA,TCCATCA,trt_sh.css,0,0,1,,,,,,,,,


# `geneinfo`

In [56]:
geneinfo = get_full_query('geneinfo')

In [111]:
keys = ['pr_id', 'pr_gene_id', 'pr_gene_symbol', 'pr_gene_title', 
        'is_lm', 'is_l1000', 'is_bing', 'pr_pool_id']
geneinfo_df = as_dataframe(geneinfo, keys)
write_gz_tsv(geneinfo_df, 'data/geneinfo/geneinfo.tsv.gz')
geneinfo_df.tail()

Unnamed: 0,pr_id,pr_gene_id,pr_gene_symbol,pr_gene_title,is_lm,is_l1000,is_bing,pr_pool_id
36585,215706_x_at,7791,ZYX,zyxin,False,True,True,inferred
36586,212601_at,23140,ZZEF1,"zinc finger, ZZ-type with EF-hand domain 1",False,True,True,inferred
36587,207189_s_at,23140,ZZEF1,"zinc finger, ZZ-type with EF-hand domain 1",False,True,False,inferred
36588,207190_at,23140,ZZEF1,"zinc finger, ZZ-type with EF-hand domain 1",False,True,False,inferred
36589,212893_at,26009,ZZZ3,"zinc finger, ZZ-type containing 3",False,True,True,inferred


In [110]:
import collections
collections.Counter(geneinfo_df.pr_pool_id)

Counter({'inferred': 21097, None: 14322, 'epsilon|deltap': 786, 'deltap': 193, 'epsilon': 192})

# siginfo

In [None]:
siginfo = get_full_query('siginfo')

In [99]:
keys = ['sig_id', 'pert_id', 'pert_itime', 'distil_nsample', 'pert_idose',
        'cell_id', 'pert_type', 'is_gold', 'distil_ss', 'ngenes_modulated_dn_lm', 'ngenes_modulated_up_lm']
siginfo_df = as_dataframe(siginfo, keys)
write_gz_tsv(siginfo_df, 'data/siginfo/siginfo.tsv.gz')
siginfo_df.tail()

Unnamed: 0,sig_id,pert_id,pert_itime,distil_nsample,pert_idose,cell_id,pert_type,is_gold,distil_ss,ngenes_modulated_dn_lm,ngenes_modulated_up_lm
476246,CGS001_VCAP_120H:NRAS:5,CGS001-4893,120 h,5,5 µL,VCAP,trt_sh.cgs,True,1.75965,0,0
476247,CGS001_VCAP_120H:PFKL:5,CGS001-5211,120 h,6,5 µL,VCAP,trt_sh.cgs,True,2.15781,2,0
476248,CGS001_VCAP_120H:STK17A:5,CGS001-9263,120 h,7,5 µL,VCAP,trt_sh.cgs,True,1.94439,0,0
476249,CGS001_VCAP_168H:FLT1:5,CGS001-2321,168 h,7,5 µL,VCAP,trt_sh.cgs,True,2.3042,2,2
476250,CGS001_VCAP_168H:PIK3CA:5,CGS001-5290,168 h,7,5 µL,VCAP,trt_sh.cgs,True,2.3303,2,1


# cellinfo

In [17]:
cellinfo = get_full_query('cellinfo')

In [19]:
keys = ['cell_id', 'cell_histology', 'cell_lineage', 'cell_source', 'cell_source_id',
        'cell_type', 'gender', 'is_from_metastasis', 'lincs_status', 'metastatic_site']
cellinfo_df = as_dataframe(cellinfo, keys)
write_gz_tsv(cellinfo_df, 'data/cellinfo/cellinfo.tsv.gz')
cellinfo_df.tail()

Unnamed: 0,cell_id,cell_histology,cell_lineage,cell_source,cell_source_id,cell_type,gender,is_from_metastasis,lincs_status,metastatic_site
2536,PC3,carcinoma,prostate,ATCC,CRL-1435,cancer,M,Y,core_cline|DIVR,bone
2537,RMUGS,carcinoma,ovary,HSRRB,,cancer,F,N,DIVR,
2538,THP1,"leukemia,acute monocytic","haematopoietic,lymphoid,monocyte",ATCC,TIB-202,cancer,M,,DIVR,
2539,1HAE,"lung, bronchial epithelial",lung,Jim Hogg (UBC) [obtained from DC Gruenert (UCSF)],,primary immortalized,M,N,,
2540,HPTEC,epithelium,kidney,Biopredic,,primary,male,,,
