In [1]:
import numpy as np
import pandas as pd
from cmapPy.pandasGEXpress.parse import parse
import cmapPy.pandasGEXpress.write_gct as wg
import cmapPy.pandasGEXpress.write_gctx as wgx

# let's try GSE92742

gctx_file =      "GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx" # 20 gigs 😳
sig_info_file =  "GSE92742_Broad_LINCS_sig_info.txt"
gene_info_file = "GSE92742_Broad_LINCS_gene_info.txt"
pert_info_file = "GSE92742_Broad_LINCS_pert_info.txt"
cell_info_file = "GSE92742_Broad_LINCS_cell_info.txt"

# GSE92743 is RNA-seq data used to verify accuracy of L1000, no perturbagens involved, so not very helpful

In [2]:
# get metadata

# columns
sig_info = pd.read_csv(
    sig_info_file,
    sep="\t"
)
sig_info.set_index("sig_id", inplace=True)

# rows
gene_info = pd.read_csv(
    gene_info_file,
    sep="\t",
    dtype={'pr_gene_id': 'str'}
)
gene_info.set_index("pr_gene_id", inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# get perturbagen metadata
pert_info = pd.read_csv(
    pert_info_file,
    sep="\t",
    na_values=["-666", -666],
    index_col="pert_id"
)
pert_info['pubchem_cid'] = pd.to_numeric(pert_info['pubchem_cid'], errors='coerce').fillna('0').astype('Int64')

In [4]:
# perturbagens that NPC cells were exposed to, for example 
NPC_perts = sig_info["pert_id"][sig_info["cell_id"] == "NPC"].unique()
pert_info.loc[NPC_perts]['pert_iname'].values

# perturbagen exposure times
NPC_times = sig_info["pert_itime"][sig_info["cell_id"] == "NPC"].unique()
NPC_times

array(['96 h', '24 h', '6 h'], dtype=object)

In [5]:
# get cell line metadata
cell_info = pd.read_csv(
    cell_info_file,
    sep="\t",
    na_values=["-666", -666],
    index_col="cell_id"
)

In [6]:
# how many cell lines are described in metadata?
len(cell_info)

98

In [7]:
# how many cell lines actually used in signatures?
cell_lines_used = sig_info['cell_id'].unique()
len(cell_lines_used)

76

In [8]:
# all cell lines we have data for
cell_info.loc[cell_lines_used]

Unnamed: 0_level_0,cell_type,base_cell_id,precursor_cell_id,modification,sample_type,primary_site,subtype,original_growth_pattern,provider_catalog_id,original_source_vendor,donor_age,donor_sex,donor_ethnicity
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
CD34,primary,CD34,,,normal,bone,bone marrow,suspension,,,,,
HL60,cell line,HL60,,,tumor,haematopoietic and lymphoid tissue,acute myelogenous leukemia (AML)| M3 (promyelo...,suspension,CCL-240,ATCC,36.0,F,Caucasian
PC3,cell line,PC3,,,tumor,prostate,adenocarcinoma,mix,CRL-1435,ATCC,62.0,M,Caucasian
U937,cell line,U937,,,tumor,haematopoietic and lymphoid tissue,lymphoma| B-cell| non-hodgkin's| histiocytic,suspension,CRL-1593.2,ATCC,37.0,M,Caucasian
MCF7,cell line,MCF7,,,tumor,breast,adenocarcinoma,adherent,HTB-22,ATCC,69.0,F,Caucasian
A375,cell line,A375,,,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54.0,F,
HEK293T,cell line,HEK293T,,immortalized normal,normal,kidney,embryonal kidney,adherent,,,,,
A549,cell line,A549,,,tumor,lung,non small cell lung cancer| carcinoma,adherent,CCL-185,ATCC,58.0,M,Caucasian
ASC,primary,ASC,,,primary,adipose,normal primary adipocyte stem cells,adherent,,,,,
HA1E,cell line,HA1E,,immortalized normal,normal,kidney,normal kidney,adherent,,,,,


In [10]:
# perturbagens for just NPC and NEU

#pd.set_option('display.max_rows', None)
all_perts = {}
for cell in ["NPC", "NEU"]:
    for pert in set(sig_info["pert_id"][sig_info["cell_id"] == cell]):
        if pert in all_perts:
            all_perts[pert].append(cell)
        else:
            all_perts[pert] = [cell]
pert_info2 = pert_info.loc[all_perts.keys()].sort_values('pert_iname')
pert_info2['cell_lines'] = pd.Series(all_perts)
pert_info2
#pert_info2.to_csv('all_perts_GSE92742.csv')

Unnamed: 0_level_0,pert_iname,pert_type,is_touchstone,inchi_key_prefix,inchi_key,canonical_smiles,pubchem_cid,cell_lines
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BRD-A76934284,(+)-3-(1-propyl-piperidin-3-yl)-phenol,trt_cp,0,HTSNFXAICLXZMA,HTSNFXAICLXZMA-UHFFFAOYSA-N,CCCN1CCCC(C1)c1cccc(O)c1,55445,"[NPC, NEU]"
BRD-A18795974,"(+/-)-7-hydroxy-2-(N,N-di-n-propylamino)tetralin",trt_cp,0,BLYMJBIZMIGWFK,BLYMJBIZMIGWFK-UHFFFAOYSA-N,CCCN(CCC)C1CCc2ccc(O)cc2C1,1219,"[NPC, NEU]"
BRD-K06817181,"1,2,3,4,5,6-hexabromocyclohexane",trt_cp,1,QFQZKISCBJKVHI,QFQZKISCBJKVHI-UHFFFAOYSA-N,BrC1C(Br)C(Br)C(Br)C(Br)C1Br,74603,[NPC]
BRD-K18436203,"1,2,3,4-tetrahydroisoquinoline",trt_cp,0,UWYZHKAOTLEWKK,UWYZHKAOTLEWKK-UHFFFAOYSA-N,C1Cc2ccccc2CN1,7046,"[NPC, NEU]"
BRD-K74430258,"1,2-dichlorobenzene",trt_cp,1,IVDUVYRBLGSJAO,IVDUVYRBLGSJAO-UHFFFAOYSA-N,Clc1cccc(C=NC=Nc2cccc(Cl)c2)c1,0,[NPC]
BRD-K02603382,1-methylisoquinoline,trt_cp,0,PBYMYAJONQZORL,PBYMYAJONQZORL-UHFFFAOYSA-N,Cc1nccc2ccccc12,15592,"[NPC, NEU]"
BRD-A80928489,1-monopalmitin,trt_cp,1,QHZLMUACJMDIAE,QHZLMUACJMDIAE-UHFFFAOYSA-N,CCCCCCCCCCCCCCCC(=O)OCC(O)CO,14900,[NPC]
BRD-K31491153,1-phenylbiguanide,trt_cp,1,CUQCMXFWIMOWRP,CUQCMXFWIMOWRP-UHFFFAOYSA-N,NC(=N)NC(=N)Nc1ccccc1,4780,[NPC]
BRD-K70792160,10-DEBC,trt_cp,0,GYBXAGDWMCJZJK,GYBXAGDWMCJZJK-UHFFFAOYSA-N,CCN(CC)CCCCN1c2ccccc2Oc2ccc(Cl)cc12,10521421,[NEU]
BRD-K63784565,10-hydroxycamptothecin,trt_cp,1,HAWSQZCWOQZXHI,HAWSQZCWOQZXHI-FQEVSTJZSA-N,CC[C@@]1(O)C(=O)OCc2c1cc1-c3nc4ccc(O)cc4cc3Cn1...,97226,[NPC]


In [14]:
# many more perturbagens in this dataset

df.shape

(8246, 8)

In [5]:
len(sig_info["pert_id"].unique())

51219

In [88]:
# do any perturbagens match similar molecules to ketamine?
# https://pubchem.ncbi.nlm.nih.gov/#query=CID3821%20structure&tab=similarity&similaritythreshold=80&fullsearch=true

# by pubchem cid:
pubchem_similar = pd.read_csv("pubchem_similar_80.csv", index_col="cid")
pert_info[pert_info['pubchem_cid'].isin(pubchem_similar.index)]

Unnamed: 0_level_0,pert_iname,pert_type,is_touchstone,inchi_key_prefix,inchi_key,canonical_smiles,pubchem_cid
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BRD-A10355991,norketamine,trt_cp,1,BEQZHFIKTBVCAU,BEQZHFIKTBVCAU-UHFFFAOYSA-N,NC1(CCCCC1=O)c2ccccc2Cl,123767


In [194]:
# by name (also includes variant names for ketamine itself):
cmpdnames = set()
for cid, cmpdname, cmpdsynonym in zip(pubchem_similar.index, pubchem_similar['cmpdname'].values, pubchem_similar['cmpdsynonym'].values):
    cmpdnames.add(cmpdname)
    if type(cmpdsynonym) is not float:
        cmpdnames.update(cmpdsynonym.split('|'))
pert_info[pert_info['pert_iname'].str.lower().isin([c.lower() for c in cmpdnames])]
#len(cmpdnames)

Unnamed: 0_level_0,pert_iname,pert_type,is_touchstone,inchi_key_prefix,inchi_key,canonical_smiles,pubchem_cid
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BRD-A05186015,bupropion,trt_cp,1,SNPPWIUOZRMYNY,SNPPWIUOZRMYNY-UHFFFAOYSA-N,CC(NC(C)(C)C)C(=O)c1cccc(Cl)c1,0
BRD-A10355991,norketamine,trt_cp,1,BEQZHFIKTBVCAU,BEQZHFIKTBVCAU-UHFFFAOYSA-N,NC1(CCCCC1=O)c2ccccc2Cl,123767


In [112]:
# also check GSE70138 for similar molecules
# (bupropion is a hit but not for neuronal cell lines)
pert_info_file = "GSE70138_Broad_LINCS_pert_info_2017-03-06.txt"
pert_info_GSE70138 = pd.read_csv(
    pert_info_file,
    sep="\t",
    na_values=["-666", -666],
    index_col="pert_id"
)
pert_info_GSE70138[pert_info_GSE70138['pert_iname'].str.lower().isin([c.lower() for c in cmpdnames])]

Unnamed: 0_level_0,canonical_smiles,inchi_key,pert_iname,pert_type
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BRD-A05186015,CC(NC(C)(C)C)C(=O)c1cccc(Cl)c1,SNPPWIUOZRMYNY-UHFFFAOYSA-N,bupropion,trt_cp


In [118]:
pert_info.loc[["BRD-A10355991", "BRD-A05186015"]]

Unnamed: 0_level_0,pert_iname,pert_type,is_touchstone,inchi_key_prefix,inchi_key,canonical_smiles,pubchem_cid
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BRD-A10355991,norketamine,trt_cp,1,BEQZHFIKTBVCAU,BEQZHFIKTBVCAU-UHFFFAOYSA-N,NC1(CCCCC1=O)c2ccccc2Cl,123767
BRD-A05186015,bupropion,trt_cp,1,SNPPWIUOZRMYNY,SNPPWIUOZRMYNY-UHFFFAOYSA-N,CC(NC(C)(C)C)C(=O)c1cccc(Cl)c1,0


In [148]:
bagot = pd.read_csv(
    "paper_bagot.csv"
)
bagot['Gene'].str.upper()

0      ARRDC2
1       PLIN4
2      SLC2A1
3     TSC22D3
4       HIF3A
5      MAP3K6
6       TXNIP
7        SGK1
8      NFKBIA
9     PLEKHF1
10       RHOJ
11     CSRNP1
12       RGCC
13     MFSD2A
14    DCLRE1B
15      FGF11
16      ITGAD
17    SLC27A3
18      DDIT4
19    GADD45G
20     POLR3E
21    PPP1R3G
22    SULT1A1
Name: Gene, dtype: object

In [129]:
norketamine = parse("norketamine.gct", make_multiindex=True)

In [192]:
criterion = norketamine.row_metadata_df['pr_gene_symbol'].isin(bagot['Gene'].str.upper())
df = pd.concat([norketamine.row_metadata_df[criterion], norketamine.data_df[criterion].iloc[:, 0]], axis=1)
df

Unnamed: 0_level_0,pr_gene_symbol,pr_gene_title,pr_is_lm,pr_is_bing,CPC017_NPC_24H:BRD-A10355991-003-01-8:10
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4792,NFKBIA,nuclear factor of kappa light polypeptide gene...,1,1,-0.473
54541,DDIT4,DNA damage inducible transcript 4,1,1,-0.4441
1831,TSC22D3,TSC22 domain family member 3,1,1,-0.5525
11000,SLC27A3,solute carrier family 27 (fatty acid transport...,1,1,-0.0737
6446,SGK1,serum/glucocorticoid regulated kinase 1,0,1,0.1272
6513,SLC2A1,solute carrier family 2 (facilitated glucose t...,0,1,0.0486
10628,TXNIP,thioredoxin interacting protein,0,1,0.4338
6817,SULT1A1,sulfotransferase family 1A member 1,0,1,-0.2094
10912,GADD45G,growth arrest and DNA damage inducible gamma,0,1,-0.2059
55718,POLR3E,polymerase (RNA) III (DNA directed) polypeptid...,0,1,0.0


In [189]:
norketamine.data_df.abs().mean()

cid
CPC017_NPC_24H:BRD-A10355991-003-01-8:10    0.464671
dtype: float32