In [1]:
import pandas as pd
from cmapPy.pandasGEXpress.parse import parse
import cmapPy.pandasGEXpress.write_gct as wg
import cmapPy.pandasGEXpress.write_gctx as wgx
gctx_file =      "GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328_2017-03-06.gctx"
sig_info_file =  "GSE70138_Broad_LINCS_sig_info_2017-03-06.txt"
gene_info_file = "GSE70138_Broad_LINCS_gene_info_2017-03-06.txt"
pert_info_file = "GSE70138_Broad_LINCS_pert_info_2017-03-06.txt"
cell_info_file = "GSE70138_Broad_LINCS_cell_info_2017-04-28.txt"

In [2]:
# get metadata

# columns
sig_info = pd.read_csv(
    sig_info_file,
    sep="\t"
)
sig_info.set_index("sig_id", inplace=True)

# rows
gene_info = pd.read_csv(
    gene_info_file,
    sep="\t",
    dtype={'pr_gene_id': 'str'}
)
gene_info.set_index("pr_gene_id", inplace=True)

In [3]:
# ids of all samples from two specific neuronal cell lines
NPC_ids = sig_info[sig_info["cell_id"] == "NPC"].index
NEU_ids = sig_info[sig_info["cell_id"] == "NEU"].index

In [26]:
# parse in GCTx
NPC_only_gctoo = parse(gctx_file, cid=NPC_ids)

# set metadata
NPC_only_gctoo.col_metadata_df = sig_info[sig_info["cell_id"] == "NPC"]
NPC_only_gctoo.row_metadata_df = gene_info

# write a GCT file
wg.write(NPC_only_gctoo, "NPC")

In [27]:
del NPC_only_gctoo

# repeat for NEU
NEU_only_gctoo = parse(gctx_file, cid=NEU_ids)

NEU_only_gctoo.col_metadata_df = sig_info[sig_info["cell_id"] == "NEU"]
NEU_only_gctoo.row_metadata_df = gene_info

wg.write(NEU_only_gctoo, "NEU")

In [4]:
# get perturbagen metadata
pert_info = pd.read_csv(
    pert_info_file,
    sep="\t",
    na_values=["-666", -666],
    index_col="pert_id"
)

In [141]:
# perturbagens that NPC cells were exposed to, for example 
NPC_perts = sig_info["pert_id"][sig_info["cell_id"] == "NPC"].unique()
pert_info.loc[NPC_perts]['pert_iname'].values

#NPC_times = sig_info["pert_itime"][sig_info["cell_id"] == "NPC"].unique()
#NPC_times

array(['vorinostat', 'tanespimycin', 'DMSO', 'XMD-885', 'amuvatinib',
       'TPCA-1', 'sirolimus', 'mitoxantrone', 'ALW-II-49-7', 'tivantinib',
       'A-66', 'tivozanib', 'SB-216763', 'SAR-245409', 'CGP-60474',
       'SGI-1776', 'cabozantinib', 'BIX-02189', 'bosutinib', 'GDC-0068',
       'OSI-930', 'AZ-628', 'dinaciclib', 'PF-04217903', 'AZD-4547',
       'tyrphostin-AG-1478', 'KW-2449', 'dacomitinib', 'dasatinib',
       'KIN001-244', 'WZ-7043', 'ibrutinib', 'AST-1306', 'KIN001-266',
       'sunitinib', 'pazopanib', 'masitinib', 'rebastinib', 'PHA-767491',
       'KIN001-270', 'doramapimod', 'BMS-777607', 'GSK-1059615',
       'HG-9-91-01', 'everolimus', 'SB-525334', 'MLN-0128', 'HG-14-10-04',
       'WZ-4002', 'FR-180204', 'motesanib', 'NVP-TAE226', 'quizartinib',
       'GSK-3-inhibitor-II', 'BGT-226', 'osimertinib', 'PD-0325901',
       'BMS-509744', 'OTS-167', 'BS-181', 'idelalisib', 'AT7867',
       'vandetanib', 'NVP-BGJ398', 'veliparib', 'NPK76-II-72-1', 'MK-1775',
       '

In [130]:
# get cell line metadata
cell_info = pd.read_csv(
    cell_info_file,
    sep="\t",
    na_values=["-666", -666],
    index_col="cell_id"
)

In [127]:
# how many cell lines are described in metadata?
len(cell_info)

98

In [128]:
# how many cell lines actually used in signatures?
cell_lines_used = sig_info['cell_id'].unique()
len(cell_lines_used)

41

In [129]:
# all cell lines we have data for
cell_info.loc[cell_lines_used]

Unnamed: 0_level_0,cell_type,base_cell_id,precursor_cell_id,modification,sample_type,primary_site,subtype,original_growth_pattern,provider_catalog_id,original_source_vendor,donor_age,donor_sex,donor_ethnicity
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
A375,cell line,A375,,,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54.0,F,
A549,cell line,A549,,,tumor,lung,non small cell lung cancer| carcinoma,adherent,CCL-185,ATCC,58.0,M,Caucasian
BT20,cell line,BT20,,,tumor,breast,carcinoma,adherent,HTB-19,ATCC,74.0,F,Caucasian
HA1E,cell line,HA1E,,immortalized normal,normal,kidney,normal kidney,adherent,,,,,
HCC515,cell line,HCC515,,,normal,lung,carcinoma,adherent,,,,,
HEPG2,cell line,HEPG2,,,tumor,liver,hepatocellular carcinoma,adherent,HB-8065,ATCC,15.0,M,Caucasian
HS578T,cell line,HS578T,,,tumor,breast,carcinoma,adherent,HTB-126,ATCC,74.0,F,Caucasian
HT29,cell line,HT29,,,tumor,large intestine,colorectal adenocarcinoma,adherent,HTB-38,ATCC,44.0,F,Caucasian
MCF10A,cell line,MCF10A,,immortalized normal,normal,breast,epithelial,adherent,CRL-10317,ATCC,,,
MCF7,cell line,MCF7,,,tumor,breast,adenocarcinoma,adherent,HTB-22,ATCC,69.0,F,Caucasian


In [132]:
# MNEU.E, NPC.CAS9, and NPC.TAK might also be interesting

del NEU_only_gctoo

for cell in ["MNEU.E", "NPC.CAS9", "NPC.TAK"]:
    sig_ids = sig_info[sig_info["cell_id"] == cell].index
    gctoo = parse(gctx_file, cid=sig_ids)
    gctoo.col_metadata_df = sig_info[sig_info["cell_id"] == cell]
    gctoo.row_metadata_df = gene_info
    wg.write(gctoo, cell)

In [184]:
# table of all perts

pd.set_option('display.max_rows', None)
all_perts = {}
for cell in ["NPC", "NEU", "MNEU.E", "NPC.CAS9", "NPC.TAK"]:
    for pert in set(sig_info["pert_id"][sig_info["cell_id"] == cell]):
        if pert in all_perts:
            all_perts[pert].append(cell)
        else:
            all_perts[pert] = [cell]
df = pert_info.loc[all_perts.keys()].sort_values('pert_iname')
df['cell_lines'] = pd.Series(all_perts)
df
#df.to_csv('all_perts.csv')

Unnamed: 0_level_0,canonical_smiles,inchi_key,pert_iname,pert_type,cell_lines
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BRD-K90860366,CCCc1cc2c(ncnc2s1)N1CCN(CC1)C1=NCC(C)(C)S1,SRQYLNYQAPCPIR-UHFFFAOYSA-N,1271738-62-5,trt_cp,[NPC.TAK]
BRD-K97118047,Brc1c(Br)c(Br)c2[nH]nnc2c1Br,OMZYUVOATZSGJY-UHFFFAOYSA-N,"4,5,6,7-tetrabromobenzotriazole",trt_cp,[NPC.TAK]
BRD-K49055432,Cc1nc(NC(=O)N2CCC[C@H]2C(N)=O)sc1-c1csc(n1)C(C...,HBPXWEPKNBHKAX-NSHDSACASA-N,A-66,trt_cp,"[NPC, NEU, MNEU.E, NPC.CAS9, NPC.TAK]"
BRD-K41918892,Oc1ccccc1-c1ccc(cc1)-c1csc2[nH]c(=O)c(C#N)c(O)c12,CTESJDQKVOEUOY-UHFFFAOYSA-N,A-769662,trt_cp,"[NPC, NEU, NPC.CAS9, NPC.TAK]"
BRD-K56301217,CN(C)CC[C@H](CSc1ccccc1)Nc1ccc(cc1[N+]([O-])=O...,HPLNQCPCUACXLM-PGUFJCEWSA-N,ABT-737,trt_cp,"[NPC, NEU, NPC.CAS9, NPC.TAK]"
BRD-K82928847,ONC(=O)CCCCCCNC(=O)c1cnc(nc1)N(c1ccccc1)c1ccccc1,QGZYDVAGYRLSKP-UHFFFAOYSA-N,ACY-1215,trt_cp,"[NPC, NEU, NPC.CAS9, NPC.TAK]"
BRD-K32536677,Clc1ccc(Cl)c(c1)c2ccc(/C=C(\C#N)/C(=O)Nc3cccc4...,SVENPFFEMUOOGK-SDNWHVSQSA-N,AGK-2,trt_cp,"[NPC, NEU, NPC.CAS9, NPC.TAK]"
BRD-K68191783,Cc1cn(cn1)-c1cc(cc(c1)C(F)(F)F)C(=O)Nc1ccc(C)c...,IYUFHBXMTTXZBE-UHFFFAOYSA-N,ALW-II-38-3,trt_cp,"[NPC, NEU, NPC.CAS9, NPC.TAK]"
BRD-K22096725,Cc1ccc(cc1Nc1cncc(c1)C(N)=O)C(=O)Nc1cccc(c1)C(...,SAAYRHKJHDIDPH-UHFFFAOYSA-N,ALW-II-49-7,trt_cp,"[NPC, NEU, MNEU.E, NPC.CAS9, NPC.TAK]"
BRD-K67860401,COc1ccc(CNC(=O)Nc2ncc(s2)[N+]([O-])=O)cc1,YAEMHJKFIIIULI-UHFFFAOYSA-N,AR-A014418,trt_cp,[NPC]


In [5]:
all_perts = {}
for cell in ["NPC", "NEU"]:
    for pert in set(sig_info["pert_id"][sig_info["cell_id"] == cell]):
        if pert in all_perts:
            all_perts[pert].append(cell)
        else:
            all_perts[pert] = [cell]
df = pert_info.loc[all_perts.keys()].sort_values('pert_iname')
df['cell_lines'] = pd.Series(all_perts)
df

Unnamed: 0_level_0,canonical_smiles,inchi_key,pert_iname,pert_type,cell_lines
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BRD-K49055432,Cc1nc(NC(=O)N2CCC[C@H]2C(N)=O)sc1-c1csc(n1)C(C...,HBPXWEPKNBHKAX-NSHDSACASA-N,A-66,trt_cp,"[NPC, NEU]"
BRD-K41918892,Oc1ccccc1-c1ccc(cc1)-c1csc2[nH]c(=O)c(C#N)c(O)c12,CTESJDQKVOEUOY-UHFFFAOYSA-N,A-769662,trt_cp,"[NPC, NEU]"
BRD-K56301217,CN(C)CC[C@H](CSc1ccccc1)Nc1ccc(cc1[N+]([O-])=O...,HPLNQCPCUACXLM-PGUFJCEWSA-N,ABT-737,trt_cp,"[NPC, NEU]"
BRD-K82928847,ONC(=O)CCCCCCNC(=O)c1cnc(nc1)N(c1ccccc1)c1ccccc1,QGZYDVAGYRLSKP-UHFFFAOYSA-N,ACY-1215,trt_cp,"[NPC, NEU]"
BRD-K32536677,Clc1ccc(Cl)c(c1)c2ccc(/C=C(\C#N)/C(=O)Nc3cccc4...,SVENPFFEMUOOGK-SDNWHVSQSA-N,AGK-2,trt_cp,"[NPC, NEU]"
BRD-K68191783,Cc1cn(cn1)-c1cc(cc(c1)C(F)(F)F)C(=O)Nc1ccc(C)c...,IYUFHBXMTTXZBE-UHFFFAOYSA-N,ALW-II-38-3,trt_cp,"[NPC, NEU]"
BRD-K22096725,Cc1ccc(cc1Nc1cncc(c1)C(N)=O)C(=O)Nc1cccc(c1)C(...,SAAYRHKJHDIDPH-UHFFFAOYSA-N,ALW-II-49-7,trt_cp,"[NPC, NEU]"
BRD-K67860401,COc1ccc(CNC(=O)Nc2ncc(s2)[N+]([O-])=O)cc1,YAEMHJKFIIIULI-UHFFFAOYSA-N,AR-A014418,trt_cp,[NPC]
BRD-K26838195,Fc1cccc(COc2ccc(Nc3ncnc4ccc(NC(=O)C=C)cc34)cc2...,MVZGYPSXNDCANY-UHFFFAOYSA-N,AST-1306,trt_cp,"[NPC, NEU]"
BRD-K12040459,Clc1ccc(cc1)C1(CCNCC1)c1ccc(cc1)-c1cn[nH]c1,LZMOSYUFVYJEPY-UHFFFAOYSA-N,AT7867,trt_cp,"[NPC, NEU]"


In [7]:
len(sig_info["pert_id"].unique())

2170