# Parse LINCS L1000 metadata

**By Alexandra Lee** 

**created January 2019 **

Parse metadata to determine if there are sufficient samples with multiple dose concentrations to train autencoder

Also explore the metadata to determine the breakdown of different cancer types, tissue types, drug types that exist in order to determine the scope of the analysis

In [None]:
import pandas as pd
import os
import numpy as np

import sys
from cmapPy.pandasGEXpress.parse import parse

randomState = 123
from numpy.random import seed
seed(randomState)

In [2]:
# Load arguments
metadata_file = os.path.join(os.path.dirname(os.getcwd()), "metadata","GSE92742_Broad_LINCS_inst_info.txt")
cell_file = os.path.join(os.path.dirname(os.getcwd()), "metadata","GSE92742_Broad_LINCS_cell_info.txt")

In [3]:
# Read sample metadata
metadata = pd.read_table(metadata_file, index_col=None, dtype=str)
metadata.head(10)

Unnamed: 0,inst_id,rna_plate,rna_well,pert_id,pert_iname,pert_type,pert_dose,pert_dose_unit,pert_time,pert_time_unit,cell_id
0,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:F13,ASG001_MCF7_24H_X1,F13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
1,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:G13,ASG001_MCF7_24H_X1,G13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
2,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:I13,ASG001_MCF7_24H_X1,I13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
3,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:K13,ASG001_MCF7_24H_X1,K13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
4,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:N13,ASG001_MCF7_24H_X1,N13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
5,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P08,ASG001_MCF7_24H_X1,P08,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
6,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P12,ASG001_MCF7_24H_X1,P12,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
7,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P17,ASG001_MCF7_24H_X1,P17,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
8,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P18,ASG001_MCF7_24H_X1,P18,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
9,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P19,ASG001_MCF7_24H_X1,P19,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7


In [4]:
# Read cell line metadata
cell_line_metadata = pd.read_table(cell_file, index_col=None, dtype=str)
cell_line_metadata.head(10)

Unnamed: 0,cell_id,cell_type,base_cell_id,precursor_cell_id,modification,sample_type,primary_site,subtype,original_growth_pattern,provider_catalog_id,original_source_vendor,donor_age,donor_sex,donor_ethnicity
0,A375,cell line,A375,-666,-666,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
1,A375.311,cell line,A375,A375,genetically modified to stably express Cas9 pr...,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
2,A549,cell line,A549,-666,-666,tumor,lung,non small cell lung cancer| carcinoma,adherent,CCL-185,ATCC,58,M,Caucasian
3,A549.311,cell line,A549,A549,genetically modified to stably express Cas9 p...,tumor,lung,non small cell lung cancer| carcinoma,adherent,CCL-185,ATCC,58,M,Caucasian
4,A673,cell line,A673,-666,-666,tumor,bone,ewing's sarcoma,adherent,CRL-1598,ATCC,-666,F,-666
5,AGS,cell line,AGS,-666,-666,tumor,stomach,adenocarcinoma,adherent,CRL-1739,ATCC,54,F,Caucasian
6,BT20,cell line,BT20,-666,-666,tumor,breast,carcinoma,adherent,HTB-19,ATCC,74,F,Caucasian
7,CL34,cell line,CL34,-666,-666,tumor,large intestine,colorectal adenocarcinoma,adherent,-666,DSMZ,-666,F,-666
8,CORL23,cell line,CORL23,-666,-666,tumor,lung,non small cell lung cancer| large cell carcinoma,adherent,92031919,ECACC,-666,M,-666
9,COV644,cell line,COV644,-666,-666,tumor,ovary,carcinoma| epithelial-mucinous,adherent,-666,ECACC,-666,F,-666


In [5]:
# Merge sample metadata and cell line metadata
metadata = metadata.merge(cell_line_metadata, on='cell_id', how='inner')
metadata.head(10)

Unnamed: 0,inst_id,rna_plate,rna_well,pert_id,pert_iname,pert_type,pert_dose,pert_dose_unit,pert_time,pert_time_unit,...,modification,sample_type,primary_site,subtype,original_growth_pattern,provider_catalog_id,original_source_vendor,donor_age,donor_sex,donor_ethnicity
0,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:F13,ASG001_MCF7_24H_X1,F13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,...,-666,tumor,breast,adenocarcinoma,adherent,HTB-22,ATCC,69,F,Caucasian
1,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:G13,ASG001_MCF7_24H_X1,G13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,...,-666,tumor,breast,adenocarcinoma,adherent,HTB-22,ATCC,69,F,Caucasian
2,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:I13,ASG001_MCF7_24H_X1,I13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,...,-666,tumor,breast,adenocarcinoma,adherent,HTB-22,ATCC,69,F,Caucasian
3,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:K13,ASG001_MCF7_24H_X1,K13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,...,-666,tumor,breast,adenocarcinoma,adherent,HTB-22,ATCC,69,F,Caucasian
4,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:N13,ASG001_MCF7_24H_X1,N13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,...,-666,tumor,breast,adenocarcinoma,adherent,HTB-22,ATCC,69,F,Caucasian
5,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P08,ASG001_MCF7_24H_X1,P08,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,...,-666,tumor,breast,adenocarcinoma,adherent,HTB-22,ATCC,69,F,Caucasian
6,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P12,ASG001_MCF7_24H_X1,P12,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,...,-666,tumor,breast,adenocarcinoma,adherent,HTB-22,ATCC,69,F,Caucasian
7,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P17,ASG001_MCF7_24H_X1,P17,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,...,-666,tumor,breast,adenocarcinoma,adherent,HTB-22,ATCC,69,F,Caucasian
8,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P18,ASG001_MCF7_24H_X1,P18,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,...,-666,tumor,breast,adenocarcinoma,adherent,HTB-22,ATCC,69,F,Caucasian
9,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P19,ASG001_MCF7_24H_X1,P19,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,...,-666,tumor,breast,adenocarcinoma,adherent,HTB-22,ATCC,69,F,Caucasian


In [6]:
# Get unique pairs of (drug, cell line)
drug_cell_line_pairs = (
    metadata
    .groupby(['pert_iname', 'pert_type', 'cell_id', 'sample_type', 'primary_site', 'subtype'])
    .size()
    .reset_index()
    .rename(columns={0:'count'})
)

drug_cell_line_pairs.head(5)

Unnamed: 0,pert_iname,pert_type,cell_id,sample_type,primary_site,subtype,count
0,(+)-3-(1-propyl-piperidin-3-yl)-phenol,trt_cp,FIBRNPC,normal,skin,-666,4
1,(+)-3-(1-propyl-piperidin-3-yl)-phenol,trt_cp,NEU,normal,-666,-666,7
2,(+)-3-(1-propyl-piperidin-3-yl)-phenol,trt_cp,NPC,primary,central nervous system,normal stem fibroblast-derived iPScs,7
3,"(+/-)-7-hydroxy-2-(N,N-di-n-propylamino)tetralin",trt_cp,FIBRNPC,normal,skin,-666,4
4,"(+/-)-7-hydroxy-2-(N,N-di-n-propylamino)tetralin",trt_cp,NEU,normal,-666,-666,7


In [7]:
%%time
# Filter by dose concentration and time points
num_pairs = drug_cell_line_pairs.shape[0]

multiple_dose_conc = pd.DataFrame(columns=['drug',
                                           'drug type',
                                           'cell line',
                                           'sample type', 
                                           'primary site',
                                           'subtype',
                                           'time point', 
                                           'drug dose',
                                           'count'])

for index, row in drug_cell_line_pairs.iterrows():
    
    # Select samples with specific drug and cell line
    drug, drug_type, cell_line, sample_type, primary_site, cancer_type = row['pert_iname'],row['pert_type'], row['cell_id'], row['sample_type'], row['primary_site'], row['subtype']
    selected_samples = metadata.query('pert_iname == @drug & cell_id == @cell_line', inplace=False)
    
    # Group samples by time point
    timept_counts = (
        selected_samples
        .groupby(['pert_time'])
        .size()
        .reset_index()
        .rename(columns={0:'count'})
    )
    
    # For each time point group determine if multiple dose concentrations were measured
    for timept in timept_counts['pert_time']:
        samples_per_timept = selected_samples.query('pert_time == @timept', inplace=False)   
        
        # Get counts for the different dose concentrations
        dose_conc_counts = (
            samples_per_timept
            .groupby(['pert_dose'])
            .size()
            .reset_index()
            .rename(columns={0:'count'})
        )

        # Keep track of how many samples have multiple dose concentrations
        num_dose_conc = dose_conc_counts.shape[0]
        
        if num_dose_conc > 1:
            
            for index,row in dose_conc_counts.iterrows():
                dose_conc, count = row['pert_dose'], row['count']
                multiple_dose_conc = multiple_dose_conc.append({'drug':drug,
                                                                'drug type': drug_type,
                                                                'cell line':cell_line,
                                                                'sample type': sample_type,
                                                                'primary site': primary_site,
                                                                'subtype': cancer_type,
                                                                'time point': timept,
                                                                'drug dose':dose_conc,
                                                                'count': count},
                                                               ignore_index=True)

CPU times: user 1h 39min 28s, sys: 1.48 s, total: 1h 39min 29s
Wall time: 1h 39min 29s


In [8]:
multiple_dose_conc.head(10)

Unnamed: 0,drug,drug type,cell line,sample type,primary site,subtype,time point,drug dose,count
0,"1,2-propylene-glycol",trt_cp,HA1E,normal,kidney,normal kidney,24,10.0,6
1,"1,2-propylene-glycol",trt_cp,HA1E,normal,kidney,normal kidney,24,19.4,12
2,"1,2-propylene-glycol",trt_cp,HA1E,normal,kidney,normal kidney,6,10.0,3
3,"1,2-propylene-glycol",trt_cp,HA1E,normal,kidney,normal kidney,6,19.4,6
4,15-delta-prostaglandin-j2,trt_cp,VCAP,tumor,prostate,carcinoma,24,1.0,4
5,15-delta-prostaglandin-j2,trt_cp,VCAP,tumor,prostate,carcinoma,24,10.0,15
6,15-delta-prostaglandin-j2,trt_cp,VCAP,tumor,prostate,carcinoma,24,20.0,4
7,15-delta-prostaglandin-j2,trt_cp,VCAP,tumor,prostate,carcinoma,24,5.0,4
8,15-delta-prostaglandin-j2,trt_cp,VCAP,tumor,prostate,carcinoma,48,1.0,4
9,15-delta-prostaglandin-j2,trt_cp,VCAP,tumor,prostate,carcinoma,48,20.0,4


In [9]:
# Get the number of samples that have multiple dose concentrations
multiple_dose_conc['count'].sum()

230865

In [10]:
# Get the number of conditions (same drug, cell line, time point with multiple drug dose concentrations)
conditions_count = (
    multiple_dose_conc
    .groupby(['drug', 'drug type', 'cell line', 'sample type', 'primary site', 'subtype', 'time point'])
    .size()
    .reset_index()
    .rename(columns={0:'number of dose concentrations'})
)

conditions_count.shape

(11957, 8)

In [11]:
conditions_count.head(20)

Unnamed: 0,drug,drug type,cell line,sample type,primary site,subtype,time point,number of dose concentrations
0,"1,2-propylene-glycol",trt_cp,HA1E,normal,kidney,normal kidney,24,2
1,"1,2-propylene-glycol",trt_cp,HA1E,normal,kidney,normal kidney,6,2
2,15-delta-prostaglandin-j2,trt_cp,VCAP,tumor,prostate,carcinoma,24,4
3,15-delta-prostaglandin-j2,trt_cp,VCAP,tumor,prostate,carcinoma,48,3
4,15-delta-prostaglandin-j2,trt_cp,VCAP,tumor,prostate,carcinoma,6,4
5,3-amino-benzamide,trt_cp,A549,tumor,lung,non small cell lung cancer| carcinoma,24,10
6,3-amino-benzamide,trt_cp,A549,tumor,lung,non small cell lung cancer| carcinoma,6,10
7,3-amino-benzamide,trt_cp,MCF7,tumor,breast,adenocarcinoma,24,10
8,3-amino-benzamide,trt_cp,MCF7,tumor,breast,adenocarcinoma,6,10
9,7-nitroindazole,trt_cp,A375,tumor,skin,malignant melanoma,6,2


In [12]:
# What is the counts for the different subtypes
top_subtype = conditions_count['subtype'].value_counts().index[0]
print(top_subtype)
conditions_count['subtype'].value_counts()

adenocarcinoma


adenocarcinoma                                          4288
carcinoma                                               2576
non small cell lung cancer| carcinoma                    917
malignant melanoma                                       734
colorectal adenocarcinoma                                732
hepatocellular carcinoma                                 701
epithelial                                               467
normal kidney                                            314
normal primary liver                                     260
kidney epithelial                                        233
embryonal kidney                                         138
lymphoma| B-cell| non-hodgkin's| histiocytic              87
bone marrow                                               81
acute myelogenous leukemia (AML)| M3 (promyelocytic)      80
acute lymphoblastic leukemia (ALL)| T-cell                64
myeloman, haematopoietic,lymphoid                         61
acute myeloid leukemia (

In [13]:
# What is the counts for the different drugs
top_drug = conditions_count['drug'].value_counts().index[0]
print(top_drug)
conditions_count['drug'].value_counts()

sirolimus


sirolimus              70
vorinostat             64
curcumin               62
parthenolide           59
manumycin-a            52
IL4                    35
INS                    35
IFNG                   35
TNF                    35
trichostatin-a         33
IGF2                   31
GAS6                   30
IL6                    30
IGF1                   30
EGF                    30
AR                     27
GDNF                   26
Y-27632                25
HGF                    25
HBEGF                  25
BTC                    25
FGF1                   25
IL1                    24
tamoxifen              24
MCSF                   24
SCF                    24
KGF                    24
TGFa                   24
IFNA                   24
BNGF                   24
                       ..
BRD-K86372560           1
CMA1                    1
tetracycline            1
BRD-K80767319           1
BRD-K88925856           1
TPO                     1
BRD-K62368279           1
CXCL13      

In [26]:
# What is the counts for the different primary tissue site
top_tissue = conditions_count['primary site'].value_counts().index[0]
print(top_tissue)
conditions_count['primary site'].value_counts()

breast


breast                                4642
prostate                              2408
lung                                  1227
liver                                  961
large intestine                        757
skin                                   736
kidney                                 685
haematopoietic and lymphoid tissue     325
bone                                    86
blood                                   61
ovary                                   29
endometrium                             15
-666                                    10
central nervous system                  10
stomach                                  5
Name: primary site, dtype: int64

In [27]:
# Filter by the top primary site (tissue types)
conditions_count[conditions_count['primary site'] == top_tissue]['subtype'].value_counts()

adenocarcinoma    3239
carcinoma          936
epithelial         467
Name: subtype, dtype: int64

In [28]:
conditions_count[(conditions_count['primary site'] == top_tissue) &
                 (conditions_count['subtype'] == top_subtype) &
                 (conditions_count['time point'] == '24')]

Unnamed: 0,drug,drug type,cell line,sample type,primary site,subtype,time point,number of dose concentrations
7,3-amino-benzamide,trt_cp,MCF7,tumor,breast,adenocarcinoma,24,10
16,7-nitroindazole,trt_cp,MCF7,tumor,breast,adenocarcinoma,24,2
35,A-443644,trt_cp,MCF7,tumor,breast,adenocarcinoma,24,4
37,A-443644,trt_cp,MDAMB231,tumor,breast,adenocarcinoma,24,3
39,A-443644,trt_cp,SKBR3,tumor,breast,adenocarcinoma,24,3
49,ABT-737,trt_cp,MCF7,tumor,breast,adenocarcinoma,24,5
51,ABT-737,trt_cp,MDAMB231,tumor,breast,adenocarcinoma,24,4
53,ABT-737,trt_cp,SKBR3,tumor,breast,adenocarcinoma,24,4
70,AG-14361,trt_cp,MCF7,tumor,breast,adenocarcinoma,24,10
76,AG-490,trt_cp,MCF7,tumor,breast,adenocarcinoma,24,4


In [41]:
conditions_count[(conditions_count['primary site'] == top_tissue) &
                 (conditions_count['subtype'] == top_subtype) &
                 (conditions_count['time point'] == '24')]['drug type'].value_counts()

trt_cp         1614
trt_lig          87
trt_sh            9
trt_oe            5
ctl_vehicle       1
Name: drug type, dtype: int64

In [17]:
multiple_dose_conc[(multiple_dose_conc['primary site'] == top_tissue) &
                 (multiple_dose_conc['subtype'] == top_subtype) &
                 (multiple_dose_conc['time point'] == '24')]

Unnamed: 0,drug,drug type,cell line,sample type,primary site,subtype,time point,drug dose,count
73,7-nitroindazole,trt_cp,PC3,tumor,prostate,adenocarcinoma,24,10.0,5
74,7-nitroindazole,trt_cp,PC3,tumor,prostate,adenocarcinoma,24,80.0,6
261,AG-14361,trt_cp,PC3,tumor,prostate,adenocarcinoma,24,10.0,4
262,AG-14361,trt_cp,PC3,tumor,prostate,adenocarcinoma,24,25.0,6
735,ASN-05257430,trt_cp,PC3,tumor,prostate,adenocarcinoma,24,10.0,4
736,ASN-05257430,trt_cp,PC3,tumor,prostate,adenocarcinoma,24,11.1,6
832,AT-9283,trt_cp,PC3,tumor,prostate,adenocarcinoma,24,0.0399999991059,3
833,AT-9283,trt_cp,PC3,tumor,prostate,adenocarcinoma,24,0.119999997318,3
834,AT-9283,trt_cp,PC3,tumor,prostate,adenocarcinoma,24,0.3700000047680001,3
835,AT-9283,trt_cp,PC3,tumor,prostate,adenocarcinoma,24,1.1100000143100002,3


In [33]:
top_drug = multiple_dose_conc[(multiple_dose_conc['primary site'] == top_tissue) &
                 (multiple_dose_conc['subtype'] == top_subtype) &
                 (multiple_dose_conc['time point'] == '24')]['drug'].value_counts().index[0]
print(top_drug)

vorinostat


In [35]:
multiple_dose_conc[(multiple_dose_conc['primary site'] == top_tissue) &
                 (multiple_dose_conc['subtype'] == top_subtype) &
                 (multiple_dose_conc['time point'] == '24') &
                  (multiple_dose_conc['drug'] == top_drug)]['count'].sum()

527

In [38]:
multiple_dose_conc[(multiple_dose_conc['primary site'] == top_tissue) &
                 (multiple_dose_conc['subtype'] == top_subtype) &
                 (multiple_dose_conc['time point'] == '24')]['count'].sum()

31219

In [19]:
conditions_count[conditions_count['primary site'] == top_tissue]['cell line'].value_counts()

VCAP    1364
PC3     1044
Name: cell line, dtype: int64

In [20]:
conditions_count['number of dose concentrations'].max()

44

In [21]:
conditions_count.loc[conditions_count['number of dose concentrations'] == 44]

Unnamed: 0,drug,drug type,cell line,sample type,primary site,subtype,time point,number of dose concentrations
11874,vorinostat,trt_cp,MCF7,tumor,breast,adenocarcinoma,24,44
11887,vorinostat,trt_cp,PC3,tumor,prostate,adenocarcinoma,24,44


In [22]:
# Samples from first condition with 44 doses
drug_name = 'vorinostat'
cell_line_name = 'MCF7'
time_pt = '24'
multiple_dose_conc.loc[(multiple_dose_conc['drug'] == drug_name) &
                       (multiple_dose_conc['cell line'] == cell_line_name) &
                       (multiple_dose_conc['time point'] == time_pt)]['count'].sum()

527

In [23]:
drug_name = 'vorinostat'
cell_line_name = 'PC3'
time_pt = '24'
multiple_dose_conc.loc[(multiple_dose_conc['drug'] == drug_name) &
                       (multiple_dose_conc['cell line'] == cell_line_name) &
                       (multiple_dose_conc['time point'] == time_pt)]['count'].sum()

403

In [24]:
# Output
dose_file = os.path.join(os.path.dirname(os.getcwd()), "metadata","multiple_dose_conc_counts.txt")
conditions_file = os.path.join(os.path.dirname(os.getcwd()), "metadata","conditions_counts.txt")

multiple_dose_conc.to_csv(dose_file, sep='\t')
conditions_count.to_csv(conditions_file, sep='\t')