In [1]:
# -----------------------------------------------------------------------------------------------------------------------
# Alexandra Lee 
# (created January 2019) 
#
# Parse metadata to determine if there are sufficient samples with multiple 
# dose concentrations to train autencoder
# -------------------------------------------------------------------------------------------------------------------
import pandas as pd
import os
import numpy as np

import sys
from cmapPy.pandasGEXpress.parse import parse

randomState = 123
from numpy.random import seed
seed(randomState)

In [2]:
# Load arguments
metadata_file = os.path.join(os.path.dirname(os.getcwd()), "metadata","GSE92742_Broad_LINCS_inst_info.txt")

In [3]:
# Read metadata
metadata = pd.read_csv(metadata_file, sep="\t", index_col=0, dtype=str)
metadata.head(10)

Unnamed: 0_level_0,rna_plate,rna_well,pert_id,pert_iname,pert_type,pert_dose,pert_dose_unit,pert_time,pert_time_unit,cell_id
inst_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ASG001_MCF7_24H_X1_B7_DUO52HI53LO:F13,ASG001_MCF7_24H_X1,F13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
ASG001_MCF7_24H_X1_B7_DUO52HI53LO:G13,ASG001_MCF7_24H_X1,G13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
ASG001_MCF7_24H_X1_B7_DUO52HI53LO:I13,ASG001_MCF7_24H_X1,I13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
ASG001_MCF7_24H_X1_B7_DUO52HI53LO:K13,ASG001_MCF7_24H_X1,K13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
ASG001_MCF7_24H_X1_B7_DUO52HI53LO:N13,ASG001_MCF7_24H_X1,N13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P08,ASG001_MCF7_24H_X1,P08,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P12,ASG001_MCF7_24H_X1,P12,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P17,ASG001_MCF7_24H_X1,P17,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P18,ASG001_MCF7_24H_X1,P18,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P19,ASG001_MCF7_24H_X1,P19,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7


In [4]:
# Get unique pairs of (drug, cell line)
drug_cell_line_pairs = metadata.groupby(['pert_iname','cell_id']).size().reset_index().rename(columns={0:'count'})
drug_cell_line_pairs.head(5)

Unnamed: 0,pert_iname,cell_id,count
0,(+)-3-(1-propyl-piperidin-3-yl)-phenol,FIBRNPC,4
1,(+)-3-(1-propyl-piperidin-3-yl)-phenol,NEU,7
2,(+)-3-(1-propyl-piperidin-3-yl)-phenol,NPC,7
3,"(+/-)-7-hydroxy-2-(N,N-di-n-propylamino)tetralin",FIBRNPC,4
4,"(+/-)-7-hydroxy-2-(N,N-di-n-propylamino)tetralin",NEU,7


In [5]:
%%time
# Filter by dose concentration and time points
num_pairs = drug_cell_line_pairs.shape[0]

multiple_dose_conc = pd.DataFrame(columns=['drug', 'cell line', 'time point', 'drug dose', 'count'])

for index, row in drug_cell_line_pairs.iterrows():
    
    # Select samples with specific drug and cell line
    drug, cell_line = row['pert_iname'], row['cell_id']
    selected_samples = metadata.query('pert_iname == @drug & cell_id == @cell_line', inplace=False)
    
    # Group samples by time point
    timept_counts = selected_samples.groupby(['pert_time']).size().reset_index().rename(columns={0:'count'})
    
    # For each time point group determine if multiple dose concentrations were measured
    for timept in timept_counts['pert_time']:
        samples_per_timept = selected_samples.query('pert_time == @timept', inplace=False)   
        
        # Get counts for the different dose concentrations
        dose_conc_counts = samples_per_timept.groupby(['pert_dose']).size().reset_index().rename(columns={0:'count'})

        # Keep track of how many samples have multiple dose concentrations
        num_dose_conc = dose_conc_counts.shape[0]
        
        if num_dose_conc > 1:
            
            for index,row in dose_conc_counts.iterrows():
                dose_conc, count = row['pert_dose'], row['count']
                multiple_dose_conc = multiple_dose_conc.append({'drug':drug,
                                                                'cell line':cell_line,
                                                                'time point': timept,
                                                                'drug dose':dose_conc,
                                                                'count': count},
                                                               ignore_index=True)

CPU times: user 2h 49min 45s, sys: 664 ms, total: 2h 49min 46s
Wall time: 2h 49min 44s


In [14]:
multiple_dose_conc.head(10)

Unnamed: 0,drug,cell line,time point,drug dose,count
0,"1,2-propylene-glycol",HA1E,24,10.0,6
1,"1,2-propylene-glycol",HA1E,24,19.4,12
2,"1,2-propylene-glycol",HA1E,6,10.0,3
3,"1,2-propylene-glycol",HA1E,6,19.4,6
4,15-delta-prostaglandin-j2,VCAP,24,1.0,4
5,15-delta-prostaglandin-j2,VCAP,24,10.0,15
6,15-delta-prostaglandin-j2,VCAP,24,20.0,4
7,15-delta-prostaglandin-j2,VCAP,24,5.0,4
8,15-delta-prostaglandin-j2,VCAP,48,1.0,4
9,15-delta-prostaglandin-j2,VCAP,48,20.0,4


In [21]:
# Get the number of samples that have multiple dose concentrations
multiple_dose_conc['count'].sum()

223574

In [23]:
# Get the number of conditions (same drug, cell line, time point with multiple drug dose concentrations)
conditions_count = multiple_dose_conc.groupby(['drug', 'cell line', 'time point']).size().reset_index().rename(columns={0:'number of dose concentrations'})
conditions_count.shape

(11739, 4)

In [24]:
conditions_count.head(20)

Unnamed: 0,drug,cell line,time point,number of dose concentrations
0,"1,2-propylene-glycol",HA1E,24,2
1,"1,2-propylene-glycol",HA1E,6,2
2,15-delta-prostaglandin-j2,VCAP,24,4
3,15-delta-prostaglandin-j2,VCAP,48,3
4,15-delta-prostaglandin-j2,VCAP,6,4
5,3-amino-benzamide,A549,24,10
6,3-amino-benzamide,A549,6,10
7,3-amino-benzamide,MCF7,24,10
8,3-amino-benzamide,MCF7,6,10
9,7-nitroindazole,A375,6,2


In [25]:
conditions_count['number of dose concentrations'].max()

44

In [34]:
conditions_count.loc[conditions_count['number of dose concentrations'] == 44]

Unnamed: 0,drug,cell line,time point,number of dose concentrations
11655,vorinostat,MCF7,24,44
11668,vorinostat,PC3,24,44


In [49]:
# Samples from first condition with 44 doses
drug_name = 'vorinostat'
cell_line_name = 'MCF7'
time_pt = '24'
multiple_dose_conc.loc[(multiple_dose_conc['drug'] == drug_name) &
                       (multiple_dose_conc['cell line'] == cell_line_name) &
                       (multiple_dose_conc['time point'] == time_pt)]['count'].sum()

527

In [50]:
drug_name = 'vorinostat'
cell_line_name = 'PC3'
time_pt = '24'
multiple_dose_conc.loc[(multiple_dose_conc['drug'] == drug_name) &
                       (multiple_dose_conc['cell line'] == cell_line_name) &
                       (multiple_dose_conc['time point'] == time_pt)]['count'].sum()

403

In [20]:
# Output
dose_file = os.path.join(os.path.dirname(os.getcwd()), "metadata","multiple_dose_conc_counts.txt")
conditions_file = os.path.join(os.path.dirname(os.getcwd()), "metadata","conditions_counts.txt")

multiple_dose_conc.to_csv(dose_file, sep='\t')
conditions_count.to_csv(conditions_file, sep='\t')