# Process LINCS Gene Expression Data into Training and Testing Data

Subset L1000 Gene Expression data to A549 and the Perturbations captured in the Cell Painting Pilot.
Stratify these profiles we have into 85% training and and 15% testing data.
The data are balanced across perturbations.

In [1]:
import os
import numpy as np
import pandas as pd

from cmapPy.pandasGEXpress import parse
from sklearn.model_selection import train_test_split

In [2]:
np.random.seed(123)

In [3]:
test_proportion = 0.15

In [4]:
# Read only the gene names and first plate to save memory
file = os.path.join("data", "GSE92742_Broad_LINCS_Level4_ZSPCINF_mlr12k_n1319138x12328.gctx")
df = parse.parse(file, cidx=[0, 1])

In [5]:
# Load Genes
gene_file = os.path.join("data", "GSE92742_Broad_LINCS_gene_info.txt.gz")
gene_df = pd.read_csv(gene_file, sep='\t', index_col=0)

landmark_gene_df = gene_df.query("pr_is_lm == 1")
landmark_gene_df.head(2)

Unnamed: 0_level_0,pr_gene_symbol,pr_gene_title,pr_is_lm,pr_is_bing
pr_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
780,DDR1,discoidin domain receptor tyrosine kinase 1,1,1
7849,PAX8,paired box 8,1,1


## Subset L1000 to Landmark Genes

In [6]:
landmark_indices = list(np.where(df.data_df.index.isin(landmark_gene_df.index.astype(str)))[0])

In [7]:
df = parse.parse(file, ridx=landmark_indices).data_df

print(df.shape)
df.head(2)

(978, 1319138)


cid,CPC005_A375_6H_X1_B3_DUO52HI53LO:K06,CPC005_A375_6H_X2_B3_DUO52HI53LO:K06,CPC005_A375_6H_X3_B3_DUO52HI53LO:K06,CPC005_A375_6H_X1_B3_DUO52HI53LO:C19,CPC005_A375_6H_X2_B3_DUO52HI53LO:C19,CPC005_A375_6H_X3_B3_DUO52HI53LO:C19,CPC004_A375_6H_X1_B3_DUO52HI53LO:K13,CPC004_A375_6H_X2_B3_DUO52HI53LO:K13,CPC004_A375_6H_X3_B3_DUO52HI53LO:K13,CPC005_A375_6H_X1_B3_DUO52HI53LO:K20,...,PCLB003_PC3_24H_X3_B13:P15,PCLB003_PC3_24H_X3_B13:P16,PCLB003_PC3_24H_X3_B13:P17,PCLB003_PC3_24H_X3_B13:P18,PCLB003_PC3_24H_X3_B13:P19,PCLB003_PC3_24H_X3_B13:P20,PCLB003_PC3_24H_X3_B13:P21,PCLB003_PC3_24H_X3_B13:P22,PCLB003_PC3_24H_X3_B13:P23,PCLB003_PC3_24H_X3_B13:P24
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5720,0.8509,0.6964,0.7763,-0.7216,-0.5528,-0.8905,1.1575,-10.4761,-0.222,-0.4631,...,-2.787,-0.9472,-1.201,0.1184,2.1614,0.4863,-0.0174,1.3497,0.7177,0.6071
466,0.0117,-1.1202,-1.2383,0.2762,-1.9274,-1.1556,1.6463,2.9997,-0.1231,-0.918,...,4.7663,4.8015,1.3907,0.8861,-2.5416,4.4342,-1.2144,4.7777,0.6917,0.2969


## Align with Cell Painting Data

In [8]:
project_name = "2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad"
project_dir = os.path.join("~", "bucket", "projects", project_name)

batch_name = "2016_04_01_a549_48hr_batch1"
data_dir = os.path.join(project_dir, "workspace", "backend", batch_name)

In [9]:
cp_file = os.path.join(data_dir, "{}.csv".format(batch_name))
cp_df = pd.read_csv(cp_file, low_memory=False)

cp_df.loc[cp_df.Metadata_pert_id == "BRD-K60230970", "Metadata_pert_iname"] = "MG-132"
cp_df.loc[cp_df.Metadata_pert_id == "BRD-K50691590", "Metadata_pert_iname"] = "bortezomib"
cp_df.loc[cp_df.Metadata_broad_sample == "DMSO", ["Metadata_pert_iname", "Metadata_pert_id"]] = "DMSO"

print(cp_df.shape)
cp_df.head(2)

(10752, 426)


Unnamed: 0,Metadata_broad_sample,Metadata_mmoles_per_liter,Metadata_moa,Metadata_pert_id,Metadata_pert_id_vendor,Metadata_pert_idose,Metadata_pert_iname,Metadata_pert_mfc_desc,Metadata_pert_mfc_id,Metadata_pert_vehicle,...,Nuclei_Texture_InverseDifferenceMoment_Mito_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_10_0,Nuclei_Texture_InverseDifferenceMoment_RNA_20_0,Nuclei_Texture_SumAverage_DNA_20_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_20_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_5_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_RNA_10_0
0,BRD-A00147595-001-01-5,0.041152,insulin sensitizer|PPAR receptor partial agonist,BRD-A00147595,,0.041152,balaglitazone,balaglitazone,BRD-A00147595-001-01-5,DMSO,...,0.081673,0.650381,0.804025,-0.143705,0.595971,-0.874259,0.058965,-0.245046,-0.012409,-1.354363
1,BRD-A00147595-001-01-5,0.123457,insulin sensitizer|PPAR receptor partial agonist,BRD-A00147595,,0.123457,balaglitazone,balaglitazone,BRD-A00147595-001-01-5,DMSO,...,0.335284,0.827858,0.998955,-0.621075,0.773121,0.098758,0.650768,0.303148,0.588129,-1.264081


In [10]:
# Load Experiment data
experiment_file = os.path.join("data", "GSE92742_Broad_LINCS_inst_info.txt.gz")
experiment_df = pd.read_csv(experiment_file, sep='\t', low_memory=False)

print(experiment_df.shape)
experiment_df.head()

(1319138, 11)


Unnamed: 0,inst_id,rna_plate,rna_well,pert_id,pert_iname,pert_type,pert_dose,pert_dose_unit,pert_time,pert_time_unit,cell_id
0,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:F13,ASG001_MCF7_24H_X1,F13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
1,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:G13,ASG001_MCF7_24H_X1,G13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
2,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:I13,ASG001_MCF7_24H_X1,I13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
3,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:K13,ASG001_MCF7_24H_X1,K13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7
4,ASG001_MCF7_24H_X1_B7_DUO52HI53LO:N13,ASG001_MCF7_24H_X1,N13,DMSO,DMSO,ctl_vehicle,0.1,%,24,h,MCF7


## Subset to Perturbations measured in Cell Painting and A549 Profiles

In [11]:
experiment_sub_df = (
    experiment_df
    .query("pert_id in @cp_df.Metadata_pert_id")
    .query("cell_id == 'A549'")
    .reset_index(drop=True)
)

print(experiment_sub_df.shape)
experiment_sub_df.head(2)

(9326, 11)


Unnamed: 0,inst_id,rna_plate,rna_well,pert_id,pert_iname,pert_type,pert_dose,pert_dose_unit,pert_time,pert_time_unit,cell_id
0,CPC005_A549_24H_X1_B3_DUO52HI53LO:J07,CPC005_A549_24H_X1,J07,BRD-K53979406,ALX-5407,trt_cp,10.0,um,24,h,A549
1,CPC005_A549_24H_X2_B3_DUO52HI53LO:J07,CPC005_A549_24H_X2,J07,BRD-K53979406,ALX-5407,trt_cp,10.0,um,24,h,A549


In [12]:
subset_df = (
    df
    .loc[:, experiment_sub_df.inst_id]
    .transpose()
)

print(subset_df.shape)
subset_df.head(2)

(9326, 978)


rid,5720,466,6009,2309,387,3553,427,5898,23365,6657,...,9738,6793,7358,58472,50865,23200,51293,10962,10153,874
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CPC005_A549_24H_X1_B3_DUO52HI53LO:J07,1.2584,-0.1467,-1.1304,0.3373,0.6018,1.3294,-0.8248,-0.1333,0.5331,-0.2269,...,0.198,-1.4723,0.4588,-0.0359,-0.8057,-0.8344,0.4992,0.9141,0.6025,0.5556
CPC005_A549_24H_X2_B3_DUO52HI53LO:J07,0.4924,0.9291,-0.133,0.9097,-0.4239,0.6745,0.3605,0.7312,-0.8416,-0.4439,...,-0.8413,-1.0246,-0.2107,0.8534,-0.4369,0.2388,0.7536,0.2691,-0.4988,0.7143


## Output Training and Testing Data

In [13]:
train_x, test_x = train_test_split(subset_df, test_size=test_proportion, stratify=experiment_sub_df.pert_id)

In [14]:
print(train_x.shape)

file = os.path.join("data", "expr_train_data.tsv.gz")
train_x.to_csv(file, sep='\t', index=False)

(7927, 978)


In [15]:
print(test_x.shape)

file = os.path.join("data", "expr_test_data.tsv.gz")
test_x.to_csv(file, sep='\t', index=False)

(1399, 978)
