# Subset training compendium

This notebook subsets the normalized compendium to only include those samples that use PAO1 strains. This filtered training compendium will be used to examine the hypothesis that the subset of genes that are found to be generic by SOPHIE are not generic using GAPE-curated experiments because SOPHIE is trained on a compendium containing multiple strains, whereas we suspect that the GAPE experiments are only from a single strain (PAO1). So genes that are generic in other strain contexts will not be detected by GAPE.

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pandas as pd

from ponyo import utils

### Set parameters for data processing

Most parameters are read from `config_filename`. 

In [2]:
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))

# Read in config variables
config_filename = os.path.abspath(
    os.path.join(base_dir, "configs", "config_pseudomonas_pao1.tsv")
)

params = utils.read_config(config_filename)

local_dir = params["local_dir"]
dataset_name = params["dataset_name"]
project_id = params['project_id']
raw_compendium_filename = "https://raw.githubusercontent.com/greenelab/adage/master/Data_collection_processing/Pa_compendium_02.22.2014.pcl"

# Load metadata file with annotations per sample
metadata_filename = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "metadata",
    "sample_annotations.tsv")

# Load metadata file with annotations per sample
sample_id_filename = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "metadata",
    f"{project_id}_process_samples.tsv")

# Output filename
out_compendium_filename = params["raw_compendium_filename"]

## Read data and metadata

In [3]:
# Read normalized compendium
raw_compendium = pd.read_csv(raw_compendium_filename, sep="\t", index_col=0, header=0).T

# Read metadata file
metadata = pd.read_csv(metadata_filename, sep="\t", index_col=0, header=0)

# Read sample id file
project_sample_ids = pd.read_csv(sample_id_filename, sep="\t", index_col=None, header=0)

In [4]:
raw_compendium.head()

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
05_PA14000-4-2_5-10-07_S2.CEL,9.62009,10.575783,9.296287,9.870074,8.512268,7.903954,7.039473,10.209826,9.784684,5.485688,...,7.740609,9.730384,10.516061,10.639916,9.746849,5.768592,9.224442,11.512176,12.529719,11.804896
54375-4-05.CEL,9.327996,10.781977,9.169988,10.269239,7.237999,7.663758,6.855194,9.631573,9.404465,5.684067,...,7.127736,9.687607,10.199612,9.457152,9.318372,5.523898,7.911031,10.828271,11.597643,11.26852
AKGlu_plus_nt_7-8-09_s1.CEL,9.368599,10.596248,9.714517,9.487155,7.804147,7.681754,6.714411,9.497601,9.523126,5.766331,...,7.343241,9.717993,10.419979,10.164667,10.305005,5.806817,8.57573,10.85825,12.255953,11.309662
anaerobic_NO3_1.CEL,9.083292,9.89705,8.068471,7.310218,6.723634,7.141148,8.492302,7.740717,7.640251,5.267993,...,7.37474,8.287819,9.437053,8.936576,9.418147,5.956482,7.481406,7.687985,9.205525,9.395773
anaerobic_NO3_2.CEL,8.854901,9.931392,8.167126,7.526595,6.864015,7.154523,8.492109,7.716687,7.268094,5.427256,...,7.425398,8.588969,9.313851,8.684602,9.272818,5.729479,7.699086,7.414436,9.363494,9.424762


In [5]:
metadata.head()

Unnamed: 0_level_0,sample_name,ml_data_source,description,nucleic_acid,medium,genotype,od,growth_setting_1,growth_setting_2,strain,temperature,treatment,additional_notes,variant_phenotype,abx_marker,biotic_int_lv_2,biotic_int_lv_1
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
E-GEOD-46947,GSM1141730 1,GSM1141730_PA01_ZnO_PZO_.CEL,Pseudomonas aeruginosa PAO1 LB aerated 5 h wi...,RNA,LB,,,planktonic,aerated,PAO1,37.0,1 mM ZnO nanoparticles,Grown for 5h,,,,
E-GEOD-46947,GSM1141729 1,GSM1141729_PA01_none_PC_.CEL,Pseudomonas aeruginosa PAO1 LB aerated 5 h,RNA,LB,,,planktonic,aerated,PAO1,37.0,,Grown for 5h,,,,
E-GEOD-65882,GSM1608059 1,GSM1608059_Planktonic_1.CEL,PAO1 WT. Planktonic. Rep1,RNA,PBM plus 1 g / L glucose.,WT,0.26,Planktonic,Aerated,PAO1,37.0,,Grown shaking at 200rpm,,,,
E-GEOD-65882,GSM1608060 1,GSM1608060_Planktonic_2.CEL,PAO1 WT. Planktonic. Rep2,RNA,PBM plus 1 g / L glucose.,WT,0.26,Planktonic,Aerated,PAO1,37.0,,Grown shaking at 200rpm,,,,
E-GEOD-65882,GSM1608061 1,GSM1608061_Planktonic_3.CEL,PAO1 WT. Planktonic. Rep3,RNA,PBM plus 1 g / L glucose.,WT,0.26,Planktonic,Aerated,PAO1,37.0,,Grown shaking at 200rpm,,,,


In [6]:
# Get sample ids that have PAO1 strain
pao1_metadata = metadata.loc[metadata["strain"].str.contains("PAO1")]
experiment_ids = list(pao1_metadata.index)

assert "E-GEOD-33245" in experiment_ids

In [7]:
# Get normalized data associated with PAO1 sample ids
pao1_sample_ids = list(pao1_metadata["ml_data_source"])
print(len(pao1_sample_ids))

pao1_raw_compendium = raw_compendium.loc[pao1_sample_ids]

assert len(pao1_sample_ids) == pao1_raw_compendium.shape[0]

719


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """


In [8]:
# Drop samples with no expression activity associated
pao1_raw_compendium.dropna(inplace=True)

print(pao1_raw_compendium.shape)
pao1_raw_compendium

(545, 5549)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
GSM1146022_pJN105_1.CEL,9.469630,10.170614,8.468516,9.585662,8.353823,8.452549,7.354845,9.913560,8.258423,5.445693,...,8.790537,9.835683,9.652631,9.887377,9.919375,6.368828,8.233680,11.346424,12.364579,12.116378
GSM1146023_pJN105_2.CEL,9.617634,9.539554,9.922237,9.169949,8.071916,9.238627,7.923988,9.508516,8.174457,6.206660,...,7.957668,10.390467,9.521621,10.292927,9.616351,6.818172,7.236897,11.269168,11.292843,10.810757
GSM1146024_phaF_1.CEL,9.650212,10.489798,8.520705,9.066560,8.272158,8.476515,6.845883,9.777121,8.030193,5.532491,...,8.872988,9.832781,9.712714,10.327820,10.401750,6.825256,8.157127,11.204328,12.356186,11.888967
GSM1146025_phaF_2.CEL,9.450308,9.921177,8.465899,9.741961,8.759061,8.202494,7.625642,9.610623,8.068657,5.538607,...,8.763209,9.710935,10.013331,10.407397,9.906222,6.502036,7.509643,11.234006,12.497402,12.137581
GSM1244967_PAO1-22-replicate-01.CEL,9.049257,9.927143,8.885547,8.804660,5.988185,7.764461,8.350282,7.774165,7.774105,5.510923,...,9.356331,7.377822,9.644671,7.429156,7.654936,6.131965,6.241217,7.684121,9.072845,10.695175
GSM1244968_PAO1-22-replicate-02.CEL,8.833167,9.917035,9.009681,8.900852,6.096402,7.749163,8.319773,8.301315,7.600733,5.706577,...,9.486076,6.894396,9.723022,7.788555,7.605215,6.231041,6.303251,7.815412,8.896422,10.714222
GSM1244969_PAO1-22-replicate-03.CEL,8.884647,9.907316,8.737792,8.628758,6.270097,7.989337,8.304787,7.953646,7.694889,5.679747,...,9.080317,7.296979,9.655950,7.345836,7.659305,5.702764,6.390429,7.863996,9.196426,10.671757
GSM1244970_PAO1-37-replicate-01.CEL,8.778351,9.872437,8.755126,8.662006,7.212951,8.426300,8.670188,8.653405,7.737566,5.695070,...,9.153401,7.927784,9.572730,8.284933,8.580799,5.959742,6.265907,7.824123,10.907826,12.177075
GSM1244971_PAO1-37-replicate-02.CEL,9.061243,9.828194,8.342299,8.842026,6.465854,7.970151,8.432160,8.226710,7.877492,5.785088,...,9.337408,8.013215,9.553362,8.590636,8.628973,5.778791,6.830793,8.052581,10.929728,12.152272
GSM1244972_PAO1-37-replicate-03.CEL,8.808541,10.165703,8.627964,8.689108,6.905749,8.306635,8.780996,8.567739,7.190508,5.742351,...,9.013357,7.840225,9.405921,8.208101,8.480249,6.232756,5.991988,7.848818,10.879207,12.175344


In [9]:
# Check samples associated with project_id are still in subset
project_sample_ids = list(project_sample_ids["Sample"])

for sample_id in project_sample_ids:
    assert sample_id in list(pao1_raw_compendium.index)

In [10]:
# Save filtered compendium
pao1_raw_compendium.T.to_csv(out_compendium_filename, sep="\t")