# Process array data

This notebook will process array data to:
1. Include only those PAO1 samples, based on the metadata
2. Use only genes shared by both the array and RNA-seq compendia

In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from core_acc_modules import paths

In [2]:
# Files
array_compendium_filename = paths.ARRAY_DATA
array_metadata_filename = paths.ARRAY_METADATA

pao1_rnaseq_compendium_filename = paths.PAO1_COMPENDIUM

In [3]:
array_compendium = pd.read_csv(
    array_compendium_filename, sep="\t", index_col=0, header=0
).T
array_metadata = pd.read_csv(array_metadata_filename, sep="\t", index_col=0, header=0)

In [4]:
array_compendium.head()

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
05_PA14000-4-2_5-10-07_S2.CEL,9.62009,10.575783,9.296287,9.870074,8.512268,7.903954,7.039473,10.209826,9.784684,5.485688,...,7.740609,9.730384,10.516061,10.639916,9.746849,5.768592,9.224442,11.512176,12.529719,11.804896
54375-4-05.CEL,9.327996,10.781977,9.169988,10.269239,7.237999,7.663758,6.855194,9.631573,9.404465,5.684067,...,7.127736,9.687607,10.199612,9.457152,9.318372,5.523898,7.911031,10.828271,11.597643,11.26852
AKGlu_plus_nt_7-8-09_s1.CEL,9.368599,10.596248,9.714517,9.487155,7.804147,7.681754,6.714411,9.497601,9.523126,5.766331,...,7.343241,9.717993,10.419979,10.164667,10.305005,5.806817,8.57573,10.85825,12.255953,11.309662
anaerobic_NO3_1.CEL,9.083292,9.89705,8.068471,7.310218,6.723634,7.141148,8.492302,7.740717,7.640251,5.267993,...,7.37474,8.287819,9.437053,8.936576,9.418147,5.956482,7.481406,7.687985,9.205525,9.395773
anaerobic_NO3_2.CEL,8.854901,9.931392,8.167126,7.526595,6.864015,7.154523,8.492109,7.716687,7.268094,5.427256,...,7.425398,8.588969,9.313851,8.684602,9.272818,5.729479,7.699086,7.414436,9.363494,9.424762


In [5]:
print(array_metadata.shape)
array_metadata.head()

(1217, 17)


Unnamed: 0_level_0,sample_name,ml_data_source,description,nucleic_acid,medium,genotype,od,growth_setting_1,growth_setting_2,strain,temperature,treatment,additional_notes,variant_phenotype,abx_marker,biotic_int_lv_2,biotic_int_lv_1
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
E-GEOD-46947,GSM1141730 1,GSM1141730_PA01_ZnO_PZO_.CEL,Pseudomonas aeruginosa PAO1 LB aerated 5 h wi...,RNA,LB,,,planktonic,aerated,PAO1,37.0,1 mM ZnO nanoparticles,Grown for 5h,,,,
E-GEOD-46947,GSM1141729 1,GSM1141729_PA01_none_PC_.CEL,Pseudomonas aeruginosa PAO1 LB aerated 5 h,RNA,LB,,,planktonic,aerated,PAO1,37.0,,Grown for 5h,,,,
E-GEOD-65882,GSM1608059 1,GSM1608059_Planktonic_1.CEL,PAO1 WT. Planktonic. Rep1,RNA,PBM plus 1 g / L glucose.,WT,0.26,Planktonic,Aerated,PAO1,37.0,,Grown shaking at 200rpm,,,,
E-GEOD-65882,GSM1608060 1,GSM1608060_Planktonic_2.CEL,PAO1 WT. Planktonic. Rep2,RNA,PBM plus 1 g / L glucose.,WT,0.26,Planktonic,Aerated,PAO1,37.0,,Grown shaking at 200rpm,,,,
E-GEOD-65882,GSM1608061 1,GSM1608061_Planktonic_3.CEL,PAO1 WT. Planktonic. Rep3,RNA,PBM plus 1 g / L glucose.,WT,0.26,Planktonic,Aerated,PAO1,37.0,,Grown shaking at 200rpm,,,,


In [6]:
pao1_strain_values = [
    strain_name
    for strain_name in array_metadata["strain"].unique()
    if ("PAO1" in strain_name) & ("MPAO1" not in strain_name)
]

In [7]:
pao1_sample_ids = array_metadata[
    array_metadata["strain"].apply(
        lambda strain_name: strain_name in pao1_strain_values
    )
]["ml_data_source"]

In [8]:
# Drop any sample ids that are na
pao1_sample_ids.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(result)


## Select subset of samples

Select only those sample ids that are using PAO1-like strains and have expression data available

In [9]:
pao1_array_compendium = array_compendium.loc[pao1_sample_ids]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


In [10]:
print(pao1_array_compendium.shape)
pao1_array_compendium.head()

(614, 5549)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
GSM1141730_PA01_ZnO_PZO_.CEL,,,,,,,,,,,...,,,,,,,,,,
GSM1141729_PA01_none_PC_.CEL,,,,,,,,,,,...,,,,,,,,,,
GSM1608059_Planktonic_1.CEL,,,,,,,,,,,...,,,,,,,,,,
GSM1608060_Planktonic_2.CEL,,,,,,,,,,,...,,,,,,,,,,
GSM1608061_Planktonic_3.CEL,,,,,,,,,,,...,,,,,,,,,,


In [11]:
# Drop samples without expression data available
pao1_array_compendium.dropna(inplace=True)

In [12]:
print(pao1_array_compendium.shape)
pao1_array_compendium.head()

(524, 5549)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
GSM1146022_pJN105_1.CEL,9.46963,10.170614,8.468516,9.585662,8.353823,8.452549,7.354845,9.91356,8.258423,5.445693,...,8.790537,9.835683,9.652631,9.887377,9.919375,6.368828,8.23368,11.346424,12.364579,12.116378
GSM1146023_pJN105_2.CEL,9.617634,9.539554,9.922237,9.169949,8.071916,9.238627,7.923988,9.508516,8.174457,6.20666,...,7.957668,10.390467,9.521621,10.292927,9.616351,6.818172,7.236897,11.269168,11.292843,10.810757
GSM1146024_phaF_1.CEL,9.650212,10.489798,8.520705,9.06656,8.272158,8.476515,6.845883,9.777121,8.030193,5.532491,...,8.872988,9.832781,9.712714,10.32782,10.40175,6.825256,8.157127,11.204328,12.356186,11.888967
GSM1146025_phaF_2.CEL,9.450308,9.921177,8.465899,9.741961,8.759061,8.202494,7.625642,9.610623,8.068657,5.538607,...,8.763209,9.710935,10.013331,10.407397,9.906222,6.502036,7.509643,11.234006,12.497402,12.137581
GSM1244967_PAO1-22-replicate-01.CEL,9.049257,9.927143,8.885547,8.80466,5.988185,7.764461,8.350282,7.774165,7.774105,5.510923,...,9.356331,7.377822,9.644671,7.429156,7.654936,6.131965,6.241217,7.684121,9.072845,10.695175


## Use only shared genes

Use only genes that are shared between the array and RNA-seq compendia so that we can compare modules

In [13]:
# Read in rnaseq compendium processing
pao1_rnaseq_compendium = pd.read_csv(
    pao1_rnaseq_compendium_filename, sep="\t", index_col=0, header=0
)
print(pao1_rnaseq_compendium.shape)
pao1_rnaseq_compendium.head()

(956, 5563)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA1905,PA0195,PA4812,PA0195.1,PA0457.1,PA1552.1,PA1555.1,PA3701,PA4724.1,PA5471.1
ERX541571,186.695122,41.039911,52.686904,42.559606,11.827122,49.68178,15.112466,18.976053,39.794426,7.900165,...,0.0,5.164847,8.341849,2.249546,16.787212,35.897737,35.380105,117.820814,133.373023,0.0
ERX541579,150.908951,63.968306,56.863002,86.01348,32.240954,48.322611,21.099413,21.219456,31.521424,12.16896,...,0.0,16.606783,38.218939,8.611779,28.396413,110.943698,137.58394,101.058282,65.210599,0.0
ERX541580,108.323515,56.079007,57.001674,73.42148,18.426169,46.57263,16.606703,18.595344,30.253393,15.936006,...,0.0,13.904292,37.246074,11.777708,45.733,94.799523,53.486191,82.753701,23.340646,38.113576
ERX541591,556.704228,182.309588,81.730476,157.348561,85.185006,116.643999,36.964524,31.973493,63.10993,16.140763,...,14.312595,73.032021,80.369628,49.31246,69.099677,61.622295,103.911157,186.030844,478.965167,0.0
ERX541592,428.47025,178.510235,74.953217,152.142022,92.393069,95.317522,33.668009,31.262234,81.131515,20.126563,...,6.691671,114.618032,62.583672,56.426453,58.473904,55.311934,101.620394,207.409539,345.05796,0.0


In [14]:
# Get shared genes
pao1_array_gene_ids = pao1_array_compendium.columns
pao1_rnaseq_gene_ids = pao1_rnaseq_compendium.columns

shared_gene_ids = list(set(pao1_array_gene_ids).intersection(set(pao1_rnaseq_gene_ids)))
print(len(shared_gene_ids))

5543


In [15]:
# Only include shared genes for both compendia
pao1_array_compendium_processed = pao1_array_compendium[shared_gene_ids]
pao1_rnaseq_compendium_processed = pao1_rnaseq_compendium[shared_gene_ids]

In [16]:
# Save to new paths
pao1_array_compendium_processed.to_csv(paths.ARRAY_COMPENDIUM_TO_COMPARE, sep="\t")
pao1_rnaseq_compendium_processed.to_csv(paths.RNASEQ_COMPENDIUM_TO_COMPARE, sep="\t")