# Create PAO1 and PA14 compendia

This notebook is using the thresholds from the previous notebook to bin samples into PAO1 or PA14 compendia.

In [1]:
import os
import pandas as pd
import seaborn as sns
from core_acc_modules import paths

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


## Load data

In [2]:
# Log files
pao1_logs_filename = paths.PAO1_LOGS
pa14_logs_filename = paths.PA14_LOGS

# Expression data files
pao1_expression_filename = paths.PAO1_GE
pa14_expression_filename = paths.PA14_GE

# File containing table to map sample id to strain name
sample_to_strain_filename = paths.SAMPLE_TO_STRAIN

In [3]:
# Load log files
pao1_logs = pd.read_csv(pao1_logs_filename, index_col=0, header=0)
pa14_logs = pd.read_csv(pa14_logs_filename, index_col=0, header=0)

# Load expression data
# Matrices will be sample x gene after taking the transpose
pao1_expression = pd.read_csv(pao1_expression_filename, index_col=0, header=0).T

pa14_expression = pd.read_csv(pa14_expression_filename, index_col=0, header=0).T

# Drop row with gene ensembl ids
pao1_expression.drop(["X"], inplace=True)
pa14_expression.drop(["X"], inplace=True)

# Load metadata
# Set index to experiment id, which is what we will use to map to expression data
sample_to_strain_table_full = pd.read_csv(sample_to_strain_filename, index_col=2)

In [4]:
sample_to_strain_table_full.head()

Unnamed: 0_level_0,Unnamed: 0,Run,Sample.Name,SRA_study,BioProject,GEO_Accession..exp.,source_name,Strain,PAO1,PA14,PAK,ClinicalIsolate
Experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
SRX5057740,1,SRR8239475,CLJ1,SRP170343,PRJNA383797,,,CLJ1,False,False,False,False
SRX5057739,2,SRR8239476,CLJ1,SRP170343,PRJNA383797,,,CLJ1,False,False,False,False
SRX5057910,3,SRR8239645,CLJ3,SRP170410,PRJNA383798,,,CLJ3,False,False,False,False
SRX5057909,4,SRR8239646,CLJ3,SRP170410,PRJNA383798,,,CLJ3,False,False,False,False
SRX3573046,5,SRR6483189,PAO1 samples from Colistin/Doripenem treatment,SRP130183,PRJNA414673,,,PAO1,True,False,False,False


In [5]:
pao1_logs.head()

Unnamed: 0_level_0,lib_types,reads_processed,reads_mapped,mapping_rate,run,job
exp_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SRP090296/SRX2185323/SRX2185323,[u'SF'],47468450,5240148,11.039223,78041,2724
ERP105395/ERX2259197/ERX2259197,[u'U'],34109390,16420150,48.139676,78036,984
SRP017679/SRX2366135/SRX2366135,[u'U'],9912084,4322668,43.610082,78042,1232
SRP038697/SRX474161/SRX474161,[u'SF'],710589,93493,13.157113,78041,2560
SRP062215/SRX1140456/SRX1140456,[u'SF'],6598795,470748,7.133848,78042,1439


In [6]:
pao1_expression.head()

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA1905,PA0195,PA4812,PA0195.1,PA0457.1,PA1552.1,PA1555.1,PA3701,PA4724.1,PA5471.1
ERX541571.ERX541571.salmon,186.695,41.0399,52.6869,42.5596,11.8271,49.6818,15.1125,18.9761,39.7944,7.90016,...,0,5.16485,8.34185,2.24955,16.7872,35.8977,35.3801,117.821,133.373,0.0
ERX541572.ERX541572.salmon,200.587,36.65,65.5425,40.1072,15.0975,38.9103,9.27882,13.3374,35.814,4.29789,...,0,4.02765,8.1123,2.13563,13.5472,21.0032,26.6635,109.515,99.2929,0.0
ERX541573.ERX541573.salmon,111.42,27.4148,56.0137,25.2659,18.6952,22.6177,7.28033,8.25073,22.8004,2.72297,...,0,3.55578,8.70387,2.63218,20.6529,13.5422,33.1718,47.4317,34.1095,0.0
ERX541574.ERX541574.salmon,143.32,34.4773,83.4517,39.3797,23.2258,30.0783,12.3878,11.1666,26.4343,7.74561,...,0,7.79597,9.47563,4.12049,16.6373,15.0784,24.8545,97.8814,36.5146,5.00512
ERX541575.ERX541575.salmon,118.398,34.014,73.7324,32.2136,21.2469,23.1491,6.74443,13.0066,33.6489,4.74327,...,0,5.56938,9.57055,2.4523,32.6727,8.08117,47.4864,66.6335,26.2999,0.0


## Format data

Format index to only include experiment id. This will be used to map to expression data and labels

In [7]:
# Format log indices so that values can be mapped to expression data
pao1_index_processed = pao1_logs.index.str.split("/").str[-1]
pa14_index_processed = pa14_logs.index.str.split("/").str[-1]

print(f"No. of samples processed using PAO1 reference: {pao1_logs.shape[0]}")
print(f"No. of samples processed using PA14 reference: {pa14_logs.shape[0]}")
pao1_logs.index = pao1_index_processed
pa14_logs.index = pa14_index_processed

No. of samples processed using PAO1 reference: 2852
No. of samples processed using PA14 reference: 2852


In [8]:
# Format expression data indices so that values can be mapped to `sample_to_strain_table`
pao1_index_processed = pao1_expression.index.str.split(".").str[0]
pa14_index_processed = pa14_expression.index.str.split(".").str[0]

print(
    f"No. of samples processed using PAO1 reference after filtering: {pao1_expression.shape}"
)
print(
    f"No. of samples processed using PA14 reference after filtering: {pa14_expression.shape}"
)
pao1_expression.index = pao1_index_processed
pa14_expression.index = pa14_index_processed

No. of samples processed using PAO1 reference after filtering: (2643, 5563)
No. of samples processed using PA14 reference after filtering: (2619, 5891)


In [9]:
pao1_logs.head()

Unnamed: 0_level_0,lib_types,reads_processed,reads_mapped,mapping_rate,run,job
exp_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SRX2185323,[u'SF'],47468450,5240148,11.039223,78041,2724
ERX2259197,[u'U'],34109390,16420150,48.139676,78036,984
SRX2366135,[u'U'],9912084,4322668,43.610082,78042,1232
SRX474161,[u'SF'],710589,93493,13.157113,78041,2560
SRX1140456,[u'SF'],6598795,470748,7.133848,78042,1439


In [10]:
pao1_expression.head()

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA1905,PA0195,PA4812,PA0195.1,PA0457.1,PA1552.1,PA1555.1,PA3701,PA4724.1,PA5471.1
ERX541571,186.695,41.0399,52.6869,42.5596,11.8271,49.6818,15.1125,18.9761,39.7944,7.90016,...,0,5.16485,8.34185,2.24955,16.7872,35.8977,35.3801,117.821,133.373,0.0
ERX541572,200.587,36.65,65.5425,40.1072,15.0975,38.9103,9.27882,13.3374,35.814,4.29789,...,0,4.02765,8.1123,2.13563,13.5472,21.0032,26.6635,109.515,99.2929,0.0
ERX541573,111.42,27.4148,56.0137,25.2659,18.6952,22.6177,7.28033,8.25073,22.8004,2.72297,...,0,3.55578,8.70387,2.63218,20.6529,13.5422,33.1718,47.4317,34.1095,0.0
ERX541574,143.32,34.4773,83.4517,39.3797,23.2258,30.0783,12.3878,11.1666,26.4343,7.74561,...,0,7.79597,9.47563,4.12049,16.6373,15.0784,24.8545,97.8814,36.5146,5.00512
ERX541575,118.398,34.014,73.7324,32.2136,21.2469,23.1491,6.74443,13.0066,33.6489,4.74327,...,0,5.56938,9.57055,2.4523,32.6727,8.08117,47.4864,66.6335,26.2999,0.0


In [11]:
# Aggregate boolean labels into a single strain label
aggregated_label = []
for exp_id in list(sample_to_strain_table_full.index):
    if sample_to_strain_table_full.loc[exp_id, "PAO1"].all() == True:
        aggregated_label.append("PAO1")
    elif sample_to_strain_table_full.loc[exp_id, "PA14"].all() == True:
        aggregated_label.append("PA14")
    elif sample_to_strain_table_full.loc[exp_id, "PAK"].all() == True:
        aggregated_label.append("PAK")
    elif sample_to_strain_table_full.loc[exp_id, "ClinicalIsolate"].all() == True:
        aggregated_label.append("Clinical Isolate")
    else:
        aggregated_label.append("NA")

sample_to_strain_table_full["Strain type"] = aggregated_label

sample_to_strain_table = sample_to_strain_table_full["Strain type"].to_frame()

sample_to_strain_table.head()

Unnamed: 0_level_0,Strain type
Experiment,Unnamed: 1_level_1
SRX5057740,
SRX5057739,
SRX5057910,
SRX5057909,
SRX3573046,PAO1


## Bin samples as PAO1 or PA14

* Bin samples based on threshold from previous notebook
* Check if there are any samples that have a high mapping to both PAO1 and PA14 (i.e. ambiguous mapping)

In [12]:
threshold = 25

In [13]:
high_pao1_mapping_ids = list(pao1_logs.query("mapping_rate>=@threshold").index)
high_pa14_mapping_ids = list(pa14_logs.query("mapping_rate>=@threshold").index)

print(len(high_pao1_mapping_ids))
print(len(high_pa14_mapping_ids))

1892
1900


In [14]:
# Check if any ids have high mapping rate for both PAO1 and PA14
high_pao1_pa14_mapping_ids = list(
    set(high_pao1_mapping_ids).intersection(high_pa14_mapping_ids)
)

print(len(high_pao1_pa14_mapping_ids))

1843


Looks like there are many ids with high mapping rates for both PAO1 and PA14, lets look at what their mapping rates are and their SRA annotations. We suspect that these are mainly clinical and NA isolates as we saw in [exploratory analysis](https://github.com/greenelab/core-accessory-interactome/blob/master/explore_data/cluster_by_accessory_gene.ipynb)

In [15]:
pao1_logs.loc[high_pao1_pa14_mapping_ids].head(10)

Unnamed: 0_level_0,lib_types,reads_processed,reads_mapped,mapping_rate,run,job
exp_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SRX470356,[u'SF'],11441514,3051875,26.673699,78036,321
SRX5099519,[u'SF'],3713472,1459027,39.290104,78036,937
SRX4714405,[u'SR'],2862932,1793211,62.635473,78041,2636
SRX7098669,[u'U'],57297040,22356573,39.018722,78036,786
SRX5123742,[u'SF'],8834038,4421026,50.045359,78036,649
SRX6437666,[u'SF'],6244337,2420094,38.756621,78042,1766
SRX2940633,[u'U'],23692426,7813509,32.978932,78041,2508
SRX5123686,[u'SF'],9852051,5327960,54.079704,78036,737
SRX5123383,[u'SF'],27443517,13187683,48.05391,78036,736
SRX1074666,[u'U'],3277456,2176733,66.415323,78042,1587


In [16]:
pa14_logs.loc[high_pao1_pa14_mapping_ids].head(10)

Unnamed: 0_level_0,lib_types,reads_processed,reads_mapped,mapping_rate,run,job
exp_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SRX470356,[u'SF'],11441514,3347460,29.257142,77149,321
SRX5099519,[u'U'],3713472,2809735,75.663288,77223,937
SRX4714405,[u'SR'],2862932,1773477,61.94618,77697,2636
SRX7098669,[u'U'],57297040,22816881,39.822094,77223,786
SRX5123742,[u'SF'],8834038,4509508,51.046962,77223,649
SRX6437666,[u'SF'],6244337,2447920,39.20224,77490,1766
SRX2940633,[u'U'],23692426,7597396,32.066771,77697,2508
SRX5123686,[u'SF'],9852051,5310519,53.902675,77223,737
SRX5123383,[u'SF'],27443517,13283533,48.403173,77223,736
SRX1074666,[u'U'],3277456,2295442,70.037309,77546,1587


In [17]:
sample_to_strain_table.loc[high_pao1_pa14_mapping_ids]["Strain type"].value_counts()

NA                  582
PAO1                541
Clinical Isolate    461
PA14                273
PAK                  58
Name: Strain type, dtype: int64

## Create compendia

Create PAO1 and PA14 compendia

In [18]:
# Get expression data
pao1_expression_binned = pao1_expression.loc[high_pao1_mapping_ids]
pa14_expression_binned = pa14_expression.loc[high_pa14_mapping_ids]

# Drop ambiguously mapped samples
# pao1_expression_binned = pao1_expression_binned.drop(high_pao1_pa14_mapping_ids)
# pa14_expression_binned = pa14_expression_binned.drop(high_pao1_pa14_mapping_ids)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
# Label samples with SRA annotations
pao1_expression_label = pao1_expression_binned.merge(
    sample_to_strain_table, left_index=True, right_index=True
)
pa14_expression_label = pa14_expression_binned.merge(
    sample_to_strain_table, left_index=True, right_index=True
)
print(pao1_expression_label.shape)
pao1_expression_label.head()

(1964, 5564)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA0195,PA4812,PA0195.1,PA0457.1,PA1552.1,PA1555.1,PA3701,PA4724.1,PA5471.1,Strain type
ERX1272616,303.826,338.798,465.777,397.257,123.078,116.112,27.6101,92.3953,120.1,65.3267,...,27.2083,92.8974,33.9391,71.2049,657.254,1486.1,392.344,130.88,0.0,
ERX1272617,330.188,396.435,355.406,510.354,122.091,97.4159,54.6994,136.763,120.483,69.5939,...,38.5745,83.0111,44.8906,85.2286,756.769,849.959,412.308,107.523,0.0,
ERX1272618,135.018,243.605,387.461,326.783,62.5105,87.8338,100.757,41.0638,25.4338,47.1923,...,89.8114,1060.36,90.2585,51.5689,233.097,117.328,155.28,219.262,0.0,
ERX1272619,137.16,244.026,439.474,339.497,56.7303,85.7355,101.12,48.4013,28.0883,46.4248,...,94.4059,999.594,73.5905,54.9011,227.787,129.853,165.439,148.655,0.0,
ERX1296068,171.657,117.056,179.329,127.623,115.555,220.688,95.0127,73.628,109.02,95.2244,...,282.147,163.947,127.03,0.281338,426.511,672.65,263.38,346.185,10.5219,


In [20]:
pao1_expression_label["Strain type"].value_counts()

NA                  597
PAO1                563
Clinical Isolate    471
PA14                274
PAK                  59
Name: Strain type, dtype: int64

In [21]:
print(pa14_expression_label.shape)
pa14_expression_label.head()

(1972, 5892)


Unnamed: 0,PA14_55610,PA14_55600,PA14_55590,PA14_55580,PA14_55570,PA14_55560,PA14_55550,PA14_55540,PA14_55530,PA14_55520,...,PA14_17675,PA14_67975,PA14_36345,PA14_43405,PA14_38825,PA14_24245,PA14_28895,PA14_55117,PA14_59845,Strain type
ERX1272616,11.3888,5.60194,8.18787,646.528,172.504,25.6436,11.6876,12.3832,2.80314,0.688309,...,116.438,50.4695,10.484,6.3718,529.087,110.09,19.0519,15.702,0,
ERX1272617,12.3779,7.1858,10.1337,839.69,251.283,47.7741,19.5641,15.4955,4.10436,1.53655,...,83.4851,63.8452,10.8386,1.89429,940.833,100.974,34.5071,11.6007,0,
ERX1272618,36.7299,41.3198,40.741,41.6408,95.5985,14.2767,10.0638,15.3037,8.50456,5.95606,...,36.929,88.0115,20.7487,162.01,235.434,158.796,35.2944,15.4606,0,
ERX1272619,35.8203,40.1997,41.4905,40.058,100.634,13.8559,8.01068,15.3995,14.203,7.12301,...,32.9656,84.816,16.1881,272.396,398.229,141.718,26.0185,16.2675,0,
ERX1296067,40.5316,22.099,12.5838,93.8299,49.6454,38.456,87.3857,21.7692,18.8938,28.6754,...,152.636,68.3067,89.1003,680.312,255.148,382.703,35.181,64.4878,0,


In [22]:
pa14_expression_label["Strain type"].value_counts()

NA                  597
PAO1                550
Clinical Isolate    468
PA14                299
PAK                  58
Name: Strain type, dtype: int64

In [23]:
# Save compendia with label
pao1_expression_label.to_csv(paths.PAO1_COMPENDIUM_LABEL)
pa14_expression_label.to_csv(paths.PA14_COMPENDIUM_LABEL)

# Save compendia without label
pao1_expression_binned.to_csv(paths.PAO1_COMPENDIUM)
pa14_expression_binned.to_csv(paths.PA14_COMPENDIUM)

# Save metadata table
sample_to_strain_table.to_csv(paths.SAMPLE_TO_STRAIN_PROCESSED)