# Create PAO1 and PA14 compendia

This notebook is using the thresholds from the [previous notebook](0_decide_thresholds.ipynb) to bin samples into PAO1 or PA14 compendia.

A sample will be PAO1 if:
1. PAO1 mapping rate >= 30%
2. PAO1-PA14 mapping rate > 0%

A sample will be PA14 if:
1. PA14 mapping rate >= 30%
2. PA14-PAO1 mapping rate > 0%

In [1]:
import os
import pandas as pd
import seaborn as sns
from core_acc_modules import paths

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [2]:
# Params
mapping_threshold = 30
mapping_threshold_pa14 = 10
diff_mapping_threshold = 0

## Load data

In [3]:
# Log files
pao1_logs_filename = paths.PAO1_LOGS
pa14_logs_filename = paths.PA14_LOGS

# Expression data files
pao1_expression_filename = paths.PAO1_GE
pa14_expression_filename = paths.PA14_GE

# File containing table to map sample id to strain name
sample_to_strain_filename = paths.SAMPLE_TO_STRAIN

In [4]:
# Load log files
pao1_logs = pd.read_csv(pao1_logs_filename, index_col=0, header=0)
pa14_logs = pd.read_csv(pa14_logs_filename, index_col=0, header=0)

# Load expression data
# Matrices will be sample x gene after taking the transpose
pao1_expression = pd.read_csv(pao1_expression_filename, index_col=0, header=0).T

pa14_expression = pd.read_csv(pa14_expression_filename, index_col=0, header=0).T

# Drop row with gene ensembl ids
pao1_expression.drop(["X"], inplace=True)
pa14_expression.drop(["X"], inplace=True)

# Load metadata
# Set index to experiment id, which is what we will use to map to expression data
sample_to_strain_table_full = pd.read_csv(sample_to_strain_filename, index_col=2)

In [5]:
sample_to_strain_table_full.head()

Unnamed: 0_level_0,Unnamed: 0,Run,Sample.Name,SRA_study,BioProject,GEO_Accession..exp.,source_name,Strain,PAO1,PA14,PAK,ClinicalIsolate
Experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
SRX5057740,1,SRR8239475,CLJ1,SRP170343,PRJNA383797,,,CLJ1,False,False,False,False
SRX5057739,2,SRR8239476,CLJ1,SRP170343,PRJNA383797,,,CLJ1,False,False,False,False
SRX5057910,3,SRR8239645,CLJ3,SRP170410,PRJNA383798,,,CLJ3,False,False,False,False
SRX5057909,4,SRR8239646,CLJ3,SRP170410,PRJNA383798,,,CLJ3,False,False,False,False
SRX3573046,5,SRR6483189,PAO1 samples from Colistin/Doripenem treatment,SRP130183,PRJNA414673,,,PAO1,True,False,False,False


In [6]:
pao1_logs.head()

Unnamed: 0_level_0,lib_types,reads_processed,reads_mapped,mapping_rate,run,job
exp_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SRP090296/SRX2185323/SRX2185323,[u'SF'],47468450,5240148,11.039223,78041,2724
ERP105395/ERX2259197/ERX2259197,[u'U'],34109390,16420150,48.139676,78036,984
SRP017679/SRX2366135/SRX2366135,[u'U'],9912084,4322668,43.610082,78042,1232
SRP038697/SRX474161/SRX474161,[u'SF'],710589,93493,13.157113,78041,2560
SRP062215/SRX1140456/SRX1140456,[u'SF'],6598795,470748,7.133848,78042,1439


In [7]:
pao1_expression.head()

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA1905,PA0195,PA4812,PA0195.1,PA0457.1,PA1552.1,PA1555.1,PA3701,PA4724.1,PA5471.1
ERX541571.ERX541571.salmon,186.695,41.0399,52.6869,42.5596,11.8271,49.6818,15.1125,18.9761,39.7944,7.90016,...,0,5.16485,8.34185,2.24955,16.7872,35.8977,35.3801,117.821,133.373,0.0
ERX541572.ERX541572.salmon,200.587,36.65,65.5425,40.1072,15.0975,38.9103,9.27882,13.3374,35.814,4.29789,...,0,4.02765,8.1123,2.13563,13.5472,21.0032,26.6635,109.515,99.2929,0.0
ERX541573.ERX541573.salmon,111.42,27.4148,56.0137,25.2659,18.6952,22.6177,7.28033,8.25073,22.8004,2.72297,...,0,3.55578,8.70387,2.63218,20.6529,13.5422,33.1718,47.4317,34.1095,0.0
ERX541574.ERX541574.salmon,143.32,34.4773,83.4517,39.3797,23.2258,30.0783,12.3878,11.1666,26.4343,7.74561,...,0,7.79597,9.47563,4.12049,16.6373,15.0784,24.8545,97.8814,36.5146,5.00512
ERX541575.ERX541575.salmon,118.398,34.014,73.7324,32.2136,21.2469,23.1491,6.74443,13.0066,33.6489,4.74327,...,0,5.56938,9.57055,2.4523,32.6727,8.08117,47.4864,66.6335,26.2999,0.0


## Format data

Format index to only include experiment id. This will be used to map to expression data and labels

In [8]:
# Format log indices so that values can be mapped to expression data
pao1_index_processed = pao1_logs.index.str.split("/").str[-1]
pa14_index_processed = pa14_logs.index.str.split("/").str[-1]

print(f"No. of samples processed using PAO1 reference: {pao1_logs.shape[0]}")
print(f"No. of samples processed using PA14 reference: {pa14_logs.shape[0]}")
pao1_logs.index = pao1_index_processed
pa14_logs.index = pa14_index_processed

No. of samples processed using PAO1 reference: 2852
No. of samples processed using PA14 reference: 2852


In [9]:
# Format expression data indices so that values can be mapped to `sample_to_strain_table`
pao1_index_processed = pao1_expression.index.str.split(".").str[0]
pa14_index_processed = pa14_expression.index.str.split(".").str[0]

print(
    f"No. of samples processed using PAO1 reference after filtering: {pao1_expression.shape}"
)
print(
    f"No. of samples processed using PA14 reference after filtering: {pa14_expression.shape}"
)
pao1_expression.index = pao1_index_processed
pa14_expression.index = pa14_index_processed

No. of samples processed using PAO1 reference after filtering: (2643, 5563)
No. of samples processed using PA14 reference after filtering: (2619, 5891)


In [10]:
pao1_logs.head()

Unnamed: 0_level_0,lib_types,reads_processed,reads_mapped,mapping_rate,run,job
exp_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SRX2185323,[u'SF'],47468450,5240148,11.039223,78041,2724
ERX2259197,[u'U'],34109390,16420150,48.139676,78036,984
SRX2366135,[u'U'],9912084,4322668,43.610082,78042,1232
SRX474161,[u'SF'],710589,93493,13.157113,78041,2560
SRX1140456,[u'SF'],6598795,470748,7.133848,78042,1439


In [11]:
pao1_expression.head()

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA1905,PA0195,PA4812,PA0195.1,PA0457.1,PA1552.1,PA1555.1,PA3701,PA4724.1,PA5471.1
ERX541571,186.695,41.0399,52.6869,42.5596,11.8271,49.6818,15.1125,18.9761,39.7944,7.90016,...,0,5.16485,8.34185,2.24955,16.7872,35.8977,35.3801,117.821,133.373,0.0
ERX541572,200.587,36.65,65.5425,40.1072,15.0975,38.9103,9.27882,13.3374,35.814,4.29789,...,0,4.02765,8.1123,2.13563,13.5472,21.0032,26.6635,109.515,99.2929,0.0
ERX541573,111.42,27.4148,56.0137,25.2659,18.6952,22.6177,7.28033,8.25073,22.8004,2.72297,...,0,3.55578,8.70387,2.63218,20.6529,13.5422,33.1718,47.4317,34.1095,0.0
ERX541574,143.32,34.4773,83.4517,39.3797,23.2258,30.0783,12.3878,11.1666,26.4343,7.74561,...,0,7.79597,9.47563,4.12049,16.6373,15.0784,24.8545,97.8814,36.5146,5.00512
ERX541575,118.398,34.014,73.7324,32.2136,21.2469,23.1491,6.74443,13.0066,33.6489,4.74327,...,0,5.56938,9.57055,2.4523,32.6727,8.08117,47.4864,66.6335,26.2999,0.0


## TO DO: check why duplicates are appearing

In [12]:
# Aggregate boolean labels into a single strain label
aggregated_label = []
for exp_id in list(sample_to_strain_table_full.index):
    if sample_to_strain_table_full.loc[exp_id, "PAO1"].all() == True:
        aggregated_label.append("PAO1")
    elif sample_to_strain_table_full.loc[exp_id, "PA14"].all() == True:
        aggregated_label.append("PA14")
    elif sample_to_strain_table_full.loc[exp_id, "PAK"].all() == True:
        aggregated_label.append("PAK")
    elif sample_to_strain_table_full.loc[exp_id, "ClinicalIsolate"].all() == True:
        aggregated_label.append("Clinical Isolate")
    else:
        aggregated_label.append("NA")

sample_to_strain_table_full["Strain type"] = aggregated_label

sample_to_strain_table = sample_to_strain_table_full["Strain type"].to_frame()
sample_to_strain_table = sample_to_strain_table.loc[
    sample_to_strain_table.index.drop_duplicates()
]

sample_to_strain_table.head()

Unnamed: 0_level_0,Strain type
Experiment,Unnamed: 1_level_1
SRX5057740,
SRX5057739,
SRX5057910,
SRX5057909,
SRX3573046,PAO1


## Bin samples as PAO1 or PA14

* Bin samples based on threshold from previous notebook
* Check if there are any samples that have a high mapping to both PAO1 and PA14 (i.e. ambiguous mapping)

In [13]:
# Add column calculating the difference in mapping rates
pao1_logs["diff_mapping_rate"] = pao1_logs["mapping_rate"] - pa14_logs["mapping_rate"]
pa14_logs["diff_mapping_rate"] = pa14_logs["mapping_rate"] - pao1_logs["mapping_rate"]

In [14]:
high_pao1_mapping_ids = list(
    pao1_logs.query("mapping_rate>=@mapping_threshold&diff_mapping_rate>0").index
)
high_pa14_mapping_ids = list(
    pa14_logs.query("mapping_rate>=@mapping_threshold&diff_mapping_rate>0").index
)

print(len(high_pao1_mapping_ids))
print(len(high_pa14_mapping_ids))

1209
622


In [15]:
# Check if any ids have high mapping rate for both PAO1 and PA14
high_pao1_pa14_mapping_ids = list(
    set(high_pao1_mapping_ids).intersection(high_pa14_mapping_ids)
)

print(len(high_pao1_pa14_mapping_ids))

0


**Some observations:**
* Looks like there are not any samples that map to both PAO1 and PA14 using our criteria
* The number of PA14 samples is much lower compared to PAO1. Does this mean that the mapping rates of PA14 samples mapped to PA14 reference lower?

## Create compendia

Create PAO1 and PA14 compendia

In [16]:
# Get expression data
# Note: reindexing needed here instead of .loc since samples from expression data
# were filtered out for low counts, but these samples still exist in log files
pao1_expression_binned = pao1_expression.reindex(high_pao1_mapping_ids)
pa14_expression_binned = pa14_expression.reindex(high_pa14_mapping_ids)

# Missing samples are dropped
pao1_expression_binned = pao1_expression_binned.dropna()
pa14_expression_binned = pa14_expression_binned.dropna()

# Drop ambiguously mapped samples
pao1_expression_binned = pao1_expression_binned.drop(high_pao1_pa14_mapping_ids)
pa14_expression_binned = pa14_expression_binned.drop(high_pao1_pa14_mapping_ids)

In [17]:
print(pao1_expression_binned.shape)
print(pa14_expression_binned.shape)

(1199, 5563)
(602, 5891)


In [18]:
# Label samples with SRA annotations
# pao1_expression_label = pao1_expression_binned.join(
#    sample_to_strain_table, how='left')
pao1_expression_label = pao1_expression_binned.merge(
    sample_to_strain_table, left_index=True, right_index=True
)
pa14_expression_label = pa14_expression_binned.merge(
    sample_to_strain_table, left_index=True, right_index=True
)
print(pao1_expression_label.shape)
pao1_expression_label.head()

(1254, 5564)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA0195,PA4812,PA0195.1,PA0457.1,PA1552.1,PA1555.1,PA3701,PA4724.1,PA5471.1,Strain type
ERX1272616,303.826,338.798,465.777,397.257,123.078,116.112,27.6101,92.3953,120.1,65.3267,...,27.2083,92.8974,33.9391,71.2049,657.254,1486.1,392.344,130.88,0.0,
ERX1272617,330.188,396.435,355.406,510.354,122.091,97.4159,54.6994,136.763,120.483,69.5939,...,38.5745,83.0111,44.8906,85.2286,756.769,849.959,412.308,107.523,0.0,
ERX1272618,135.018,243.605,387.461,326.783,62.5105,87.8338,100.757,41.0638,25.4338,47.1923,...,89.8114,1060.36,90.2585,51.5689,233.097,117.328,155.28,219.262,0.0,
ERX1272619,137.16,244.026,439.474,339.497,56.7303,85.7355,101.12,48.4013,28.0883,46.4248,...,94.4059,999.594,73.5905,54.9011,227.787,129.853,165.439,148.655,0.0,
ERX1296068,171.657,117.056,179.329,127.623,115.555,220.688,95.0127,73.628,109.02,95.2244,...,282.147,163.947,127.03,0.281338,426.511,672.65,263.38,346.185,10.5219,


In [19]:
# pao1_expression_label[pao1_expression_label.index.duplicated(keep=False)]

In [20]:
# sample_to_strain_table[sample_to_strain_table.index.duplicated(keep=False)]

In [21]:
print(pa14_expression_label.shape)
pa14_expression_label.head()

(610, 5892)


Unnamed: 0,PA14_55610,PA14_55600,PA14_55590,PA14_55580,PA14_55570,PA14_55560,PA14_55550,PA14_55540,PA14_55530,PA14_55520,...,PA14_17675,PA14_67975,PA14_36345,PA14_43405,PA14_38825,PA14_24245,PA14_28895,PA14_55117,PA14_59845,Strain type
ERX1296067,40.5316,22.099,12.5838,93.8299,49.6454,38.456,87.3857,21.7692,18.8938,28.6754,...,152.636,68.3067,89.1003,680.312,255.148,382.703,35.181,64.4878,0,
ERX1296069,39.0279,22.5293,15.8625,90.8143,43.6259,40.7302,79.5882,18.9671,18.9383,26.9428,...,146.595,67.2578,85.1823,629.025,223.785,327.791,32.6634,65.7526,0,
ERX1296502,10.2079,8.26682,7.5297,6.8905,10.1008,6.40562,6.24507,2.21737,1.13524,2.56469,...,84.0015,44.023,14.0472,88.1364,27.2587,485.295,3.26472,32.2662,0,
ERX1296503,43.0257,27.8671,17.7344,100.469,50.6015,47.2848,91.7137,19.1493,15.1365,28.4366,...,233.618,83.7615,111.818,801.637,275.615,431.902,29.7772,77.5779,0,
ERX1296504,44.756,30.2834,18.7402,100.747,52.4838,53.3943,111.878,23.683,17.3899,36.4087,...,234.389,86.674,123.033,813.9,283.062,364.885,34.3077,78.4528,0,


## Quick comparison

Quick check comparing our binned labels compared with SRA annotations

In [22]:
pao1_expression_label["Strain type"].value_counts()

PAO1                475
NA                  357
Clinical Isolate    349
PAK                  54
PA14                 19
Name: Strain type, dtype: int64

In [23]:
pa14_expression_label["Strain type"].value_counts()

PA14                232
NA                  210
Clinical Isolate    108
PAO1                 58
PAK                   2
Name: Strain type, dtype: int64

In [24]:
# Save compendia with label
pao1_expression_label.to_csv(paths.PAO1_COMPENDIUM_LABEL)
pa14_expression_label.to_csv(paths.PA14_COMPENDIUM_LABEL)

# Save compendia without label
pao1_expression_binned.to_csv(paths.PAO1_COMPENDIUM)
pa14_expression_binned.to_csv(paths.PA14_COMPENDIUM)

# Save metadata table
sample_to_strain_table.to_csv(paths.SAMPLE_TO_STRAIN_PROCESSED)