# Format expression

This notebook cleans and formats the data. There are two specific things that this notebook does:
1. We noticed that there are PAO1 gene ids in the features/column headers for PA14 gene expression matrix. This notebook removes those gene ids before we perform the rest of the downstream analyses.
2. This notebook also formats the data to be sample x gene matrices

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import pandas as pd
from scripts import paths

In [2]:
# Raw (normalized counts) expression data files
pao1_expression_filename = paths.PAO1_GE_RAW
pa14_expression_filename = paths.PA14_GE_RAW

In [3]:
# Load expression data
# Matrices will be sample x gene after taking the transpose
pao1_expression = pd.read_csv(pao1_expression_filename, index_col=0, header=0).T
pa14_expression = pd.read_csv(pa14_expression_filename, index_col=0, header=0).T

## Format expression data

Format index to only include experiment id. This will be used to map to expression data and SRA labels later

In [4]:
# Format expression data indices so that values can be mapped to `sample_to_strain_table`
pao1_index_processed = pao1_expression.index.str.split(".").str[0]
pa14_index_processed = pa14_expression.index.str.split(".").str[0]

print(
    f"No. of samples processed using PAO1 reference after filtering: {pao1_expression.shape}"
)
print(
    f"No. of samples processed using PA14 reference after filtering: {pa14_expression.shape}"
)
pao1_expression.index = pao1_index_processed
pa14_expression.index = pa14_index_processed

No. of samples processed using PAO1 reference after filtering: (2767, 5563)
No. of samples processed using PA14 reference after filtering: (2767, 5891)


In [5]:
print(pao1_expression.shape)
pao1_expression.head()

(2767, 5563)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA1905,PA0195,PA4812,PA0195.1,PA0457.1,PA1552.1,PA1555.1,PA3701,PA4724.1,PA5471.1
ERX541572,5793.218939,766.512255,1608.330977,1663.46607,176.163343,384.600886,295.846835,453.183561,611.865046,43.032267,...,1.344758,75.306467,447.804528,10.758067,65.893159,44.377025,56.47985,2033.274614,184.231893,1.344758
ERX541573,4416.506898,797.782811,1770.117221,1562.763979,313.958581,324.501966,333.873864,415.87797,550.599003,38.659079,...,1.171487,103.090877,698.206395,18.743796,138.235494,39.830566,96.061954,1315.580171,91.376005,1.171487
ERX541574,3825.086116,644.433113,1852.251003,1589.338107,260.936107,270.820051,363.729119,363.729119,423.03278,67.210815,...,1.976789,128.491265,468.49892,19.767887,73.141182,33.605408,55.350083,1759.341934,67.210815,3.953577
ERX541575,3834.097653,789.216207,1926.825153,1610.427665,289.734779,261.294555,250.629471,520.811596,666.567742,53.325419,...,1.777514,124.425979,611.464809,15.997626,177.751398,21.330168,108.428353,1486.001686,56.880447,1.777514
ERX541576,3515.165133,853.775186,2185.27713,1683.341246,183.98936,245.319146,253.388855,380.890253,551.968079,66.171612,...,1.613942,90.380738,745.641089,11.297592,130.729282,50.032194,95.222563,1273.400041,72.627379,1.613942


In [6]:
print(pa14_expression.shape)
pa14_expression.head()

(2767, 5891)


Unnamed: 0,PA14_55610,PA14_55600,PA14_55590,PA14_55580,PA14_55570,PA14_55560,PA14_55550,PA14_55540,PA14_55530,PA14_55520,...,PA14_19205,PA14_17675,PA14_67975,PA14_36345,PA14_43405,PA14_38825,PA14_24245,PA14_28895,PA14_55117,PA14_59845
ERX541572,204.761199,49.806778,8.30113,16.602259,22.136346,11.068173,13.835216,74.710167,77.47721,5.534086,...,58.107908,166.022594,2692.333064,204.761199,27.670432,16.602259,1090.215033,520.204128,110.681729,2.767043
ERX541573,163.421371,44.569465,18.908258,24.310617,9.454129,18.908258,6.752949,63.477723,56.724773,9.454129,...,55.374184,202.588477,1755.766798,67.529492,5.402359,13.505898,1493.752368,598.311301,62.127133,1.35059
ERX541574,201.758337,14.497605,7.248803,15.705739,6.040669,18.122006,9.66507,48.32535,59.198554,8.456936,...,67.65549,224.712879,2213.301042,77.32056,7.248803,13.289471,1680.514056,885.562044,164.306191,1.208134
ERX541575,186.502345,46.124236,10.027008,30.081023,10.027008,14.037811,6.016205,44.118834,42.113433,8.021606,...,40.108031,240.648187,1500.040368,64.17285,6.016205,12.032409,1251.370574,677.825728,140.378109,2.005402
ERX541576,223.958038,23.864381,12.850051,31.207268,14.685773,11.01433,11.01433,40.385876,55.071649,9.178608,...,69.757422,212.943708,1672.342397,62.414535,1.835722,14.685773,1325.39101,627.816794,104.636132,1.835722


## Clean

Find any gene ids that are mismatched

In [7]:
for gene_id in pao1_expression.columns:
    if "PA14_" in gene_id:
        print(gene_id)

In [8]:
mismatched_gene_ids = []
for gene_id in pa14_expression.columns:
    if "PA14_" not in gene_id:
        print(gene_id)
        mismatched_gene_ids.append(gene_id)

PA0263.1
PA2170.1
PA0263.2
PA0263.3


In [9]:
# Drop this columns from the PA14 expression data
pa14_expression_new = pa14_expression.drop(columns=mismatched_gene_ids)

In [10]:
print(pa14_expression_new.shape)
pa14_expression_new.head()

(2767, 5887)


Unnamed: 0,PA14_55610,PA14_55600,PA14_55590,PA14_55580,PA14_55570,PA14_55560,PA14_55550,PA14_55540,PA14_55530,PA14_55520,...,PA14_19205,PA14_17675,PA14_67975,PA14_36345,PA14_43405,PA14_38825,PA14_24245,PA14_28895,PA14_55117,PA14_59845
ERX541572,204.761199,49.806778,8.30113,16.602259,22.136346,11.068173,13.835216,74.710167,77.47721,5.534086,...,58.107908,166.022594,2692.333064,204.761199,27.670432,16.602259,1090.215033,520.204128,110.681729,2.767043
ERX541573,163.421371,44.569465,18.908258,24.310617,9.454129,18.908258,6.752949,63.477723,56.724773,9.454129,...,55.374184,202.588477,1755.766798,67.529492,5.402359,13.505898,1493.752368,598.311301,62.127133,1.35059
ERX541574,201.758337,14.497605,7.248803,15.705739,6.040669,18.122006,9.66507,48.32535,59.198554,8.456936,...,67.65549,224.712879,2213.301042,77.32056,7.248803,13.289471,1680.514056,885.562044,164.306191,1.208134
ERX541575,186.502345,46.124236,10.027008,30.081023,10.027008,14.037811,6.016205,44.118834,42.113433,8.021606,...,40.108031,240.648187,1500.040368,64.17285,6.016205,12.032409,1251.370574,677.825728,140.378109,2.005402
ERX541576,223.958038,23.864381,12.850051,31.207268,14.685773,11.01433,11.01433,40.385876,55.071649,9.178608,...,69.757422,212.943708,1672.342397,62.414535,1.835722,14.685773,1325.39101,627.816794,104.636132,1.835722


In [11]:
# Verify that we've removed the gene ids
assert pa14_expression_new.shape[1] == pa14_expression.shape[1] - len(
    mismatched_gene_ids
)
assert pa14_expression_new.shape[0] == pa14_expression.shape[0]

In [12]:
for gene_id in mismatched_gene_ids:
    if gene_id in pa14_expression_new.columns:
        print(gene_id)

## Save

In [13]:
pao1_expression.to_csv(paths.PAO1_GE, sep="\t")
pa14_expression_new.to_csv(paths.PA14_GE, sep="\t")