# Differential expression validation
This notebook performs a differential expression (DE) analysis comparing PAO1 samples vs PA14 samples. We can compare our results with those published in the literature as an additional step to validate that our RNA-seq processing are reasonable.

In [1]:
%load_ext autoreload
%load_ext rpy2.ipython
%autoreload 2

import pandas as pd
from core_acc_modules import utils, paths
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [2]:
# Load gene expression using PAO1 reference
expression_data = pd.read_csv(paths.PAO1_GE, sep='\t', header=0, index_col=0)
print(expression_data.shape)
expression_data.head()

(28, 5685)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
SRR7886564,231.716766,190.742049,251.434363,244.575451,232.66742,338.778285,154.619466,135.309196,197.297614,208.656767,...,384.006646,160.423009,217.461069,156.88598,207.875748,401.447266,243.877369,242.992502,246.314856,2505.558825
SRR7886563,221.861101,206.2282,262.510896,236.227489,218.772353,277.169311,172.770017,131.65515,203.906846,195.419829,...,422.431994,186.070549,221.798564,169.894611,206.04732,364.537806,220.715998,222.137392,290.767265,1822.432926
SRR7886556,239.03554,214.717657,274.756361,234.900934,211.425667,331.008247,182.534175,151.789441,177.38904,214.45934,...,313.613041,203.419543,219.460975,122.112693,216.61862,356.472812,204.935818,236.919636,344.232315,840.672326
SRR7886554,207.540231,225.539932,225.249527,219.545459,187.796107,288.503175,170.652752,152.999445,178.938258,189.161895,...,310.622609,162.000331,226.673993,151.294458,188.400366,344.227676,230.639268,215.712007,231.559415,2818.206826
SRR11809604,260.594079,246.812776,266.252334,265.965475,221.272292,326.553412,168.113136,173.746496,222.402769,211.535415,...,407.532934,218.539998,230.043668,167.621171,229.045395,376.02569,245.817934,271.991984,330.510694,3013.89892


### Get core genes

In [3]:
# Get mapping between PAO1 and PA14 genes using PAO1 reference
gene_annot_file = paths.GENE_PAO1_ANNOT
gene_mapping_pao1 = utils.get_pao1_pa14_gene_map(gene_annot_file, 'pao1')
gene_annot_file = paths.GENE_PA14_ANNOT
gene_mapping_pa14 = utils.get_pao1_pa14_gene_map(gene_annot_file, 'pa14')

core_pao1_genes, core_pa14_genes = utils.get_core_genes(gene_mapping_pao1,
                                                        gene_mapping_pa14,
                                                        False)
print(f"Number of PAO1 core genes: {len(core_pao1_genes)}")
print(f"Number of PA14 core genes: {len(core_pa14_genes)}")

expression_data = expression_data.reindex(columns=core_pao1_genes)
print(expression_data.shape)
expression_data.head()

Number of PAO1 core genes: 5366
Number of PA14 core genes: 5363
(28, 5366)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA4215,PA4214,PA4213
SRR7886564,231.716766,190.742049,251.434363,244.575451,232.66742,338.778285,154.619466,135.309196,197.297614,208.656767,...,217.461069,156.88598,207.875748,401.447266,243.877369,242.992502,246.314856,,,
SRR7886563,221.861101,206.2282,262.510896,236.227489,218.772353,277.169311,172.770017,131.65515,203.906846,195.419829,...,221.798564,169.894611,206.04732,364.537806,220.715998,222.137392,290.767265,,,
SRR7886556,239.03554,214.717657,274.756361,234.900934,211.425667,331.008247,182.534175,151.789441,177.38904,214.45934,...,219.460975,122.112693,216.61862,356.472812,204.935818,236.919636,344.232315,,,
SRR7886554,207.540231,225.539932,225.249527,219.545459,187.796107,288.503175,170.652752,152.999445,178.938258,189.161895,...,226.673993,151.294458,188.400366,344.227676,230.639268,215.712007,231.559415,,,
SRR11809604,260.594079,246.812776,266.252334,265.965475,221.272292,326.553412,168.113136,173.746496,222.402769,211.535415,...,230.043668,167.621171,229.045395,376.02569,245.817934,271.991984,330.510694,,,


In [4]:
gene_mapping_pa14.head()

Unnamed: 0_level_0,Name,Product.Name,GeneID.(PAO1),PAO1_ID,annotation,num_mapped_genes
PA14_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PA14_00010,dnaA,chromosomal replication initiation protein,878417.0,PA0001,core,1.0
PA14_00020,dnaN,DNA polymerase III subunit beta,879244.0,PA0002,core,1.0
PA14_00030,recF,recombination protein F,879229.0,PA0003,core,1.0
PA14_00050,gyrB,DNA gyrase subunit B,879230.0,PA0004,core,1.0
PA14_00060,,acyltransferase,877576.0,PA0005,core,1.0


In [5]:
# Save file
expression_data.to_csv(paths.PAO1_GE_DE, sep='\t')

### Differential expression analysis

In [None]:
%%R
# Select 59
# Run one time
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("limma")

Selection: 59







	‘/tmp/Rtmpa53YOn/downloaded_packages’















a





















In [None]:
%%R
library('limma')

In [None]:
# Load files
metadata_file = paths.SAMPLE_ANNOT
expression_data_file = paths.PAO1_GE_DE
out_file = paths.DE_STATS

In [None]:
%%R -i metadata_file -i expression_data_file -i out_file
source('../core_acc_modules/DE_analysis.R')

get_DE_stats(metadata_file,
             expression_data_file,
             out_file)

In [None]:
# Read in DE stats file
DE_stats = pd.read_csv(paths.DE_STATS, sep='\t', header=0, index_col=0)
print(DE_stats.shape)
DE_stats.head()
# Get number of DEGs
# Get list of DEGs

In [None]:
# Compare out results with publication
#https://jb.asm.org/content/201/21/e00362-19 found ~ 2K DEGs between 2 strains where QS genes were DEGs
# What have publications found?