# Format summary tables

This notebook inputs the summary tables for PAO1 and PA14 and formats it to be one table using transcriptional similarity values for PAO1-aligned genes and includes expression statistics for both PAO1-aligned and PA14-aligned genes

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import random
import pandas as pd
from scripts import paths, utils

random.seed(1)

In [2]:
# Input similarity scores and annotations filenames
# Since the results are similar we only need to look at the scores for one strain type
pao1_similarity_filename = "pao1_core_similarity_associations_final_spell.tsv"
pa14_similarity_filename = "pa14_core_similarity_associations_final_spell.tsv"

In [3]:
# Import df
pao1_similarity = pd.read_csv(pao1_similarity_filename, sep="\t", index_col=0, header=0)
pa14_similarity = pd.read_csv(pa14_similarity_filename, sep="\t", index_col=0, header=0)

In [4]:
print(pao1_similarity.shape)
pao1_similarity.head()

(5349, 16)


Unnamed: 0_level_0,PA14 homolog id,Transcriptional similarity across strains,P-value,Name,label,mean expression,standard deviation expression,min expression,25% expression,50% expression,75% expression,max expression,variance expression,range expression,pathways present,Related acc genes
PAO1 id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
PA2763,PA14_28370,0.223647,1.285228e-61,,,19.694848,24.743003,0.0,5.391723,12.263734,23.681063,278.097141,612.216209,278.097141,[],
PA0687,PA14_55430,0.186869,3.120286e-43,hxcS,,50.518187,124.934061,0.0,2.059509,10.258957,31.659012,1167.426739,15608.519591,1167.426739,['KEGG-Pathway-pae03070: Bacterial secretion s...,
PA0048,PA14_00600,0.172591,4.860103e-37,,,86.414619,96.94468,0.0,25.523418,53.040761,113.42086,823.451891,9398.270927,823.451891,[],
PA2363,PA14_34100,0.397806,2.600695e-202,hsiJ3,,327.426738,370.231163,0.0,69.472696,210.784639,457.7082,3176.807775,137071.114318,3176.807775,['KEGG-Module-M00334: Type VI secretion system'],
PA1171,PA14_49280,0.422021,4.773292e-230,sltB2,,298.032395,158.89245,14.169995,193.150705,276.005155,374.746932,1019.941686,25246.810693,1005.771691,[],


In [5]:
print(pa14_similarity.shape)
pa14_similarity.head()

(5347, 16)


Unnamed: 0_level_0,PAO1 homolog id,Transcriptional similarity across strains,P-value,Name,label,mean expression,standard deviation expression,min expression,25% expression,50% expression,75% expression,max expression,variance expression,range expression,pathways present,Related acc genes
PA14 id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
PA14_54660,PA0744,0.416944,5.865822999999999e-224,,,803.543344,1081.81717,14.240564,142.068583,414.380788,913.522137,6057.518156,1170328.0,6043.277593,[],
PA14_30620,PA2588,0.494403,0.0,,,892.601508,1095.977874,15.326451,209.813434,484.465843,1002.523013,7104.684725,1201168.0,7089.358274,[],
PA14_68400,PA5178,0.520295,0.0,,,3134.472215,3559.745121,25.280375,695.070834,2015.410297,4014.206074,25298.790957,12671790.0,25273.510582,[],
PA14_38850,PA1983,0.23995,6.75451e-71,exaB,,35.467321,219.499403,0.0,0.0,3.238127,11.433134,4543.642316,48179.99,4543.642316,[],
PA14_04760,PA0363,0.465735,3.828645e-286,coaD,,355.33834,255.923231,19.537134,198.75217,284.643548,436.779949,2042.651981,65496.7,2023.114847,['KEGG-Pathway-pae00770: Pantothenate and CoA ...,


In [6]:
# Select expression statistics for PA14-aligned data
pa14_subset = pa14_similarity[
    [
        "PAO1 homolog id",
        "mean expression",
        "standard deviation expression",
        "25% expression",
        "50% expression",
        "75% expression",
        "min expression",
        "max expression",
        "variance expression",
        "range expression",
        "pathways present",
        "Related acc genes",
    ]
].set_index("PAO1 homolog id")

In [7]:
print(pa14_subset.shape)
pa14_subset.head()

(5347, 11)


Unnamed: 0_level_0,mean expression,standard deviation expression,25% expression,50% expression,75% expression,min expression,max expression,variance expression,range expression,pathways present,Related acc genes
PAO1 homolog id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
PA0744,803.543344,1081.81717,142.068583,414.380788,913.522137,14.240564,6057.518156,1170328.0,6043.277593,[],
PA2588,892.601508,1095.977874,209.813434,484.465843,1002.523013,15.326451,7104.684725,1201168.0,7089.358274,[],
PA5178,3134.472215,3559.745121,695.070834,2015.410297,4014.206074,25.280375,25298.790957,12671790.0,25273.510582,[],
PA1983,35.467321,219.499403,0.0,3.238127,11.433134,0.0,4543.642316,48179.99,4543.642316,[],
PA0363,355.33834,255.923231,198.75217,284.643548,436.779949,19.537134,2042.651981,65496.7,2023.114847,['KEGG-Pathway-pae00770: Pantothenate and CoA ...,


In [8]:
# Merge dataframes based on PAO1 gene id
all_similarity = pao1_similarity.merge(
    pa14_subset, left_index=True, right_index=True, suffixes=["_pao1", "_p14"]
)

In [9]:
print(all_similarity.shape)
all_similarity.head()

(5347, 27)


Unnamed: 0,PA14 homolog id,Transcriptional similarity across strains,P-value,Name,label,mean expression_pao1,standard deviation expression_pao1,min expression_pao1,25% expression_pao1,50% expression_pao1,...,standard deviation expression_p14,25% expression_p14,50% expression_p14,75% expression_p14,min expression_p14,max expression_p14,variance expression_p14,range expression_p14,pathways present_p14,Related acc genes_p14
PA2763,PA14_28370,0.223647,1.285228e-61,,,19.694848,24.743003,0.0,5.391723,12.263734,...,49.808029,7.991056,16.560478,40.186283,0.0,547.002967,2480.839706,547.002967,[],
PA0687,PA14_55430,0.186869,3.120286e-43,hxcS,,50.518187,124.934061,0.0,2.059509,10.258957,...,75.514622,2.101597,7.905863,25.592966,0.0,775.099584,5702.458064,775.099584,['KEGG-Pathway-pae03070: Bacterial secretion s...,
PA0048,PA14_00600,0.172591,4.860103e-37,,,86.414619,96.94468,0.0,25.523418,53.040761,...,55.776274,11.637363,24.35179,51.313567,0.0,593.557573,3110.99278,593.557573,[],
PA2363,PA14_34100,0.397806,2.600695e-202,hsiJ3,,327.426738,370.231163,0.0,69.472696,210.784639,...,745.908938,61.075563,154.455248,391.919946,0.0,11227.048304,556380.143242,11227.048304,['KEGG-Module-M00334: Type VI secretion system'],
PA1171,PA14_49280,0.422021,4.773292e-230,sltB2,,298.032395,158.89245,14.169995,193.150705,276.005155,...,142.557564,129.126722,206.885616,292.197842,24.874447,1057.094057,20322.659085,1032.21961,[],


In [10]:
# Some manual checks
# all_similarity.loc[["PA0744", "PA2588"]]
# pao1_similarity.loc[["PA0744", "PA2588"]]

In [11]:
# Output
all_similarity.to_csv("all_core_similarity_associations_final_spell.tsv", sep="\t")