# Format summary tables

This notebook inputs the summary tables for PAO1 and PA14 and formats it to be one table using transcriptional similarity values for PAO1-aligned genes and includes expression statistics for both PAO1-aligned and PA14-aligned genes

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import random
import pandas as pd
from scripts import paths, utils

random.seed(1)

In [2]:
# Input similarity scores and annotations filenames
# Since the results are similar we only need to look at the scores for one strain type
pao1_similarity_filename = "pao1_core_similarity_associations_final_spell.tsv"
pa14_similarity_filename = "pa14_core_similarity_associations_final_spell.tsv"

In [3]:
# Import df
pao1_similarity = pd.read_csv(pao1_similarity_filename, sep="\t", index_col=0, header=0)
pa14_similarity = pd.read_csv(pa14_similarity_filename, sep="\t", index_col=0, header=0)

In [4]:
print(pao1_similarity.shape)
pao1_similarity.head()

(5349, 16)


Unnamed: 0_level_0,PA14 homolog id,Transcriptional similarity across strains,P-value,Name,label,mean expression,standard deviation expression,min expression,25% expression,50% expression,75% expression,max expression,variance expression,range expression,pathways present,Related acc genes
PAO1 id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
PA0118,PA14_01440,0.290443,1.771504e-104,,,84.379666,87.012773,0.0,30.80525,56.083433,98.678794,743.594397,7571.223,743.594397,[],
PA1859,PA14_40440,0.143328,5.986934e-26,,,106.687908,91.482549,0.0,50.280627,78.021322,118.345717,584.117921,8369.057,584.117921,[],
PA3190,PA14_22980,0.257508,9.17062e-82,,,2974.728597,4619.591491,0.0,477.708748,1386.13806,3291.525419,41815.193043,21340630.0,41815.193043,['path:pae02010 : ABC transporters'],
PA1009,PA14_51280,0.54995,0.0,,,448.628362,387.755818,18.128903,238.075927,370.338444,543.930949,5826.229102,150354.6,5808.100199,[],
PA1065,PA14_50620,0.329633,9.188709e-136,,,105.604816,85.534956,0.0,47.418993,82.907779,144.322724,707.184303,7316.229,707.184303,[],


In [5]:
print(pa14_similarity.shape)
pa14_similarity.head()

(5347, 16)


Unnamed: 0_level_0,PAO1 homolog id,Transcriptional similarity across strains,P-value,Name,label,mean expression,standard deviation expression,min expression,25% expression,50% expression,75% expression,max expression,variance expression,range expression,pathways present,Related acc genes
PA14 id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
PA14_23690,PA3125,0.185094,2.030432e-42,,,85.595452,101.121951,0.0,25.546082,51.093537,104.536787,709.117909,10225.65,709.117909,[],
PA14_11480,PA4049,0.327795,3.8691119999999996e-134,,,198.132451,121.601925,0.0,125.11967,168.451503,230.506602,1084.278966,14787.03,1084.278966,[],
PA14_01150,PA0094,0.418704,4.9895729999999995e-226,,,105.290577,111.218985,0.0,39.297006,75.807135,124.150007,860.205738,12369.66,860.205738,[],
PA14_11460,PA4051,0.541759,0.0,thiL,,234.39525,177.782645,16.650153,110.935215,188.025557,300.223243,1008.676242,31606.67,992.026089,['path:pau00730 : Thiamine metabolism'],
PA14_56780,PA4366,0.482227,1.451201e-309,sodB,,7781.835203,7633.163594,412.593023,3450.004296,5718.029115,8591.327252,72482.666616,58265190.0,72070.073593,[],


In [6]:
# Select expression statistics for PA14-aligned data
pa14_subset = pa14_similarity[
    [
        "PAO1 homolog id",
        "mean expression",
        "standard deviation expression",
        "25% expression",
        "50% expression",
        "75% expression",
        "min expression",
        "max expression",
        "variance expression",
        "range expression",
        "pathways present",
        "Related acc genes",
    ]
].set_index("PAO1 homolog id")

In [7]:
print(pa14_subset.shape)
pa14_subset.head()

(5347, 11)


Unnamed: 0_level_0,mean expression,standard deviation expression,25% expression,50% expression,75% expression,min expression,max expression,variance expression,range expression,pathways present,Related acc genes
PAO1 homolog id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
PA3125,85.595452,101.121951,25.546082,51.093537,104.536787,0.0,709.117909,10225.65,709.117909,[],
PA4049,198.132451,121.601925,125.11967,168.451503,230.506602,0.0,1084.278966,14787.03,1084.278966,[],
PA0094,105.290577,111.218985,39.297006,75.807135,124.150007,0.0,860.205738,12369.66,860.205738,[],
PA4051,234.39525,177.782645,110.935215,188.025557,300.223243,16.650153,1008.676242,31606.67,992.026089,['path:pau00730 : Thiamine metabolism'],
PA4366,7781.835203,7633.163594,3450.004296,5718.029115,8591.327252,412.593023,72482.666616,58265190.0,72070.073593,[],


In [8]:
# Merge dataframes based on PAO1 gene id
all_similarity = pao1_similarity.merge(
    pa14_subset, left_index=True, right_index=True, suffixes=["_pao1", "_p14"]
)

In [9]:
print(all_similarity.shape)
all_similarity.head()

(5347, 27)


Unnamed: 0,PA14 homolog id,Transcriptional similarity across strains,P-value,Name,label,mean expression_pao1,standard deviation expression_pao1,min expression_pao1,25% expression_pao1,50% expression_pao1,...,standard deviation expression_p14,25% expression_p14,50% expression_p14,75% expression_p14,min expression_p14,max expression_p14,variance expression_p14,range expression_p14,pathways present_p14,Related acc genes_p14
PA0118,PA14_01440,0.290443,1.771504e-104,,,84.379666,87.012773,0.0,30.80525,56.083433,...,80.368739,23.079151,38.107897,57.933632,0.0,662.245143,6459.134,662.245143,[],
PA1859,PA14_40440,0.143328,5.986934e-26,,,106.687908,91.482549,0.0,50.280627,78.021322,...,57.028168,51.071843,72.575326,105.626428,0.0,400.928716,3252.212,400.928716,[],
PA3190,PA14_22980,0.257508,9.17062e-82,,,2974.728597,4619.591491,0.0,477.708748,1386.13806,...,5428.865874,682.52039,2268.790469,5222.390596,0.0,77365.371337,29472580.0,77365.371337,['path:pau02010 : ABC transporters'],
PA1009,PA14_51280,0.54995,0.0,,,448.628362,387.755818,18.128903,238.075927,370.338444,...,282.615721,170.776065,274.443598,462.828788,13.871988,2095.679114,79871.65,2081.807126,[],
PA1065,PA14_50620,0.329633,9.188709e-136,,,105.604816,85.534956,0.0,47.418993,82.907779,...,86.037792,40.85693,74.832672,115.490996,3.104826,1253.535657,7402.502,1250.430831,[],


In [10]:
# Some manual checks
# all_similarity.loc[["PA0744", "PA2588"]]
# pao1_similarity.loc[["PA0744", "PA2588"]]

In [11]:
# Output
all_similarity.to_csv("all_core_similarity_associations_final_spell.tsv", sep="\t")