# Find KEGG associations

This notebook will create a table that has the KEGG pathways that are associated with all genes, but we are particularly interested in those that are associated with the most and least stable genes.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import random
import pandas as pd
from scripts import paths, utils, modules, annotations

random.seed(1)

In [2]:
# Output files
pao1_out_filename = "pao1_core_similarity_associations_spell.tsv"
pa14_out_filename = "pa14_core_similarity_associations_spell.tsv"

In [3]:
# Load transcriptional similarity df
pao1_similarity_scores_filename = "pao1_core_similarity_expression_stats_spell.tsv"
pa14_similarity_scores_filename = "pa14_core_similarity_expression_stats_spell.tsv"

pao1_similarity_scores = pd.read_csv(
    pao1_similarity_scores_filename, sep="\t", header=0, index_col=0
)
pa14_similarity_scores = pd.read_csv(
    pa14_similarity_scores_filename, sep="\t", header=0, index_col=0
)

In [4]:
print(pao1_similarity_scores.shape)
pao1_similarity_scores.head()

(5349, 14)


Unnamed: 0_level_0,PA14 homolog id,Transcriptional similarity across strains,P-value,Name,label,mean expression,standard deviation expression,min expression,25% expression,50% expression,75% expression,max expression,variance expression,range expression
PAO1 id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
PA0118,PA14_01440,0.290443,1.771504e-104,,,84.379666,87.012773,0.0,30.80525,56.083433,98.678794,743.594397,7571.223,743.594397
PA1859,PA14_40440,0.143328,5.986934e-26,,,106.687908,91.482549,0.0,50.280627,78.021322,118.345717,584.117921,8369.057,584.117921
PA3190,PA14_22980,0.257508,9.17062e-82,,,2974.728597,4619.591491,0.0,477.708748,1386.13806,3291.525419,41815.193043,21340630.0,41815.193043
PA1009,PA14_51280,0.54995,0.0,,,448.628362,387.755818,18.128903,238.075927,370.338444,543.930949,5826.229102,150354.6,5808.100199
PA1065,PA14_50620,0.329633,9.188709e-136,,,105.604816,85.534956,0.0,47.418993,82.907779,144.322724,707.184303,7316.229,707.184303


In [5]:
print(pa14_similarity_scores.shape)
pa14_similarity_scores.head()

(5347, 14)


Unnamed: 0_level_0,PAO1 homolog id,Transcriptional similarity across strains,P-value,Name,label,mean expression,standard deviation expression,min expression,25% expression,50% expression,75% expression,max expression,variance expression,range expression
PA14 id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
PA14_23690,PA3125,0.185094,2.030432e-42,,,85.595452,101.121951,0.0,25.546082,51.093537,104.536787,709.117909,10225.65,709.117909
PA14_11480,PA4049,0.327795,3.8691119999999996e-134,,,198.132451,121.601925,0.0,125.11967,168.451503,230.506602,1084.278966,14787.03,1084.278966
PA14_01150,PA0094,0.418704,4.9895729999999995e-226,,,105.290577,111.218985,0.0,39.297006,75.807135,124.150007,860.205738,12369.66,860.205738
PA14_11460,PA4051,0.541759,0.0,thiL,,234.39525,177.782645,16.650153,110.935215,188.025557,300.223243,1008.676242,31606.67,992.026089
PA14_56780,PA4366,0.482227,1.451201e-309,sodB,,7781.835203,7633.163594,412.593023,3450.004296,5718.029115,8591.327252,72482.666616,58265190.0,72070.073593


## Load KEGG annotations

In [6]:
pao1_pathway_filename = "pao1_kegg_annot.tsv"
pa14_pathway_filename = "pa14_kegg_annot.tsv"

In [7]:
pao1_pathways = pd.read_csv(pao1_pathway_filename, sep="\t", header=0, index_col=0)
pa14_pathways = pd.read_csv(pa14_pathway_filename, sep="\t", header=0, index_col=0)

In [8]:
print(pao1_pathways.shape)
pao1_pathways.head()

(123, 3)


Unnamed: 0_level_0,pathway_name,num_genes,gene_ids
pathway_id_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
path:pae00010 : Glycolysis / Gluconeogenesis,Glycolysis / Gluconeogenesis,37,"['PA3193', 'PA4732', 'PA5110', 'PA0555', 'PA47..."
path:pae00020 : Citrate cycle (TCA cycle),Citrate cycle (TCA cycle),28,"['PA1580', 'PA1562', 'PA1787', 'PA2623', 'PA26..."
path:pae00030 : Pentose phosphate pathway,Pentose phosphate pathway,28,"['PA4732', 'PA5439', 'PA3183', 'PA3182', 'PA42..."
path:pae00040 : Pentose and glucuronate interconversions,Pentose and glucuronate interconversions,8,"['PA2022', 'PA3559', 'PA2023', 'PA0607', 'PA23..."
path:pae00051 : Fructose and mannose metabolism,Fructose and mannose metabolism,19,"['PA2344', 'PA3551', 'PA2232', 'PA5452', 'PA53..."


In [9]:
print(pa14_pathways.shape)
pa14_pathways.head()

(123, 3)


Unnamed: 0_level_0,pathway_name,num_genes,gene_ids
pathway_id_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
path:pau00010 : Glycolysis / Gluconeogenesis,Glycolysis / Gluconeogenesis,37,"['PA14_22930', 'PA14_62620', 'PA14_67490', 'PA..."
path:pau00020 : Citrate cycle (TCA cycle),Citrate cycle (TCA cycle),28,"['PA14_44070', 'PA14_44290', 'PA14_41470', 'PA..."
path:pau00030 : Pentose phosphate pathway,Pentose phosphate pathway,28,"['PA14_62620', 'PA14_71800', 'PA14_23070', 'PA..."
path:pau00040 : Pentose and glucuronate interconversions,Pentose and glucuronate interconversions,8,"['PA14_38360', 'PA14_18300', 'PA14_38350', 'PA..."
path:pau00051 : Fructose and mannose metabolism,Fructose and mannose metabolism,18,"['PA14_34340', 'PA14_18380', 'PA14_71970', 'PA..."


## Get pathway associations for all genes

In [14]:
def get_associated_pathways(genes_, pathway_df):
    rows = []
    for gene_id in genes_:
        pathway_bool = [
            gene_id in pathway_df.loc[pathway, "gene_ids"]
            for pathway in pathway_df.index
        ]
        found_pathways = list(pathway_df[pathway_bool].index)
        rows.append({"gene id": gene_id, "pathways present": found_pathways})
    return pd.DataFrame(rows).set_index("gene id")

In [15]:
# Get KEGG associations for all genes in PAO1
all_pao1_gene_ids = list(pao1_similarity_scores.index)
pao1_associations = get_associated_pathways(all_pao1_gene_ids, pao1_pathways)

In [16]:
print(pao1_associations.shape)
pao1_associations.head()

(5349, 1)


Unnamed: 0_level_0,pathways present
gene id,Unnamed: 1_level_1
PA0118,[]
PA1859,[]
PA3190,[path:pae02010 : ABC transporters]
PA1009,[]
PA1065,[]


In [17]:
# Get KEGG associations for all genes in PA14
all_pa14_gene_ids = list(pa14_similarity_scores.index)
pa14_associations = get_associated_pathways(all_pa14_gene_ids, pa14_pathways)

In [18]:
print(pa14_associations.shape)
pa14_associations.head()

(5347, 1)


Unnamed: 0_level_0,pathways present
gene id,Unnamed: 1_level_1
PA14_23690,[]
PA14_11480,[]
PA14_01150,[]
PA14_11460,[path:pau00730 : Thiamine metabolism]
PA14_56780,[]


In [20]:
# Merge KEGG associations with transcriptional similarity information
pao1_associations = pao1_similarity_scores.merge(
    pao1_associations, left_index=True, right_index=True, how="left"
)
pa14_associations = pa14_similarity_scores.merge(
    pa14_associations, left_index=True, right_index=True, how="left"
)

In [21]:
print(pao1_associations.shape)
pao1_associations.head()

(5349, 15)


Unnamed: 0_level_0,PA14 homolog id,Transcriptional similarity across strains,P-value,Name,label,mean expression,standard deviation expression,min expression,25% expression,50% expression,75% expression,max expression,variance expression,range expression,pathways present
PAO1 id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
PA0118,PA14_01440,0.290443,1.771504e-104,,,84.379666,87.012773,0.0,30.80525,56.083433,98.678794,743.594397,7571.223,743.594397,[]
PA1859,PA14_40440,0.143328,5.986934e-26,,,106.687908,91.482549,0.0,50.280627,78.021322,118.345717,584.117921,8369.057,584.117921,[]
PA3190,PA14_22980,0.257508,9.17062e-82,,,2974.728597,4619.591491,0.0,477.708748,1386.13806,3291.525419,41815.193043,21340630.0,41815.193043,[path:pae02010 : ABC transporters]
PA1009,PA14_51280,0.54995,0.0,,,448.628362,387.755818,18.128903,238.075927,370.338444,543.930949,5826.229102,150354.6,5808.100199,[]
PA1065,PA14_50620,0.329633,9.188709e-136,,,105.604816,85.534956,0.0,47.418993,82.907779,144.322724,707.184303,7316.229,707.184303,[]


In [22]:
print(pa14_associations.shape)
pa14_associations.head()

(5347, 15)


Unnamed: 0_level_0,PAO1 homolog id,Transcriptional similarity across strains,P-value,Name,label,mean expression,standard deviation expression,min expression,25% expression,50% expression,75% expression,max expression,variance expression,range expression,pathways present
PA14 id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
PA14_23690,PA3125,0.185094,2.030432e-42,,,85.595452,101.121951,0.0,25.546082,51.093537,104.536787,709.117909,10225.65,709.117909,[]
PA14_11480,PA4049,0.327795,3.8691119999999996e-134,,,198.132451,121.601925,0.0,125.11967,168.451503,230.506602,1084.278966,14787.03,1084.278966,[]
PA14_01150,PA0094,0.418704,4.9895729999999995e-226,,,105.290577,111.218985,0.0,39.297006,75.807135,124.150007,860.205738,12369.66,860.205738,[]
PA14_11460,PA4051,0.541759,0.0,thiL,,234.39525,177.782645,16.650153,110.935215,188.025557,300.223243,1008.676242,31606.67,992.026089,[path:pau00730 : Thiamine metabolism]
PA14_56780,PA4366,0.482227,1.451201e-309,sodB,,7781.835203,7633.163594,412.593023,3450.004296,5718.029115,8591.327252,72482.666616,58265190.0,72070.073593,[]


In [23]:
# Save
pao1_associations.to_csv(pao1_out_filename, sep="\t")
pa14_associations.to_csv(pa14_out_filename, sep="\t")