# Find KEGG associations

This notebook will create a table that has the KEGG pathways that are associated with all genes, but we are particularly interested in those that are associated with the most and least stable genes.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import random
import pandas as pd
from scripts import paths, utils, modules, annotations

random.seed(1)

In [2]:
# Output files
pao1_out_filename = "pao1_core_similarity_associations_spell.tsv"
pa14_out_filename = "pa14_core_similarity_associations_spell.tsv"

In [3]:
# Load transcriptional similarity df
pao1_similarity_scores_filename = "pao1_core_similarity_expression_stats_spell.tsv"
pa14_similarity_scores_filename = "pa14_core_similarity_expression_stats_spell.tsv"

pao1_similarity_scores = pd.read_csv(
    pao1_similarity_scores_filename, sep="\t", header=0, index_col=0
)
pa14_similarity_scores = pd.read_csv(
    pa14_similarity_scores_filename, sep="\t", header=0, index_col=0
)

In [4]:
print(pao1_similarity_scores.shape)
pao1_similarity_scores.head()

(5349, 14)


Unnamed: 0_level_0,PA14 homolog id,Transcriptional similarity across strains,P-value,Name,label,mean expression,standard deviation expression,min expression,25% expression,50% expression,75% expression,max expression,variance expression,range expression
PAO1 id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
PA1094,PA14_50270,0.326832,2.285473e-133,fliD,,4867.543068,5825.361224,0.087195,2109.140118,3597.7815,6033.694993,87576.556941,33934830.0,87576.469746
PA0935,PA14_52160,0.391218,3.7219999999999996e-195,,,271.167829,218.917214,26.136121,162.395953,224.734757,312.468287,4387.771547,47924.75,4361.635426
PA4751,PA14_62860,0.509608,0.0,ftsH,,5598.197708,3969.209125,377.968058,3126.341934,4680.010442,7080.232604,33237.563838,15754620.0,32859.59578
PA3895,PA14_13510,0.29947,2.90347e-111,,,165.526283,73.590673,31.816253,121.229894,152.712377,189.405258,759.16222,5415.587,727.345967
PA4769,PA14_63070,0.28941,1.022809e-103,,,288.985482,192.99679,11.634211,189.881467,244.062901,342.301143,2237.704066,37247.76,2226.069854


In [5]:
print(pa14_similarity_scores.shape)
pa14_similarity_scores.head()

(5348, 14)


Unnamed: 0_level_0,PAO1 homolog id,Transcriptional similarity across strains,P-value,Name,label,mean expression,standard deviation expression,min expression,25% expression,50% expression,75% expression,max expression,variance expression,range expression
PA14 id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
PA14_62530,PA4725,0.480002,2.188351e-306,cbrA,,709.762609,407.267235,122.452334,394.988439,594.216562,946.771082,2080.228806,165866.600452,1957.776471
PA14_73030,PA5537,0.263706,8.781443e-86,,,76.710538,88.560858,4.603652,33.40274,53.869774,90.654684,710.651335,7843.025641,706.047683
PA14_21310,PA3302,0.366802,5.357493e-170,phaJ1,,201.544418,96.266755,10.148641,133.162931,186.305003,255.493097,577.839715,9267.288079,567.691075
PA14_46740,PA1357,0.364274,1.617233e-167,,,95.018186,47.996948,8.588149,61.6044,84.536122,118.411355,302.756086,2303.707038,294.167937
PA14_06660,PA0510,0.454295,1.0931199999999999e-270,nirE,,232.242707,558.402225,0.764329,16.743883,42.275799,127.1849,3587.86916,311813.045125,3587.104831


In [6]:
# Load KEGG pathway data
pao1_pathway_filename = "https://raw.githubusercontent.com/greenelab/adage/7a4eda39d360b224268921dc1f2c14b32788ab16/Node_interpretation/pseudomonas_KEGG_terms.txt"

In [7]:
pao1_pathways = annotations.load_format_KEGG(pao1_pathway_filename)
pao1_pathways.head()

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
KEGG-Pathway-pae00072: Synthesis and degradation of ketone bodies,10,"{PA2011, PA3925, PA2003, PA4785, PA1999, PA200..."
KEGG-Pathway-pae00071: Fatty acid degradation,32,"{PA1748, PA2574, PA5020, PA4435, PA3299, PA102..."
KEGG-Pathway-pae00903: Limonene and pinene degradation,9,"{PA2475, PA3426, PA1748, PA4899, PA3331, PA182..."
KEGG-Pathway-pae00380: Tryptophan metabolism,27,"{PA1748, PA3366, PA2147, PA4342, PA1027, PA200..."
KEGG-Pathway-pae00900: Terpenoid backbone biosynthesis,16,"{PA3925, PA3627, PA4669, PA3633, PA3803, PA365..."


## Pathway annotations to PA14

The annotations we have are only for PAO1 genes, so we will map PAO1 core genes to PA14 core genes to add annotations to PA14. This is possible since we are focused on only core genes, which have homologs between PAO1 and PA14

In [8]:
pao1_annotation_filename = paths.GENE_PAO1_ANNOT
gene_mapping_pao1 = utils.get_pao1_pa14_gene_map(pao1_annotation_filename, "pao1")

In [9]:
gene_mapping_pao1 = gene_mapping_pao1["PA14_ID"].to_frame()

## Get pathway associations for all genes

In [10]:
def get_associated_pathways(genes_):
    rows = []
    for gene_id in genes_:
        pathway_bool = [
            gene_id in pao1_pathways.loc[pathway, 2] for pathway in pao1_pathways.index
        ]
        found_pathways = list(pao1_pathways[pathway_bool].index)
        rows.append({"gene id": gene_id, "pathways present": found_pathways})
    return pd.DataFrame(rows).set_index("gene id")

In [11]:
# Get KEGG associations for all genes in PAO1
all_pao1_gene_ids = list(pao1_similarity_scores.index)
pao1_associations = get_associated_pathways(all_pao1_gene_ids)

In [12]:
print(pao1_associations.shape)
pao1_associations.head()

(5349, 1)


Unnamed: 0_level_0,pathways present
gene id,Unnamed: 1_level_1
PA1094,[KEGG-Pathway-pae02040: Flagellar assembly]
PA0935,"[KEGG-Pathway-pae00230: Purine metabolism, KEG..."
PA4751,[]
PA3895,[]
PA4769,[]


In [13]:
# Map PA14 gene ids
pa14_associations = pao1_associations.merge(
    gene_mapping_pao1, left_index=True, right_index=True
)
pa14_associations.set_index("PA14_ID", inplace=True)
print(pa14_associations.shape)
pa14_associations.head()

(5349, 1)


Unnamed: 0_level_0,pathways present
PA14_ID,Unnamed: 1_level_1
PA14_50270,[KEGG-Pathway-pae02040: Flagellar assembly]
PA14_52160,"[KEGG-Pathway-pae00230: Purine metabolism, KEG..."
PA14_62860,[]
PA14_13510,[]
PA14_63070,[]


In [14]:
# Merge KEGG associations with transcriptional similarity information
pao1_associations = pao1_similarity_scores.merge(
    pao1_associations, left_index=True, right_index=True, how="left"
)
pa14_associations = pa14_similarity_scores.merge(
    pa14_associations, left_index=True, right_index=True, how="left"
)

In [15]:
print(pao1_associations.shape)
pao1_associations.head()

(5349, 15)


Unnamed: 0_level_0,PA14 homolog id,Transcriptional similarity across strains,P-value,Name,label,mean expression,standard deviation expression,min expression,25% expression,50% expression,75% expression,max expression,variance expression,range expression,pathways present
PAO1 id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
PA1094,PA14_50270,0.326832,2.285473e-133,fliD,,4867.543068,5825.361224,0.087195,2109.140118,3597.7815,6033.694993,87576.556941,33934830.0,87576.469746,[KEGG-Pathway-pae02040: Flagellar assembly]
PA0935,PA14_52160,0.391218,3.7219999999999996e-195,,,271.167829,218.917214,26.136121,162.395953,224.734757,312.468287,4387.771547,47924.75,4361.635426,"[KEGG-Pathway-pae00230: Purine metabolism, KEG..."
PA4751,PA14_62860,0.509608,0.0,ftsH,,5598.197708,3969.209125,377.968058,3126.341934,4680.010442,7080.232604,33237.563838,15754620.0,32859.59578,[]
PA3895,PA14_13510,0.29947,2.90347e-111,,,165.526283,73.590673,31.816253,121.229894,152.712377,189.405258,759.16222,5415.587,727.345967,[]
PA4769,PA14_63070,0.28941,1.022809e-103,,,288.985482,192.99679,11.634211,189.881467,244.062901,342.301143,2237.704066,37247.76,2226.069854,[]


In [16]:
print(pa14_associations.shape)
pa14_associations.head()

(5348, 15)


Unnamed: 0_level_0,PAO1 homolog id,Transcriptional similarity across strains,P-value,Name,label,mean expression,standard deviation expression,min expression,25% expression,50% expression,75% expression,max expression,variance expression,range expression,pathways present
PA14 id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
PA14_62530,PA4725,0.480002,2.188351e-306,cbrA,,709.762609,407.267235,122.452334,394.988439,594.216562,946.771082,2080.228806,165866.600452,1957.776471,[]
PA14_73030,PA5537,0.263706,8.781443e-86,,,76.710538,88.560858,4.603652,33.40274,53.869774,90.654684,710.651335,7843.025641,706.047683,[]
PA14_21310,PA3302,0.366802,5.357493e-170,phaJ1,,201.544418,96.266755,10.148641,133.162931,186.305003,255.493097,577.839715,9267.288079,567.691075,[]
PA14_46740,PA1357,0.364274,1.617233e-167,,,95.018186,47.996948,8.588149,61.6044,84.536122,118.411355,302.756086,2303.707038,294.167937,[]
PA14_06660,PA0510,0.454295,1.0931199999999999e-270,nirE,,232.242707,558.402225,0.764329,16.743883,42.275799,127.1849,3587.86916,311813.045125,3587.104831,[KEGG-Pathway-pae00860: Porphyrin and chloroph...


In [17]:
# Save
pao1_associations.to_csv(pao1_out_filename, sep="\t")
pa14_associations.to_csv(pa14_out_filename, sep="\t")