# Find KEGG associations

This notebokk will create a table that has the KEGG pathways that are associated with the most stable and least stable core genes.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import random
import pandas as pd
from core_acc_modules import paths, utils, modules

random.seed(1)

In [2]:
# Output files
pao1_core_stable_similarity_filename = "pao1_core_stable_associations.tsv"
pa14_core_stable_similarity_filename = "pa14_core_stable_associations.tsv"

In [3]:
# Load transcriptional similarity df
pao1_similarity_scores_filename = "pao1_similarity_scores.tsv"
pa14_similarity_scores_filename = "pa14_similarity_scores.tsv"

pao1_similarity_scores = pd.read_csv(
    pao1_similarity_scores_filename, sep="\t", header=0, index_col=0
)
pa14_similarity_scores = pd.read_csv(
    pa14_similarity_scores_filename, sep="\t", header=0, index_col=0
)

In [4]:
pao1_similarity_scores.head()

Unnamed: 0_level_0,PA14 homolog id,Transcriptional similarity across strains,P-value,Name,label
PAO1 id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PA1842,PA14_40690,0.269544,1.0643069999999999e-89,,
PA3037,PA14_24810,0.318441,2.444181e-126,,
PA2560,PA14_31420,0.321269,1.1074830000000001e-128,,
PA3677,PA14_16800,0.212203,1.6288020000000002e-55,mexJ,
PA5381,PA14_71080,0.380869,3.054638e-184,,


In [5]:
# Load KEGG pathway data
pao1_pathway_filename = "https://raw.githubusercontent.com/greenelab/adage/7a4eda39d360b224268921dc1f2c14b32788ab16/Node_interpretation/pseudomonas_KEGG_terms.txt"

pao1_pathways = pd.read_csv(pao1_pathway_filename, sep="\t", index_col=0, header=None)

In [6]:
pao1_pathways[2] = pao1_pathways[2].str.split(";").apply(set)
pao1_pathways.index = pao1_pathways.index.str.split(" - ").str[0]
pao1_pathways.head()

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
KEGG-Pathway-pae00072: Synthesis and degradation of ketone bodies,10,"{PA2553, PA3925, PA1736, PA2003, PA2011, PA200..."
KEGG-Pathway-pae00071: Fatty acid degradation,32,"{PA3300, PA3014, PA3589, PA5020, PA1736, PA392..."
KEGG-Pathway-pae00903: Limonene and pinene degradation,9,"{PA3014, PA2475, PA4899, PA1027, PA1748, PA342..."
KEGG-Pathway-pae00380: Tryptophan metabolism,27,"{PA1585, PA2579, PA3014, PA0704, PA2147, PA358..."
KEGG-Pathway-pae00900: Terpenoid backbone biosynthesis,16,"{PA2553, PA3925, PA1736, PA3650, PA3633, PA362..."


## Pathway annotations to PA14

The annotations we have are only for PAO1 genes, so we will map PAO1 core genes to PA14 core genes to add annotations to PA14. This is possible since we are focused on only core genes, which have homologs between PAO1 and PA14

In [7]:
pao1_annotation_filename = paths.GENE_PAO1_ANNOT
gene_mapping_pao1 = utils.get_pao1_pa14_gene_map(pao1_annotation_filename, "pao1")

In [8]:
gene_mapping_pao1 = gene_mapping_pao1["PA14_ID"].to_frame()

## Get pathway associations for most and least stable genes

In [9]:
# Get most and least stable core genes
most_stable_genes = pao1_similarity_scores[
    pao1_similarity_scores["label"] == "most stable"
].index
least_stable_genes = pao1_similarity_scores[
    pao1_similarity_scores["label"] == "least stable"
].index

In [10]:
def get_associated_pathways(genes_):
    rows = []
    for gene_id in genes_:
        pathway_bool = [
            gene_id in pao1_pathways.loc[pathway, 2] for pathway in pao1_pathways.index
        ]
        found_pathways = list(pao1_pathways[pathway_bool].index)
        rows.append({"gene id": gene_id, "pathways present": found_pathways})
    return pd.DataFrame(rows)

In [11]:
most_stable_associations = get_associated_pathways(most_stable_genes)
most_stable_associations.head()

Unnamed: 0,gene id,pathways present
0,PA4112,[]
1,PA4414,[KEGG-Pathway-pae00471: D-Glutamine and D-glut...
2,PA4481,[]
3,PA3449,"[KEGG-Pathway-pae00920: Sulfur metabolism, KEG..."
4,PA4563,"[KEGG-Pathway-pae03010: Ribosome, KEGG-Module-..."


In [12]:
least_stable_associations = get_associated_pathways(least_stable_genes)
least_stable_associations.head()

Unnamed: 0,gene id,pathways present
0,PA0850,[]
1,PA2283,[]
2,PA0346,[]
3,PA1633,[]
4,PA1195,[]


In [13]:
# Add label for most and least stable core genes
most_stable_associations["label"] = "most stable"
least_stable_associations["label"] = "least stable"

In [14]:
# Concatenate dataframes
pao1_all_associations = pd.concat([most_stable_associations, least_stable_associations])
pao1_all_associations.set_index("gene id", inplace=True)
pao1_all_associations.head()

# TO DO: Rename index col

Unnamed: 0_level_0,pathways present,label
gene id,Unnamed: 1_level_1,Unnamed: 2_level_1
PA4112,[],most stable
PA4414,[KEGG-Pathway-pae00471: D-Glutamine and D-glut...,most stable
PA4481,[],most stable
PA3449,"[KEGG-Pathway-pae00920: Sulfur metabolism, KEG...",most stable
PA4563,"[KEGG-Pathway-pae03010: Ribosome, KEGG-Module-...",most stable


In [15]:
# Map PA14 gene ids
pa14_all_associations = pao1_all_associations.merge(
    gene_mapping_pao1, left_index=True, right_index=True
)
pa14_all_associations.set_index("PA14_ID", inplace=True)
pa14_all_associations.head()

Unnamed: 0_level_0,pathways present,label
PA14_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
PA14_10770,[],most stable
PA14_57370,[KEGG-Pathway-pae00471: D-Glutamine and D-glut...,most stable
PA14_58150,[],most stable
PA14_19500,"[KEGG-Pathway-pae00920: Sulfur metabolism, KEG...",most stable
PA14_60400,"[KEGG-Pathway-pae03010: Ribosome, KEGG-Module-...",most stable


In [16]:
# Save
pao1_all_associations.to_csv(pao1_core_stable_similarity_filename, sep="\t")
pa14_all_associations.to_csv(pa14_core_stable_similarity_filename, sep="\t")