# Add annotations

This notebook takes the dataframe with information about module composition and their labels and adds additional annotations including:

1. Which gene is contained within the modules (both gene id and gene name)
2. KEGG pathways that genes are found in
3. GO pathways genes are found in
4. Regulon/operon genes are found in

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import random
import scipy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from core_acc_modules import utils, paths

random.seed(1)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [2]:
# User param
method = "affinity"

In [3]:
# Import module labels
pao1_module_label_filename = os.path.join(
    paths.LOCAL_DATA_DIR, "pao1_gene_module_labels.tsv"
)
pa14_module_label_filename = os.path.join(
    paths.LOCAL_DATA_DIR, "pa14_gene_module_labels.tsv"
)

pao1_module_labels = pd.read_csv(
    pao1_module_label_filename, sep="\t", index_col=0, header=0
)
pa14_module_labels = pd.read_csv(
    pa14_module_label_filename, sep="\t", index_col=0, header=0
)

In [4]:
# Import gene memberships
pao1_membership_filename = os.path.join(
    paths.LOCAL_DATA_DIR, f"pao1_modules_{method}.tsv"
)
pa14_membership_filename = os.path.join(
    paths.LOCAL_DATA_DIR, f"pa14_modules_{method}.tsv"
)

pao1_membership = pd.read_csv(pao1_membership_filename, sep="\t", index_col=0, header=0)
pa14_membership = pd.read_csv(pa14_membership_filename, sep="\t", index_col=0, header=0)

In [5]:
# Import gene metadata
pao1_gene_annot_filename = paths.GENE_PAO1_ANNOT
pa14_gene_annot_filename = paths.GENE_PA14_ANNOT

pao1_gene_annot = pd.read_csv(pao1_gene_annot_filename, index_col=0, header=0)
pa14_gene_annot = pd.read_csv(pa14_gene_annot_filename, index_col=0, header=0)

In [6]:
pao1_gene_annot = pao1_gene_annot["Name"].to_frame("gene name")
pa14_gene_annot = pa14_gene_annot["Name"].to_frame("gene name")

## Add module labels

In [7]:
# Add module labels
pao1_gene_module_labels = pao1_membership.merge(
    pao1_module_labels, left_on="module id", right_index=True
)
pa14_gene_module_labels = pa14_membership.merge(
    pa14_module_labels, left_on="module id", right_index=True
)

## Add gene names

In [8]:
# Add gene names
pao1_gene_module_labels = pao1_gene_module_labels.merge(
    pao1_gene_annot, left_index=True, right_index=True
)
pa14_gene_module_labels = pa14_gene_module_labels.merge(
    pa14_gene_annot, left_index=True, right_index=True
)

In [9]:
print(pao1_gene_module_labels.shape)
pao1_gene_module_labels.head()

(5563, 8)


Unnamed: 0,module id,num core,num acc,odds ratio,p-value,module label,size,gene name
PA0001,390,12,0,inf,1.0,mixed,12,dnaA
PA0006,390,12,0,inf,1.0,mixed,12,
PA1741,390,12,0,inf,1.0,mixed,12,
PA2558,390,12,0,inf,1.0,mixed,12,
PA3817,390,12,0,inf,1.0,mixed,12,


In [10]:
print(pa14_gene_module_labels.shape)
pa14_gene_module_labels.head()

(5891, 8)


Unnamed: 0,module id,num core,num acc,odds ratio,p-value,module label,size,gene name
PA14_55610,425,11,0,inf,0.614116,mixed,11,dnaE2
PA14_55590,425,11,0,inf,0.614116,mixed,11,
PA14_40370,425,11,0,inf,0.614116,mixed,11,
PA14_25160,425,11,0,inf,0.614116,mixed,11,lexA
PA14_25150,425,11,0,inf,0.614116,mixed,11,


## Add KEGG pathways

For each pathway, what genes are contained in it

In [11]:
pao1_pathway_filename = "https://raw.githubusercontent.com/greenelab/adage/master/Node_interpretation/pseudomonas_KEGG_terms.txt"

pao1_pathways = pd.read_csv(pao1_pathway_filename, sep="\t", index_col=0, header=None)

In [12]:
pao1_pathways[2] = pao1_pathways[2].str.split(";").apply(set)
pao1_pathways.index = pao1_pathways.index.str.split(" - ").str[0]
pao1_pathways.head()

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
KEGG-Pathway-pae00072: Synthesis and degradation of ketone bodies,10,"{PA2003, PA3925, PA4785, PA1736, PA2000, PA255..."
KEGG-Pathway-pae00071: Fatty acid degradation,32,"{PA2550, PA4899, PA3454, PA3299, PA3014, PA173..."
KEGG-Pathway-pae00903: Limonene and pinene degradation,9,"{PA1737, PA2475, PA4899, PA1748, PA1821, PA301..."
KEGG-Pathway-pae00380: Tryptophan metabolism,27,"{PA4899, PA3014, PA4342, PA0421, PA1737, PA392..."
KEGG-Pathway-pae00900: Terpenoid backbone biosynthesis,16,"{PA3633, PA3925, PA4785, PA4669, PA3650, PA456..."


In [13]:
gene_to_pathways_df = pd.DataFrame(
    index=pao1_gene_module_labels.index, columns=list(pao1_pathways.index)
)

In [14]:
%%time
for gene in gene_to_pathways_df.index:
    gene_to_pathways_df.loc[gene] = [
        gene in pao1_pathways.loc[pathway, 2] for pathway in pao1_pathways.index
    ]

CPU times: user 2min 26s, sys: 3.96 ms, total: 2min 26s
Wall time: 2min 26s


## Add operon

For each operon, what genes are contained in it

NOTE: This code takes a while to run so for now its commented out

In [15]:
pao1_operon_filename = "https://raw.githubusercontent.com/greenelab/core-accessory-interactome/6635c0e357c0172c2cebd0368648030e0ee4beaf/data/metadata/operons_format.csv"

pao1_operons = pd.read_csv(pao1_operon_filename, index_col=0, header=0)

In [16]:
pao1_operons.head()

Unnamed: 0_level_0,Length,Genes
OperonID,Unnamed: 1_level_1,Unnamed: 2_level_1
12029,4,PA0001;PA0002;PA0003;PA0004
12030,2,PA0005;PA0006
12031,2,PA0008;PA0009
12032,3,PA0016;PA0017;PA0018
12033,2,PA0021;PA0022


In [17]:
pao1_operons["Genes"] = pao1_operons["Genes"].str.split(";").apply(set)
pao1_operons.head()

Unnamed: 0_level_0,Length,Genes
OperonID,Unnamed: 1_level_1,Unnamed: 2_level_1
12029,4,"{PA0004, PA0001, PA0002, PA0003}"
12030,2,"{PA0005, PA0006}"
12031,2,"{PA0009, PA0008}"
12032,3,"{PA0016, PA0017, PA0018}"
12033,2,"{PA0021, PA0022}"


In [18]:
# Remove operons with a single gene
pao1_operons = pao1_operons[pao1_operons["Genes"].apply(len) > 1]

In [19]:
gene_to_operons_df = pd.DataFrame(
    index=pao1_gene_module_labels.index, columns=list(pao1_operons.index)
)

In [20]:
%%time
for gene in gene_to_operons_df.index:
    gene_to_operons_df.loc[gene] = [
        gene in pao1_operons.loc[operon, "Genes"] for operon in pao1_operons.index
    ]

CPU times: user 18min 58s, sys: 1min 56s, total: 20min 55s
Wall time: 20min 54s


In [21]:
# Add operons to pathway annotations
pao1_gene_annot = gene_to_pathways_df.merge(
    gene_to_operons_df, left_index=True, right_index=True, how="outer"
)

In [22]:
print(pao1_gene_annot.shape)
pao1_gene_annot.head()

(5563, 1308)


Unnamed: 0,KEGG-Pathway-pae00072: Synthesis and degradation of ketone bodies,KEGG-Pathway-pae00071: Fatty acid degradation,KEGG-Pathway-pae00903: Limonene and pinene degradation,KEGG-Pathway-pae00380: Tryptophan metabolism,KEGG-Pathway-pae00900: Terpenoid backbone biosynthesis,KEGG-Pathway-pae00660: C5-Branched dibasic acid metabolism,"KEGG-Pathway-pae00260: Glycine, serine and threonine metabolism",KEGG-Pathway-pae00780: Biotin metabolism,KEGG-Pathway-pae02060: Phosphotransferase system (PTS),KEGG-Pathway-pae00364: Fluorobenzoate degradation,...,13158,13159,13160,13161,13162,13163,13164,13165,13166,13167
PA0001,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0006,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA1741,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA2558,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA3817,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Add regulon

For each regulon, what genes are contained in it

In [23]:
pao1_regulon_filename = "https://raw.githubusercontent.com/greenelab/core-accessory-interactome/6635c0e357c0172c2cebd0368648030e0ee4beaf/data/metadata/regulons_format.csv"

pao1_regulons = pd.read_csv(pao1_regulon_filename, index_col=0, header=0)

In [24]:
pao1_regulons["Genes"] = pao1_regulons["Genes"].str.split(";").apply(set)

In [25]:
gene_to_regulons_df = pd.DataFrame(
    index=pao1_gene_module_labels.index, columns=list(pao1_regulons.index)
)

In [26]:
%%time
for gene in gene_to_regulons_df.index:
    gene_to_regulons_df.loc[gene] = [
        gene in pao1_regulons.loc[regulon, "Genes"] for regulon in pao1_regulons.index
    ]

CPU times: user 14.6 s, sys: 92 µs, total: 14.6 s
Wall time: 14.6 s


In [27]:
# Add regulons to other annotations
pao1_gene_annot = pao1_gene_annot.merge(
    gene_to_regulons_df, left_index=True, right_index=True, how="outer"
)

In [28]:
print(pao1_gene_annot.shape)
pao1_gene_annot.head()

(5563, 1325)


Unnamed: 0,KEGG-Pathway-pae00072: Synthesis and degradation of ketone bodies,KEGG-Pathway-pae00071: Fatty acid degradation,KEGG-Pathway-pae00903: Limonene and pinene degradation,KEGG-Pathway-pae00380: Tryptophan metabolism,KEGG-Pathway-pae00900: Terpenoid backbone biosynthesis,KEGG-Pathway-pae00660: C5-Branched dibasic acid metabolism,"KEGG-Pathway-pae00260: Glycine, serine and threonine metabolism",KEGG-Pathway-pae00780: Biotin metabolism,KEGG-Pathway-pae02060: Phosphotransferase system (PTS),KEGG-Pathway-pae00364: Fluorobenzoate degradation,...,PqsR_regulon,QscR_regulon,VreI_regulon,Zur_regulon,Anr_short_list,PhoB_short_list,AlgU_short_list,LasR_short_list,RhlR_short_list,PqsR_short_list
PA0001,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0006,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA1741,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA2558,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA3817,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Map pathway, operon, regulon to PA14

The annotations we have are only for PAO1 genes, so we will map PAO1 core genes to PA14 core genes to add annotations to PA14

In [29]:
pao1_annotation_filename = paths.GENE_PAO1_ANNOT
gene_mapping_pao1 = utils.get_pao1_pa14_gene_map(pao1_annotation_filename, "pao1")

In [30]:
gene_mapping_pao1 = gene_mapping_pao1["PA14_ID"].to_frame()

In [31]:
# Map PA14 gene ids
pao1_pa14_gene_annot = pao1_gene_annot.merge(
    gene_mapping_pao1, left_index=True, right_index=True
)

In [32]:
pao1_pa14_gene_annot.head()

Unnamed: 0,KEGG-Pathway-pae00072: Synthesis and degradation of ketone bodies,KEGG-Pathway-pae00071: Fatty acid degradation,KEGG-Pathway-pae00903: Limonene and pinene degradation,KEGG-Pathway-pae00380: Tryptophan metabolism,KEGG-Pathway-pae00900: Terpenoid backbone biosynthesis,KEGG-Pathway-pae00660: C5-Branched dibasic acid metabolism,"KEGG-Pathway-pae00260: Glycine, serine and threonine metabolism",KEGG-Pathway-pae00780: Biotin metabolism,KEGG-Pathway-pae02060: Phosphotransferase system (PTS),KEGG-Pathway-pae00364: Fluorobenzoate degradation,...,QscR_regulon,VreI_regulon,Zur_regulon,Anr_short_list,PhoB_short_list,AlgU_short_list,LasR_short_list,RhlR_short_list,PqsR_short_list,PA14_ID
PA0001,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,PA14_00010
PA0006,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,PA14_00070
PA1741,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,PA14_42020
PA2558,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,PA14_31460
PA3817,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,PA14_14690


In [33]:
# Reset index to PA14 gene ids
pa14_gene_annot = pao1_pa14_gene_annot.set_index("PA14_ID")
print(pa14_gene_annot.shape)
pa14_gene_annot.head()

(5563, 1325)


Unnamed: 0_level_0,KEGG-Pathway-pae00072: Synthesis and degradation of ketone bodies,KEGG-Pathway-pae00071: Fatty acid degradation,KEGG-Pathway-pae00903: Limonene and pinene degradation,KEGG-Pathway-pae00380: Tryptophan metabolism,KEGG-Pathway-pae00900: Terpenoid backbone biosynthesis,KEGG-Pathway-pae00660: C5-Branched dibasic acid metabolism,"KEGG-Pathway-pae00260: Glycine, serine and threonine metabolism",KEGG-Pathway-pae00780: Biotin metabolism,KEGG-Pathway-pae02060: Phosphotransferase system (PTS),KEGG-Pathway-pae00364: Fluorobenzoate degradation,...,PqsR_regulon,QscR_regulon,VreI_regulon,Zur_regulon,Anr_short_list,PhoB_short_list,AlgU_short_list,LasR_short_list,RhlR_short_list,PqsR_short_list
PA14_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PA14_00010,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA14_00070,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA14_42020,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA14_31460,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA14_14690,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [34]:
# Merge annotations with module labels
pao1_gene_summary = pao1_gene_module_labels.merge(
    pao1_gene_annot, left_index=True, right_index=True, how="left"
)
pa14_gene_summary = pa14_gene_module_labels.merge(
    pa14_gene_annot, left_index=True, right_index=True, how="left"
)

In [35]:
print(pao1_gene_summary.shape)
pao1_gene_summary.head()

(5563, 1333)


Unnamed: 0,module id,num core,num acc,odds ratio,p-value,module label,size,gene name,KEGG-Pathway-pae00072: Synthesis and degradation of ketone bodies,KEGG-Pathway-pae00071: Fatty acid degradation,...,PqsR_regulon,QscR_regulon,VreI_regulon,Zur_regulon,Anr_short_list,PhoB_short_list,AlgU_short_list,LasR_short_list,RhlR_short_list,PqsR_short_list
PA0001,390,12,0,inf,1.0,mixed,12,dnaA,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0006,390,12,0,inf,1.0,mixed,12,,False,False,...,False,False,False,False,False,False,False,False,False,False
PA1741,390,12,0,inf,1.0,mixed,12,,False,False,...,False,False,False,False,False,False,False,False,False,False
PA2558,390,12,0,inf,1.0,mixed,12,,False,False,...,False,False,False,False,False,False,False,False,False,False
PA3817,390,12,0,inf,1.0,mixed,12,,False,False,...,False,False,False,False,False,False,False,False,False,False


In [36]:
print(pa14_gene_summary.shape)
pa14_gene_summary.head()

(5894, 1333)


Unnamed: 0,module id,num core,num acc,odds ratio,p-value,module label,size,gene name,KEGG-Pathway-pae00072: Synthesis and degradation of ketone bodies,KEGG-Pathway-pae00071: Fatty acid degradation,...,PqsR_regulon,QscR_regulon,VreI_regulon,Zur_regulon,Anr_short_list,PhoB_short_list,AlgU_short_list,LasR_short_list,RhlR_short_list,PqsR_short_list
PA14_00010,516,7,0,inf,1.0,mixed,7,dnaA,False,False,...,False,False,False,False,False,False,False,False,False,False
PA14_00020,516,7,0,inf,1.0,mixed,7,dnaN,False,False,...,False,False,False,False,False,False,False,False,False,False
PA14_00030,516,7,0,inf,1.0,mixed,7,recF,False,False,...,False,False,False,False,False,False,False,False,False,False
PA14_00050,456,7,0,inf,1.0,mixed,7,gyrB,False,False,...,False,False,False,False,False,False,False,False,False,False
PA14_00060,245,20,0,inf,0.250157,mixed,20,,False,False,...,False,False,False,False,False,False,False,False,False,False


In [45]:
# Drop duplicates
pa14_gene_summary = pa14_gene_summary[~pa14_gene_summary.index.duplicated(keep=False)]

In [46]:
# Save
pao1_gene_summary.to_csv(
    os.path.join(paths.LOCAL_DATA_DIR, "pao1_gene_module_annotated.tsv"), sep="\t"
)
pa14_gene_summary.to_csv(
    os.path.join(paths.LOCAL_DATA_DIR, "pa14_gene_module_annotated.tsv"), sep="\t"
)