# Add annotations

This notebook takes the dataframe with information about module composition and their labels and adds additional annotations including:

1. Which gene is contained within the modules (both gene id and gene name)
2. Baseline expression and expression in some context of interest
3. How clustered the module is on the genome
4. KEGG pathways that genes are found in
5. GO pathways genes are found in
6. Regulon/operon genes are found in

All this information will help _P. aeruginosa_ experiments filter and determine which module might be interesting to explore.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import random
import scipy
import pandas as pd
import numpy as np
from itertools import product
from core_acc_modules import paths

random.seed(1)

In [2]:
# User param
method = "affinity"

In [3]:
# Import gene memberships
pao1_membership_filename = os.path.join(
    paths.LOCAL_DATA_DIR, f"pao1_modules_{method}_acc.tsv"
)
pa14_membership_filename = os.path.join(
    paths.LOCAL_DATA_DIR, f"pa14_modules_{method}_acc.tsv"
)

pao1_membership = pd.read_csv(pao1_membership_filename, sep="\t", index_col=0, header=0)
pa14_membership = pd.read_csv(pa14_membership_filename, sep="\t", index_col=0, header=0)

In [4]:
# Import gene metadata
pao1_gene_annot_filename = paths.GENE_PAO1_ANNOT
pa14_gene_annot_filename = paths.GENE_PA14_ANNOT

pao1_gene_annot = pd.read_csv(pao1_gene_annot_filename, index_col=0, header=0)
pa14_gene_annot = pd.read_csv(pa14_gene_annot_filename, index_col=0, header=0)

In [5]:
# Import metadata of samples
metadata_filename = paths.SAMPLE_METADATA

In [6]:
pao1_gene_annot = pao1_gene_annot["Name"].to_frame("gene name")
pa14_gene_annot = pa14_gene_annot["Name"].to_frame("gene name")

## Add gene names

In [7]:
# Add gene names
pao1_gene_module_labels = pao1_membership.merge(
    pao1_gene_annot, left_index=True, right_index=True
)
pa14_gene_module_labels = pa14_membership.merge(
    pa14_gene_annot, left_index=True, right_index=True
)

In [8]:
print(pao1_gene_module_labels.shape)
pao1_gene_module_labels.head()

(202, 2)


Unnamed: 0,module id,gene name
PA1224,8,
PA0497,0,
PA5149,8,
PA1391,14,
PA0188,1,


In [9]:
print(pa14_gene_module_labels.shape)
pa14_gene_module_labels.head()

(530, 2)


Unnamed: 0,module id,gene name
PA14_59860,35,
PA14_30880,17,
PA14_35820,21,tnpS
PA14_31280,0,
PA14_39670,22,


## Add expression information

1. What is the baseline level of expression for each gene in the module?
2. What is the expression level of genes in a clinical context (i.e. clinical samples)?

In [10]:
# Read in expression data
pao1_compendium = pd.read_csv(paths.PAO1_COMPENDIUM, sep="\t", index_col=0)
pa14_compendium = pd.read_csv(paths.PA14_COMPENDIUM, sep="\t", index_col=0)

In [11]:
pao1_compendium.head()

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA1905,PA0195,PA4812,PA0195.1,PA0457.1,PA1552.1,PA1555.1,PA3701,PA4724.1,PA5471.1
ERX541572,5793.218939,766.512255,1608.330977,1663.46607,176.163343,384.600886,295.846835,453.183561,611.865046,43.032267,...,1.344758,75.306467,447.804528,10.758067,65.893159,44.377025,56.47985,2033.274614,184.231893,1.344758
ERX541573,4416.506898,797.782811,1770.117221,1562.763979,313.958581,324.501966,333.873864,415.87797,550.599003,38.659079,...,1.171487,103.090877,698.206395,18.743796,138.235494,39.830566,96.061954,1315.580171,91.376005,1.171487
ERX541574,3825.086116,644.433113,1852.251003,1589.338107,260.936107,270.820051,363.729119,363.729119,423.03278,67.210815,...,1.976789,128.491265,468.49892,19.767887,73.141182,33.605408,55.350083,1759.341934,67.210815,3.953577
ERX541575,3834.097653,789.216207,1926.825153,1610.427665,289.734779,261.294555,250.629471,520.811596,666.567742,53.325419,...,1.777514,124.425979,611.464809,15.997626,177.751398,21.330168,108.428353,1486.001686,56.880447,1.777514
ERX541576,3515.165133,853.775186,2185.27713,1683.341246,183.98936,245.319146,253.388855,380.890253,551.968079,66.171612,...,1.613942,90.380738,745.641089,11.297592,130.729282,50.032194,95.222563,1273.400041,72.627379,1.613942


In [12]:
# Calculate median expression across all samples
pao1_median_all = pao1_compendium.median().to_frame("median expression")
pa14_median_all = pa14_compendium.median().to_frame("median expression")

In [13]:
pao1_median_all.head()

Unnamed: 0,median expression
PA0001,1846.069803
PA0002,1631.296497
PA0003,1042.829403
PA0004,2863.865717
PA0005,226.541717


In [14]:
# Select subset of samples and calculate the median expression across that subset of samples
# TO DO: Move this into utils
def get_sample_ids(
    metadata_filename, experiment_colname, sample_colname, experiment_id
):
    """
    Returns sample ids (found in gene expression df) associated with
    a given list of experiment ids (found in the metadata)

    Arguments
    ----------
    metadata_filename: str
        File containing metadata
    experiment_colname: str
        Column header that contains experiment id that maps expression data
        and metadata
    sample_colname: str
        Column header that contains sample id that maps expression data
        and metadata
    experiment_id: str
        Selected experiment id to grab samples from

    """
    # Read in metadata
    metadata = pd.read_csv(metadata_filename, header=0)
    metadata.set_index(experiment_colname, inplace=True)

    selected_metadata = metadata.loc[experiment_id]
    sample_ids = list(selected_metadata[sample_colname])

    return sample_ids

In [15]:
# TO DO: Have Deb or Georgia select a study
# Looks like we removed many of the clinical isolates from this compendium with our strain binning
# selected_sample_ids = get_sample_ids(
#   metadata_filename, experiment_colname="SRA_study", sample_colname="Experiment", experiment_id="SRP063289")

In [16]:
# Subset compendium
# subset_pao1_compendium = pao1_compendium.loc[selected_sample_ids]
# subset_pa14_compendium = pa14_compendium.loc[selected_sample_ids]

In [17]:
# print(subset_pao1_compendium.shape)
# print(subset_pa14_compendium.shape)

In [18]:
# pao1_median_subset = subset_pao1_compendium.median().to_frame("median subset expression")
# pa14_median_subset = subset_pa14_compendium.median().to_frame("median subset expression")

In [19]:
# Add median expression to gene ids
pao1_gene_annot = pao1_gene_module_labels.merge(
    pao1_median_all, left_index=True, right_index=True, how="left"
)
pa14_gene_annot = pa14_gene_module_labels.merge(
    pa14_median_all, left_index=True, right_index=True, how="left"
)

In [20]:
# Add median subset expression to gene ids
"""pao1_gene_annot = pao1_gene_annot.merge(
    pao1_median_subset, left_index=True, right_index=True, how="left"
)
pa14_gene_annot = pa14_gene_annot.merge(
    pa14_median_subset, left_index=True, right_index=True, how="left"
)"""

'pao1_gene_annot = pao1_gene_annot.merge(\n    pao1_median_subset, left_index=True, right_index=True, how="left"\n)\npa14_gene_annot = pa14_gene_annot.merge(\n    pa14_median_subset, left_index=True, right_index=True, how="left"\n)'

In [21]:
print(pao1_gene_annot.shape)
pao1_gene_annot.head()

(202, 3)


Unnamed: 0,module id,gene name,median expression
PA1224,8,,42.632516
PA0497,0,,57.609764
PA5149,8,,190.053787
PA1391,14,,60.082125
PA0188,1,,21.000574


In [22]:
print(pa14_gene_annot.shape)
pa14_gene_annot.head()

(530, 3)


Unnamed: 0,module id,gene name,median expression
PA14_59860,35,,9.022371
PA14_30880,17,,15.345832
PA14_35820,21,tnpS,840.90247
PA14_31280,0,,80.791518
PA14_39670,22,,3.942563


## Genome location information

How far are genes from other genes in the same module?

In [23]:
# TO DO: Move into scripts
# For genes in the same module, calculate the pairwise distance from each other
# Calculate the median pairwise distance to represent how spread the module is
# across the genome
# Other metrics?


def get_intra_module_dist(annot_df, pa_prefix):
    rows = []
    for grp_name, grp_df in annot_df.groupby("module id"):
        # print("module", grp_name)

        # Trim off "PA" and convert number to integer
        ids = grp_df.index

        # Convert trailing id numbers to floats
        num_ids = [float(_id.split(pa_prefix)[1]) for _id in ids]

        abs_dist = []
        for gene1, gene2 in product(num_ids, num_ids):
            if gene1 != gene2:
                dist = abs(gene1 - gene2)
                # print(gene1, gene2, dist)
                abs_dist.append(dist)

        median_module_dist = np.median(abs_dist)
        min_dist = np.min(abs_dist)
        max_dist = np.max(abs_dist)

        for _id in ids:
            rows.append(
                {
                    "gene id": _id,
                    "median pairwise dist": median_module_dist,
                    "min pairwise dist": min_dist,
                    "max pairwise dist": max_dist,
                }
            )

    module_dist = pd.DataFrame(rows)
    module_dist = module_dist.set_index("gene id")

    return module_dist

In [24]:
pao1_module_dist = get_intra_module_dist(pao1_gene_annot, pa_prefix="PA")
pa14_module_dist = get_intra_module_dist(pa14_gene_annot, pa_prefix="PA14_")

In [25]:
pao1_module_dist.head(10)

Unnamed: 0_level_0,median pairwise dist,min pairwise dist,max pairwise dist
gene id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PA0497,163.0,1.0,326.0
PA0499,163.0,1.0,326.0
PA0823,163.0,1.0,326.0
PA0498,163.0,1.0,326.0
PA0188,1861.5,1.0,5165.0
PA3066,1861.5,1.0,5165.0
PA2771,1861.5,1.0,5165.0
PA3067,1861.5,1.0,5165.0
PA2119,1861.5,1.0,5165.0
PA2192,1861.5,1.0,5165.0


In [26]:
pa14_module_dist.head(10)

Unnamed: 0_level_0,median pairwise dist,min pairwise dist,max pairwise dist
gene id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PA14_31280,285.0,10.0,29210.0
PA14_30980,285.0,10.0,29210.0
PA14_31270,285.0,10.0,29210.0
PA14_53570,285.0,10.0,29210.0
PA14_31240,285.0,10.0,29210.0
PA14_49520,285.0,10.0,29210.0
PA14_31180,285.0,10.0,29210.0
PA14_31160,285.0,10.0,29210.0
PA14_30990,285.0,10.0,29210.0
PA14_31000,285.0,10.0,29210.0


In [27]:
# Add module distance to gene names
pao1_gene_annot = pao1_gene_annot.merge(
    pao1_module_dist, left_index=True, right_index=True, how="left"
)
pa14_gene_annot = pa14_gene_annot.merge(
    pa14_module_dist, left_index=True, right_index=True, how="left"
)

## Add KEGG pathways

For each pathway, what genes are contained in it. This information is only available for PAO1.

In [28]:
pao1_pathway_filename = "https://raw.githubusercontent.com/greenelab/adage/7a4eda39d360b224268921dc1f2c14b32788ab16/Node_interpretation/pseudomonas_KEGG_terms.txt"

pao1_pathways = pd.read_csv(pao1_pathway_filename, sep="\t", index_col=0, header=None)

In [29]:
pao1_pathways[2] = pao1_pathways[2].str.split(";").apply(set)
pao1_pathways.index = pao1_pathways.index.str.split(" - ").str[0]
pao1_pathways.head()

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
KEGG-Pathway-pae00072: Synthesis and degradation of ketone bodies,10,"{PA2011, PA2000, PA3589, PA2553, PA1999, PA173..."
KEGG-Pathway-pae00071: Fatty acid degradation,32,"{PA1525, PA2940, PA0447, PA3299, PA3925, PA087..."
KEGG-Pathway-pae00903: Limonene and pinene degradation,9,"{PA1821, PA4899, PA3426, PA1027, PA1748, PA247..."
KEGG-Pathway-pae00380: Tryptophan metabolism,27,"{PA4613, PA2080, PA0202, PA0447, PA3366, PA392..."
KEGG-Pathway-pae00900: Terpenoid backbone biosynthesis,16,"{PA4557, PA4044, PA3589, PA4569, PA2553, PA466..."


In [30]:
gene_to_pathways_df = pd.DataFrame(
    index=pao1_gene_module_labels.index, columns=list(pao1_pathways.index)
)

In [31]:
%%time
for gene in gene_to_pathways_df.index:
    gene_to_pathways_df.loc[gene] = [
        gene in pao1_pathways.loc[pathway, 2] for pathway in pao1_pathways.index
    ]

CPU times: user 217 ms, sys: 3.4 ms, total: 221 ms
Wall time: 220 ms


In [32]:
gene_to_pathways_df.head()

Unnamed: 0,KEGG-Pathway-pae00072: Synthesis and degradation of ketone bodies,KEGG-Pathway-pae00071: Fatty acid degradation,KEGG-Pathway-pae00903: Limonene and pinene degradation,KEGG-Pathway-pae00380: Tryptophan metabolism,KEGG-Pathway-pae00900: Terpenoid backbone biosynthesis,KEGG-Pathway-pae00660: C5-Branched dibasic acid metabolism,"KEGG-Pathway-pae00260: Glycine, serine and threonine metabolism",KEGG-Pathway-pae00780: Biotin metabolism,KEGG-Pathway-pae02060: Phosphotransferase system (PTS),KEGG-Pathway-pae00364: Fluorobenzoate degradation,...,KEGG-Module-M00436: Sulfonate transport system,KEGG-Module-M00300: Putrescine transport system,KEGG-Module-M00200: Putative sorbitol/mannitol transport system,"KEGG-Module-M00360: Aminoacyl-tRNA biosynthesis, prokaryotes",KEGG-Module-M00238: D-Methionine transport system,KEGG-Module-M00208: Glycine betaine/proline transport system,"KEGG-Module-M00176: Assimilatory sulfate reduction, sulfate => H2S","KEGG-Module-M00570: Isoleucine biosynthesis, threonine => 2-oxobutanoate => isoleucine","KEGG-Module-M00572: Pimeloyl-ACP biosynthesis, BioC-BioH pathway, malonyl-ACP => pimeloyl-ACP","KEGG-Module-M00178: Ribosome, bacteria"
PA1224,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0497,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA5149,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA1391,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0188,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [33]:
# Add gene name to pathway information
pao1_gene_annot = pao1_gene_annot.merge(
    gene_to_pathways_df, left_index=True, right_index=True, how="left"
)

## Import and format operon

In [34]:
pao1_operon_filename = paths.PAO1_OPERON
pa14_operon_filename = paths.PA14_OPERON

In [35]:
pao1_operon = pd.read_csv(pao1_operon_filename, index_col=0, header=0)
pa14_operon = pd.read_csv(pa14_operon_filename, index_col=0, header=0)

In [36]:
pao1_operon = pao1_operon.set_index("locus_tag")
pa14_operon = pa14_operon.set_index("locus_tag")

In [37]:
print(pao1_operon.shape)
pao1_operon.head()

(3816, 7)


Unnamed: 0_level_0,operon_name,start,end,strand,gene_name,source_database,pmid
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PA0001,dnaA-dnaN-recF-gyrB,483,2027,1,dnaA,DOOR,18988623
PA0002,dnaA-dnaN-recF-gyrB,2056,3159,1,dnaN,DOOR,18988623
PA0003,dnaA-dnaN-recF-gyrB,3169,4278,1,recF,DOOR,18988623
PA0004,dnaA-dnaN-recF-gyrB,4275,6695,1,gyrB,DOOR,18988623
PA0005,PA0006-lptA,7018,7791,-1,lptA,DOOR,18988623


In [38]:
print(pa14_operon.shape)
pa14_operon.head()

(3756, 7)


Unnamed: 0_level_0,operon_name,start,end,strand,gene_name,source_database,pmid
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PA14_00010,dnaA-dnaN-recF-gyrB,483,2027,1,dnaA,DOOR,18988623
PA14_00020,dnaA-dnaN-recF-gyrB,2056,3159,1,dnaN,DOOR,18988623
PA14_00030,dnaA-dnaN-recF-gyrB,3169,4278,1,recF,DOOR,18988623
PA14_00050,dnaA-dnaN-recF-gyrB,4275,6695,1,gyrB,DOOR,18988623
PA14_00060,PA14_00070-PA14_00060,7018,7791,-1,,DOOR,18988623


In [39]:
# There are 247 PAO1 genes with multiple annotations
# This operon df contains annotations from predicted operons based on DOOR database
# predictions which make up the majority of the operons) as well as some that
# are curated (i.e. PseudoCAP)
# There are some that have multiple PseudoCAP annotations too

# Here we will keep the last PseudoCAP annotations
# Note: Do we want to discard these annotations all together
# or will these need to be carefully curated to determine which to keep?
# We will use the curated annotation here
pao1_operon = pao1_operon[~pao1_operon.index.duplicated(keep="last")]
pa14_operon = pa14_operon[~pa14_operon.index.duplicated(keep="last")]

In [40]:
# Only include columns for gene id and operon_name
pao1_operon = pao1_operon["operon_name"].to_frame()
pa14_operon = pa14_operon["operon_name"].to_frame()

In [41]:
pao1_operon.head()

Unnamed: 0_level_0,operon_name
locus_tag,Unnamed: 1_level_1
PA0001,dnaA-dnaN-recF-gyrB
PA0002,dnaA-dnaN-recF-gyrB
PA0003,dnaA-dnaN-recF-gyrB
PA0004,dnaA-dnaN-recF-gyrB
PA0005,PA0006-lptA


In [42]:
# Add operons to pathway annotations for PAO1
pao1_gene_annot = pao1_gene_annot.merge(
    pao1_operon, left_index=True, right_index=True, how="left"
)

In [43]:
print(pao1_gene_annot.shape)
pao1_gene_annot.head()

(202, 176)


Unnamed: 0,module id,gene name,median expression,median pairwise dist,min pairwise dist,max pairwise dist,KEGG-Pathway-pae00072: Synthesis and degradation of ketone bodies,KEGG-Pathway-pae00071: Fatty acid degradation,KEGG-Pathway-pae00903: Limonene and pinene degradation,KEGG-Pathway-pae00380: Tryptophan metabolism,...,KEGG-Module-M00300: Putrescine transport system,KEGG-Module-M00200: Putative sorbitol/mannitol transport system,"KEGG-Module-M00360: Aminoacyl-tRNA biosynthesis, prokaryotes",KEGG-Module-M00238: D-Methionine transport system,KEGG-Module-M00208: Glycine betaine/proline transport system,"KEGG-Module-M00176: Assimilatory sulfate reduction, sulfate => H2S","KEGG-Module-M00570: Isoleucine biosynthesis, threonine => 2-oxobutanoate => isoleucine","KEGG-Module-M00572: Pimeloyl-ACP biosynthesis, BioC-BioH pathway, malonyl-ACP => pimeloyl-ACP","KEGG-Module-M00178: Ribosome, bacteria",operon_name
PA1224,8,,42.632516,1708.0,1.0,5475.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,
PA0497,0,,57.609764,163.0,1.0,326.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,
PA5149,8,,190.053787,1708.0,1.0,5475.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,PA5146-mutY-PA5148-PA5149
PA1391,14,,60.082125,4.0,1.0,12.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,PA1390-PA1391
PA0188,1,,21.000574,1861.5,1.0,5165.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,PA0187-PA0188


In [44]:
# For PA14 we only have operon annotations
pa14_gene_annot = pa14_gene_annot.merge(
    pa14_operon, left_index=True, right_index=True, how="left"
)

## Add regulon

For each regulon, what genes are contained in it. This information is only available for PAO1

In [45]:
pao1_regulon_filename = "https://raw.githubusercontent.com/greenelab/core-accessory-interactome/6635c0e357c0172c2cebd0368648030e0ee4beaf/data/metadata/regulons_format.csv"

pao1_regulons = pd.read_csv(pao1_regulon_filename, index_col=0, header=0)

In [46]:
pao1_regulons["Genes"] = pao1_regulons["Genes"].str.split(";").apply(set)

In [47]:
gene_to_regulons_df = pd.DataFrame(
    index=pao1_gene_module_labels.index, columns=list(pao1_regulons.index)
)

In [48]:
%%time
for gene in gene_to_regulons_df.index:
    gene_to_regulons_df.loc[gene] = [
        gene in pao1_regulons.loc[regulon, "Genes"] for regulon in pao1_regulons.index
    ]

CPU times: user 65.5 ms, sys: 3.77 ms, total: 69.3 ms
Wall time: 68.2 ms


In [49]:
# Add regulons to other annotations
pao1_gene_annot = pao1_gene_annot.merge(
    gene_to_regulons_df, left_index=True, right_index=True, how="left"
)

In [50]:
print(pao1_gene_annot.shape)
pao1_gene_annot.head()

(202, 193)


Unnamed: 0,module id,gene name,median expression,median pairwise dist,min pairwise dist,max pairwise dist,KEGG-Pathway-pae00072: Synthesis and degradation of ketone bodies,KEGG-Pathway-pae00071: Fatty acid degradation,KEGG-Pathway-pae00903: Limonene and pinene degradation,KEGG-Pathway-pae00380: Tryptophan metabolism,...,PqsR_regulon,QscR_regulon,VreI_regulon,Zur_regulon,Anr_short_list,PhoB_short_list,AlgU_short_list,LasR_short_list,RhlR_short_list,PqsR_short_list
PA1224,8,,42.632516,1708.0,1.0,5475.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0497,0,,57.609764,163.0,1.0,326.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA5149,8,,190.053787,1708.0,1.0,5475.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA1391,14,,60.082125,4.0,1.0,12.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0188,1,,21.000574,1861.5,1.0,5165.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [51]:
print(pa14_gene_annot.shape)
pa14_gene_annot.head()

(530, 7)


Unnamed: 0,module id,gene name,median expression,median pairwise dist,min pairwise dist,max pairwise dist,operon_name
PA14_59860,35,,9.022371,215.0,10.0,1040.0,PA14_59860-PA14_59870-PA14_59880-PA14_59890-PA...
PA14_30880,17,,15.345832,180.0,10.0,17610.0,PA14_30880-PA14_30870-PA14_30860-PA14_30850
PA14_35820,21,tnpS,840.90247,1830.0,10.0,25880.0,tnpS-PA14_35810-PA14_35800
PA14_31280,0,,80.791518,285.0,10.0,29210.0,PA14_31280-PA14_31270
PA14_39670,22,,3.942563,18030.0,10.0,61000.0,


In [52]:
# Save
pao1_gene_annot.to_csv(f"pao1_acc_gene_module_annotated_{method}.tsv", sep="\t")
pa14_gene_annot.to_csv(f"pa14_acc_gene_module_annotated_{method}.tsv", sep="\t")

These annotations will be used to help _P. aeruginosa_ experts, like our collaborators, to determine what accessory-accessory modules to focus on.


Note: Since genes can be in multiple KEGG pathways and regulons, each pathway and regulon are separate columns. Whereas operons are a single column since genes can belong to only a single operon.