# Add annotations

This notebook takes the dataframe with information about module composition and their labels and adds additional annotations including:

1. Which gene is contained within the modules (both gene id and gene name)
2. Baseline expression and expression in some context of interest
3. How clustered the module is on the genome
4. KEGG pathways that genes are found in
5. GO pathways genes are found in
6. Regulon/operon genes are found in

All this information will help _P. aeruginosa_ experiments filter and determine which module might be interesting to explore.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import random
import scipy.stats
import statsmodels.stats.multitest
import pandas as pd
import numpy as np
from scripts import paths, utils, modules, annotations

random.seed(1)

In [2]:
# Clustering method used to obtain gene-module assignments
method = "affinity"

In [3]:
# Import gene memberships
pao1_membership_filename = os.path.join(
    paths.LOCAL_DATA_DIR, f"pao1_modules_{method}_acc.tsv"
)
pa14_membership_filename = os.path.join(
    paths.LOCAL_DATA_DIR, f"pa14_modules_{method}_acc.tsv"
)

pao1_membership = pd.read_csv(pao1_membership_filename, sep="\t", index_col=0, header=0)
pa14_membership = pd.read_csv(pa14_membership_filename, sep="\t", index_col=0, header=0)

In [4]:
# Import gene metadata
pao1_gene_annot_filename = paths.GENE_PAO1_ANNOT
pa14_gene_annot_filename = paths.GENE_PA14_ANNOT

pao1_gene_annot = pd.read_csv(pao1_gene_annot_filename, index_col=0, header=0)
pa14_gene_annot = pd.read_csv(pa14_gene_annot_filename, index_col=0, header=0)

In [5]:
# Import metadata of samples
metadata_filename = paths.SAMPLE_METADATA

In [6]:
# Get df with gene ids as indices and gene names as a column
# Having the data in a df instead of a series will just allow me to do my merges that are in the notebook
pao1_gene_annot = pao1_gene_annot["Name"].to_frame("gene name")
pa14_gene_annot = pa14_gene_annot["Name"].to_frame("gene name")

## Add gene names

In [7]:
# Add gene names
pao1_gene_module_labels = pao1_membership.merge(
    pao1_gene_annot, left_index=True, right_index=True
)
pa14_gene_module_labels = pa14_membership.merge(
    pa14_gene_annot, left_index=True, right_index=True
)

In [8]:
# Note: Many gene ids don't have an associated gene name and so are NaNs
print(pao1_gene_module_labels.shape)
pao1_gene_module_labels.head()

(202, 2)


Unnamed: 0,module id,gene name
PA1224,8,
PA0497,0,
PA5149,8,
PA1391,14,
PA0188,1,


In [9]:
# Note: Many gene ids don't have an associated gene name and so are NaNs
print(pa14_gene_module_labels.shape)
pa14_gene_module_labels.head()

(530, 2)


Unnamed: 0,module id,gene name
PA14_59860,35,
PA14_30880,17,
PA14_35820,21,tnpS
PA14_31280,0,
PA14_39670,22,


## Add expression information

1. What is the baseline level of expression for each gene in the module?
2. What is the expression level of genes in a clinical context (i.e. clinical samples)?

In [10]:
# Read in expression data
# Data is of the form SRA sample id x gene id
pao1_compendium = pd.read_csv(paths.PAO1_COMPENDIUM, sep="\t", index_col=0)
pa14_compendium = pd.read_csv(paths.PA14_COMPENDIUM, sep="\t", index_col=0)

In [11]:
pao1_compendium.head()

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA1905,PA0195,PA4812,PA0195.1,PA0457.1,PA1552.1,PA1555.1,PA3701,PA4724.1,PA5471.1
ERX541572,5793.218939,766.512255,1608.330977,1663.46607,176.163343,384.600886,295.846835,453.183561,611.865046,43.032267,...,1.344758,75.306467,447.804528,10.758067,65.893159,44.377025,56.47985,2033.274614,184.231893,1.344758
ERX541573,4416.506898,797.782811,1770.117221,1562.763979,313.958581,324.501966,333.873864,415.87797,550.599003,38.659079,...,1.171487,103.090877,698.206395,18.743796,138.235494,39.830566,96.061954,1315.580171,91.376005,1.171487
ERX541574,3825.086116,644.433113,1852.251003,1589.338107,260.936107,270.820051,363.729119,363.729119,423.03278,67.210815,...,1.976789,128.491265,468.49892,19.767887,73.141182,33.605408,55.350083,1759.341934,67.210815,3.953577
ERX541575,3834.097653,789.216207,1926.825153,1610.427665,289.734779,261.294555,250.629471,520.811596,666.567742,53.325419,...,1.777514,124.425979,611.464809,15.997626,177.751398,21.330168,108.428353,1486.001686,56.880447,1.777514
ERX541576,3515.165133,853.775186,2185.27713,1683.341246,183.98936,245.319146,253.388855,380.890253,551.968079,66.171612,...,1.613942,90.380738,745.641089,11.297592,130.729282,50.032194,95.222563,1273.400041,72.627379,1.613942


In [12]:
# Calculate median expression across all samples
pao1_median_all = pao1_compendium.median().to_frame("median expression")
pa14_median_all = pa14_compendium.median().to_frame("median expression")

In [13]:
pao1_median_all.head()

Unnamed: 0,median expression
PA0001,1846.069803
PA0002,1631.296497
PA0003,1042.829403
PA0004,2863.865717
PA0005,226.541717


In [14]:
# TO DO: Have Deb or Georgia select a study
# The following code blocks allow me to Select subset of samples and calculate the median
# expression across that subset of samples.
# An interesting selection would be what the clinical expression is, however
# it looks like we removed many of the clinical isolates from this compendium with our strain binning
# For now I will leave these blocks commented out
# selected_sample_ids = utils.get_sample_ids(
#   metadata_filename, experiment_colname="SRA_study", sample_colname="Experiment", experiment_id="SRP063289")

In [15]:
# Subset compendium
# subset_pao1_compendium = pao1_compendium.loc[selected_sample_ids]
# subset_pa14_compendium = pa14_compendium.loc[selected_sample_ids]

In [16]:
# print(subset_pao1_compendium.shape)
# print(subset_pa14_compendium.shape)

In [17]:
# pao1_median_subset = subset_pao1_compendium.median().to_frame("median subset expression")
# pa14_median_subset = subset_pa14_compendium.median().to_frame("median subset expression")

In [18]:
# Add median expression to gene ids
pao1_gene_annot = pao1_gene_module_labels.merge(
    pao1_median_all, left_index=True, right_index=True, how="left"
)
pa14_gene_annot = pa14_gene_module_labels.merge(
    pa14_median_all, left_index=True, right_index=True, how="left"
)

In [19]:
# Add median subset expression to gene ids
"""pao1_gene_annot = pao1_gene_annot.merge(
    pao1_median_subset, left_index=True, right_index=True, how="left"
)
pa14_gene_annot = pa14_gene_annot.merge(
    pa14_median_subset, left_index=True, right_index=True, how="left"
)"""

'pao1_gene_annot = pao1_gene_annot.merge(\n    pao1_median_subset, left_index=True, right_index=True, how="left"\n)\npa14_gene_annot = pa14_gene_annot.merge(\n    pa14_median_subset, left_index=True, right_index=True, how="left"\n)'

In [20]:
print(pao1_gene_annot.shape)
pao1_gene_annot.head()

(202, 3)


Unnamed: 0,module id,gene name,median expression
PA1224,8,,42.632516
PA0497,0,,57.609764
PA5149,8,,190.053787
PA1391,14,,60.082125
PA0188,1,,21.000574


In [21]:
print(pa14_gene_annot.shape)
pa14_gene_annot.head()

(530, 3)


Unnamed: 0,module id,gene name,median expression
PA14_59860,35,,9.022371
PA14_30880,17,,15.345832
PA14_35820,21,tnpS,840.90247
PA14_31280,0,,80.791518
PA14_39670,22,,3.942563


## Genome location information

How far are genes from other genes in the same module?

In [22]:
pao1_module_dist = modules.get_intra_module_dist(pao1_gene_annot, pa_prefix="PA")
pa14_module_dist = modules.get_intra_module_dist(pa14_gene_annot, pa_prefix="PA14_")

In [23]:
pao1_module_dist.head(10)

Unnamed: 0_level_0,median pairwise dist,min pairwise dist,max pairwise dist
gene id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PA0497,163.0,1.0,326.0
PA0499,163.0,1.0,326.0
PA0823,163.0,1.0,326.0
PA0498,163.0,1.0,326.0
PA0188,1861.5,1.0,5165.0
PA3066,1861.5,1.0,5165.0
PA2771,1861.5,1.0,5165.0
PA3067,1861.5,1.0,5165.0
PA2119,1861.5,1.0,5165.0
PA2192,1861.5,1.0,5165.0


In [24]:
pa14_module_dist.head(10)

Unnamed: 0_level_0,median pairwise dist,min pairwise dist,max pairwise dist
gene id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PA14_31280,285.0,10.0,29210.0
PA14_30980,285.0,10.0,29210.0
PA14_31270,285.0,10.0,29210.0
PA14_53570,285.0,10.0,29210.0
PA14_31240,285.0,10.0,29210.0
PA14_49520,285.0,10.0,29210.0
PA14_31180,285.0,10.0,29210.0
PA14_31160,285.0,10.0,29210.0
PA14_30990,285.0,10.0,29210.0
PA14_31000,285.0,10.0,29210.0


In [25]:
# Add module distance to gene names
pao1_gene_annot = pao1_gene_annot.merge(
    pao1_module_dist, left_index=True, right_index=True, how="left"
)
pa14_gene_annot = pa14_gene_annot.merge(
    pa14_module_dist, left_index=True, right_index=True, how="left"
)

In [26]:
pao1_gene_annot.head()

Unnamed: 0,module id,gene name,median expression,median pairwise dist,min pairwise dist,max pairwise dist
PA1224,8,,42.632516,1708.0,1.0,5475.0
PA0497,0,,57.609764,163.0,1.0,326.0
PA5149,8,,190.053787,1708.0,1.0,5475.0
PA1391,14,,60.082125,4.0,1.0,12.0
PA0188,1,,21.000574,1861.5,1.0,5165.0


## Add KEGG pathway enrichment analysis

For each pathway, find significant association of pathways in accessory-accessory modules. This information is only available for PAO1.

The [Fisher's exact test](https://en.wikipedia.org/wiki/Fisher%27s_exact_test) determines whether there is a significant association between two categorical variables in a contingency table (i.e two classifications of the data). Here we used use the Fisher’s exact test to determine if there is an association between the two classifications: in kegg pathway or not and in accessory-accessory module or not. In other words, we want to determine if there is a statistically significant association between genes found in a given accessory-accessory moudle and the genes involved in a given KEGG pathway. To do this we compare the ratio of genes found in the kegg pathway that are in the accessory-accessory module to the ratio of kegg pathway genes that are not found in the accessory-accessory module.

Since the numbers are large, we also applied the $\chi^2$ test as an alternative to the Fisher's exact test.

In [27]:
pao1_pathway_filename = "https://raw.githubusercontent.com/greenelab/adage/7a4eda39d360b224268921dc1f2c14b32788ab16/Node_interpretation/pseudomonas_KEGG_terms.txt"

# pao1_pathways = pd.read_csv(pao1_pathway_filename, sep="\t", index_col=0, header=None)

In [28]:
# pao1_pathways[2] = pao1_pathways[2].str.split(";").apply(set)
# pao1_pathways.index = pao1_pathways.index.str.split(" - ").str[0]
pao1_pathways = annotations.load_format_KEGG(pao1_pathway_filename)
pao1_pathways.head()

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
KEGG-Pathway-pae00072: Synthesis and degradation of ketone bodies,10,"{PA4785, PA3589, PA1736, PA3925, PA2001, PA200..."
KEGG-Pathway-pae00071: Fatty acid degradation,32,"{PA4435, PA3014, PA1535, PA5349, PA3629, PA345..."
KEGG-Pathway-pae00903: Limonene and pinene degradation,9,"{PA1821, PA1027, PA2475, PA3331, PA1737, PA301..."
KEGG-Pathway-pae00380: Tryptophan metabolism,27,"{PA2579, PA3014, PA2147, PA2080, PA0421, PA182..."
KEGG-Pathway-pae00900: Terpenoid backbone biosynthesis,16,"{PA3627, PA3633, PA3803, PA4044, PA4785, PA456..."


In [29]:
pao1_gene_annot.head()

Unnamed: 0,module id,gene name,median expression,median pairwise dist,min pairwise dist,max pairwise dist
PA1224,8,,42.632516,1708.0,1.0,5475.0
PA0497,0,,57.609764,163.0,1.0,326.0
PA5149,8,,190.053787,1708.0,1.0,5475.0
PA1391,14,,60.082125,4.0,1.0,12.0
PA0188,1,,21.000574,1861.5,1.0,5165.0


In [30]:
# Given an accessory-accessory module, look for the array module with the most overlap/significant p-value
def KEGG_enrichment(acc_membership_df, kegg_df):
    all_genes = set(acc_membership_df.index)

    rows = []
    best_rows = []
    # For each accessory-accessory module
    for module_name, module_df_group in acc_membership_df.groupby("module id"):
        num_module_genes = module_df_group.shape[0]
        module_genes = set(module_df_group.index)
        not_module_genes = all_genes.difference(module_genes)

        # Find the KEGG pathway with the best overlap
        for kegg_name in kegg_df.index:
            num_kegg_genes = kegg_df.loc[kegg_name, 1]
            kegg_genes = set(kegg_df.loc[kegg_name, 2])
            not_kegg_genes = all_genes.difference(kegg_genes)

            # Make contingency table
            # -----------------|accessory module |not accessory module
            # kegg pathway     | # genes         | # genes
            # not kegg pathway | # genes         | # genes
            module_kegg_genes = module_genes.intersection(kegg_genes)
            not_module_kegg_genes = not_module_genes.intersection(kegg_genes)
            module_not_kegg_genes = module_genes.intersection(not_kegg_genes)
            not_module_not_kegg_genes = not_module_genes.intersection(not_kegg_genes)

            observed_contingency_table = np.array(
                [
                    [len(module_kegg_genes), len(not_module_kegg_genes)],
                    [len(module_not_kegg_genes), len(not_module_not_kegg_genes)],
                ]
            )

            # Fisher's exact test
            oddsr, pval = scipy.stats.fisher_exact(
                observed_contingency_table, alternative="greater"
            )
            # chi2 test will not accept 0 counts for the contingency table
            # chi2, pval, dof, expected_counts = scipy.stats.chi2_contingency(
            #    observed_contingency_table
            # )
            # print(oddsr, pval)

            rows.append(
                {
                    "module id": module_name,
                    "enriched KEGG pathway": kegg_name,
                    "p-value": pval,
                    "num shared genes": len(module_kegg_genes),
                    "size module": num_module_genes,
                    "size KEGG pathway": num_kegg_genes,
                }
            )

    enrichment_df = pd.DataFrame(rows)

    # Get corrected pvalues
    (
        reject_,
        pvals_corrected_,
        alphacSidak,
        alphacBonf,
    ) = statsmodels.stats.multitest.multipletests(
        enrichment_df["p-value"].values,
        alpha=0.05,
        method="fdr_bh",
        is_sorted=False,
    )

    enrichment_df["corrected p-value"] = pvals_corrected_

    # Select best module mapping
    for grp, grp_df in enrichment_df.groupby("module id"):
        # Find if any pathways is significant
        any_significant = (grp_df["corrected p-value"] < 0.05).any()
        if any_significant:
            best_kegg = grp_df[grp_df["corrected p-value"] < 0.05][
                "enriched KEGG pathway"
            ]
            best_pval = grp_df[grp_df["corrected p-value"] < 0.05]["p-value"].values[0]
            best_shared = grp_df[grp_df["corrected p-value"] < 0.05][
                "num shared genes"
            ].values[0]
            best_module_size = grp_df[grp_df["corrected p-value"] < 0.05][
                "size module"
            ].values[0]
            best_kegg_size = grp_df[grp_df["corrected p-value"] < 0.05][
                "size KEGG pathway"
            ].values[0]
            best_corrected_pval = grp_df[grp_df["corrected p-value"] < 0.05][
                "corrected p-value"
            ].values[0]
            best_rows.append(
                {
                    "module id": grp,
                    "enriched KEGG pathway": best_kegg,
                    "p-value": best_pval,
                    "num shared genes": best_shared,
                    "size module": best_module_size,
                    "size KEGG pathway": best_kegg_size,
                    "corrected p-value": best_corrected_pval,
                }
            )
        else:
            best_rows.append(
                {
                    "module id": grp,
                    "enriched KEGG pathway": "NA",
                    "p-value": "NA",
                    "num shared genes": "NA",
                    "size module": "NA",
                    "size KEGG pathway": "NA",
                    "corrected p-value": "NA",
                }
            )
    best_enrichment_df = pd.DataFrame(best_rows).set_index("module id")

    return best_enrichment_df

In [31]:
pao1_enrichment_df = KEGG_enrichment(pao1_membership, pao1_pathways)

In [32]:
pao1_enrichment_df.head(20)

Unnamed: 0_level_0,enriched KEGG pathway,p-value,num shared genes,size module,size KEGG pathway,corrected p-value
module id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,,,,,,
1,,,,,,
2,,,,,,
3,,,,,,
4,809 KEGG-Module-M00236: Putative polar amin...,1.03963e-06,4.0,8.0,24.0,0.00166913
5,952 KEGG-Module-M00299: Spermidine/putresci...,5.19815e-07,4.0,7.0,12.0,0.00166913
6,,,,,,
7,,,,,,
8,,,,,,
9,,,,,,


In [33]:
# Add pathway enrichment information
pao1_gene_annot = pao1_gene_annot.merge(
    pao1_enrichment_df, left_on="module id", right_index=True, how="left"
)

In [34]:
pao1_gene_annot.head()

Unnamed: 0,module id,gene name,median expression,median pairwise dist,min pairwise dist,max pairwise dist,enriched KEGG pathway,p-value,num shared genes,size module,size KEGG pathway,corrected p-value
PA1224,8,,42.632516,1708.0,1.0,5475.0,,,,,,
PA0497,0,,57.609764,163.0,1.0,326.0,,,,,,
PA5149,8,,190.053787,1708.0,1.0,5475.0,,,,,,
PA1391,14,,60.082125,4.0,1.0,12.0,,,,,,
PA0188,1,,21.000574,1861.5,1.0,5165.0,,,,,,


## Import and format operon

In [35]:
pao1_operon_filename = paths.PAO1_OPERON
pa14_operon_filename = paths.PA14_OPERON

In [36]:
# pao1_operon = pd.read_csv(pao1_operon_filename, index_col=0, header=0)
# pa14_operon = pd.read_csv(pa14_operon_filename, index_col=0, header=0)

In [37]:
# There are 247 PAO1 genes with multiple annotations
# This operon df contains annotations from predicted operons based on DOOR database
# predictions which make up the majority of the operons) as well as some that
# are curated (i.e. PseudoCAP)
# There are some that have multiple PseudoCAP annotations too

# Here we will keep the last PseudoCAP annotations
# To ensure that the PseudoCAP annotations are the last ones, we will sort the values
# pao1_operon = pao1_operon.sort_values(by=["locus_tag", "source_database"])
# pa14_operon = pa14_operon.sort_values(by=["locus_tag", "source_database"])

In [38]:
# pao1_operon = pao1_operon.set_index("locus_tag")
# pa14_operon = pa14_operon.set_index("locus_tag")

In [39]:
# print(pao1_operon.shape)
# pao1_operon.head()

(3816, 7)


Unnamed: 0_level_0,operon_name,start,end,strand,gene_name,source_database,pmid
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PA0001,dnaA-dnaN-recF-gyrB,483,2027,1,dnaA,DOOR,18988623
PA0002,dnaA-dnaN-recF-gyrB,2056,3159,1,dnaN,DOOR,18988623
PA0003,dnaA-dnaN-recF-gyrB,3169,4278,1,recF,DOOR,18988623
PA0004,dnaA-dnaN-recF-gyrB,4275,6695,1,gyrB,DOOR,18988623
PA0005,PA0006-lptA,7018,7791,-1,lptA,DOOR,18988623


In [40]:
# print(pa14_operon.shape)
# pa14_operon.head()

(3756, 7)


Unnamed: 0_level_0,operon_name,start,end,strand,gene_name,source_database,pmid
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PA14_00010,dnaA-dnaN-recF-gyrB,483,2027,1,dnaA,DOOR,18988623
PA14_00020,dnaA-dnaN-recF-gyrB,2056,3159,1,dnaN,DOOR,18988623
PA14_00030,dnaA-dnaN-recF-gyrB,3169,4278,1,recF,DOOR,18988623
PA14_00050,dnaA-dnaN-recF-gyrB,4275,6695,1,gyrB,DOOR,18988623
PA14_00060,PA14_00070-PA14_00060,7018,7791,-1,,DOOR,18988623


In [41]:
# pao1_operon = pao1_operon[~pao1_operon.index.duplicated(keep="last")]
# pa14_operon = pa14_operon[~pa14_operon.index.duplicated(keep="last")]

In [42]:
# Only include columns for gene id and operon_name
# pao1_operon = pao1_operon["operon_name"].to_frame()
# pa14_operon = pa14_operon["operon_name"].to_frame()

In [None]:
pao1_operon = annotations.load_format_operons(pao1_operon_filename)
pa14_operon = annotations.load_format_operons(pa14_operon_filename)

# TO DO:
# Check that the dim
# Check the saved output

In [43]:
pao1_operon.head()

Unnamed: 0_level_0,operon_name
locus_tag,Unnamed: 1_level_1
PA0001,dnaA-dnaN-recF-gyrB
PA0002,dnaA-dnaN-recF-gyrB
PA0003,dnaA-dnaN-recF-gyrB
PA0004,dnaA-dnaN-recF-gyrB
PA0005,PA0006-lptA


In [44]:
# Add operons to pathway annotations for PAO1
pao1_gene_annot = pao1_gene_annot.merge(
    pao1_operon, left_index=True, right_index=True, how="left"
)

In [45]:
print(pao1_gene_annot.shape)
pao1_gene_annot.head()

(202, 13)


Unnamed: 0,module id,gene name,median expression,median pairwise dist,min pairwise dist,max pairwise dist,enriched KEGG pathway,p-value,num shared genes,size module,size KEGG pathway,corrected p-value,operon_name
PA1224,8,,42.632516,1708.0,1.0,5475.0,,,,,,,
PA0497,0,,57.609764,163.0,1.0,326.0,,,,,,,
PA5149,8,,190.053787,1708.0,1.0,5475.0,,,,,,,PA5146-mutY-PA5148-PA5149
PA1391,14,,60.082125,4.0,1.0,12.0,,,,,,,PA1390-PA1391
PA0188,1,,21.000574,1861.5,1.0,5165.0,,,,,,,PA0187-PA0188


In [46]:
# For PA14 we only have operon annotations
pa14_gene_annot = pa14_gene_annot.merge(
    pa14_operon, left_index=True, right_index=True, how="left"
)

## Add regulon

For each regulon, what genes are contained in it. This information is only available for PAO1

In [47]:
pao1_regulon_filename = "https://raw.githubusercontent.com/greenelab/core-accessory-interactome/6635c0e357c0172c2cebd0368648030e0ee4beaf/data/metadata/regulons_format.csv"

pao1_regulons = pd.read_csv(pao1_regulon_filename, index_col=0, header=0)

In [48]:
pao1_regulons["Genes"] = pao1_regulons["Genes"].str.split(";").apply(set)

In [49]:
gene_to_regulons_df = pd.DataFrame(
    index=pao1_gene_module_labels.index, columns=list(pao1_regulons.index)
)

In [50]:
%%time
for gene in gene_to_regulons_df.index:
    gene_to_regulons_df.loc[gene] = [
        gene in pao1_regulons.loc[regulon, "Genes"] for regulon in pao1_regulons.index
    ]

CPU times: user 62 ms, sys: 198 µs, total: 62.2 ms
Wall time: 61.4 ms


In [51]:
# Add regulons to other annotations
pao1_gene_annot = pao1_gene_annot.merge(
    gene_to_regulons_df, left_index=True, right_index=True, how="left"
)

In [52]:
print(pao1_gene_annot.shape)
pao1_gene_annot.head()

(202, 30)


Unnamed: 0,module id,gene name,median expression,median pairwise dist,min pairwise dist,max pairwise dist,enriched KEGG pathway,p-value,num shared genes,size module,...,PqsR_regulon,QscR_regulon,VreI_regulon,Zur_regulon,Anr_short_list,PhoB_short_list,AlgU_short_list,LasR_short_list,RhlR_short_list,PqsR_short_list
PA1224,8,,42.632516,1708.0,1.0,5475.0,,,,,...,False,False,False,False,False,False,False,False,False,False
PA0497,0,,57.609764,163.0,1.0,326.0,,,,,...,False,False,False,False,False,False,False,False,False,False
PA5149,8,,190.053787,1708.0,1.0,5475.0,,,,,...,False,False,False,False,False,False,False,False,False,False
PA1391,14,,60.082125,4.0,1.0,12.0,,,,,...,False,False,False,False,False,False,False,False,False,False
PA0188,1,,21.000574,1861.5,1.0,5165.0,,,,,...,False,False,False,False,False,False,False,False,False,False


In [53]:
print(pa14_gene_annot.shape)
pa14_gene_annot.head()

(530, 7)


Unnamed: 0,module id,gene name,median expression,median pairwise dist,min pairwise dist,max pairwise dist,operon_name
PA14_59860,35,,9.022371,215.0,10.0,1040.0,PA14_59860-PA14_59870-PA14_59880-PA14_59890-PA...
PA14_30880,17,,15.345832,180.0,10.0,17610.0,PA14_30880-PA14_30870-PA14_30860-PA14_30850
PA14_35820,21,tnpS,840.90247,1830.0,10.0,25880.0,tnpS-PA14_35810-PA14_35800
PA14_31280,0,,80.791518,285.0,10.0,29210.0,PA14_31280-PA14_31270
PA14_39670,22,,3.942563,18030.0,10.0,61000.0,


In [54]:
# Save
pao1_gene_annot.to_csv(f"pao1_acc_gene_module_annotated_{method}.tsv", sep="\t")
pa14_gene_annot.to_csv(f"pa14_acc_gene_module_annotated_{method}.tsv", sep="\t")

These annotations will be used to help _P. aeruginosa_ experts, like our collaborators, to determine what accessory-accessory modules to focus on.


Note: Since genes can be in multiple KEGG pathways and regulons, each pathway and regulon are separate columns. Whereas operons are a single column since genes can belong to only a single operon.