# Find co-expressed accessory genes

From the [core-acc analysis](../core_acc_analysis/stable_gene_relationships.ipynb) we observed that least stable core genes tended to be highly co-expressed with more accessory genes compared to the most stable core genes. Therefore, this notebook reports the accessory genes that the least core genes are most co-expressed with.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import random
import scipy
import pandas as pd
from core_acc_modules import utils, paths, gene_relationships

random.seed(1)

In [2]:
# User params
use_operon = True

### Load correlation matrix

In [3]:
# Load correlation matrix
pao1_corr_filename = paths.PAO1_CORR_LOG_SPELL
pa14_corr_filename = paths.PA14_CORR_LOG_SPELL

pao1_corr = pd.read_csv(pao1_corr_filename, sep="\t", index_col=0, header=0)
pa14_corr = pd.read_csv(pa14_corr_filename, sep="\t", index_col=0, header=0)

In [4]:
# Make a dataframe with gene ids
pao1_membership = pd.DataFrame(data=[], index=pao1_corr.index)
print(pao1_membership.shape)
pao1_membership.head()

(5563, 0)


PA0001
PA0002
PA0003
PA0004
PA0005


In [5]:
pa14_membership = pd.DataFrame(data=[], index=pa14_corr.index)
print(pa14_membership.shape)
pa14_membership.head()

(5891, 0)


PA14_55610
PA14_55600
PA14_55590
PA14_55580
PA14_55570


### Load and get least stable core genes

In [6]:
# Load transcriptional similarity df
# These are the subset of genes that we will consider
pao1_similarity_scores_filename = "pao1_similarity_scores.tsv"
pa14_similarity_scores_filename = "pa14_similarity_scores.tsv"

pao1_similarity_scores = pd.read_csv(
    pao1_similarity_scores_filename, sep="\t", header=0, index_col=0
)
pa14_similarity_scores = pd.read_csv(
    pa14_similarity_scores_filename, sep="\t", header=0, index_col=0
)

In [7]:
# Get most and least stable core genes
pao1_least_stable_genes = list(
    pao1_similarity_scores[pao1_similarity_scores["label"] == "least stable"].index
)

pa14_least_stable_genes = list(
    pa14_similarity_scores[pa14_similarity_scores["label"] == "least stable"].index
)

### Load core/accessory gene labels

In [8]:
# Read in expression data
pao1_expression_filename = paths.PAO1_COMPENDIUM
pa14_expression_filename = paths.PA14_COMPENDIUM

pao1_expression = pd.read_csv(pao1_expression_filename, sep="\t", index_col=0, header=0)
pa14_expression = pd.read_csv(pa14_expression_filename, sep="\t", index_col=0, header=0)

In [9]:
pao1_annot_filename = paths.GENE_PAO1_ANNOT
pa14_annot_filename = paths.GENE_PA14_ANNOT

core_acc_dict = utils.get_my_core_acc_genes(
    pao1_annot_filename, pa14_annot_filename, pao1_expression, pa14_expression
)

Number of PAO1 core genes: 5366
Number of PA14 core genes: 5363
Number of PAO1 core genes in my dataset: 5361
Number of PA14 core genes in my dataset: 5357
Number of PAO1-specific genes: 202
Number of PA14-specific genes: 534


In [10]:
pao1_core = core_acc_dict["core_pao1"]
pa14_core = core_acc_dict["core_pa14"]
pao1_acc = core_acc_dict["acc_pao1"]
pa14_acc = core_acc_dict["acc_pa14"]

In [11]:
pao1_membership.loc[pao1_core, "core/acc"] = "core"
pao1_membership.loc[pao1_acc, "core/acc"] = "acc"

In [12]:
# pa14_acc_shared = set(pa14_acc).intersection(pa14_gene_module_labels.index)
pa14_membership.loc[pa14_core, "core/acc"] = "core"
pa14_membership.loc[pa14_acc, "core/acc"] = "acc"

In [13]:
# Drop "module id" column
pao1_arr = pao1_membership
pa14_arr = pa14_membership

In [14]:
# Make sure to sort by gene id
# NOTE PA14 gene ids don't increment by 1, but by 10 or 20 are we missing some genes?
pao1_arr = pao1_arr.sort_index()
pa14_arr = pa14_arr.sort_index()

### Load operon annotations

In [15]:
pao1_operon_filename = paths.PAO1_OPERON
pa14_operon_filename = paths.PA14_OPERON

In [16]:
pao1_operon = pd.read_csv(pao1_operon_filename, index_col=0, header=0)
pa14_operon = pd.read_csv(pa14_operon_filename, index_col=0, header=0)

In [17]:
pao1_operon = pao1_operon.set_index("locus_tag")
pa14_operon = pa14_operon.set_index("locus_tag")

In [18]:
# There are 247 PAO1 genes with multiple annotations
# This operon df contains annotations from predicted operons based on DOOR database
# predictions which make up the majority of the operons) as well as some that
# are curated (i.e. PseudoCAP)
# There are some that have multiple PseudoCAP annotations too

# Here we will keep the last PseudoCAP annotations
# Note: Do we want to discard these annotations all together
# or will these need to be carefully curated to determine which to keep?
# We will use the curated annotation here
pao1_operon = pao1_operon[~pao1_operon.index.duplicated(keep="last")]
pa14_operon = pa14_operon[~pa14_operon.index.duplicated(keep="last")]

In [19]:
# Only include columns for gene id and operon_name
pao1_operon = pao1_operon["operon_name"].to_frame()
pa14_operon = pa14_operon["operon_name"].to_frame()

In [20]:
if use_operon:
    pao1_operon_expression_to_use = pao1_operon
    pa14_operon_expression_to_use = pa14_operon
else:
    pao1_operon_expression_to_use = None
    pa14_operon_expression_to_use = None

### Find all accessory genes co-expressed with least stable core genes

In [21]:
%%time
pao1_least_acc_relationships = gene_relationships.find_related_acc_genes(
    pao1_corr,
    pao1_least_stable_genes,
    pao1_arr,
    10,
    pao1_operon_expression_to_use,
)

CPU times: user 2min 18s, sys: 3.4 ms, total: 2min 18s
Wall time: 2min 18s


In [22]:
pao1_least_acc_relationships.head()

Unnamed: 0_level_0,Related acc genes
gene id,Unnamed: 1_level_1
PA0850,No accessory genes
PA2283,[PA2336]
PA0346,No accessory genes
PA1633,No accessory genes
PA1195,No accessory genes


In [23]:
%%time
pa14_least_acc_relationships = gene_relationships.find_related_acc_genes(
    pa14_corr,
    pa14_least_stable_genes,
    pa14_arr,
    10,
    pa14_operon_expression_to_use,
)

CPU times: user 2min 29s, sys: 3.97 ms, total: 2min 29s
Wall time: 2min 29s


In [24]:
pa14_least_acc_relationships.head()

Unnamed: 0_level_0,Related acc genes
gene id,Unnamed: 1_level_1
PA14_00600,No accessory genes
PA14_01600,"[PA14_30950, PA14_44230, PA14_30870]"
PA14_01910,No accessory genes
PA14_01980,No accessory genes
PA14_02300,No accessory genes


### Add accessory gene list to core-core annotation df

In [25]:
# Load current core-core annotations
pao1_core_stable_similarity_filename = "pao1_core_stable_associations.tsv"
pa14_core_stable_similarity_filename = "pa14_core_stable_associations.tsv"

In [26]:
pao1_all_associations = pd.read_csv(
    pao1_core_stable_similarity_filename, sep="\t", header=0, index_col=0
)
pa14_all_associations = pd.read_csv(
    pa14_core_stable_similarity_filename, sep="\t", header=0, index_col=0
)

In [27]:
print(pao1_all_associations.shape)
print(pa14_all_associations.shape)

(682, 6)
(682, 6)


In [28]:
# Merge KEGG associations with transcriptional similarity information
pao1_all_associations = pao1_all_associations.merge(
    pao1_least_acc_relationships, left_index=True, right_index=True, how="left"
)
pa14_all_associations = pa14_all_associations.merge(
    pa14_least_acc_relationships, left_index=True, right_index=True, how="left"
)

In [29]:
# Check that the dimension is consistent before and after merge
print(pao1_all_associations.shape)
print(pa14_all_associations.shape)

(682, 7)
(682, 7)


In [32]:
# Reorder columns
pao1_all_associations = pao1_all_associations[
    [
        "Name",
        "PA14 homolog id",
        "pathways present",
        "Transcriptional similarity across strains",
        "P-value",
        "label",
        "Related acc genes",
    ]
]

pa14_all_associations = pa14_all_associations[
    [
        "Name",
        "PAO1 homolog id",
        "pathways present",
        "Transcriptional similarity across strains",
        "P-value",
        "label",
        "Related acc genes",
    ]
]

In [35]:
pao1_all_associations.sort_values(by="label").head()

Unnamed: 0_level_0,Name,PA14 homolog id,pathways present,Transcriptional similarity across strains,P-value,label,Related acc genes
gene id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PA1367,,PA14_46630,[],0.186353,5.331981e-43,least stable,No accessory genes
PA0320,carO,PA14_04180,[],0.176029,1.7502599999999998e-38,least stable,No accessory genes
PA4799,,PA14_63390,[],0.19457,8.656657e-47,least stable,No accessory genes
PA0978,,PA14_51630,[],0.113417,8.802946000000001e-17,least stable,"[PA1937, PA2225]"
PA2723,,PA14_28940,[],0.165376,4.163685e-34,least stable,"[PA5265, PA3066, PA5264, PA0258, PA2296]"


In [36]:
pa14_all_associations.sort_values(by="label").head()

Unnamed: 0,Name,PAO1 homolog id,pathways present,Transcriptional similarity across strains,P-value,label,Related acc genes
PA14_35690,pslE,PA2235,[],0.175337,3.2773329999999997e-38,least stable,"[PA14_31070, PA14_10120, PA14_33970, PA14_20520]"
PA14_48770,,PA1201,[],0.182194,3.6027789999999996e-41,least stable,No accessory genes
PA14_11670,mucE,PA4033,[],0.179098,8.084179e-40,least stable,[PA14_58980]
PA14_60070,,PA1939,[],0.192158,1.104447e-45,least stable,"[PA14_60120, PA14_60110, PA14_59780, PA14_5917..."
PA14_59960,,PA0982,[],0.135936,1.7066030000000002e-23,least stable,"[PA14_59350, PA14_59830, PA14_59060, PA14_6002..."


In [37]:
# Save
pao1_all_associations.to_csv(pao1_core_stable_similarity_filename, sep="\t")
pa14_all_associations.to_csv(pa14_core_stable_similarity_filename, sep="\t")