# Calculate conditional probability

Now that we have labels for which module is mostly core, mostly accessory or mixed, we can ask our original question: How is the expression of different gene groups coordinated? Specifically we can ask: Are accessory genes more likely to be co-expressed with other accessory genes?

To answer this we can calculate the following conditional probability:
$$
Pr(\text{gene x in an acc module | gene x is acc gene}) = \frac{Pr(\text{gene x in acc module}\cap \text{gene x is acc gene})}{Pr(\text{gene x is acc gene})}
$$

A similar probability can be calculated for core genes co-expressed with other core genes.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import pandas as pd
import numpy as np
from core_acc_modules import utils, paths

In [2]:
# User params
method = "affinity"

In [3]:
# Import module memberships -- import annotated df
pao1_membership_filename = os.path.join(
    paths.LOCAL_DATA_DIR, f"pao1_gene_module_annotated_{method}.tsv"
)
pa14_membership_filename = os.path.join(
    paths.LOCAL_DATA_DIR, f"pa14_gene_module_annotated_{method}.tsv"
)

pao1_membership = pd.read_csv(pao1_membership_filename, sep="\t", index_col=0, header=0)
pa14_membership = pd.read_csv(pa14_membership_filename, sep="\t", index_col=0, header=0)

In [4]:
pao1_membership.head()

Unnamed: 0,module id,num core,num acc,odds ratio,p-value,module label,num core in module,num acc in module,num core outside module,num acc outside module,...,PqsR_regulon,QscR_regulon,VreI_regulon,Zur_regulon,Anr_short_list,PhoB_short_list,AlgU_short_list,LasR_short_list,RhlR_short_list,PqsR_short_list
PA0001,415,,,0.493271,0.407427,mixed,12.0,0.0,5349.0,202.0,...,False,False,False,False,False,False,False,False,False,False
PA0006,415,,,0.493271,0.407427,mixed,12.0,0.0,5349.0,202.0,...,False,False,False,False,False,False,False,False,False,False
PA1440,415,,,0.493271,0.407427,mixed,12.0,0.0,5349.0,202.0,...,False,False,False,False,False,False,False,False,False,False
PA1544,415,,,0.493271,0.407427,mixed,12.0,0.0,5349.0,202.0,...,False,False,False,False,False,False,False,False,False,False
PA1741,415,,,0.493271,0.407427,mixed,12.0,0.0,5349.0,202.0,...,False,False,False,False,False,False,False,False,False,False


In [5]:
pa14_membership.head()

Unnamed: 0,module id,num core,num acc,odds ratio,p-value,module label,num core in module,num acc in module,num core outside module,num acc outside module,...,PqsR_regulon,QscR_regulon,VreI_regulon,Zur_regulon,Anr_short_list,PhoB_short_list,AlgU_short_list,LasR_short_list,RhlR_short_list,PqsR_short_list
PA14_00010,537,,,0.9,1.0,mixed,8.0,0.0,5349.0,534.0,...,False,False,False,False,False,False,False,False,False,False
PA14_00020,537,,,0.9,1.0,mixed,8.0,0.0,5349.0,534.0,...,False,False,False,False,False,False,False,False,False,False
PA14_00030,537,,,0.9,1.0,mixed,8.0,0.0,5349.0,534.0,...,False,False,False,False,False,False,False,False,False,False
PA14_00050,252,,,0.79985,0.576226,mixed,7.0,0.0,5350.0,534.0,...,False,False,False,False,False,False,False,False,False,False
PA14_00060,276,,,1.201125,1.0,mixed,23.0,1.0,5334.0,533.0,...,False,False,False,False,False,False,False,False,False,False


## Get core/accessory annotations

In [6]:
# Read in expression data
pao1_expression_filename = paths.PAO1_COMPENDIUM
pa14_expression_filename = paths.PA14_COMPENDIUM

pao1_expression = pd.read_csv(pao1_expression_filename, sep="\t", index_col=0, header=0)
pa14_expression = pd.read_csv(pa14_expression_filename, sep="\t", index_col=0, header=0)

In [7]:
pao1_annot_filename = paths.GENE_PAO1_ANNOT
pa14_annot_filename = paths.GENE_PA14_ANNOT

core_acc_dict = utils.get_my_core_acc_genes(
    pao1_annot_filename, pa14_annot_filename, pao1_expression, pa14_expression
)

Number of PAO1 core genes: 5366
Number of PA14 core genes: 5363
Number of PAO1 core genes in my dataset: 5361
Number of PA14 core genes in my dataset: 5357
Number of PAO1-specific genes: 202
Number of PA14-specific genes: 534


In [8]:
pao1_core = core_acc_dict["core_pao1"]
pa14_core = core_acc_dict["core_pa14"]
pao1_acc = core_acc_dict["acc_pao1"]
pa14_acc = core_acc_dict["acc_pa14"]

## Calculate likelihood
$Pr(\text{gene x in acc module}\cap \text{gene x is acc gene})$ is the number of accessory genes in accessory modules

In [9]:
# Get subset of df
pao1_subset = pao1_membership[["module id", "module label", "num acc in module"]]
pao1_subset = pao1_subset.set_index("module id")
pao1_subset = pao1_subset.drop_duplicates()

In [10]:
pa14_subset = pa14_membership[["module id", "module label", "num acc in module"]]
pa14_subset = pa14_subset.set_index("module id")
pa14_subset = pa14_subset.drop_duplicates()

In [11]:
# Get number of accessory genes in "mostly accessory" modules
num_acc_gene_in_acc_mod_pao1 = pao1_subset.loc[
    pao1_subset["module label"] == "mostly accessory", "num acc in module"
].sum()
num_acc_gene_in_acc_mod_pa14 = pa14_subset.loc[
    pa14_subset["module label"] == "mostly accessory", "num acc in module"
].sum()

In [None]:
# Currently there are no mostly core modules, so this is a placeholder
# Get number of core genes in "mostly core" modules
"""num_core_gene_in_core_mod_pao1 = pao1_subset.loc[
    pao1_subset["module label"] == "mostly core", "num core in module"
].sum()
num_core_gene_in_core_mod_pa14 = pa14_subset.loc[
    pa14_subset["module label"] == "mostly core", "num core in module"
].sum()"""

In [12]:
lik_pao1_acc = num_acc_gene_in_acc_mod_pao1 / (len(pao1_core) + len(pao1_acc))
lik_pa14_acc = num_acc_gene_in_acc_mod_pa14 / (len(pa14_core) + len(pa14_acc))

In [None]:
# Currently there are no mostly core modules, so this is a placeholder
# lik_pao1_core = num_core_gene_in_core_mod_pao1 / (len(pao1_core) + len(pao1_acc))
# lik_pa14_core = num_core_gene_in_core_mod_pa14 / (len(pa14_core) + len(pa14_acc))

In [13]:
num_acc_gene_in_acc_mod_pao1

99.0

In [14]:
num_acc_gene_in_acc_mod_pa14

200.0

In [15]:
print(lik_pao1_acc)
print(lik_pa14_acc)

0.017796153154772603
0.03395009336275675


In [None]:
# Currently there are no mostly core modules, so this is a placeholder
# print(lik_pao1_core)
# print(lik_pa14_core)

### Caclulate prior distribution
$Pr(\text{gene x is acc gene})$ is the number of accessory genes divided by the total number of genes

Similarly for core genes

In [16]:
pr_pao1_acc = len(pao1_acc) / (len(pao1_core) + len(pao1_acc))
pr_pa14_acc = len(pa14_acc) / (len(pa14_core) + len(pa14_acc))

In [None]:
pr_pao1_core = len(pao1_core) / (len(pao1_core) + len(pao1_acc))
pr_pa14_core = len(pa14_core) / (len(pa14_core) + len(pa14_acc))

In [17]:
print(pr_pao1_acc)
print(pr_pa14_acc)

0.036311342800647135
0.09064674927856052


In [None]:
print(pr_pao1_core)
print(pr_pa14_core)

## Calculate conditional probability

In [18]:
pr_acc2acc_pao1 = lik_pao1_acc / pr_pao1_acc
pr_acc2acc_pa14 = lik_pa14_acc / pr_pa14_acc

print(
    f"Probability of accessory gene being co-expressed with another accessory gene in PAO1 is {pr_acc2acc_pao1}"
)
print(
    f"Probability of accessory gene being co-expressed with another accessory gene in PA14 is {pr_acc2acc_pa14}"
)

Probability of accessory gene being co-expressed with another accessory gene in PAO1 is 0.49009900990099003
Probability of accessory gene being co-expressed with another accessory gene in PA14 is 0.3745318352059925


In [None]:
# Currently there are no mostly core modules, so this is a placeholder
"""pr_core2core_pao1 = lik_pao1_core / pr_pao1_core
pr_core2core_pa14 = lik_pa14_core / pr_pa14_core

print(
    f"Probability of core gene being co-expressed with another core gene in PAO1 is {pr_core2core_pao1}"
)
print(
    f"Probability of core gene being co-expressed with another core gene in PA14 is {pr_core2core_pa14}"
)"""