# PAO1 vs PA14 core modules

This notebook compares the core modules from PAO1 and PA14 and tries to determine if they are consistent (i.e. are the same genes grouped in PAO1 also grouped in PA14?)

In [1]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
from core_acc_modules import paths, utils

In [2]:
# User params
cluster_method = "dbscan"

In [3]:
# Load module membership
pao1_membership_filename = os.path.join(
    paths.LOCAL_DATA_DIR, f"pao1_modules_{cluster_method}.tsv"
)
pa14_membership_filename = os.path.join(
    paths.LOCAL_DATA_DIR, f"pa14_modules_{cluster_method}.tsv"
)

In [10]:
pao1_membership = pd.read_csv(pao1_membership_filename, sep="\t", index_col=0, header=0)
pa14_membership = pd.read_csv(pa14_membership_filename, sep="\t", index_col=0, header=0)

In [11]:
print(pao1_membership.shape)
pao1_membership.head()

(5361, 1)


Unnamed: 0,module id
PA0052,-1
PA3378,0
PA0990,0
PA1590,0
PA5236,-1


In [13]:
print(pa14_membership.shape)
pa14_membership.head()

(5357, 1)


Unnamed: 0,module id
PA14_04860,0
PA14_72210,0
PA14_65050,0
PA14_42410,0
PA14_07680,0


## Get gene id mapping

Get mapping from PAO1 gene ids to PA14 gene ids and vice versa in order to map core modules from PAO1 to PA14 and vice versa

In [4]:
pao1_annot_filename = paths.GENE_PAO1_ANNOT
pa14_annot_filename = paths.GENE_PA14_ANNOT

In [5]:
# Map PA14 gene ids to PAO1 gene ids
pao1_to_pa14_map = utils.get_pao1_pa14_gene_map(pao1_annot_filename, "PAO1")
pa14_to_pao1_map = utils.get_pao1_pa14_gene_map(pao1_annot_filename, "PA14")

In [32]:
pao1_to_pa14_map_dict = pao1_to_pa14_map["PA14_ID"].to_dict()
pa14_to_pao1_map_dict = pa14_to_pao1_map["PAO1_ID"].to_dict()

In [36]:
# Test
# pa14_to_pao1_map.loc["PA14_72210"]
pao1_to_pa14_map.loc["PA0052"]

Name                                 NaN
Product.Name        hypothetical protein
GeneID.(PAO1)                     877739
PA14_ID                       PA14_00650
annotation                          core
num_mapped_genes                       1
Name: PA0052, dtype: object

In [33]:
# Assign genes in PAO1 modules to PA14 gene ids
pao1_membership["index mapped"] = (
    pao1_membership.reset_index()["index"].map(pao1_to_pa14_map_dict).values
)

In [34]:
# Assign genes in PA14 modules to PAO1 gene ids
pa14_membership["index mapped"] = (
    pa14_membership.reset_index()["index"].map(pa14_to_pao1_map_dict).values
)

In [31]:
print(pa14_membership.shape)
pa14_membership.head()

(5357, 2)


Unnamed: 0,module id,index mapped
PA14_04860,0,PA0370
PA14_72210,0,PA5471
PA14_65050,0,PA4926
PA14_42410,0,PA1711
PA14_07680,0,PA0588


In [35]:
print(pao1_membership.shape)
pao1_membership.head()

(5361, 2)


Unnamed: 0,module id,index mapped
PA0052,-1,PA14_00650
PA3378,0,PA14_20380
PA0990,0,PA14_51490
PA1590,0,PA14_43920
PA5236,-1,PA14_69140


## Find modules that best match

In [45]:
# PAO1 --> PA14
num_pao1_genes = len(pao1_core)

# For each module in PAO1
for pao1_group_name, pao1_df_group in pao1_membership.groupby("module id"):
    print(pao1_group_name)
    print(pao1_df_group)

    # Find the PA14 module with the best overlap
    for pa14_group_name, pa14_df_group in pa14_membership.groupby("module id"):
        print(pa14_group_name)
        print(pa14_df_group)

        # TO DO
        # Update based this analysis
        # Save p-values and best group_name
        num_generic_Crow_genes = shared_ranking.query(f"{ref_rank_col}>=80.0").shape[0]
        num_generic_SOPHIE_genes = shared_ranking[
            shared_ranking["Percentile (simulated)"] >= percentile_threshold
        ].shape[0]
        num_concordant_generic_genes = shared_ranking[
            (shared_ranking[ref_rank_col] >= percentile_threshold)
            & (shared_ranking["Percentile (simulated)"] >= percentile_threshold)
        ].shape[0]

        print(num_Crow_genes)
        print(num_generic_Crow_genes)
        print(num_generic_SOPHIE_genes)
        print(num_concordant_generic_genes)

        p = ss.hypergeom.sf(
            num_concordant_generic_genes,
            num_pao1_genes,
            num_generic_Crow_genes,
            num_generic_SOPHIE_genes,
        )
        print(p)
        break
    break
    # Store p-values

-1
        module id index mapped
PA0052         -1   PA14_00650
PA5236         -1   PA14_69140
PA0789         -1   PA14_54040
PA4119         -1   PA14_10670
PA3682         -1   PA14_16740
...           ...          ...
PA5350         -1   PA14_70630
PA3749         -1   PA14_15920
PA4617         -1   PA14_61090
PA1012         -1   PA14_51250
PA3312         -1   PA14_21180

[2041 rows x 2 columns]
-1
            module id index mapped
PA14_12620         -1       PA3962
PA14_27220         -1       PA2850
PA14_48800         -1       PA1198
PA14_26770         -1       PA2884
PA14_16700         -1       PA3686
...               ...          ...
PA14_41350         -1       PA1796
PA14_67050         -1       PA5076
PA14_02730         -1       PA0223
PA14_25530         -1       PA2979
PA14_12780         -1       PA3948

[2008 rows x 2 columns]


In [None]:
# TO DO
# Plot pvalue dist