In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
data_dir = os.path.join("..", "data")
pao1_data = os.path.join(data_dir, "pao1_data")
tcga_data = os.path.join(data_dir, "tcga_data")

In [3]:
pao1_kegg_edges_file = os.path.join(
    pao1_data, "eADAGE_analysis", "permutation_test_n=10000", "filtered_network.tsv")
tcga_pid_edges_file = os.path.join(
    tcga_data, "NMF_analysis", "permutation_test_n=10000", "filtered_network.tsv")

kegg_pathways = os.path.join(pao1_data, "pseudomonas_KEGG_terms.txt")
pid_pathways = os.path.join(tcga_data, "PID_pathway_definitions.txt")

In [4]:
def load_pathway_definitions_file(pathways_file, shorten_pathway_name):
    pathway_definitions_df = pd.read_table(
        pathways_file, header=None, names=["pw", "size", "genes"])
    pathway_definitions_df["genes"] = pathway_definitions_df["genes"].map(
        lambda x: x.split(";"))
    pathway_definitions_df.set_index("pw", inplace=True)

    pathway_definitions_map = {}
    for index, row in pathway_definitions_df.iterrows():
        pathway = shorten_pathway_name(index)
        pathway_definitions_map[pathway] = set(row["genes"])
    return pathway_definitions_map

In [5]:
pao1_kegg_network = pd.read_table(pao1_kegg_edges_file)
pao1_kegg_network.head()

Unnamed: 0,pw0,pw1,weight
0,Ribosome PAO1,"Ribosome, bacteria",26.08964
1,Phosphonate and phosphinate metabolism PAO1,Phosphonate transport system,25.455394
2,"Macrolide resistance, MacAB-TolC transporter",Zinc transport system,23.170151
3,Phosphonate and phosphinate metabolism PAO1,Type II general secretion pathway,21.16831
4,Glycine betaine/proline transport system,"Glycine, serine and threonine metabolism PAO1",19.221697


In [6]:
# shorten the PAO1 KEGG pathway names so that they match the ones
# we use in the publication
def shorten_pao1_kegg(pathway_name):
    REMOVE_SUFFIX = "- Pseudomonas aeruginosa PAO1"
    pathway_short = None
    split_label = pathway_name.split(" ", 1)
    if len(split_label) > 1:
        pathway_short = split_label[1]
    else:
        pathway_short = split_label[0]
    if REMOVE_SUFFIX in pathway_short:
        remove_from_index = pathway_short.index(REMOVE_SUFFIX)
        return "{0}PAO1".format(pathway_short[:remove_from_index])
    return pathway_short.strip()

kegg_pathways_dict = load_pathway_definitions_file(
    kegg_pathways, shorten_pao1_kegg)

In [7]:
pathways_in_kegg_network = set(
    pao1_kegg_network["pw0"].tolist() + pao1_kegg_network["pw1"].tolist())

n_overlapping = 0
edge_jaccards = []
for index, row in pao1_kegg_network.iterrows():
    pw0_genes = kegg_pathways_dict[row["pw0"]] 
    pw1_genes = kegg_pathways_dict[row["pw1"]]
    overlap = pw0_genes & pw1_genes
    if overlap:
        n_overlapping += 1
    jaccard = float(len(overlap)) / (len(pw0_genes) + len(pw1_genes) - len(overlap))
    edge_jaccards.append(jaccard)        
        
print("For the PAO1 KEGG pathway-pathway co-occurrence network:")
print("Number of distinct pathways in the network: {0}".format(
    len(pathways_in_kegg_network)))
print("Number of edges in the network: {0}".format(
    pao1_kegg_network.shape[0]))
print("Number of edges in the network where the 2 pathways "
      "share genes: {0} (average similarity by Jaccard Index: {1:.4})".format(
          n_overlapping, np.average(edge_jaccards)))

For the PAO1 KEGG pathway-pathway co-occurrence network:
Number of distinct pathways in the network: 89
Number of edges in the network: 203
Number of edges in the network where the 2 pathways share genes: 35 (average similarity by Jaccard Index: 0.03482)


In [8]:
tcga_pid_network = pd.read_table(tcga_pid_edges_file)
tcga_pid_network.head()

Unnamed: 0,pw0,pw1,weight,features
0,E2F,PLK1,6.543478,19.0 29.0 39.0 44.0 59.0 114.0 115.0 158.0
1,AURORA B,E2F,6.531936,19.0 29.0 39.0 44.0 59.0 114.0 115.0 158.0
2,AURORA B,PLK1,6.341347,19.0 29.0 39.0 44.0 59.0 114.0 115.0 158.0
3,E2F,FOXM1,5.974458,19.0 29.0 39.0 44.0 59.0 115.0 158.0
4,AURORA B,FOXM1,5.873026,19.0 29.0 39.0 44.0 59.0 115.0 158.0


In [9]:
# shorten the PID pathway names so that they match the ones
# we use in the publication
def shorten_tcga_pid(pathway_name):
    REMOVE_SUFFIX = "PATHWAY"
    pathway_short = None
    split_on_underscores = pathway_name.split("_")
    if split_on_underscores[-1] == REMOVE_SUFFIX:
        pathway_short = " ".join(split_on_underscores[:-1])
    else:
        pathway_short = " ".join(split_on_underscores)
    return pathway_short

pid_pathways_dict = load_pathway_definitions_file(
    pid_pathways, shorten_tcga_pid)

In [10]:
pathways_in_pid_network = set(
    tcga_pid_network["pw0"].tolist() + tcga_pid_network["pw1"].tolist())

n_overlapping = 0
edge_jaccards = []
for index, row in tcga_pid_network.iterrows():
    pw0_genes = pid_pathways_dict[row["pw0"]] 
    pw1_genes = pid_pathways_dict[row["pw1"]]
    overlap = pw0_genes & pw1_genes
    if overlap:
        n_overlapping += 1
    jaccard = float(len(overlap)) / (len(pw0_genes) + len(pw1_genes) - len(overlap))
    edge_jaccards.append(jaccard)

print("For the TCGA PID pathway-pathway co-occurrence network:")
print("Number of distinct pathways in the network: {0}".format(
    len(pathways_in_kegg_network)))
print("Number of edges in the network: {0}".format(
    pao1_kegg_network.shape[0]))
print("Number of edges in the network where the 2 pathways "
      "share genes: {0} (average similarity by Jaccard Index: {1:.4})".format(
          n_overlapping, np.average(edge_jaccards)))

For the TCGA PID pathway-pathway co-occurrence network:
Number of distinct pathways in the network: 89
Number of edges in the network: 203
Number of edges in the network where the 2 pathways share genes: 96 (average similarity by Jaccard Index: 0.05783)
