In [1]:
from Bio.KEGG import REST
import pandas as pd

In [2]:
pao1_pathways = REST.kegg_list("pathway", "pae").read()
pa14_pathways = REST.kegg_list("pathway", "pau").read()

In [3]:
# Example
pao1_pathways.split("\n")[0:5]

['path:pae00010\tGlycolysis / Gluconeogenesis - Pseudomonas aeruginosa PAO1',
 'path:pae00020\tCitrate cycle (TCA cycle) - Pseudomonas aeruginosa PAO1',
 'path:pae00030\tPentose phosphate pathway - Pseudomonas aeruginosa PAO1',
 'path:pae00040\tPentose and glucuronate interconversions - Pseudomonas aeruginosa PAO1',
 'path:pae00051\tFructose and mannose metabolism - Pseudomonas aeruginosa PAO1']

### Clean up pathway names

In [4]:
pao1_pathways_clean = []
for line in pao1_pathways.rstrip().split("\n"):
    entry, description = line.split("\t")
    pao1_pathways_clean.append(entry)

print(len(pao1_pathways_clean))
pao1_pathways_clean

123


['path:pae00010',
 'path:pae00020',
 'path:pae00030',
 'path:pae00040',
 'path:pae00051',
 'path:pae00052',
 'path:pae00053',
 'path:pae00061',
 'path:pae00071',
 'path:pae00130',
 'path:pae00190',
 'path:pae00220',
 'path:pae00230',
 'path:pae00240',
 'path:pae00250',
 'path:pae00260',
 'path:pae00261',
 'path:pae00270',
 'path:pae00280',
 'path:pae00281',
 'path:pae00290',
 'path:pae00300',
 'path:pae00310',
 'path:pae00330',
 'path:pae00332',
 'path:pae00340',
 'path:pae00350',
 'path:pae00360',
 'path:pae00361',
 'path:pae00362',
 'path:pae00364',
 'path:pae00380',
 'path:pae00400',
 'path:pae00401',
 'path:pae00405',
 'path:pae00410',
 'path:pae00430',
 'path:pae00440',
 'path:pae00450',
 'path:pae00460',
 'path:pae00470',
 'path:pae00480',
 'path:pae00500',
 'path:pae00520',
 'path:pae00521',
 'path:pae00523',
 'path:pae00525',
 'path:pae00540',
 'path:pae00541',
 'path:pae00550',
 'path:pae00561',
 'path:pae00562',
 'path:pae00564',
 'path:pae00565',
 'path:pae00590',
 'path:pae

In [5]:
pa14_pathways_clean = []
for line in pa14_pathways.rstrip().split("\n"):
    entry, description = line.split("\t")
    pa14_pathways_clean.append(entry)

print(len(pa14_pathways_clean))
pa14_pathways_clean

123


['path:pau00010',
 'path:pau00020',
 'path:pau00030',
 'path:pau00040',
 'path:pau00051',
 'path:pau00052',
 'path:pau00053',
 'path:pau00061',
 'path:pau00071',
 'path:pau00130',
 'path:pau00190',
 'path:pau00220',
 'path:pau00230',
 'path:pau00240',
 'path:pau00250',
 'path:pau00260',
 'path:pau00261',
 'path:pau00270',
 'path:pau00280',
 'path:pau00281',
 'path:pau00290',
 'path:pau00300',
 'path:pau00310',
 'path:pau00330',
 'path:pau00332',
 'path:pau00340',
 'path:pau00350',
 'path:pau00360',
 'path:pau00361',
 'path:pau00362',
 'path:pau00364',
 'path:pau00380',
 'path:pau00400',
 'path:pau00401',
 'path:pau00405',
 'path:pau00410',
 'path:pau00430',
 'path:pau00440',
 'path:pau00450',
 'path:pau00460',
 'path:pau00470',
 'path:pau00480',
 'path:pau00500',
 'path:pau00520',
 'path:pau00521',
 'path:pau00523',
 'path:pau00525',
 'path:pau00540',
 'path:pau00541',
 'path:pau00550',
 'path:pau00561',
 'path:pau00562',
 'path:pau00564',
 'path:pau00565',
 'path:pau00590',
 'path:pau

In [6]:
# List of all pae pathways in KEGG
# as a check there are 123
# https://www.kegg.jp/kegg-bin/search_pathway_text?map=pae&keyword=&mode=1&viewImage=true

In [7]:
# Example of a KEGG pathway page that is being parsed
# https://www.genome.jp/dbget-bin/www_bget?pathway:pae00020
# For a given pathway there are multiple modules and multiple gene ids

### Get associated genes

In [8]:
# Get the genes for pathways and add them to a list


def get_geneset(pathway_id_list):
    pathway_id_to_genes = {}
    pathway_id_to_pathway_name = {}
    pathway_id_to_gene_len = {}

    for pathway in pathway_id_list:
        pathway_file = REST.kegg_get(pathway).read()  # query and read each pathway

        # Initiate empty list to store genes
        pathway_gene_ids = []

        # iterate through each KEGG pathway file, keeping track of which section
        # of the file we're in, only read the gene in each pathway
        current_section = None
        for line in pathway_file.rstrip().split("\n"):
            section = line[:12].strip()  # section names are within 12 columns
            if not section == "":
                current_section = section

            if current_section == "NAME":
                if line.count(" - ") == 1:
                    pathway_name, strain_name = line[12:].split(" - ")
                    pathway_id_to_pathway_name[pathway] = pathway_name
                else:
                    pathway_name, strain_name_1, stain_name_2 = line[12:].split(" - ")
                    pathway_id_to_pathway_name[pathway] = pathway_name

            if current_section == "GENE":
                if line.count(";") == 1:
                    gene_identifiers, gene_description = line[12:].split("; ")
                    gene_id, gene_symbol = gene_identifiers.split()
                elif line.count(";") == 0:
                    gene_identifiers = line[12:].split(" ")
                    gene_id = gene_identifiers[0]
                else:
                    # in the case of 2 semicolons
                    gene_identifiers, gene_description, protein_name = line[12:].split(
                        "; "
                    )
                    gene_id, gene_symbol = gene_identifiers.split()

                if gene_id not in pathway_gene_ids:
                    pathway_gene_ids.append(gene_id)

        pathway_id_to_genes[pathway] = pathway_gene_ids
        pathway_id_to_gene_len[pathway] = len(pathway_gene_ids)

    return (pathway_id_to_genes, pathway_id_to_pathway_name, pathway_id_to_gene_len)

In [9]:
# Check that there are some pathways with no genes mapped
# pathway_id_to_genes["path:pae01110"]

In [10]:
pao1_genesets_dict, pao1_pathway_names_dict, pao1_geneset_len_dict = get_geneset(
    pao1_pathways_clean
)

In [11]:
pa14_genesets_dict, pa14_pathway_names_dict, pa14_geneset_len_dict = get_geneset(
    pa14_pathways_clean
)

### Format

Make gmt pathway file with index = pathway id, name and column with list of associated gene ids

In [12]:
assert len(pao1_pathway_names_dict) == len(pao1_genesets_dict)
assert len(pa14_pathway_names_dict) == len(pa14_genesets_dict)

In [13]:
pao1_pathway_name_df = pd.DataFrame(
    pao1_pathway_names_dict.items(), columns=["pathway_id", "pathway_name"]
).set_index("pathway_id")
pa14_pathway_name_df = pd.DataFrame(
    pa14_pathway_names_dict.items(), columns=["pathway_id", "pathway_name"]
).set_index("pathway_id")

pao1_pathway_name_df.head()

Unnamed: 0_level_0,pathway_name
pathway_id,Unnamed: 1_level_1
path:pae00010,Glycolysis / Gluconeogenesis
path:pae00020,Citrate cycle (TCA cycle)
path:pae00030,Pentose phosphate pathway
path:pae00040,Pentose and glucuronate interconversions
path:pae00051,Fructose and mannose metabolism


In [14]:
pa14_pathway_name_df.head()

Unnamed: 0_level_0,pathway_name
pathway_id,Unnamed: 1_level_1
path:pau00010,Glycolysis / Gluconeogenesis
path:pau00020,Citrate cycle (TCA cycle)
path:pau00030,Pentose phosphate pathway
path:pau00040,Pentose and glucuronate interconversions
path:pau00051,Fructose and mannose metabolism


In [15]:
pao1_pathway_gene_df = pd.DataFrame(
    pao1_genesets_dict.items(), columns=["pathway_id", "gene_ids"]
).set_index("pathway_id")
pa14_pathway_gene_df = pd.DataFrame(
    pa14_genesets_dict.items(), columns=["pathway_id", "gene_ids"]
).set_index("pathway_id")
pao1_pathway_gene_df.head()

Unnamed: 0_level_0,gene_ids
pathway_id,Unnamed: 1_level_1
path:pae00010,"[PA3193, PA4732, PA5110, PA0555, PA4748, PA319..."
path:pae00020,"[PA1580, PA1562, PA1787, PA2623, PA2624, PA158..."
path:pae00030,"[PA4732, PA5439, PA3183, PA3182, PA4204, PA060..."
path:pae00040,"[PA2022, PA3559, PA2023, PA0607, PA2343, PA125..."
path:pae00051,"[PA2344, PA3551, PA2232, PA5452, PA5322, PA545..."


In [16]:
pa14_pathway_gene_df.head()

Unnamed: 0_level_0,gene_ids
pathway_id,Unnamed: 1_level_1
path:pau00010,"[PA14_22930, PA14_62620, PA14_67490, PA14_0723..."
path:pau00020,"[PA14_44070, PA14_44290, PA14_41470, PA14_3018..."
path:pau00030,"[PA14_62620, PA14_71800, PA14_23070, PA14_2308..."
path:pau00040,"[PA14_38360, PA14_18300, PA14_38350, PA14_0791..."
path:pau00051,"[PA14_34340, PA14_18380, PA14_71970, PA14_7027..."


In [17]:
pao1_pathway_gene_len_df = pd.DataFrame(
    pao1_geneset_len_dict.items(), columns=["pathway_id", "num_genes"]
).set_index("pathway_id")
pa14_pathway_gene_len_df = pd.DataFrame(
    pa14_geneset_len_dict.items(), columns=["pathway_id", "num_genes"]
).set_index("pathway_id")
pao1_pathway_gene_len_df.head()

Unnamed: 0_level_0,num_genes
pathway_id,Unnamed: 1_level_1
path:pae00010,37
path:pae00020,28
path:pae00030,28
path:pae00040,8
path:pae00051,19


In [18]:
# Merge dfs
pao1_tmp = pao1_pathway_name_df.merge(
    pao1_pathway_gene_len_df, left_index=True, right_index=True
)
pao1_pathway_annot_df = pao1_tmp.merge(
    pao1_pathway_gene_df, left_index=True, right_index=True
)

pa14_tmp = pa14_pathway_name_df.merge(
    pa14_pathway_gene_len_df, left_index=True, right_index=True
)
pa14_pathway_annot_df = pa14_tmp.merge(
    pa14_pathway_gene_df, left_index=True, right_index=True
)

print(pao1_pathway_annot_df.shape)
pao1_pathway_annot_df.head()

(123, 3)


Unnamed: 0_level_0,pathway_name,num_genes,gene_ids
pathway_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
path:pae00010,Glycolysis / Gluconeogenesis,37,"[PA3193, PA4732, PA5110, PA0555, PA4748, PA319..."
path:pae00020,Citrate cycle (TCA cycle),28,"[PA1580, PA1562, PA1787, PA2623, PA2624, PA158..."
path:pae00030,Pentose phosphate pathway,28,"[PA4732, PA5439, PA3183, PA3182, PA4204, PA060..."
path:pae00040,Pentose and glucuronate interconversions,8,"[PA2022, PA3559, PA2023, PA0607, PA2343, PA125..."
path:pae00051,Fructose and mannose metabolism,19,"[PA2344, PA3551, PA2232, PA5452, PA5322, PA545..."


In [19]:
# Manually check that KEGG downloaded data matches what is on the KEGG website
# and that this new data is updated from what is in https://raw.githubusercontent.com/greenelab/adage/7a4eda39d360b224268921dc1f2c14b32788ab16/Node_interpretation/pseudomonas_KEGG_terms.txt
# Verified pae00020 has 28 genes in our downloaded and online versions, while ADAGE has 34 genes
# https://www.genome.jp/dbget-bin/www_bget?pathway:pae00020
# Verified that pae00071 has 34 genes in our downloaded and online versions, while ADAGE has 32 genes
# https://www.genome.jp/entry/pathway+pae00071
# pao1_pathway_annot_df.loc["path:pae00071"]
# Verified pae00072 is not found in downloaded and online versions

In [20]:
print(pa14_pathway_annot_df.shape)
pa14_pathway_annot_df.head()

(123, 3)


Unnamed: 0_level_0,pathway_name,num_genes,gene_ids
pathway_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
path:pau00010,Glycolysis / Gluconeogenesis,37,"[PA14_22930, PA14_62620, PA14_67490, PA14_0723..."
path:pau00020,Citrate cycle (TCA cycle),28,"[PA14_44070, PA14_44290, PA14_41470, PA14_3018..."
path:pau00030,Pentose phosphate pathway,28,"[PA14_62620, PA14_71800, PA14_23070, PA14_2308..."
path:pau00040,Pentose and glucuronate interconversions,8,"[PA14_38360, PA14_18300, PA14_38350, PA14_0791..."
path:pau00051,Fructose and mannose metabolism,18,"[PA14_34340, PA14_18380, PA14_71970, PA14_7027..."


In [21]:
# Join pathway id and pathway name columns
pao1_pathway_annot_df["pathway_id_name"] = (
    pao1_pathway_annot_df.index + " : " + pao1_pathway_annot_df["pathway_name"]
)
pa14_pathway_annot_df["pathway_id_name"] = (
    pa14_pathway_annot_df.index + " : " + pa14_pathway_annot_df["pathway_name"]
)

In [22]:
# Set index
pao1_pathway_annot_df = pao1_pathway_annot_df.set_index("pathway_id_name")
pa14_pathway_annot_df = pa14_pathway_annot_df.set_index("pathway_id_name")

In [23]:
pao1_pathway_annot_df.head()

Unnamed: 0_level_0,pathway_name,num_genes,gene_ids
pathway_id_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
path:pae00010 : Glycolysis / Gluconeogenesis,Glycolysis / Gluconeogenesis,37,"[PA3193, PA4732, PA5110, PA0555, PA4748, PA319..."
path:pae00020 : Citrate cycle (TCA cycle),Citrate cycle (TCA cycle),28,"[PA1580, PA1562, PA1787, PA2623, PA2624, PA158..."
path:pae00030 : Pentose phosphate pathway,Pentose phosphate pathway,28,"[PA4732, PA5439, PA3183, PA3182, PA4204, PA060..."
path:pae00040 : Pentose and glucuronate interconversions,Pentose and glucuronate interconversions,8,"[PA2022, PA3559, PA2023, PA0607, PA2343, PA125..."
path:pae00051 : Fructose and mannose metabolism,Fructose and mannose metabolism,19,"[PA2344, PA3551, PA2232, PA5452, PA5322, PA545..."


In [24]:
pa14_pathway_annot_df.head()

Unnamed: 0_level_0,pathway_name,num_genes,gene_ids
pathway_id_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
path:pau00010 : Glycolysis / Gluconeogenesis,Glycolysis / Gluconeogenesis,37,"[PA14_22930, PA14_62620, PA14_67490, PA14_0723..."
path:pau00020 : Citrate cycle (TCA cycle),Citrate cycle (TCA cycle),28,"[PA14_44070, PA14_44290, PA14_41470, PA14_3018..."
path:pau00030 : Pentose phosphate pathway,Pentose phosphate pathway,28,"[PA14_62620, PA14_71800, PA14_23070, PA14_2308..."
path:pau00040 : Pentose and glucuronate interconversions,Pentose and glucuronate interconversions,8,"[PA14_38360, PA14_18300, PA14_38350, PA14_0791..."
path:pau00051 : Fructose and mannose metabolism,Fructose and mannose metabolism,18,"[PA14_34340, PA14_18380, PA14_71970, PA14_7027..."


In [25]:
# Save
pao1_pathway_annot_df.to_csv("pao1_kegg_annot.tsv", sep="\t")
pa14_pathway_annot_df.to_csv("pa14_kegg_annot.tsv", sep="\t")