In [None]:
import pandas as pd
import numpy as np

In [None]:
bacteria_codes_1 = pd.read_excel("bacteriai_list_402.xlsx", index_col=0)
bacteria_codes_1

Unnamed: 0,bacteria,strain
0,Acidovorax avenae subsp. avenae,
1,Acinetobacter baumannii,
2,Acinetobacter baumannii,AIIMS 7
3,Acinetobacter baumannii,ATCC -BAA 1605
4,Acinetobacter baumannii,ATCC 19606
...,...,...
398,Verticillium dahlia,MW830379
399,Vibrio cholerae,
400,Vibrio parahaemolyticus,
401,Vibrio vulnificus,MTCC 1146


In [None]:
import pandas as pd
import requests
from collections import defaultdict
from tqdm import tqdm
import difflib

# Load the uploaded bacteria list
bacteria_strains = [f"{row['bacteria']} {row['strain']}" for _, row in bacteria_codes_1.iterrows()]

# Step 1: Get all KEGG organism names and codes
response = requests.get("https://rest.kegg.jp/list/organism")
organism_data = [line.split("\t") for line in response.text.strip().split("\n")]
name_to_code = {org[2]: org[1] for org in organism_data}
all_kegg_names = list(name_to_code.keys())

# Step 2: Map bacteria + strain to KEGG organism codes with fuzzy matching
mapped_bacteria = {}
for strain in set(bacteria_strains):
    match = difflib.get_close_matches(strain, all_kegg_names, n=1, cutoff=0.7)
    if match:
        mapped_bacteria[strain] = name_to_code[match[0]]

# KEGG KO Mapping
ko_presence = defaultdict(set)
all_kos = set()
for name, code in tqdm(mapped_bacteria.items(), desc="Fetching KO data"):
    url = f"https://rest.kegg.jp/link/ko/{code}"
    response = requests.get(url)
    if response.status_code == 200:
        lines = response.text.strip().split("\n")
        for line in lines:
            gene_entry, ko_entry = line.split("\t")
            ko = ko_entry.split(":")[1]
            ko_presence[name].add(ko)
            all_kos.add(ko)

ko_matrix = pd.DataFrame(0, index=mapped_bacteria.keys(), columns=list(all_kos))
for name, kos in ko_presence.items():
    ko_matrix.loc[name, list(kos)] = 1
ko_matrix_path = "kegg_ko_matrix.csv"
ko_matrix.to_csv(ko_matrix_path)

# KEGG Pathway Mapping

pathway_presence = defaultdict(set)
all_pathways = set()
for name, code in tqdm(mapped_bacteria.items(), desc="Fetching Pathway data"):
    url = f"https://rest.kegg.jp/link/pathway/{code}"
    response = requests.get(url)
    if response.status_code == 200:
        lines = response.text.strip().split("\n")
        for line in lines:
            _, pathway_entry = line.split("\t")
            pathway = pathway_entry.split(":")[1]
            pathway_presence[name].add(pathway)
            all_pathways.add(pathway)

pathway_matrix = pd.DataFrame(0, index=mapped_bacteria.keys(), columns=list(all_pathways))
for name, pathways in pathway_presence.items():
    pathway_matrix.loc[name, list(pathways)] = 1
pathway_matrix_path = "kegg_pathway_matrix.csv"
pathway_matrix.to_csv(pathway_matrix_path)




Fetching KO data: 100%|██████████| 369/369 [04:20<00:00,  1.42it/s]
Fetching Pathway data: 100%|██████████| 369/369 [05:11<00:00,  1.19it/s]


In [None]:
ko_matrix

Unnamed: 0,K13212,K25031,K02495,K11773,K11557,K12479,K22710,K14429,K11443,K03131,...,K22510,K07097,K11038,K08741,K14753,K15122,K20297,K12232,K06653,K07184
Staphylococcus epidermidis ATCC 36983,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Staphylococcus aureus ATCC 43300,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Enterobacter cloacae nan,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Staphylococcus epidermidis ATCC 12228,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Klebsiella pneumoniae NCTC 9633,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Salmonella enterica subsp. enterica serovar Typhimurium PT 135,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
Aspergillus flavus MT550030,0,0,0,0,1,1,0,1,0,1,...,0,0,0,0,1,0,0,0,1,0
Klebsiella pneumoniae ATCC 13883,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Listeria monocytogenes ATCC 51772,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
