KEGG pathways overlap

In [7]:
#Part 1: Map/merge the information by PATHWAY_ID and GENE_ID.

In [5]:
import pandas as pd

#file names
gene_file = "gene.txt"
human_file = "human.txt"

#load in files
gene_data = pd.read_csv(gene_file, sep="\t", header=None, names=["GENE_ID", "PATHWAY_ID"])
human_data = pd.read_csv(human_file, sep="\t", header=None, names=["PATHWAY_ID", "PATHWAY_NAME"])
print(gene_data.head(), human_data.head())


     GENE_ID     PATHWAY_ID
0  hsa:10327  path:hsa00010
1    hsa:124  path:hsa00010
2    hsa:125  path:hsa00010
3    hsa:126  path:hsa00010
4    hsa:127  path:hsa00010   PATHWAY_ID                                       PATHWAY_NAME
0   hsa01100          Metabolic pathways - Homo sapiens (human)
1   hsa01200           Carbon metabolism - Homo sapiens (human)
2   hsa01210  2-Oxocarboxylic acid metabolism - Homo sapiens...
3   hsa01212       Fatty acid metabolism - Homo sapiens (human)
4   hsa01230  Biosynthesis of amino acids - Homo sapiens (hu...


In [7]:
#remove 'path:' from gene_data
gene_data["PATHWAY_ID"] = gene_data["PATHWAY_ID"].str.replace("path:", "")

#Merge
merged_data = pd.merge(gene_data, human_data, on="PATHWAY_ID", how="inner")
print(merged_data.head())

rows, columns = merged_data.shape

print("Number of rows:", rows)
print("Number of columns:", columns)

     GENE_ID PATHWAY_ID                                       PATHWAY_NAME
0  hsa:10327   hsa00010  Glycolysis / Gluconeogenesis - Homo sapiens (h...
1    hsa:124   hsa00010  Glycolysis / Gluconeogenesis - Homo sapiens (h...
2    hsa:125   hsa00010  Glycolysis / Gluconeogenesis - Homo sapiens (h...
3    hsa:126   hsa00010  Glycolysis / Gluconeogenesis - Homo sapiens (h...
4    hsa:127   hsa00010  Glycolysis / Gluconeogenesis - Homo sapiens (h...
Number of rows: 37470
Number of columns: 3


In [54]:
#Part 2/3:Compute the number of overlapping genes between every 2 pathways & Save as KEGG_crosstalk.csv
from itertools import combinations
pathway_to_genes = (
    merged_data.groupby("PATHWAY_ID")["GENE_ID"]
    .apply(set) 
    .to_dict()
)

#Map Pathway names
pathway_to_name = (
    merged_data.groupby("PATHWAY_ID")["PATHWAY_NAME"]
    .first()
    .to_dict()
)

results = []

#Compute Overlaps
for (path1, genes1), (path2, genes2) in combinations(pathway_to_genes.items(), 2):
    overlap = genes1.intersection(genes2)
    if overlap:  # Only include pairs with overlaps
        results.append({
            "PATHWAY_ID1": path1,
            "PATHWAY_NAME1": pathway_to_name[path1],
            "PATHWAY_ID2": path2,
            "PATHWAY_NAME2": pathway_to_name[path2],
            "NUMBER_OF_OVERLAPPING_GENES": len(overlap),
            "LIST_OF_OVERLAPPING_GENES": ";".join(sorted(overlap))
        })

#turn it into a dataframe
results_df = pd.DataFrame(results)

#Descending Order
results_df = results_df.sort_values(by="NUMBER_OF_OVERLAPPING_GENES", ascending=False)

#Save CSV file
results_df.to_csv("KEGG_crosstalk.csv", index=False)

print(results_df.head())

      PATHWAY_ID1                                      PATHWAY_NAME1  \
19381    hsa05010           Alzheimer disease - Homo sapiens (human)   
19522    hsa05014  Amyotrophic lateral sclerosis - Homo sapiens (...   
19597    hsa05016          Huntington disease - Homo sapiens (human)   
19460    hsa05012           Parkinson disease - Homo sapiens (human)   
19519    hsa05014  Amyotrophic lateral sclerosis - Homo sapiens (...   

      PATHWAY_ID2                                      PATHWAY_NAME2  \
19381    hsa05022  Pathways of neurodegeneration - multiple disea...   
19522    hsa05022  Pathways of neurodegeneration - multiple disea...   
19597    hsa05022  Pathways of neurodegeneration - multiple disea...   
19460    hsa05022  Pathways of neurodegeneration - multiple disea...   
19519    hsa05016          Huntington disease - Homo sapiens (human)   

       NUMBER_OF_OVERLAPPING_GENES  \
19381                          339   
19522                          300   
19597               

In [None]:
# Part 4 Compute a rank of the genes based on how many pathways they appear on and save it to a file

In [None]:
# Compute gene ranks based on the number of pathways they appear in
gene_to_pathways = (
    merged_data.groupby("GENE_ID")["PATHWAY_ID"]
    .apply(set)
    .to_dict()
)

# Create a DataFrame for gene ranks
gene_ranks = [
    {"GENE_ID": gene, "NUMBER_OF_PATHWAYS": len(pathways)}
    for gene, pathways in gene_to_pathways.items()
]

# Convert to DataFrame and sort by NUMBER_OF_PATHWAYS
gene_ranks_df = pd.DataFrame(gene_ranks).sort_values(by="NUMBER_OF_PATHWAYS", ascending=False)

# Save to a CSV file
gene_ranks_df.to_csv("KEGG_gene_ranks.csv", index=False)

print(gene_ranks_df.head())

In [None]:
# 5. Retrieve a set of the pathways the top 3 genes appear on

In [39]:
# Create a dictionary mapping PATHWAY_ID to PATHWAY_NAME so that pathway names can be assigned to the IDs
pathway_name_dict = merged_data.drop_duplicates(subset="PATHWAY_ID")[["PATHWAY_ID", "PATHWAY_NAME"]]
pathway_name_dict = pathway_name_dict.set_index("PATHWAY_ID")["PATHWAY_NAME"].to_dict()

# Retrieve the pathways these top genes appear in
top_genes_pathways_ids = {}  # For storing pathway IDs
top_genes_pathways_names = {}  # For storing pathway names

for gene in top_genes.index:
    pathways = merged_data[merged_data["GENE_ID"] == gene]["PATHWAY_ID"].unique()
    top_genes_pathways_ids[gene] = set(pathways)
    
    # Map the pathway IDs to names using pathway_name_dict
    pathway_names = [pathway_name_dict.get(pathway_id, "Unknown pathway") for pathway_id in pathways]
    top_genes_pathways_names[gene] = set(pathway_names)

# Create DataFrame for pathway IDs
top_genes_pathways_ids_df = pd.DataFrame(list(top_genes_pathways_ids.items()), columns=["GENE_ID", "PATHWAY_IDS"])

# Create DataFrame for pathway names
top_genes_pathways_names_df = pd.DataFrame(list(top_genes_pathways_names.items()), columns=["GENE_ID", "PATHWAY_NAMES"])




Unnamed: 0,GENE_ID,PATHWAY_IDS
0,hsa:5595,"{hsa05216, hsa05142, hsa05133, hsa04722, hsa05..."
1,hsa:5594,"{hsa05216, hsa05142, hsa05133, hsa04722, hsa05..."
2,hsa:5293,"{hsa04211, hsa05142, hsa04722, hsa05220, hsa05..."
