# PPI Gene Groups Enrichment Analysis

Andrew Chung, hc893, 2/9/2025

## Import Data, Preprocessing

In [1]:
import numpy as np
import pandas as pd
import gseapy as gp
import seaborn as sns
import tqdm
from matplotlib import pyplot as plt

In [2]:
data = pd.read_csv("data.csv").drop(columns = ['MF', 'label'])
data.shape

(110000, 5)

Since the scope of enrichment analysis in this context is not clear cut, I will attempt both aggregate gene set EA and co-enrichment analysis.

In [3]:
# remove duplicate pairs
# There were no duplicate pairs

data['set'] = data.apply(lambda row: np.array([row['gene1'], row['gene2']]), axis = 1)
data['tuple'] = data['set'].apply(lambda row: tuple(sorted(row)))
data = data.drop_duplicates(subset = 'tuple', keep = 'first').drop(columns = ['set', 'tuple'])

## Model Enrichment Analysis

In [19]:
gene_sets = {}
for model in tqdm.tqdm(data['gene1'].unique()):
  gene_sets[model] = data[data['gene1'] == model]['gene2'].unique().tolist()

100%|██████████| 14665/14665 [04:32<00:00, 53.84it/s] 


`GSEApy` Enrichr API (not working)

In [35]:
models = np.array(data['gene1'].unique())
enrichment_results = dict.fromkeys(models, None)

for model, gene_set in tqdm.tqdm(gene_sets.items()):
  enr = gp.enrichr(gene_list = gene_set, gene_sets = [
      'GO_Biological_Process_2023', 
      'GO_Cellular_Component_2023',
      'GO_Molecular_Function_2023',
      'KEGG_2021_Human',
      'Reactome_Pathways_2024'
    ], organism = 'Human', outdir = None,
    no_plot = True
  )
  results = enr.results
  results = results[results['Adjusted P-value'] < 0.05][['Term', '-log10(p-value)']].set_index('Term', inplace = True)
  enrichment_results[model] = results

  0%|          | 0/14665 [00:02<?, ?it/s]


Exception: Error sending gene list, try again later

Direct Enrichr API Access

In [None]:
import json
import requests

ADD_URL = "https://maayanlab.cloud/Enrichr/addList"
ENRICHR_URL = "https://maayanlab.cloud/Enrichr/enrich"
query_string = "?userListId=%s&backgroundType=%s"

# define enrichment results
models = np.array(data['gene1'].unique())
enrichment_results = dict.fromkeys(models, None)

# convert to string for Enrichr
for model, gset in gene_sets.items():
  gene_sets[model] = '\n'.join(gset)

In [None]:
for model in models:
  payload = {
    'list': (None, gene_sets[model]),
    'description': (None, model)
  }
  response = requests.post(ADD_URL, files = payload)
  if not response.ok:
    raise Exception("Error analyzing {}".format(model))
  d = json.loads(response.text)