# Summary

This notebooks allows to replicate the results of the gene enrichment analysis on the 2 RNA-seq datasets, BRCA and KIRP.

In [None]:
import sys
sys.path.append("..")
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import itertools
import random
import scipy
from tqdm import tqdm
import pickle
import os
import gseapy as gp
from gseapy.plot import barplot, dotplot

plt.ion()
plt.show()


%load_ext autoreload
%autoreload 2

In [None]:
names = gp.get_library_name() # default: Human
names

gene_sets = ['GO_Biological_Process_2013',
 'GO_Biological_Process_2015',
 'GO_Biological_Process_2017',
 'GO_Biological_Process_2017b',
 'GO_Biological_Process_2018',
 'GO_Cellular_Component_2013',
 'GO_Cellular_Component_2015',
 'GO_Cellular_Component_2017',
 'GO_Cellular_Component_2017b',
 'GO_Cellular_Component_2018',
 'GO_Molecular_Function_2013',
 'GO_Molecular_Function_2015',
 'GO_Molecular_Function_2017',
 'GO_Molecular_Function_2017b',
 'GO_Molecular_Function_2018',]

# BRCA analysis

In [None]:
filename ="BRCA"
data = pd.read_pickle(f"../data/rna_data/{filename}.pkl")
solutions = pd.read_pickle(f"../data/{filename}_gmm_adapted_ratkowsky_lance.pkl")

In [None]:
input_genes = data.columns[:-1]
input_genes = np.array([g.split('|')[0] for g in input_genes])

In [None]:
for i, subspace in enumerate(solutions["features"].values):
    if len(subspace) >10:
        gene_list = [ input_genes[s] for s in subspace]
        enr = gp.enrichr(
                     gene_list=gene_list,
                     description='test_name',
                     gene_sets=gene_sets,
                     outdir='test/enrichr_kegg',
                     cutoff=0.05 # test dataset, use lower value from range(0,1)
                    )
        
        print(f"analyzing subspace {i} of size {len(subspace)}")
        plt.figure()
        barplot(enr.res2d,title='Barplot', cutoff=0.05)
        plt.show()
        
        dd = enr.res2d[enr.res2d["Adjusted P-value"]<=0.05][["Gene_set", "Term"]].drop_duplicates()
        ontology = dd["Gene_set"].unique()
        print(f"ontology { ontology}, {dd.shape[0]} functions")


# KIRP analysis

In [None]:
filename ="KIRP" 
data = pd.read_pickle(f"../data/rna_data/{filename}.pkl")
solutions = pd.read_pickle(f"../data/KIRP_gmm_adapted_ratkowsky_lance.pkl")

input_genes = data.columns[:-1]
input_genes = np.array([g.split('|')[0] for g in input_genes])

for i, subspace in enumerate(solutions["features"].values):
    if len(subspace) >10:
        gene_list = [ input_genes[s] for s in subspace]
        enr = gp.enrichr(
                     gene_list=gene_list,
                     description='test_name',
                     gene_sets=gene_sets,
                     outdir='test/enrichr_kegg',
                     cutoff=0.05 # test dataset, use lower value from range(0,1)
                    )
        print(f"analyzing subspace {i} of size {len(subspace)}")
        barplot(enr.res2d,title='Barplot', cutoff=0.05)
        plt.show()
        
        dd = enr.res2d[enr.res2d["Adjusted P-value"]<=0.05][["Gene_set", "Term"]].drop_duplicates()
        ontology = dd["Gene_set"].unique()
        print(f"ontology { ontology}, {dd.shape[0]} functions")
        
        