In [32]:
import scanpy as sc
import gseapy as gp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Optional
from scipy import stats, sparse
from tqdm import tqdm
import h5py

In [30]:
class CellTypeGSEA:
    """
    A class to perform GSEA analysis between different cell types in single-cell data.
    """
    def __init__(
        self, 
        adata: sc.AnnData,
        cell_type_key: str = 'cell_type',
        organism: str = 'Mouse',  # or 'Human'
        gene_sets: List[str] = ['KEGG_2021_Mouse']
    ):
        """
        Initialize the GSEA analysis object.
        
        Parameters:
        -----------
        adata : AnnData
            Annotated data matrix with cell type annotations
        cell_type_key : str
            Key in adata.obs containing cell type labels
        organism : str
            'Human' or 'Mouse'
        gene_sets : List[str]
            List of gene set databases to use
        """
        self.adata = adata
        self.cell_type_key = cell_type_key
        self.organism = organism
        self.gene_sets = gene_sets
        self.results = {}

    def compute_cell_type_rankings(
        self,
        cell_type: str
    ) -> pd.Series:
        """
        Compute differential expression rankings for one cell type vs all others.
        
        Parameters:
        -----------
        cell_type : str
            Cell type to analyze
            
        Returns:
        --------
        pd.Series
            Ranked gene list with ranking scores
        """
        # Create binary mask for cell type
        cell_mask = self.adata.obs[self.cell_type_key] == cell_type
        
        # Initialize results storage
        n_genes = self.adata.n_vars
        scores = np.zeros(n_genes)
        pvals = np.zeros(n_genes)
        
        # Get expression matrix
        if sparse.issparse(self.adata.X):
            X = self.adata.X.toarray()
        else:
            X = self.adata.X
        
        # Compute rankings for each gene
        for i in range(n_genes):
            gene_expr = X[:, i]
            
            # Perform Mann-Whitney U test
            stat, pval = stats.mannwhitneyu(
                gene_expr[cell_mask],
                gene_expr[~cell_mask],
                alternative='two-sided'
            )
            
            # Compute effect size (log2 fold change)
            mean_1 = np.mean(gene_expr[cell_mask])
            mean_2 = np.mean(gene_expr[~cell_mask])
            log2fc = np.log2((mean_1 + 1e-10) / (mean_2 + 1e-10))
            
            scores[i] = log2fc
            pvals[i] = pval
        
        # Create ranking metric
        ranking_metric = -np.log10(pvals) * np.sign(scores)
        
        # Create ranked gene list
        gene_names = [f'gene_{i}' for i in range(n_genes)] if self.adata.var_names.empty else self.adata.var_names
        rankings = pd.Series(
            ranking_metric,
            index=gene_names,
            name='ranking'
        ).sort_values(ascending=False)
        
        return rankings

    def run_gsea(
        self,
        min_size: int = 15,
        max_size: int = 500,
        permutations: int = 1000,
        processes: int = 4
    ) -> Dict:
        """
        Run GSEA analysis for all cell types.
        
        Parameters:
        -----------
        min_size : int
            Minimum gene set size
        max_size : int
            Maximum gene set size
        permutations : int
            Number of permutations
        processes : int
            Number of parallel processes
            
        Returns:
        --------
        Dict
            Dictionary containing GSEA results for each cell type
        """
        # Get unique cell types
        cell_types = self.adata.obs[self.cell_type_key].unique()
        
        print("Running GSEA analysis for each cell type...")
        for cell_type in tqdm(cell_types):
            # Get rankings for this cell type
            rankings = self.compute_cell_type_rankings(cell_type)
            
            # Run GSEA
            pre_res = gp.prerank(
                rnk=rankings,
                gene_sets=self.gene_sets,
                min_size=min_size,
                max_size=max_size,
                permutation_num=permutations,
                processes=processes,
                seed=42,
                no_plot=True
            )
            
            # Store results
            self.results[cell_type] = pre_res.res2d
            
        return self.results

    def plot_top_pathways(
        self,
        n_pathways: int = 10,
        fdr_cutoff: float = 0.05,
        figsize: tuple = (12, 8)
    ) -> None:
        """
        Plot top enriched pathways for each cell type.
        
        Parameters:
        -----------
        n_pathways : int
            Number of top pathways to show
        fdr_cutoff : float
            FDR cutoff for significance
        figsize : tuple
            Figure size
        """
        # Combine all results
        all_results = []
        for cell_type, res in self.results.items():
            df = res.copy()
            df['cell_type'] = cell_type
            all_results.append(df)
        
        combined_results = pd.concat(all_results)
        
        # Filter significant pathways and get top n
        top_pathways = (combined_results
            .query('fdr < @fdr_cutoff')
            .groupby('cell_type')
            .apply(lambda x: x.nlargest(n_pathways, 'nes'))
            .reset_index(drop=True)
        )
        
        # Create plot
        plt.figure(figsize=figsize)
        g = sns.FacetGrid(
            data=top_pathways,
            col='cell_type',
            col_wrap=3,
            height=4,
            aspect=1.5
        )
        
        g.map_dataframe(
            sns.barplot,
            x='nes',
            y='Term',
            hue='fdr',
            palette='RdBu_r'
        )
        
        g.set_axis_labels('Normalized Enrichment Score', 'Pathway')
        g.fig.suptitle('Top Enriched Pathways by Cell Type', y=1.02)
        plt.tight_layout()
        
        return g

In [60]:
# Example usage:

### Load your data
"run_20250121_150654_dataset_cell_type_generated_data.h5"
"run_20250121_150654_dataset_cell_type_generated_labels.csv"
"run_20250121_150654_dataset_cell_type_generated_data.csv"
# Load data
file_path = "/Users/guyshani/Documents/PHD/Aim_2/10x_data_mouse/20_1_2025__normalized/saved_models/"

# Load expression matrix
with h5py.File(file_path+"run_20250121_150654_dataset_cell_type_generated_data.h5", 'r') as f:
    matrix = f['matrix'][:]

gene_symbols = pd.read_csv("/Users/guyshani/Documents/PHD/Aim_2/10x_data_mouse/20_1_2025__normalized/combined_normalized_data.csv", nrows=1, sep=";", header=None, index_col=0).iloc[0]
gene_symbols = [gene.upper() for gene in gene_symbols]
# Load labels
labels = pd.read_csv(file_path+"run_20250121_150654_dataset_cell_type_generated_labels.csv")

# Create AnnData object
adata = sc.AnnData(matrix)
adata.var_names = gene_symbols
# Add cell type labels to adata
adata.obs['cell_type'] = labels.iloc[:, 2].values  # Assuming cell types are in the third column

# Initialize GSEA analyzer
gsea_analyzer = CellTypeGSEA(
    adata,
    cell_type_key='cell_type',
    organism='Mouse',
    gene_sets=['WikiPathways_2024_Mouse']
)

# Run GSEA
results = gsea_analyzer.run_gsea()

# Plot results
plot = gsea_analyzer.plot_top_pathways()
plt.savefig('gsea_results.pdf', bbox_inches='tight', dpi=300)

# Save results to CSV
for cell_type, res in results.items():
    res.to_csv(f'gsea_results_{cell_type}.csv')

Running GSEA analysis for each cell type...


  log2fc = np.log2((mean_1 + 1e-10) / (mean_2 + 1e-10))
  pre_res = gp.prerank(
  log2fc = np.log2((mean_1 + 1e-10) / (mean_2 + 1e-10))
  pre_res = gp.prerank(
  log2fc = np.log2((mean_1 + 1e-10) / (mean_2 + 1e-10))
  pre_res = gp.prerank(
  log2fc = np.log2((mean_1 + 1e-10) / (mean_2 + 1e-10))
  pre_res = gp.prerank(
  log2fc = np.log2((mean_1 + 1e-10) / (mean_2 + 1e-10))
  pre_res = gp.prerank(
  log2fc = np.log2((mean_1 + 1e-10) / (mean_2 + 1e-10))
  pre_res = gp.prerank(
  log2fc = np.log2((mean_1 + 1e-10) / (mean_2 + 1e-10))
  pre_res = gp.prerank(
  log2fc = np.log2((mean_1 + 1e-10) / (mean_2 + 1e-10))
  pre_res = gp.prerank(
  log2fc = np.log2((mean_1 + 1e-10) / (mean_2 + 1e-10))
  pre_res = gp.prerank(
  log2fc = np.log2((mean_1 + 1e-10) / (mean_2 + 1e-10))
  pre_res = gp.prerank(
The order of those genes will be arbitrary, which may produce unexpected results.
100%|██████████| 10/10 [00:42<00:00,  4.22s/it]


UndefinedVariableError: name 'fdr' is not defined

In [None]:
# Hallmark gene sets - foundational biological processes
'HALLMARK'

# Curated gene sets
'KEGG_2021_Human'      # KEGG pathways
'REACTOME_2022'        # Reactome pathways
'WIKIPATHWAYS_2021_HUMAN' # WikiPathways
'PID_2019'             # Pathway Interaction Database

# GO gene sets
'GO_Biological_Process_2021'
'GO_Cellular_Component_2021'
'GO_Molecular_Function_2021'

# Immunologic gene sets
'IMMUNESIGDB'          # Immunological signatures

In [None]:
"run_20250121_150654_dataset_cell_type_generated_data.h5"
"run_20250121_150654_dataset_cell_type_generated_labels.csv"
"run_20250121_150654_dataset_cell_type_generated_data.csv"
file_path = "~/Documents/PHD/Aim_2/10x_data_mouse/20_1_2025__normalized/saved_models/"
data = pd.read_hdf(file_path)
# Convert to AnnData object
# Assumes genes are columns and cells are rows
adata = sc.AnnData(data)
    
expression_data = pd.read_csv(expression_file, index_col=0)
# Create AnnData object
adata = sc.AnnData(expression_data)
    
# If metadata file is provided, add it to the object

metadata = pd.read_csv(metadata_file, index_col=0)
# Make sure metadata rows match expression data rows
metadata = metadata.loc[adata.obs_names]
adata.obs = metadata

In [57]:
file_path = "/Users/guyshani/Documents/PHD/Aim_2/10x_data_mouse/20_1_2025__normalized/saved_models/"
'''
with h5py.File(file_path+"run_20250121_150654_dataset_cell_type_generated_data.h5", 'r') as f:
    # Print available keys to check structure
    print("Available keys:", list(f.keys()))
    # Assuming data is stored in a 'matrix' or similar key
    # Modify this based on your file structure
    data = f['matrix'][:]
'''
#labels = pd.read_csv(file_path+"run_20250121_150654_dataset_cell_type_generated_labels.csv")
#labels
#data
available_gsets = gp.get_library_name()
print("Available gene sets:", available_gsets)
# Different categories of mouse gene sets
mgi_sets = [gs for gs in available_gsets if 'MGI' in gs]
mouse_go = [gs for gs in available_gsets if 'Mouse' in gs]
mousecyc = [gs for gs in available_gsets if 'MouseCyc' in gs]

print("\nMGI Gene Sets:")
for gs in sorted(mgi_sets):
    print(f"- {gs}")

print("\nMouse GO Sets:")
for gs in sorted(mouse_go):
    print(f"- {gs}")

print("\nMouseCyc Sets:")
for gs in sorted(mousecyc):
    print(f"- {gs}")

Available gene sets: ['ARCHS4_Cell-lines', 'ARCHS4_IDG_Coexp', 'ARCHS4_Kinases_Coexp', 'ARCHS4_TFs_Coexp', 'ARCHS4_Tissues', 'Achilles_fitness_decrease', 'Achilles_fitness_increase', 'Aging_Perturbations_from_GEO_down', 'Aging_Perturbations_from_GEO_up', 'Allen_Brain_Atlas_10x_scRNA_2021', 'Allen_Brain_Atlas_down', 'Allen_Brain_Atlas_up', 'Azimuth_2023', 'Azimuth_Cell_Types_2021', 'BioCarta_2013', 'BioCarta_2015', 'BioCarta_2016', 'BioPlanet_2019', 'BioPlex_2017', 'CCLE_Proteomics_2020', 'CORUM', 'COVID-19_Related_Gene_Sets', 'COVID-19_Related_Gene_Sets_2021', 'Cancer_Cell_Line_Encyclopedia', 'CellMarker_2024', 'CellMarker_Augmented_2021', 'ChEA_2013', 'ChEA_2015', 'ChEA_2016', 'ChEA_2022', 'Chromosome_Location', 'Chromosome_Location_hg19', 'ClinVar_2019', 'DGIdb_Drug_Targets_2024', 'DSigDB', 'Data_Acquisition_Method_Most_Popular_Genes', 'DepMap_CRISPR_GeneDependency_CellLines_2023', 'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019', 'DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019', 'Des