# GSEA analysis pipeline

Pipeline gsea analysis using ranked gene files.

## Workbook setup

### Load libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import pandas as pd
import os, re, glob
import seaborn as sns

### Functions

## Seurat v4 cluster gsea

Have recently rerun single cell RNASeq analyses through Seurat v4.0.2. Re-run focuses on just CMP.  
Ran Seurat with quality qc:  
* nFeature_RNA between 200 and 4000 (inclusive)  
* Mitochondrial threshold <= 5%  
Used `Normalize()` and `ScaleData()` functions (as opposed to `SCTransform()` because dealing with a single library prep.  
Regression `seurat.object <- ScaleData(seurat.object, features = all.genes, vars.to.regress = c("nFeature_RNA", "nCount_RNA"))`

### GSEA command

In [3]:
os.chdir("/Users/heustonef/Desktop/10XGenomicsData/msAggr_scRNASeq/RankList_res2.5_findAll_hgnc/")

In [82]:
# Use `FindAllMarkers()` to generate ranked gene files per cluster. Max adj_p_val 0.05, log_2FC threshold >= 0.25
# Get the immediately important ones dones first, then can run everything when have a few hours...

gmt_list = ["/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/c3.all.v7.4.symbols.gmt", 
            "/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/c8.all.v7.4.symbols.gmt", 
#             "/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/h.all.v7.4.symbols.gmt"
           ]
rnk_list = ["res.2.5cluster7.rnk", "res.2.5cluster11.rnk", "res.2.5cluster14.rnk", "res.2.5cluster21.rnk", "res.2.5cluster27.rnk", "res.2.5cluster23.rnk", "res.2.5cluster2.rnk", "res.2.5cluster15.rnk", "res.2.5cluster5.rnk"]


gsea_cmds = []
for gmt in gmt_list:
    for rnk_file in rnk_list:
        outname = re.search('^(.+)(?=.rnk)', rnk_file).group(0)
        outgmt = re.search('(.+)(?=.v\d)', os.path.basename(gmt)).group(0)
        outfile = '_'.join((outname, outgmt))
        gsea_cmd = ''.join(('~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx ', 
                            gmt,
                            ' -collapse No_Collapse -mode Max_probe -norm meandiv -nperm 20000 -rnk ', 
                             rnk_file, 
                            ' -scoring_scheme weighted -rpt_label ',
                            outfile, 
                            ' -create_svgs false -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp -set_max 500 -set_min 5 -zip_report false',
#                             datetime.datetime.now().strftime("%b%d").lower())
                          )
        gsea_cmds.append(gsea_cmd)

for cmd in gsea_cmds:
#     print(cmd)
    os.system(cmd)
    print("finished", re.search('(?:rpt_label\s)(res.+)(?=\s-create)', cmd).group(1))

finished res.2.5cluster7_c3.all
finished res.2.5cluster11_c3.all
finished res.2.5cluster14_c3.all
finished res.2.5cluster21_c3.all
finished res.2.5cluster27_c3.all
finished res.2.5cluster23_c3.all
finished res.2.5cluster2_c3.all
finished res.2.5cluster15_c3.all
finished res.2.5cluster5_c3.all
finished res.2.5cluster7_c8.all
finished res.2.5cluster11_c8.all
finished res.2.5cluster14_c8.all
finished res.2.5cluster21_c8.all
finished res.2.5cluster27_c8.all
finished res.2.5cluster23_c8.all
finished res.2.5cluster2_c8.all
finished res.2.5cluster15_c8.all
finished res.2.5cluster5_c8.all


In [47]:
# Use `FindAllMarkers()` to generate ranked gene files per cluster. Max adj_p_val 0.05, log_2FC threshold >= 0.25
gmt_list = ["/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/c2.all.v7.4.symbols.gmt", "/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/c3.all.v7.4.symbols.gmt", "/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/c8.all.v7.4.symbols.gmt", "/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/h.all.v7.4.symbols.gmt"]
rnk_list = glob.glob('*.rnk')


gsea_cmds = []
for gmt in gmt_list:
    for rnk_file in rnk_list:
#         outname = re.split(pattern='\.rnk', string=rnk_file)[0]
        outname = re.search('^(.+)(?=.rnk)', rnk_file).group(0)
        outgmt = re.search('(.+)(?=.v\d)', os.path.basename(gmt)).group(0)
        outfile = '_'.join((outname, outgmt))
        gsea_cmd = '_'.join(('~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx ', 
                            gmt,
                            ' -collapse No_Collapse -mode Max_probe -norm meandiv -nperm 20000 -rnk ', 
                             rnk_file, 
                            ' -scoring_scheme weighted -rpt_label ',
                            outfile, 
                            ' -create_svgs false -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp -set_max 500 -set_min 5 -zip_report false -out ./'))
        gsea_cmds.append(gsea_cmd)

for cmd in gsea_cmds:
#     print(cmd)
#     os.system(cmd)
    print("finished", re.search('(?:rpt_label\s)(res.+)(?=\s-create)', cmd).group(1))

~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx _/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/c2.all.v7.4.symbols.gmt_ -collapse No_Collapse -mode Max_probe -norm meandiv -nperm 20000 -rnk _res.2.5cluster27.rnk_ -scoring_scheme weighted -rpt_label _res.2.5cluster27_c2.all_ -create_svgs false -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp -set_max 500 -set_min 5 -zip_report false -out ./
finished res.2.5cluster27.rnk 

~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx _/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/c2.all.v7.4.symbols.gmt_ -collapse No_Collapse -mode Max_probe -norm meandiv -nperm 20000 -rnk _res.2.5cluster27.rnk_ -scoring_scheme weighted -rpt_label _res.2.5cluster27_c2.all_ -create_svgs false -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp -set_max 500 -set_min 5 -zip_report false -out ./
finished res.2.5cluster7.rnk 

~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx _/Users/heustonef/Desktop/GSEA_4.1.

finished res.2.5cluster17.rnk 

~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx _/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/c2.all.v7.4.symbols.gmt_ -collapse No_Collapse -mode Max_probe -norm meandiv -nperm 20000 -rnk _res.2.5cluster9.rnk_ -scoring_scheme weighted -rpt_label _res.2.5cluster9_c2.all_ -create_svgs false -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp -set_max 500 -set_min 5 -zip_report false -out ./
finished res.2.5cluster16.rnk 

~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx _/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/c2.all.v7.4.symbols.gmt_ -collapse No_Collapse -mode Max_probe -norm meandiv -nperm 20000 -rnk _res.2.5cluster29.rnk_ -scoring_scheme weighted -rpt_label _res.2.5cluster29_c2.all_ -create_svgs false -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp -set_max 500 -set_min 5 -zip_report false -out ./
finished res.2.5cluster27.rnk 

~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx _/Us

finished res.2.5cluster10.rnk 

~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx _/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/c3.all.v7.4.symbols.gmt_ -collapse No_Collapse -mode Max_probe -norm meandiv -nperm 20000 -rnk _res.2.5cluster29.rnk_ -scoring_scheme weighted -rpt_label _res.2.5cluster29_c3.all_ -create_svgs false -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp -set_max 500 -set_min 5 -zip_report false -out ./
finished res.2.5cluster14.rnk 

~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx _/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/c3.all.v7.4.symbols.gmt_ -collapse No_Collapse -mode Max_probe -norm meandiv -nperm 20000 -rnk _res.2.5cluster29.rnk_ -scoring_scheme weighted -rpt_label _res.2.5cluster29_c3.all_ -create_svgs false -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp -set_max 500 -set_min 5 -zip_report false -out ./
finished res.2.5cluster28.rnk 

~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx _/

finished res.2.5cluster5.rnk 

~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx _/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/c8.all.v7.4.symbols.gmt_ -collapse No_Collapse -mode Max_probe -norm meandiv -nperm 20000 -rnk _res.2.5cluster15.rnk_ -scoring_scheme weighted -rpt_label _res.2.5cluster15_c8.all_ -create_svgs false -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp -set_max 500 -set_min 5 -zip_report false -out ./
finished res.2.5cluster25.rnk 

~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx _/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/c8.all.v7.4.symbols.gmt_ -collapse No_Collapse -mode Max_probe -norm meandiv -nperm 20000 -rnk _res.2.5cluster15.rnk_ -scoring_scheme weighted -rpt_label _res.2.5cluster15_c8.all_ -create_svgs false -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp -set_max 500 -set_min 5 -zip_report false -out ./
finished res.2.5cluster19.rnk 

~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx _/U

~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx _/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/h.all.v7.4.symbols.gmt_ -collapse No_Collapse -mode Max_probe -norm meandiv -nperm 20000 -rnk _res.2.5cluster16.rnk_ -scoring_scheme weighted -rpt_label _res.2.5cluster16_h.all_ -create_svgs false -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp -set_max 500 -set_min 5 -zip_report false -out ./
finished res.2.5cluster7.rnk 

~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx _/Users/heustonef/Desktop/GSEA_4.1.0/gmtFiles/h.all.v7.4.symbols.gmt_ -collapse No_Collapse -mode Max_probe -norm meandiv -nperm 20000 -rnk _res.2.5cluster16.rnk_ -scoring_scheme weighted -rpt_label _res.2.5cluster16_h.all_ -create_svgs false -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp -set_max 500 -set_min 5 -zip_report false -out ./
finished res.2.5cluster6.rnk 

~/Desktop/GSEA_4.1.0/gsea-cli.sh GSEAPreranked -gmx _/Users/heustonef/Desktop/GSEA_4.1.0/gmt

### Plotting

In [None]:
# Search through GSEA directories for tsv summary files
file_list = []
for gsea_dir in os.listdir('./'):
    if re.match('c\d.+', gsea_dir) and os.path.isdir(gsea_dir):
        print('Searching', gsea_dir)
        for gsea_file in os.listdir(os.path.join(gsea_dir)):
            if bool(re.match('gsea.+na_[pos|neg].+.tsv', gsea_file)) == True:
                file_list.append(os.path.join(gsea_dir, gsea_file))
                
goAll_list = []
go_writelist = []

# Load summary files into go_alldf table
for file in file_list:
    filename =os.path.basename(re.match('(\w+?)(?=_LSK)', file)[0])

    df = pd.DataFrame()
    goAll_file = pd.read_csv(file, sep = '\t')
    goAll_file['File'] = filename
    df['Pathway'] = goAll_file['NAME']
    df['File'] = filename
    df['size'] = pd.to_numeric(goAll_file['SIZE'])
    df['NES'] = goAll_file['NES']
    df['FDR'] = goAll_file['FDR q-val']
    goAll_list.append(df)
    go_writelist.append(goAll_file)
go_alldf = pd.concat(goAll_list, axis = 0, ignore_index = True)

# Limit table to only entries with FDR <= 0.25
sig_goalldf = go_alldf[go_alldf['FDR'] <= 0.25].sort_values('Pathway')
sig_goalldf['NES'] = pd.to_numeric(sig_goalldf['NES'])
sig_goalldf = sig_goalldf.drop('FDR', axis = 1)
sig_goalldf['File'] = sig_goalldf['File'].apply(lambda x:  pd.Series(str(x).split("vC")[0]))
sig_goalldf = pd.melt(sig_goalldf, id_vars=['Pathway', 'File'])
sig_size = sig_goalldf[sig_goalldf['variable'] == 'size'].copy()
sig_size.drop('variable', inplace = True, axis = 1)
sig_size = sig_size.rename(columns = {'value': 'size'})
sig_nes = sig_goalldf[sig_goalldf['variable'] == 'NES'].copy()
sig_nes.drop('variable', inplace = True, axis = 1)
sig_nes = sig_nes.rename(columns = {'value': 'NES'})
sigDF = pd.merge(left = sig_size, right = sig_nes, left_on=['File', 'Pathway'], right_on = ['File', 'Pathway'])

# Format order of x and y axes
pathwayorder = ['LSK', 'CMP', 'GMP', 'MK', 'ERY']
fileorder = ['c3s', 'c10', 'c11', 'c17']
spacefiller = pd.DataFrame({'Pathway':['LSK'], 'File': ['c3s'], 'size': [0], 'NES':[0]})
sigDF = sigDF.append(spacefiller)
sigDF['Pathway'] = pd.Categorical(sigDF['Pathway'], categories=['LSK', 'CMP', 'GMP', 'MK', 'ERY'], ordered=True)
sigDF['File'] = pd.Categorical(sigDF['File'], categories=fileorder, ordered=True)

# Normalize size and NES score for plotting
sigDF = sigDF.sort_values(['File', 'Pathway'])
size_norm = sigDF['size']/sigDF['size'].max()
nes_norm = sigDF['NES']/sigDF['NES'].max()


In [None]:
fig, ax = plt.subplots(figsize = (5,8))
scatter = ax.scatter(data = sigDF, x = 'File', y = 'Pathway', c = nes_norm, s = size_norm * 2000, cmap = 'coolwarm')
ax.margins(y = .1, x = .1)
ax.grid(True)
ax.set_axisbelow(True)
plt.box(on = None)

# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(num = 5), loc="lower right", title="NES", bbox_to_anchor=(.5, 1), prop={'size': 20}) # note color is normalized
ax.add_artist(legend1)

# produce a legend with a cross section of sizes from the scatter
handles, labels = scatter.legend_elements(num =5, prop="sizes", alpha=0.6)
legend2 = ax.legend(handles, labels, loc="upper left", title="size", bbox_to_anchor=(.5,1.44), prop={'size': 23}) # note size is normalized

plt.show()
