In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scrublet as scr
import os
import warnings
import functools
import seaborn as sns
import scipy.stats
import anndata
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

warnings.filterwarnings('ignore')
os.chdir(os.path.expanduser('/home/jovyan/Prostate_analysis/scanpy'))
sc.settings.verbosity = 3
sc.logging.print_versions()

results_file = 'out/lymphoid.h5ad'

adata = sc.read_h5ad(results_file)
adata

  data = yaml.load(f.read()) or {}


scanpy==1.4.5.post2 anndata==0.6.22.post1 umap==0.3.10 numpy==1.17.2 scipy==1.4.1 pandas==0.25.1 scikit-learn==0.22.1 statsmodels==0.11.0rc1 python-igraph==0.7.1 louvain==0.6.1


AnnData object with n_obs × n_vars = 1694 × 2178 
    obs: 'age', 'barcode', 'batch', 'group', 'highest_GLEASON_score', 'mri_grading', 'name', 'patient', 'pool', 'psa', 'sample', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'is_doublet', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 'G2M_score', 'phase', 'leiden', 'celltype', 'Treg_score', 'CD4 NV/CM rest_score', 'CD4/CD8 rest_score', 'IFN Response_score', 'Proliferation_score', 'CD8 Cytotoxic_score', 'CD8 Cytokine_score', 'CD69+Trm_score', 'CD69-Trm_score', 'Trm_score', 'Tcm/Tem_score', 'GSE41867 Chronic-acute CD8+ Up_score', 'GSE41870 CD8+ LCMV Chronic-Acute UP_score', 'GSE9650 Chronic vs acute LCMV-specific CD8 T cells_score', 'celltype-immune', 'celltype-immune_group', 'K562_score', 'ADCC_score', 'cytokine_score', 'common_score', '__is_in_cluster__'
    var: 'gene_ids', 'feature_types', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'celltype-immune_colors', 'celltype_colors', 'dendr

In [6]:
### run preranked GSEA
def rankList(deg_file, remove_ribo_mito = False):
    # read in the files and do some ranking calculations
    deg = pd.read_csv(deg_file, sep = '\t', index_col = 0)
    if remove_ribo_mito:
        # remove ribosomal and mitochondrial genes
        deg = deg[~deg.index.str.contains('RPS|RPL|MT-')]

    # convert to negative log10 pval
    deg['neglog10pval'] = [-1*np.log10(p) for p in deg['pvals']]
    # convert inf values to max machine number
    try:
        deg['neglog10pval'].replace(np.inf, -1*np.log10(10**-308), inplace = True)
    except:
        pass
    try:    
        deg['neglog10pval'].replace(-np.inf, -1*np.log10(10**308), inplace = True)
    except:
        pass
    deg['rank'] = [p*np.sign(lfc) for p, lfc in zip(deg['neglog10pval'], deg['logfoldchanges'])]
    deg = deg.reset_index(drop = False)
    return(deg[['index', 'rank']])
# quickly prep the files
rnklist1 = {}
rnklist2 = {}
for x in list(adata.obs['celltype-immune'].cat.categories):
    rl = rankList('out/DEG/normal celltype markers/full/'+x+'_normal.txt', True)
    rnklist1[x] = rl
for x in list(adata.obs['celltype-immune'].cat.categories):
    rl = rankList('out/DEG/tumor vs normal/full/'+x+'_tumor_vs_normal.txt', True)
    rnklist2[x] = rl

In [None]:
# run preranked gsea
import gseapy as gp
for x in list(adata.obs['celltype-immune'].cat.categories):
    pre_res = gp.prerank(rnk=rnklist1[x],
                         gene_sets='dataset/c5.bp.v7.0.symbols.gmt',
                         processes=10,
                         min_size=10,
                         max_size=500,
                         permutation_num=1000, # reduce number to speed up testing
                         no_plot=True,
                         verbose=True)
    pre_res.res2d.to_csv('out/GSEA/'+str(x)+'_go_prgsea_normal.txt', sep ='\t')

for x in list(adata.obs['celltype-immune'].cat.categories):
    pre_res = gp.prerank(rnk=rnklist2[x],
                         gene_sets='dataset/c5.bp.v7.0.symbols.gmt',
                         processes=10,
                         min_size=10,
                         max_size=500,
                         permutation_num=1000, # reduce number to speed up testing
                         no_plot=True,
                         verbose=True)
    pre_res.res2d.to_csv('out/GSEA/'+str(x)+'_go_prgsea_tumor_vs_normal.txt', sep ='\t')

2020-04-06 21:05:48,137 Parsing data files for GSEA.............................
2020-04-06 21:09:04,570 2527 gene_sets have been filtered out when max_size=500 and min_size=10
2020-04-06 21:09:04,572 4823 gene_sets used for further statistical testing.....
2020-04-06 21:09:04,573 Start to run GSEA...Might take a while..................
