# CROP-seq KO15 - Enrichment analysis of DEA results
- goal: enrichments of DEA_condition & DEA_KO results (also mixscape analyses)
- input: DEA results
- output: enrichment results, summary files & plots

In [1]:
cd ../

/home/sreichl/projects/bmdm-stim


In [2]:
# libraries
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import gseapy as gp

In [5]:
# import util functions
import sys
sys.path.insert(1, os.path.join('src'))

import utils_GSEApy

In [4]:
# for relaoding a module
# importlib.reload(utils_GSEApy)

In [7]:
# configs
dir_base = os.path.join('results','KO15')

mixscape = True

if mixscape:
    dir_data = os.path.join(dir_base, 'unsupervised_mixscape')
else:
    dir_data = dir_base
    
dir_results = os.path.join(dir_base, 'enrichment_analysis')

databases = ["BioPlanet_2019", 
             "KEGG_2019_Mouse", 
             "WikiPathways_2019_Mouse",
             "GO_Biological_Process_2021"]

# more stringent adj.p-value cut-off
# databases_strict=["BioPlanet_2019",
#            "GO_Biological_Process_2021"]
databases_strict=[]

adj_pvalue=0.05
adj_pvalue_strict=0.01

In [8]:
if not os.path.exists(dir_results):
        os.mkdir(dir_results)

# load data

In [9]:
annot = pd.read_csv(os.path.join(dir_base, 'KO15_raw_metadata.csv'), index_col=0)
print(annot.shape)
annot.head()

(20049, 15)


Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,nCount_AB,nFeature_AB,nCount_gRNA,nFeature_gRNA,nCount_HTO,nFeature_HTO,hto_demux,pass_QC,batch,percent.mt,guide_call,KO_call
1_AAACCTGAGAAGAAGC-1,PT141_1,9229,2626,277,8,186,14,184,3,HTO-RAW-SZ_LO28_2h,True,1,0.736808,mStat1_3,mStat1
1_AAACCTGAGACTGTAA-1,PT141_1,10672,3572,389,10,1312,15,167,2,HTO-RAW-SZ_untreated,True,1,2.398801,mIrf8_4,mIrf8
1_AAACCTGAGAGTGAGA-1,PT141_1,799,633,82,8,12,9,25,3,Negative,False,1,0.500626,mJak1_1,mJak1
1_AAACCTGAGCCATCGC-1,PT141_1,1153,845,285,8,20,17,81,3,HTO-RAW-SZ_untreated,False,1,12.142238,Negative,Negative
1_AAACCTGAGGTGTTAA-1,PT141_1,9867,3564,135,9,1069,11,19,2,HTO-RAW-SZ_untreated,True,1,1.530354,mKdm1b_3,mKdm1b


In [10]:
SCT_counts = pd.read_csv(os.path.join(dir_data, 'KO15_SCT_counts.csv'), index_col=0)
print(SCT_counts.shape)
SCT_counts.head()

(13395, 4978)


Unnamed: 0,X1_AAACCTGAGAAGAAGC.1,X1_AAACCTGCATTCTCAT.1,X1_AAACGGGAGAGGGCTT.1,X1_AAACGGGCAAAGGTGC.1,X1_AAACGGGCAAGCGAGT.1,X1_AAACGGGGTACCGTAT.1,X1_AAAGATGAGCCTCGTG.1,X1_AAAGATGCAATGGAGC.1,X1_AAAGATGTCTAACTCT.1,X1_AAAGCAATCGTCCAGG.1,...,X2_TTTACTGTCGGCTTGG.1,X2_TTTATGCAGGAGTACC.1,X2_TTTATGCCAGTTCATG.1,X2_TTTATGCTCAGTCAGT.1,X2_TTTATGCTCATGCATG.1,X2_TTTCCTCCAGACGCCT.1,X2_TTTGCGCCACCGATAT.1,X2_TTTGCGCGTTCATGGT.1,X2_TTTGGTTCATGCATGT.1,X2_TTTGGTTTCCGGGTGT.1
Mrpl15,2,1,0,2,1,1,0,1,1,1,...,3,1,0,1,1,0,2,2,0,0
Lypla1,0,0,1,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Tcea1,1,1,0,1,0,1,0,2,0,1,...,0,2,0,0,0,0,1,1,1,2
Rgs20,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Atp6v1h,1,0,0,3,1,1,3,0,0,1,...,2,1,0,0,0,3,1,1,3,0


In [11]:
# prepare metadata

# conditions
annot['condition'] = annot['hto_demux'].str.split(pat='_',n=1, expand=True)[1]
conditions = list(annot['condition'].unique())
conditions.remove(None)
conditions = [conditions[i] for i in [1,0,2]]
print(conditions)

# KO target genes
KO_genes = list(annot['KO_call'].unique())
KO_genes.remove('Negative')
KO_genes.remove('Multiplet')
KO_genes.remove('mNon-targeting')
print(KO_genes)

['untreated', 'LO28_2h', 'LO28_6h']
['mStat1', 'mIrf8', 'mJak1', 'mKdm1b', 'mHdac6', 'mIrf9', 'mRela', 'mSpi1', 'mCsf1r', 'mTyk2', 'mcJun', 'mStat2', 'mKdm6b', 'mCreb1', 'mEp300']


In [12]:
# define list of background genes
background = SCT_counts.index
background = list(set(background))
background=[str(x).upper() for x in list(background)]
print(len(background))
background[:5]

13395


['NEDD1', 'TXN2', 'TAF15', 'LIPC', 'SLC35G3']

# Enrichment analysis

In [13]:
stat_sign = 0.05
pval = 'p_val_adj'

## DEA analysis within condition between KOs

In [14]:
if mixscape:
    dea_folder = 'DEA_condition_mixscape'
else:
    dea_folder = 'DEA_condition'

In [16]:
# get significant genes per treatment and time point ie group
gene_lists = dict()

for cond in conditions:
    for KO_gene in KO_genes:
        tmp_path = os.path.join(dir_base, dea_folder, 'KO15_DEA_RNA_{}_{}.csv'.format(cond, KO_gene))
        
        if not os.path.isfile(tmp_path):
            continue
        
        # load DEA analysis results
        tmp_results = pd.read_csv(tmp_path, index_col=0)
        
        for direction in ['up','down']:
            if direction=='up':
                tmp_genes = tmp_results.loc[(tmp_results[pval]<0.05) & (tmp_results['avg_log2FC']>0), ].index.unique()
            else:
                tmp_genes = tmp_results.loc[(tmp_results[pval]<0.05) & (tmp_results['avg_log2FC']<0), ].index.unique()
        
            if len(tmp_genes)==0:
                continue
            
            gene_lists[cond+"_"+KO_gene+"_"+direction] = tmp_genes

print(len(gene_lists.keys()))        
gene_lists.keys()

45


dict_keys(['untreated_mStat1_up', 'untreated_mStat1_down', 'untreated_mJak1_up', 'untreated_mJak1_down', 'untreated_mIrf9_up', 'untreated_mIrf9_down', 'untreated_mSpi1_up', 'untreated_mSpi1_down', 'untreated_mTyk2_up', 'untreated_mTyk2_down', 'untreated_mStat2_up', 'untreated_mStat2_down', 'untreated_mEp300_up', 'untreated_mEp300_down', 'LO28_2h_mStat1_up', 'LO28_2h_mJak1_up', 'LO28_2h_mJak1_down', 'LO28_2h_mIrf9_up', 'LO28_2h_mIrf9_down', 'LO28_2h_mRela_up', 'LO28_2h_mRela_down', 'LO28_2h_mSpi1_up', 'LO28_2h_mSpi1_down', 'LO28_2h_mTyk2_down', 'LO28_2h_mStat2_up', 'LO28_2h_mStat2_down', 'LO28_2h_mEp300_up', 'LO28_2h_mEp300_down', 'LO28_6h_mStat1_up', 'LO28_6h_mStat1_down', 'LO28_6h_mIrf8_up', 'LO28_6h_mIrf8_down', 'LO28_6h_mJak1_up', 'LO28_6h_mJak1_down', 'LO28_6h_mIrf9_up', 'LO28_6h_mIrf9_down', 'LO28_6h_mRela_down', 'LO28_6h_mSpi1_up', 'LO28_6h_mSpi1_down', 'LO28_6h_mTyk2_up', 'LO28_6h_mTyk2_down', 'LO28_6h_mStat2_up', 'LO28_6h_mStat2_down', 'LO28_6h_mEp300_up', 'LO28_6h_mEp300_down'

In [18]:
# convert gene IDs to symbols, and upper-case
for key in gene_lists.keys():
    gene_lists[key] = list(gene_lists[key])
    gene_lists[key] = [str(x).upper() for x in list(gene_lists[key])]

In [20]:
util_GSEApy.do_enrichment_all(gene_lists=gene_lists, 
                  background=background, 
                  databases=databases, 
                  databases_strict=databases_strict, 
                  adj_pvalue=adj_pvalue, 
                  adj_pvalue_strict=adj_pvalue_strict, 
                  dir_results=os.path.join(dir_results,dea_folder),
                  top_n=3
                 )



BioPlanet_2019




KEGG_2019_Mouse
WikiPathways_2019_Mouse




GO_Biological_Process_2021
BioPlanet_2019 396
KEGG_2019_Mouse 108
WikiPathways_2019_Mouse 69
GO_Biological_Process_2021 1403


## DEA analysis within KOs between conditions

In [21]:
if mixscape:
    dea_folder = 'DEA_KO_mixscape'
else:
    dea_folder = 'DEA_KO'

In [22]:
# get significant genes per treatment and time point ie group
gene_lists = dict()

for KO_gene in KO_genes+['mNon-targeting']:
    for cond1 in conditions[:len(conditions)-1]:
        for cond2 in conditions[conditions.index(cond1)+1:len(conditions)]:

            tmp_path = os.path.join(dir_base, dea_folder, 'KO15_DEA_RNA_{}_{}_{}.csv'.format(KO_gene, cond2, cond1))

            if not os.path.isfile(tmp_path):
                continue
            
            # load DEA analysis results
            tmp_results = pd.read_csv(tmp_path, index_col=0)

            for direction in ['up','down']:
                if direction=='up':
                    tmp_genes = tmp_results.loc[(tmp_results[pval]<0.05) & (tmp_results['avg_log2FC']>0), ].index.unique()
                else:
                    tmp_genes = tmp_results.loc[(tmp_results[pval]<0.05) & (tmp_results['avg_log2FC']<0), ].index.unique()

                if len(tmp_genes)==0:
                    continue

                gene_lists[KO_gene+"_"+cond2+"_"+cond1+"_"+direction] = tmp_genes

print(len(gene_lists.keys()))        
gene_lists.keys()

50


dict_keys(['mStat1_LO28_2h_untreated_up', 'mStat1_LO28_2h_untreated_down', 'mStat1_LO28_6h_untreated_up', 'mStat1_LO28_6h_untreated_down', 'mStat1_LO28_6h_LO28_2h_up', 'mStat1_LO28_6h_LO28_2h_down', 'mJak1_LO28_2h_untreated_up', 'mJak1_LO28_2h_untreated_down', 'mJak1_LO28_6h_untreated_up', 'mJak1_LO28_6h_untreated_down', 'mJak1_LO28_6h_LO28_2h_up', 'mJak1_LO28_6h_LO28_2h_down', 'mIrf9_LO28_2h_untreated_up', 'mIrf9_LO28_2h_untreated_down', 'mIrf9_LO28_6h_untreated_up', 'mIrf9_LO28_6h_untreated_down', 'mIrf9_LO28_6h_LO28_2h_up', 'mIrf9_LO28_6h_LO28_2h_down', 'mRela_LO28_6h_LO28_2h_up', 'mRela_LO28_6h_LO28_2h_down', 'mSpi1_LO28_2h_untreated_up', 'mSpi1_LO28_2h_untreated_down', 'mSpi1_LO28_6h_untreated_up', 'mSpi1_LO28_6h_untreated_down', 'mSpi1_LO28_6h_LO28_2h_up', 'mSpi1_LO28_6h_LO28_2h_down', 'mTyk2_LO28_2h_untreated_up', 'mTyk2_LO28_2h_untreated_down', 'mTyk2_LO28_6h_untreated_up', 'mTyk2_LO28_6h_untreated_down', 'mTyk2_LO28_6h_LO28_2h_up', 'mTyk2_LO28_6h_LO28_2h_down', 'mStat2_LO28_2h

In [23]:
# convert gene IDs to symbols, and upper-case
for key in gene_lists.keys():
    gene_lists[key] = list(gene_lists[key])
    gene_lists[key] = [str(x).upper() for x in list(gene_lists[key])]

In [24]:
util_GSEApy.do_enrichment_all(gene_lists=gene_lists, 
                              background=background, 
                              databases=databases, 
                              databases_strict=databases_strict, 
                              adj_pvalue=adj_pvalue, 
                              adj_pvalue_strict=adj_pvalue_strict, 
                              dir_results=os.path.join(dir_results,dea_folder),
                              top_n=3
                             )



BioPlanet_2019




KEGG_2019_Mouse




WikiPathways_2019_Mouse
GO_Biological_Process_2021




BioPlanet_2019 689
KEGG_2019_Mouse 185
WikiPathways_2019_Mouse 105
GO_Biological_Process_2021 1511
