# CROP-seq KO150 - Enrichment analysis of DEA results
- goal: enrichments of DEA_condition & DEA_KO results (also mixscape versions)
- input: DEA results
- output: enrichment results, summary files & plots

In [1]:
cd ../

/home/sreichl/projects/bmdm-stim


In [2]:
# libraries
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import gseapy as gp

In [5]:
# import util functions
import sys
sys.path.insert(1, os.path.join('src'))

import utils_GSEApy

In [4]:
# for relaoding a module
# importlib.reload(utils_GSEApy)

In [5]:
# configs
dir_base = os.path.join('results','KO150')

mixscape = True

if mixscape:
    dir_data = os.path.join(dir_base, 'unsupervised_mixscape')
else:
    dir_data = dir_base
    
dir_results = os.path.join(dir_base, 'enrichment_analysis')

databases = ["BioPlanet_2019", 
             "KEGG_2019_Mouse", 
             "WikiPathways_2019_Mouse",
             "GO_Biological_Process_2021"]

# more stringent adj.p-value cut-off
# databases_strict=["BioPlanet_2019",
#            "GO_Biological_Process_2021"]
databases_strict=[]

adj_pvalue=0.05
adj_pvalue_strict=0.01

In [6]:
if not os.path.exists(dir_results):
        os.mkdir(dir_results)

# load data

In [7]:
annot = pd.read_csv(os.path.join(dir_base, 'KO150_raw_metadata.csv'), index_col=0)
print(annot.shape)
annot.head()

(76344, 16)


Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,nCount_AB,nFeature_AB,nCount_gRNA,nFeature_gRNA,nCount_HTO,nFeature_HTO,hto_demux,pass_QC,batch,percent.mt,guide_call,KO_call,condition
A1_AAACCTGAGAATCTCC-1,PT149_5_A1_transcriptome,17884,4442,987,9,358,51,379,3,HTO-RAW264.7-b_untreated,True,A1,1.481771,Chd7-2,Chd7,untreated
A1_AAACCTGAGCGATGAC-1,PT149_5_A1_transcriptome,12849,3787,1378,9,1103,56,288,3,HTO-RAW264.7-b_untreated,True,A1,1.299712,Babam1-1,Babam1,untreated
A1_AAACCTGAGGATCGCA-1,PT149_5_A1_transcriptome,21634,4842,1097,11,74,54,1062,3,HTO-RAW264.7-a_untreated,True,A1,1.127854,Negative,Negative,untreated
A1_AAACCTGAGTGCCAGA-1,PT149_5_A1_transcriptome,14479,3851,458,10,838,72,292,3,HTO-RAW264.7-b_untreated,True,A1,1.20174,Mov10-4,Mov10,untreated
A1_AAACCTGAGTGGAGAA-1,PT149_5_A1_transcriptome,13765,3832,1437,9,270,117,963,3,HTO-RAW264.7-b_untreated,True,A1,1.031602,Spi1-1,Spi1,untreated


In [8]:
SCT_counts = pd.read_csv(os.path.join(dir_data, 'KO150_SCT_counts.csv'), index_col=0)
print(SCT_counts.shape)
SCT_counts.head()

(14845, 6187)


Unnamed: 0,A1_AAACCTGCATGCTGGC.1,A1_AAACGGGAGAAACGCC.1,A1_AAACGGGCACCTCGGA.1,A1_AAACGGGGTCCAAGTT.1,A1_AAACGGGGTCTGGAGA.1,A1_AAACGGGGTGCTCTTC.1,A1_AAAGATGGTAGCCTAT.1,A1_AAAGATGGTGGTCTCG.1,A1_AAAGATGGTTGATTGC.1,A1_AAAGATGTCATCGATG.1,...,C3_TTCCCAGTCTCGATGA.1,C3_TTGACTTTCCAAGTAC.1,C3_TTGCCGTAGTTCCACA.1,C3_TTGTAGGTCCGTAGGC.1,C3_TTTACTGAGTCGATAA.1,C3_TTTATGCGTGTTTGGT.1,C3_TTTCCTCGTAAAGTCA.1,C3_TTTGCGCCACATGTGT.1,C3_TTTGCGCCATTGGGCC.1,C3_TTTGGTTGTCAGATAA.1
Mrpl15,2,0,1,3,1,1,9,2,0,1,...,0,1,6,2,2,3,2,1,5,0
Lypla1,1,0,0,0,0,2,0,1,0,0,...,1,0,0,0,0,1,0,0,1,0
Tcea1,1,2,4,0,1,3,5,3,3,1,...,5,1,4,0,3,0,5,0,1,1
Rgs20,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Atp6v1h,1,3,4,3,0,1,3,3,2,3,...,5,3,8,3,3,1,2,1,3,0


In [9]:
# prepare metadata

# conditions
annot['condition'] = annot['hto_demux'].str.split(pat='_',n=1, expand=True)[1]
conditions = list(annot['condition'].unique())
conditions.remove(None)
print(conditions)

# KO target genes
KO_genes = list(annot['KO_call'].unique())
KO_genes.remove('Negative')
KO_genes.remove('Multiplet')
KO_genes.remove('NonTargeting')
print(KO_genes)

['untreated', 'LO28-6h', 'LO28-24h']
['Chd7', 'Babam1', 'Mov10', 'Spi1', 'Jmjd1c', 'Ezh2', 'Dnmt1', 'Smc1a', 'Arid1a', 'Ifi203', 'Elp2', 'Taf7', 'Prmt2', 'Morf4l2', 'Sfpq', 'Irf1', 'Ifnar1', 'Ube2d3', 'Ep300', 'Arid4a', 'Ppm1g', 'Sp100', 'Brd1', 'Kdm5a', 'Tyk2', 'Myc', 'Apobec3', 'Ywhaz', 'Ezh1', 'Fcgr1', 'Dtx3l', 'Phf11b', 'Irf9', 'Smarca5', 'Padi2', 'Chd1', 'Gadd45b', 'Prdm1', 'Yeats4', 'Phip', 'Pcgf5', 'Usp16', 'Pbrm1', 'Phf23', 'Rlim', 'Sirt2', 'Elp4', 'Kmt2a', 'Senp3', 'Irf3', 'Ubn1', 'Tlk2', 'Rela', 'Daxx', 'Chtop', 'Hdac2', 'Setdb2', 'Prmt1', 'Ep400', 'Set', 'Mllt6', 'Apobec1', 'Smyd3', 'Ppp4r2', 'Nap1l1', 'Eid1', 'Smarcc2', 'Anp32b', 'Ikzf1', 'Dnttip2', 'Brd7', 'Brd2', 'Stat2', 'Myd88', 'Dot1l', 'Yy1', 'Pkm', 'Hp1bp3', 'Pml', 'Csf1r', 'Anp32e', 'Jun', 'Sp110', 'Aebp2', 'Runx1', 'Gatad1', 'Chd2', 'Chd4', 'Baz1a', 'Aak1', 'Spen', 'Dek', 'Stat1', 'Smyd2', 'Brd3', 'Smyd5', 'Sap30', 'Tet2', 'Mbd2', 'Atad2', 'Ddx21', 'Elp5', 'Syncrip', 'Cited2', 'Irf8', 'Dnajc1', 'Supt16', 'Brwd3', '

In [10]:
# define list of background genes
background = SCT_counts.index
background = list(set(background))
background=[str(x).upper() for x in list(background)]
print(len(background))
background[:5]

14845


['LURAP1', 'PSMG1', 'A430105J06RIK', 'DDX11', 'GSTA4']

# Enrichment analysis

In [11]:
stat_sign = 0.05
pval = 'p_val_adj'

## DEA analysis within condition between KOs

In [12]:
if mixscape:
    dea_folder = 'DEA_condition_mixscape'
    top_n=3
else:
    dea_folder = 'DEA_condition'
    top_n=1

In [13]:
# get significant genes per treatment and time point ie group
gene_lists = dict()

for cond in conditions:
    for KO_gene in KO_genes:
        tmp_path = os.path.join(dir_base, dea_folder, 'KO150_DEA_RNA_{}_{}.csv'.format(cond, KO_gene))
        
        if not os.path.isfile(tmp_path):
            continue
        
        # load DEA analysis results
        tmp_results = pd.read_csv(tmp_path, index_col=0)
        
        for direction in ['up','down']:
            if direction=='up':
                tmp_genes = tmp_results.loc[(tmp_results[pval]<0.05) & (tmp_results['avg_log2FC']>0), ].index.unique()
            else:
                tmp_genes = tmp_results.loc[(tmp_results[pval]<0.05) & (tmp_results['avg_log2FC']<0), ].index.unique()
        
            if len(tmp_genes)==0:
                continue
            
            gene_lists[cond+"_"+KO_gene+"_"+direction] = tmp_genes

print(len(gene_lists.keys()))        
gene_lists.keys()

374


dict_keys(['untreated_Babam1_down', 'untreated_Spi1_up', 'untreated_Spi1_down', 'untreated_Jmjd1c_up', 'untreated_Dnmt1_up', 'untreated_Dnmt1_down', 'untreated_Smc1a_up', 'untreated_Smc1a_down', 'untreated_Arid1a_up', 'untreated_Arid1a_down', 'untreated_Elp2_down', 'untreated_Prmt2_up', 'untreated_Sfpq_up', 'untreated_Sfpq_down', 'untreated_Ifnar1_up', 'untreated_Ifnar1_down', 'untreated_Ube2d3_down', 'untreated_Ep300_up', 'untreated_Ep300_down', 'untreated_Arid4a_up', 'untreated_Ppm1g_down', 'untreated_Brd1_up', 'untreated_Brd1_down', 'untreated_Kdm5a_up', 'untreated_Tyk2_down', 'untreated_Ywhaz_down', 'untreated_Irf9_up', 'untreated_Irf9_down', 'untreated_Smarca5_up', 'untreated_Smarca5_down', 'untreated_Chd1_down', 'untreated_Yeats4_up', 'untreated_Yeats4_down', 'untreated_Pbrm1_down', 'untreated_Sirt2_down', 'untreated_Senp3_down', 'untreated_Tlk2_up', 'untreated_Chtop_down', 'untreated_Hdac2_down', 'untreated_Prmt1_down', 'untreated_Ep400_up', 'untreated_Ep400_down', 'untreated_Se

In [14]:
# convert gene IDs to symbols, and upper-case
for key in gene_lists.keys():
    gene_lists[key] = list(gene_lists[key])
    gene_lists[key] = [str(x).upper() for x in list(gene_lists[key])]

In [15]:
util_GSEApy.do_enrichment_all(gene_lists=gene_lists, 
                  background=background, 
                  databases=databases, 
                  databases_strict=databases_strict, 
                  adj_pvalue=adj_pvalue, 
                  adj_pvalue_strict=adj_pvalue_strict, 
                  dir_results=os.path.join(dir_results,dea_folder),
                  top_n=top_n
                 )



BioPlanet_2019




KEGG_2019_Mouse




WikiPathways_2019_Mouse




GO_Biological_Process_2021




BioPlanet_2019 926
KEGG_2019_Mouse 233
WikiPathways_2019_Mouse 128
GO_Biological_Process_2021 3216


## DEA analysis within KOs between conditions

In [16]:
if mixscape:
    dea_folder = 'DEA_KO_mixscape'
    top_n = 3
else:
    dea_folder = 'DEA_KO'
    top_n = 1

In [19]:
# get significant genes per treatment and time point ie group
gene_lists = dict()

for KO_gene in KO_genes+['NonTargeting']:
    for cond1 in conditions[:len(conditions)-1]:
        for cond2 in conditions[conditions.index(cond1)+1:len(conditions)]:

            tmp_path = os.path.join(dir_base, dea_folder, 'KO150_DEA_RNA_{}_{}_{}.csv'.format(KO_gene, cond2, cond1))

            if not os.path.isfile(tmp_path):
                continue
            
            # load DEA analysis results
            tmp_results = pd.read_csv(tmp_path, index_col=0)

            for direction in ['up','down']:
                if direction=='up':
                    tmp_genes = tmp_results.loc[(tmp_results[pval]<0.05) & (tmp_results['avg_log2FC']>0), ].index.unique()
                else:
                    tmp_genes = tmp_results.loc[(tmp_results[pval]<0.05) & (tmp_results['avg_log2FC']<0), ].index.unique()

                if len(tmp_genes)==0:
                    continue

                gene_lists[KO_gene+"_"+cond2+"_"+cond1+"_"+direction] = tmp_genes

print(len(gene_lists.keys()))        
gene_lists.keys()

86


dict_keys(['Spi1_LO28-6h_untreated_up', 'Spi1_LO28-6h_untreated_down', 'Spi1_LO28-24h_untreated_up', 'Spi1_LO28-24h_untreated_down', 'Spi1_LO28-24h_LO28-6h_up', 'Spi1_LO28-24h_LO28-6h_down', 'Dnmt1_LO28-6h_untreated_up', 'Dnmt1_LO28-6h_untreated_down', 'Smc1a_LO28-6h_untreated_up', 'Smc1a_LO28-6h_untreated_down', 'Smc1a_LO28-24h_untreated_up', 'Smc1a_LO28-24h_untreated_down', 'Smc1a_LO28-24h_LO28-6h_up', 'Smc1a_LO28-24h_LO28-6h_down', 'Arid1a_LO28-6h_untreated_up', 'Arid1a_LO28-6h_untreated_down', 'Sfpq_LO28-6h_untreated_up', 'Sfpq_LO28-6h_untreated_down', 'Sfpq_LO28-24h_untreated_up', 'Sfpq_LO28-24h_untreated_down', 'Sfpq_LO28-24h_LO28-6h_up', 'Sfpq_LO28-24h_LO28-6h_down', 'Ifnar1_LO28-6h_untreated_up', 'Ifnar1_LO28-6h_untreated_down', 'Ifnar1_LO28-24h_untreated_up', 'Ifnar1_LO28-24h_untreated_down', 'Ifnar1_LO28-24h_LO28-6h_up', 'Ifnar1_LO28-24h_LO28-6h_down', 'Ep300_LO28-6h_untreated_up', 'Ep300_LO28-6h_untreated_down', 'Tyk2_LO28-24h_untreated_up', 'Tyk2_LO28-24h_untreated_down', '

In [20]:
# convert gene IDs to symbols, and upper-case
for key in gene_lists.keys():
    gene_lists[key] = list(gene_lists[key])
    gene_lists[key] = [str(x).upper() for x in list(gene_lists[key])]

In [21]:
util_GSEApy.do_enrichment_all(gene_lists=gene_lists, 
                              background=background, 
                              databases=databases, 
                              databases_strict=databases_strict, 
                              adj_pvalue=adj_pvalue, 
                              adj_pvalue_strict=adj_pvalue_strict, 
                              dir_results=os.path.join(dir_results,dea_folder),
                              top_n=top_n
                             )



BioPlanet_2019




KEGG_2019_Mouse




WikiPathways_2019_Mouse




GO_Biological_Process_2021




BioPlanet_2019 815
KEGG_2019_Mouse 226
WikiPathways_2019_Mouse 110
GO_Biological_Process_2021 2129
