In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats as st
import os

In [2]:
from statsmodels.stats.multitest import fdrcorrection
from scipy.stats import hypergeom

# hypergeometric test:
# We can then compute a probability of drawing X red marbles out of N from 
# a jar containing n red marbles out of M in the following way:
def hyper_test(x, M, n, N):
    pval = hypergeom.sf(x-1, M, n, N)
    #print(pval)
    return pval

def get_BH_correct_pval(df, pval_col):
    (reject_list, pval_bh) = fdrcorrection(df[pval_col], alpha=0.05, method='indep', is_sorted=False)
    return pval_bh

In [3]:
# 07142023 - enrichment test on each condition-specific hit set
df_gobp_standard_5_200 = pd.read_csv('../data/input/go_bp_standard_Calb_5_200terms.txt', sep='\t', index_col=0)

In [4]:
df_gobp_standard_5_200

Unnamed: 0_level_0,Name,Genes,Length
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,GO:0000001 mitochondrion inheritance,orf19.6987;orf19.6071;orf19.4715;orf19.2987;or...,18
2,GO:0000002 mitochondrial genome maintenance,orf19.7538;orf19.5704;orf19.439;orf19.6672;orf...,31
3,GO:0000003 reproduction,orf19.7247;orf19.971;orf19.5292;orf19.796;orf1...,100
4,GO:0000011 vacuole inheritance,orf19.2245;orf19.7611;orf19.6364;orf19.1513;or...,20
5,GO:0000018 regulation of DNA recombination,orf19.4206;orf19.1755;orf19.2873;orf19.2983;or...,31
...,...,...,...
5462,GO:2001173 regulation of histone H2B conserved...,orf19.2983;orf19.7047;orf19.7067;orf19.7116;or...,5
5465,GO:2001207 regulation of transcription elongat...,orf19.2983;orf19.7047;orf19.4123;orf19.7067;or...,8
5467,GO:2001209 positive regulation of transcriptio...,orf19.2983;orf19.7047;orf19.4123;orf19.7067;or...,8
5475,GO:2001251 negative regulation of chromosome o...,orf19.2402;orf19.3761;orf19.643;orf19.1185;orf...,33


In [5]:
df_scores = pd.read_csv('../data/output/summary_mod_t_test_compare_median_normvar.csv', index_col=0)

In [6]:
df_scores.columns

Index(['plate', 'FBS_mean', 'FBS_fdr', 'Iron_mean', 'Iron_fdr', 'Temp37_mean',
       'Temp37_fdr', 'YPD_mean', 'YPD_fdr', 'NaCl_mean', 'NaCl_fdr',
       'SDS_mean', 'SDS_fdr', 'Sorbitol_mean', 'Sorbitol_fdr', 'YNB_mean',
       'YNB_fdr'],
      dtype='object')

In [9]:
# Write an enrichment function for each condition:
# population = 2k+ non-nan strains 
# success in population = # hits
# sample = # strains annotated to each GO term 
# success in sample: # hits in sample
def go_enrich_condition_hits(df_score, df_go_standard, 
                             condition_list=['FBS', 'Iron', 'Temp37', 'YPD', 'NaCl', 'SDS', 'Sorbitol', 'YNB'],
                            mean_cutoff=2, fdr_cutoff=0.05, output_direc='./'):
    pan_essential_hits = df_score.index
    for cond in condition_list:
        print(cond)
        cond_mean = cond + '_mean'
        cond_fdr = cond + '_fdr'
        df_score_cond_mean = df_score[cond_mean]
        df_score_cond_fdr = df_score[cond_fdr]
        df_score_cond_mean_noNA = df_score_cond_mean[df_score_cond_mean.notnull()]
        df_score_cond_fdr_noNA = df_score_cond_fdr.loc[df_score_cond_mean_noNA.index]
        
        num_population = len(df_score_cond_mean_noNA)
        strains_mean_pass = df_score_cond_mean_noNA[df_score_cond_mean_noNA>=mean_cutoff].index
        strains_fdr_pass = df_score_cond_fdr_noNA[df_score_cond_fdr_noNA<=fdr_cutoff].index
        strains_hit = np.intersect1d(strains_mean_pass, strains_fdr_pass)
        num_hits = len(strains_hit)
        
        pan_essential_hits = np.intersect1d(pan_essential_hits, strains_hit)
        
        print(num_population, num_hits)
        
        df_go_standard_cond = enrich_go(df_go_standard, strains_total=df_score_cond_mean_noNA.index, 
                  strains_hit=strains_hit, output_direc=output_direc, condition=cond)
        print()
    
    num_pan_ess_hits = len(pan_essential_hits)
    print('#pan-essential:', num_pan_ess_hits)
    all_genes = df_score.index
    df_go_standard_pan = enrich_go(df_go_standard, strains_total=all_genes, 
                  strains_hit=pan_essential_hits, output_direc=output_direc, condition='pan_essential')
    return pan_essential_hits
    
    
def enrich_go(df_go_standard, strains_total, strains_hit, output_direc, condition):
    num_population = len(strains_total)
    num_hits = len(strains_hit)
    df_go_standard_cond = df_go_standard[['Name', 'Genes']]
    df_go_standard_cond.columns = ['GO_term', 'GO_term_genes']
    df_go_standard_cond['num_strains_total'] = np.nan
    df_go_standard_cond['num_strains_total_GOannotated'] = np.nan
    df_go_standard_cond['num_strain_hits'] = np.nan
    df_go_standard_cond['num_strains_hits_GOannotated'] = np.nan
    df_go_standard_cond['cond_enrich_pval'] = np.nan
    for row in df_go_standard.iterrows():
        go_idx = row[0]
        go_name = row[1]['Name']
        go_genes = row[1]['Genes'].split(';')
        go_in_pop = np.intersect1d(go_genes, strains_total)#df_score_cond_mean_noNA.index)
        go_in_hits = np.intersect1d(go_genes, strains_hit)
        len_go_in_pop = len(go_in_pop)
        len_go_in_hits = len(go_in_hits)
        if len_go_in_hits > 1:
            # We can then compute a probability of drawing X red marbles out of N from 
            # a jar containing n red marbles out of M in the following way:
            df_go_standard_cond.loc[go_idx, 'num_strains_total'] = num_population
            df_go_standard_cond.loc[go_idx, 'num_strains_total_GOannotated'] = len_go_in_pop
            df_go_standard_cond.loc[go_idx, 'num_strain_hits'] = num_hits
            df_go_standard_cond.loc[go_idx, 'num_strains_hits_GOannotated'] = len_go_in_hits

            go_pval = hyper_test(x=len_go_in_hits, M=num_population, n=num_hits, N=len_go_in_pop)
            df_go_standard_cond.loc[go_idx, 'cond_enrich_pval'] = go_pval
    df_go_standard_cond = df_go_standard_cond[df_go_standard_cond['cond_enrich_pval'].notnull()]
    df_go_standard_cond['cond_enrich_fdr'] = get_BH_correct_pval(df_go_standard_cond, pval_col='cond_enrich_pval')
    df_go_standard_cond = df_go_standard_cond.sort_values(by='cond_enrich_fdr', ascending=True)
    df_go_standard_cond['condition'] = condition
    if output_direc is not None:
        df_go_standard_cond.to_csv(output_direc+condition+'_go_enrich_results.tsv', sep='\t', index=None)
    return df_go_standard_cond

In [None]:
go_enrich_condition_hits(df_scores, df_gobp_standard_5_200, output_direc='../data/output/go_bp_enrich/')

In [None]:
go_bp_enrich_direc = '../data/output/go_bp_enrich/'

In [None]:
# 08132023 - GO enrichment on condition-specifc hits excluding pan-essential genes
# Note: Should drop YPD before this
# = Exclude 171 pan-essentials + 36 essenitials exclusive to YPD

In [58]:
df_scores_noYPD = df_scores.drop(['YPD_mean', 'YPD_fdr'], axis=1)

In [60]:
pan_essential_hits_noYPD = go_enrich_condition_hits(df_scores_noYPD, df_gobp_standard_5_200, 
                                                    condition_list=['FBS', 'Iron', 'Temp37', 'NaCl', 'SDS', 'Sorbitol', 'YNB'],
                                                    output_direc=None)

FBS
2219 308

Iron
2223 242

Temp37
2202 313

NaCl
2187 296

SDS
2227 273

Sorbitol
2224 262

YNB
2229 278

#pan-essential: 207


In [61]:
len(pan_essential_hits_noYPD) # 171+36

207

In [63]:
def go_enrich_condition_hits_no_pan(df_score, df_go_standard, pan_essential_hits,
                             condition_list=['FBS', 'Iron', 'Temp37', 'YPD', 'NaCl', 'SDS', 'Sorbitol', 'YNB'],
                            mean_cutoff=2, fdr_cutoff=0.05, output_direc='./'):
    df_enrich_list = []
    for cond in condition_list:
        print(cond)
        cond_mean = cond + '_mean'
        cond_fdr = cond + '_fdr'
        df_score_cond_mean = df_score[cond_mean]
        df_score_cond_fdr = df_score[cond_fdr]
        df_score_cond_mean_noNA = df_score_cond_mean[df_score_cond_mean.notnull()]
        df_score_cond_fdr_noNA = df_score_cond_fdr.loc[df_score_cond_mean_noNA.index]
        
        
        strains_mean_pass = df_score_cond_mean_noNA[df_score_cond_mean_noNA>=mean_cutoff].index
        strains_fdr_pass = df_score_cond_fdr_noNA[df_score_cond_fdr_noNA<=fdr_cutoff].index
        strains_hit = np.intersect1d(strains_mean_pass, strains_fdr_pass)
        num_hits = len(strains_hit)
        
        strains_hit_no_pan = list(set(strains_hit) - set(pan_essential_hits))
        
        #pan_essential_hits = np.intersect1d(pan_essential_hits, strains_hit)
        
        bg_genes = list(set(df_score_cond_mean_noNA.index) - set(pan_essential_hits))
        num_population = len(bg_genes)
        print(num_population, num_hits, len(strains_hit_no_pan))
        
        df_go_standard_cond = enrich_go(df_go_standard, strains_total=bg_genes, 
                  strains_hit=strains_hit_no_pan, output_direc=output_direc, condition=cond)
        df_enrich_list.append(df_go_standard_cond)
        print()
    df_enrich = pd.concat(df_enrich_list)
    return df_enrich

In [64]:
df_enrich_no_pan_noYPD = go_enrich_condition_hits_no_pan(df_scores_noYPD, df_gobp_standard_5_200, 
                                                   pan_essential_hits=pan_essential_hits_noYPD,
                                                   condition_list=['FBS', 'Iron', 'Temp37', 'NaCl', 'SDS', 'Sorbitol', 'YNB'],
                                output_direc=go_bp_enrich_direc+'go_bp_enrich_exclude_pan&YPD/')

FBS
2012 308 101

Iron
2016 242 35

Temp37
1995 313 106

NaCl
1980 296 89

SDS
2020 273 66

Sorbitol
2017 262 55

YNB
2022 278 71



In [65]:
df_enrich_no_pan_noYPD

Unnamed: 0_level_0,GO_term,GO_term_genes,num_strains_total,num_strains_total_GOannotated,num_strain_hits,num_strains_hits_GOannotated,cond_enrich_pval,cond_enrich_fdr,condition
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
156,GO:0000727 double-strand break repair via brea...,orf19.4206;orf19.4545;orf19.1988;orf19.2740;or...,2012.0,13.0,101.0,5.0,0.000270,0.029198,FBS
575,GO:0006490 oligosaccharide-lipid intermediate ...,orf19.4410;orf19.7391;orf19.1092;orf19.1203.1;...,2012.0,4.0,101.0,3.0,0.000474,0.029198,FBS
476,GO:0006310 DNA recombination,orf19.4206;orf19.5873;orf19.967;orf19.5844;orf...,2012.0,48.0,101.0,9.0,0.000462,0.029198,FBS
1936,GO:0031123 RNA 3'-end processing,orf19.2402;orf19.4538;orf19.684;orf19.3333;orf...,2012.0,14.0,101.0,5.0,0.000403,0.029198,FBS
3548,GO:0051640 organelle localization,orf19.6479;orf19.3465;orf19.3248;orf19.6346;or...,2012.0,46.0,101.0,9.0,0.000330,0.029198,FBS
...,...,...,...,...,...,...,...,...,...
3059,GO:0045927 positive regulation of growth,orf19.7247;orf19.971;orf19.5672;orf19.3256;orf...,2022.0,64.0,71.0,2.0,0.666793,0.670062,YNB
4248,GO:0090033 positive regulation of filamentous ...,orf19.7247;orf19.971;orf19.5672;orf19.3256;orf...,2022.0,64.0,71.0,2.0,0.666793,0.670062,YNB
4634,GO:1900430 positive regulation of filamentous ...,orf19.7247;orf19.971;orf19.5672;orf19.3256;orf...,2022.0,64.0,71.0,2.0,0.666793,0.670062,YNB
934,GO:0008202 steroid metabolic process,orf19.5535;orf19.3240;orf19.3642;orf19.1478;or...,2022.0,65.0,71.0,2.0,0.675209,0.676860,YNB


In [74]:
df_enrich_no_pan_noYPD.to_csv(go_bp_enrich_direc+'go_bp_enrich_exclude_pan&YPD/all_conditions_go_enrich_results.tsv', sep='\t', index=None)

In [7]:
# 08152023 - Do enrichment analysis on the subgroup of genes considering all tested mutants as background
# Build a function that consider genes exclusively essential in the provided conditions
def go_enrich_condition_hits_exclusive(df_score_all, df_go_standard, 
                             all_condition_list=['FBS', 'Iron', 'Temp37', 'YPD', 'NaCl', 'SDS', 'Sorbitol', 'YNB'],
                                       subset_condition_list=['YPD'],
                            mean_cutoff=2, fdr_cutoff=0.05, output_direc='./'):
    # First, get the subset of all other conditions and exclude any hits
    all_condition_list_mean = [s + '_mean' for s in all_condition_list]
    all_condition_list_fdr = [s + '_fdr' for s in all_condition_list]
    sub_condition_list_mean = [s + '_mean' for s in subset_condition_list]
    sub_condition_list_fdr = [s + '_fdr' for s in subset_condition_list]
    
    df_score = df_score_all[all_condition_list_mean + all_condition_list_fdr] # This helps cases where we don't need YPD (no need to include in all)
    any_other_hits = []
    other_conditions_mean_fdr = list(set(all_condition_list_mean + all_condition_list_fdr) - set(sub_condition_list_mean+sub_condition_list_fdr))
    df_others = df_score[other_conditions_mean_fdr]
    other_conditions = list(set(all_condition_list)-set(subset_condition_list))
    for cond in other_conditions:
        cond_mean = cond + '_mean'
        cond_fdr = cond + '_fdr'
        df_score_cond_mean = df_others[cond_mean]
        df_score_cond_fdr = df_others[cond_fdr]
        df_score_cond_mean_noNA = df_score_cond_mean[df_score_cond_mean.notnull()]
        df_score_cond_fdr_noNA = df_score_cond_fdr.loc[df_score_cond_mean_noNA.index]
        strains_mean_pass = df_score_cond_mean_noNA[df_score_cond_mean_noNA>=mean_cutoff].index
        strains_fdr_pass = df_score_cond_fdr_noNA[df_score_cond_fdr_noNA<=fdr_cutoff].index
        strains_hit = np.intersect1d(strains_mean_pass, strains_fdr_pass)
        any_other_hits = np.union1d(any_other_hits, strains_hit)
    
    print("All other hits to exclude:", len(any_other_hits))
    
    # Second, from the subset, get the essential hits for the given condition(s)
    # Q: do we exclude other hits from the background? - probably no need to
    df_enrich_list = []
    df_score_no_other_hits = df_score.drop(any_other_hits)
    pan_essential_hits = df_score.index
    bg_gene_list = []
    for cond in subset_condition_list:
        print(cond)
        cond_mean = cond + '_mean'
        cond_fdr = cond + '_fdr'
        df_score_cond_mean = df_score_no_other_hits[cond_mean]
        df_score_cond_fdr = df_score_no_other_hits[cond_fdr]
        df_score_cond_mean_noNA = df_score_cond_mean[df_score_cond_mean.notnull()]
        df_score_cond_fdr_noNA = df_score_cond_fdr.loc[df_score_cond_mean_noNA.index]
        
        bg_genes = np.union1d(any_other_hits, df_score_cond_mean_noNA.index)
        num_population = len(bg_genes)
        bg_gene_list.extend(bg_genes.tolist())
        
        strains_mean_pass = df_score_cond_mean_noNA[df_score_cond_mean_noNA>=mean_cutoff].index
        strains_fdr_pass = df_score_cond_fdr_noNA[df_score_cond_fdr_noNA<=fdr_cutoff].index
        strains_hit = np.intersect1d(strains_mean_pass, strains_fdr_pass)
        num_hits = len(strains_hit)

        print(num_population, num_hits)
        
        pan_essential_hits = np.intersect1d(pan_essential_hits, strains_hit)
        
        #df_go_standard_cond = enrich_go(df_go_standard, strains_total=bg_genes, 
        #          strains_hit=strains_hit, output_direc=output_direc, condition=cond)
        #df_enrich_list.append(df_go_standard_cond)
        print()
    bg_gene_list = np.unique(bg_gene_list)
    num_pan_ess_hits = len(pan_essential_hits)
    print('#pan-essential:', num_pan_ess_hits)
    all_genes = df_score.index
    df_go_standard_pan = enrich_go(df_go_standard, strains_total=bg_gene_list, 
                  strains_hit=pan_essential_hits, output_direc=output_direc, condition=';'.join(subset_condition_list))
    #df_enrich = pd.concat(df_enrich_list)
    return df_go_standard_pan
        

In [98]:
# 1. 39 genes only essential to YPD but not to others
df_enrich_YPDonly = go_enrich_condition_hits_exclusive(df_scores, df_gobp_standard_5_200,
                             all_condition_list=['FBS', 'Iron', 'Temp37', 'YPD', 'NaCl', 'SDS', 'Sorbitol', 'YNB'],
                                       subset_condition_list=['YPD'],
                            mean_cutoff=2, fdr_cutoff=0.05, output_direc=None)

All other hits to exclude: 389
YPD
2228 39

#pan-essential: 39


In [99]:
df_enrich_YPDonly.to_csv(go_bp_enrich_direc+'go_bp_enrich_subsets/YPD_only_39genes_go_enrich_results.tsv', sep='\t', index=None)

In [100]:
df_enrich_YPDonly

Unnamed: 0_level_0,GO_term,GO_term_genes,num_strains_total,num_strains_total_GOannotated,num_strain_hits,num_strains_hits_GOannotated,cond_enrich_pval,cond_enrich_fdr,condition
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1126,GO:0009303 rRNA transcription,orf19.2873;orf19.274;orf19.6921;orf19.519;orf1...,2228.0,6.0,39.0,3.0,0.000096,0.011774,YPD
2684,GO:0042790 nucleolar large rRNA transcription ...,orf19.6921;orf19.519;orf19.4896;orf19.1589;orf...,2228.0,2.0,39.0,2.0,0.000299,0.018369,YPD
4430,GO:0098781 ncRNA transcription,orf19.4537;orf19.7538;orf19.2847;orf19.2831;or...,2228.0,12.0,39.0,3.0,0.000979,0.040130,YPD
1947,GO:0031145 anaphase-promoting complex-dependen...,orf19.5954;orf19.3471;orf19.5056;orf19.2084;or...,2228.0,5.0,39.0,2.0,0.002889,0.048075,YPD
3987,GO:0071043 CUT metabolic process,orf19.4582;orf19.1304;orf19.58;orf19.5229;orf1...,2228.0,7.0,39.0,2.0,0.005933,0.048075,YPD
...,...,...,...,...,...,...,...,...,...
3041,GO:0045892 negative regulation of DNA-template...,orf19.7247;orf19.1755;orf19.1343;orf19.1059;or...,2228.0,88.0,39.0,2.0,0.460687,0.472204,YPD
5042,GO:1903507 negative regulation of nucleic acid...,orf19.7247;orf19.1755;orf19.1343;orf19.1059;or...,2228.0,88.0,39.0,2.0,0.460687,0.472204,YPD
4547,GO:0140352 export from cell,orf19.5604;orf19.2078;orf19.136;orf19.6479;orf...,2228.0,91.0,39.0,2.0,0.478394,0.482316,YPD
3453,GO:0051253 negative regulation of RNA metaboli...,orf19.7247;orf19.1755;orf19.1343;orf19.1059;or...,2228.0,91.0,39.0,2.0,0.478394,0.482316,YPD


In [101]:
# 2. 18 genes only essential to YNB37 but not to others
df_enrich_Temp37only = go_enrich_condition_hits_exclusive(df_scores, df_gobp_standard_5_200,
                             all_condition_list=['FBS', 'Iron', 'Temp37', 'YPD', 'NaCl', 'SDS', 'Sorbitol', 'YNB'],
                                       subset_condition_list=['Temp37'],
                            mean_cutoff=2, fdr_cutoff=0.05, output_direc=None)

All other hits to exclude: 410
Temp37
2218 18

#pan-essential: 18


In [102]:
df_enrich_Temp37only.to_csv(go_bp_enrich_direc+'go_bp_enrich_subsets/Temp37_only_18genes_go_enrich_results.tsv', sep='\t', index=None)

In [103]:
df_enrich_Temp37only

Unnamed: 0_level_0,GO_term,GO_term_genes,num_strains_total,num_strains_total_GOannotated,num_strain_hits,num_strains_hits_GOannotated,cond_enrich_pval,cond_enrich_fdr,condition
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1999,GO:0031505 fungal-type cell wall organization,orf19.5994;orf19.1711;orf19.136;orf19.7417;orf...,2218.0,90.0,18.0,4.0,0.005012,0.036344,Temp37
4222,GO:0075259 spore-bearing structure development,orf19.4015;orf19.3013;orf19.2886;orf19.3166;or...,2218.0,14.0,18.0,2.0,0.005345,0.036344,Temp37
4086,GO:0071555 cell wall organization,orf19.5994;orf19.1711;orf19.136;orf19.7417;orf...,2218.0,91.0,18.0,4.0,0.005216,0.036344,Temp37
2977,GO:0045229 external encapsulating structure or...,orf19.5994;orf19.1711;orf19.136;orf19.7417;orf...,2218.0,91.0,18.0,4.0,0.005216,0.036344,Temp37
3324,GO:0048608 reproductive structure development,orf19.4015;orf19.3013;orf19.2886;orf19.3166;or...,2218.0,14.0,18.0,2.0,0.005345,0.036344,Temp37
483,GO:0006338 chromatin remodeling,orf19.5501;orf19.4904;orf19.3593;orf19.1059;or...,2218.0,75.0,18.0,3.0,0.021027,0.089364,Temp37
3569,GO:0051701 biological process involved in inte...,orf19.6745;orf19.7247;orf19.7417;orf19.2646;or...,2218.0,72.0,18.0,3.0,0.018857,0.089364,Temp37
2909,GO:0044403 biological process involved in symb...,orf19.6745;orf19.7247;orf19.7417;orf19.2646;or...,2218.0,72.0,18.0,3.0,0.018857,0.089364,Temp37
4648,GO:1900445 positive regulation of filamentous ...,orf19.971;orf19.5672;orf19.3256;orf19.5908;orf...,2218.0,39.0,18.0,2.0,0.038608,0.119333,Temp37
285,GO:0002833 positive regulation of response to ...,orf19.971;orf19.5672;orf19.3256;orf19.5908;orf...,2218.0,39.0,18.0,2.0,0.038608,0.119333,Temp37


In [105]:
# 3. 12 genes only essential to YPD & SERUM (FBS) but not to others
df_enrich_YPD_FBS_only = go_enrich_condition_hits_exclusive(df_scores, df_gobp_standard_5_200,
                             all_condition_list=['FBS', 'Iron', 'Temp37', 'YPD', 'NaCl', 'SDS', 'Sorbitol', 'YNB'],
                                       subset_condition_list=['YPD', 'FBS'],
                            mean_cutoff=2, fdr_cutoff=0.05, output_direc=None)

All other hits to exclude: 370
YPD
2226 51

FBS
2229 19

#pan-essential: 12


In [107]:
df_enrich_YPD_FBS_only.to_csv(go_bp_enrich_direc+'go_bp_enrich_subsets/YPD_FBS_only_12genes_go_enrich_results.tsv', sep='\t', index=None)

In [106]:
df_enrich_YPD_FBS_only

Unnamed: 0_level_0,GO_term,GO_term_genes,num_strains_total,num_strains_total_GOannotated,num_strain_hits,num_strains_hits_GOannotated,cond_enrich_pval,cond_enrich_fdr,condition
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
514,GO:0006397 mRNA processing,orf19.1261;orf19.2402;orf19.4538;orf19.684;orf...,2231.0,34.0,12.0,3.0,0.000648,0.017501,YPD;FBS
251,GO:0001932 regulation of protein phosphorylation,orf19.5166;orf19.3256;orf19.7186;orf19.3013;or...,2231.0,35.0,12.0,2.0,0.014302,0.036552,YPD;FBS
3561,GO:0051656 establishment of organelle localiza...,orf19.3465;orf19.3248;orf19.6346;orf19.6987;or...,2231.0,34.0,12.0,2.0,0.013525,0.036552,YPD;FBS
3485,GO:0051338 regulation of transferase activity,orf19.4616;orf19.5166;orf19.5954;orf19.3256;or...,2231.0,30.0,12.0,2.0,0.010613,0.036552,YPD;FBS
3347,GO:0050790 regulation of catalytic activity,orf19.4206;orf19.7250;orf19.5671;orf19.4616;or...,2231.0,74.0,12.0,3.0,0.006214,0.036552,YPD;FBS
3034,GO:0045859 regulation of protein kinase activity,orf19.5166;orf19.3256;orf19.7186;orf19.3013;or...,2231.0,26.0,12.0,2.0,0.008025,0.036552,YPD;FBS
3016,GO:0045787 positive regulation of cell cycle,orf19.7247;orf19.3551;orf19.5166;orf19.2402;or...,2231.0,36.0,12.0,2.0,0.015098,0.036552,YPD;FBS
3822,GO:0065009 regulation of molecular function,orf19.4206;orf19.7250;orf19.7417;orf19.3593;or...,2231.0,86.0,12.0,3.0,0.009455,0.036552,YPD;FBS
2795,GO:0043549 regulation of kinase activity,orf19.5166;orf19.3256;orf19.7186;orf19.3013;or...,2231.0,26.0,12.0,2.0,0.008025,0.036552,YPD;FBS
2722,GO:0043085 positive regulation of catalytic ac...,orf19.4616;orf19.5166;orf19.5954;orf19.3013;or...,2231.0,39.0,12.0,2.0,0.017599,0.036552,YPD;FBS


In [108]:
# 4. 11 genes only essential to YPD & SERUM (FBS) & NaCl but not to others
df_enrich_YPD_FBS_NaCl_only = go_enrich_condition_hits_exclusive(df_scores, df_gobp_standard_5_200,
                             all_condition_list=['FBS', 'Iron', 'Temp37', 'YPD', 'NaCl', 'SDS', 'Sorbitol', 'YNB'],
                                       subset_condition_list=['YPD', 'FBS', 'NaCl'],
                            mean_cutoff=2, fdr_cutoff=0.05, output_direc=None)

All other hits to exclude: 342
YPD
2226 66

FBS
2229 33

NaCl
2206 28

#pan-essential: 11


In [111]:
df_enrich_YPD_FBS_NaCl_only.to_csv(go_bp_enrich_direc+'go_bp_enrich_subsets/YPD_FBS_NaCl_only_11genes_go_enrich_results.tsv', sep='\t', index=None)

In [109]:
df_enrich_YPD_FBS_NaCl_only

Unnamed: 0_level_0,GO_term,GO_term_genes,num_strains_total,num_strains_total_GOannotated,num_strain_hits,num_strains_hits_GOannotated,cond_enrich_pval,cond_enrich_fdr,condition
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3890,GO:0070525 tRNA threonylcarbamoyladenosine met...,orf19.4160;orf19.1305;orf19.3787;orf19.7081;or...,2232.0,6.0,11.0,2.0,0.000328,0.006556,YPD;FBS;NaCl
500,GO:0006369 termination of RNA polymerase II tr...,orf19.2402;orf19.684;orf19.6010.1;orf19.6230;o...,2232.0,10.0,11.0,2.0,0.000973,0.009729,YPD;FBS;NaCl
152,GO:0000722 telomere maintenance via recombination,orf19.2267;orf19.3787;orf19.3752;orf19.4208;or...,2232.0,14.0,11.0,2.0,0.001946,0.009731,YPD;FBS;NaCl
486,GO:0006353 DNA-templated transcription termina...,orf19.1755;orf19.2402;orf19.684;orf19.6010.1;o...,2232.0,14.0,11.0,2.0,0.001946,0.009731,YPD;FBS;NaCl
478,GO:0006312 mitotic recombination,orf19.5873;orf19.2267;orf19.5318;orf19.3787;or...,2232.0,18.0,11.0,2.0,0.003237,0.011969,YPD;FBS;NaCl
1518,GO:0016071 mRNA metabolic process,orf19.4010;orf19.4045;orf19.1261;orf19.2402;or...,2232.0,67.0,11.0,3.0,0.003591,0.011969,YPD;FBS;NaCl
153,GO:0000723 telomere maintenance,orf19.7538;orf19.4045;orf19.1972;orf19.926;orf...,2232.0,32.0,11.0,2.0,0.010106,0.025264,YPD;FBS;NaCl
514,GO:0006397 mRNA processing,orf19.1261;orf19.2402;orf19.4538;orf19.684;orf...,2232.0,34.0,11.0,2.0,0.011369,0.025264,YPD;FBS;NaCl
2084,GO:0032200 telomere organization,orf19.7538;orf19.4045;orf19.5053;orf19.1972;or...,2232.0,33.0,11.0,2.0,0.010729,0.025264,YPD;FBS;NaCl
925,GO:0008033 tRNA processing,orf19.4160;orf19.1305;orf19.1261;orf19.4168;or...,2232.0,39.0,11.0,2.0,0.014816,0.029632,YPD;FBS;NaCl


In [110]:
# 5. 10 genes only essential to NaCl but not to others
df_enrich_NaCl_only = go_enrich_condition_hits_exclusive(df_scores, df_gobp_standard_5_200,
                             all_condition_list=['FBS', 'Iron', 'Temp37', 'YPD', 'NaCl', 'SDS', 'Sorbitol', 'YNB'],
                                       subset_condition_list=['NaCl'],
                            mean_cutoff=2, fdr_cutoff=0.05, output_direc=None)

All other hits to exclude: 418
NaCl
2210 10

#pan-essential: 10


In [113]:
df_enrich_NaCl_only.to_csv(go_bp_enrich_direc+'go_bp_enrich_subsets/NaCl_only_10genes_go_enrich_results.tsv', sep='\t', index=None)

In [112]:
df_enrich_NaCl_only

Unnamed: 0_level_0,GO_term,GO_term_genes,num_strains_total,num_strains_total_GOannotated,num_strain_hits,num_strains_hits_GOannotated,cond_enrich_pval,cond_enrich_fdr,condition
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
403,GO:0006164 purine nucleotide biosynthetic process,orf19.1233;orf19.3260;orf19.3870;orf19.7357;or...,2210.0,19.0,10.0,3.0,6.2e-05,0.000877,NaCl
1111,GO:0009260 ribonucleotide biosynthetic process,orf19.1233;orf19.3260;orf19.3870;orf19.7357;or...,2210.0,20.0,10.0,3.0,7.3e-05,0.000877,NaCl
1044,GO:0009152 purine ribonucleotide biosynthetic ...,orf19.1233;orf19.3260;orf19.3870;orf19.7357;or...,2210.0,17.0,10.0,3.0,4.4e-05,0.000877,NaCl
3172,GO:0046390 ribose phosphate biosynthetic process,orf19.1233;orf19.3260;orf19.3870;orf19.7357;or...,2210.0,22.0,10.0,3.0,9.8e-05,0.000885,NaCl
1054,GO:0009165 nucleotide biosynthetic process,orf19.1233;orf19.7176;orf19.3260;orf19.3870;or...,2210.0,30.0,10.0,3.0,0.000254,0.000919,NaCl
1110,GO:0009259 ribonucleotide metabolic process,orf19.6745;orf19.5293;orf19.1233;orf19.3260;or...,2210.0,29.0,10.0,3.0,0.000229,0.000919,NaCl
4187,GO:0072522 purine-containing compound biosynth...,orf19.1233;orf19.3260;orf19.3870;orf19.7357;or...,2210.0,26.0,10.0,3.0,0.000164,0.000919,NaCl
1042,GO:0009150 purine ribonucleotide metabolic pro...,orf19.6745;orf19.5293;orf19.1233;orf19.3260;or...,2210.0,26.0,10.0,3.0,0.000164,0.000919,NaCl
4703,GO:1901293 nucleoside phosphate biosynthetic p...,orf19.1233;orf19.7176;orf19.3260;orf19.3870;or...,2210.0,31.0,10.0,3.0,0.000281,0.000919,NaCl
402,GO:0006163 purine nucleotide metabolic process,orf19.6745;orf19.5293;orf19.1233;orf19.3260;or...,2210.0,28.0,10.0,3.0,0.000206,0.000919,NaCl


In [114]:
# 6. 9 genes only essential to all conditions except for FeFree (Iron)
df_enrich_all_but_iron = go_enrich_condition_hits_exclusive(df_scores, df_gobp_standard_5_200,
                             all_condition_list=['FBS', 'Iron', 'Temp37', 'YPD', 'NaCl', 'SDS', 'Sorbitol', 'YNB'],
                                       subset_condition_list=['FBS', 'Temp37', 'YPD', 'NaCl', 'SDS', 'Sorbitol', 'YNB'],
                            mean_cutoff=2, fdr_cutoff=0.05, output_direc=None)

All other hits to exclude: 242
FBS
2222 82

Temp37
2204 79

YPD
2217 110

NaCl
2192 75

SDS
2227 41

Sorbitol
2227 41

YNB
2230 44

#pan-essential: 9


In [117]:
df_enrich_all_but_iron.to_csv(go_bp_enrich_direc+'go_bp_enrich_subsets/all_but_iron_9genes_go_enrich_results.tsv', sep='\t', index=None)

In [118]:
df_enrich_all_but_iron

Unnamed: 0_level_0,GO_term,GO_term_genes,num_strains_total,num_strains_total_GOannotated,num_strain_hits,num_strains_hits_GOannotated,cond_enrich_pval,cond_enrich_fdr,condition
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
440,GO:0006261 DNA-templated DNA replication,orf19.5873;orf19.5166;orf19.2740;orf19.3373;or...,2236.0,11.0,9.0,2.0,0.000778,0.012442,FBS;Temp37;YPD;NaCl;SDS;Sorbitol;YNB
439,GO:0006260 DNA replication,orf19.5873;orf19.7538;orf19.5053;orf19.5166;or...,2236.0,18.0,9.0,2.0,0.002132,0.01268,FBS;Temp37;YPD;NaCl;SDS;Sorbitol;YNB
3926,GO:0070786 positive regulation of growth of un...,orf19.5908;orf19.1526;orf19.6010;orf19.5251;or...,2236.0,19.0,9.0,2.0,0.002378,0.01268,FBS;Temp37;YPD;NaCl;SDS;Sorbitol;YNB
470,GO:0006302 double-strand break repair,orf19.4206;orf19.5873;orf19.5053;orf19.4545;or...,2236.0,38.0,9.0,2.0,0.009393,0.029034,FBS;Temp37;YPD;NaCl;SDS;Sorbitol;YNB
3561,GO:0051656 establishment of organelle localiza...,orf19.3465;orf19.3248;orf19.6346;orf19.6987;or...,2236.0,34.0,9.0,2.0,0.007559,0.029034,FBS;Temp37;YPD;NaCl;SDS;Sorbitol;YNB
3924,GO:0070784 regulation of growth of unicellular...,orf19.1666;orf19.2646;orf19.4044;orf19.5908;or...,2236.0,41.0,9.0,2.0,0.010888,0.029034,FBS;Temp37;YPD;NaCl;SDS;Sorbitol;YNB
3417,GO:0051168 nuclear export,orf19.3465;orf19.6346;orf19.3865;orf19.200;orf...,2236.0,48.0,9.0,2.0,0.014759,0.033734,FBS;Temp37;YPD;NaCl;SDS;Sorbitol;YNB
820,GO:0006913 nucleocytoplasmic transport,orf19.5994;orf19.3465;orf19.6346;orf19.6400;or...,2236.0,71.0,9.0,2.0,0.030982,0.038131,FBS;Temp37;YPD;NaCl;SDS;Sorbitol;YNB
3059,GO:0045927 positive regulation of growth,orf19.7247;orf19.971;orf19.5672;orf19.3256;orf...,2236.0,68.0,9.0,2.0,0.02858,0.038131,FBS;Temp37;YPD;NaCl;SDS;Sorbitol;YNB
3418,GO:0051169 nuclear transport,orf19.5994;orf19.3465;orf19.6346;orf19.6400;or...,2236.0,71.0,9.0,2.0,0.030982,0.038131,FBS;Temp37;YPD;NaCl;SDS;Sorbitol;YNB


In [119]:
# 7. 7 genes only essential to SERUM (FBS) but not to others
df_enrich_FBS_only = go_enrich_condition_hits_exclusive(df_scores, df_gobp_standard_5_200,
                             all_condition_list=['FBS', 'Iron', 'Temp37', 'YPD', 'NaCl', 'SDS', 'Sorbitol', 'YNB'],
                                       subset_condition_list=['FBS'],
                            mean_cutoff=2, fdr_cutoff=0.05, output_direc=None)

All other hits to exclude: 421
FBS
2229 7

#pan-essential: 7


In [120]:
df_enrich_FBS_only

Unnamed: 0_level_0,GO_term,GO_term_genes,num_strains_total,num_strains_total_GOannotated,num_strain_hits,num_strains_hits_GOannotated,cond_enrich_pval,cond_enrich_fdr,condition
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [10]:
# 10042023: 
# 36 genes essential to all but YPD
df_enrich_allButYPD = go_enrich_condition_hits_exclusive(df_scores, df_gobp_standard_5_200,
                             all_condition_list=['FBS', 'Iron', 'Temp37', 'YPD', 'NaCl', 'SDS', 'Sorbitol', 'YNB'],
                                       subset_condition_list=['FBS', 'Iron', 'Temp37', 'NaCl', 'SDS', 'Sorbitol', 'YNB'],
                            mean_cutoff=2, fdr_cutoff=0.05, output_direc=None)

All other hits to exclude: 300
FBS
2222 64

Iron
2225 52

Temp37
2207 91

NaCl
2193 73

SDS
2227 65

Sorbitol
2225 61

YNB
2231 70

#pan-essential: 36


In [13]:
df_enrich_allButYPD.to_csv(go_bp_enrich_direc+'go_bp_enrich_results/go_bp_enrich_subsets/all_but_YPD_36genes_go_enrich_results.tsv', sep='\t', index=None)