In [1]:
%matplotlib inline
from __future__ import division
import numpy as np
import os
import sys
import datetime
from subprocess import call
import subprocess
import glob
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import djPyi2 as DJ
from djPyi2 import Common as CM
from djPyi2 import mpltools

import pandas as pd
pd.options.mode.chained_assignment = None
import csv
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import copy 
import pybedtools as pbt
import ciepy
import cardipspy as cpy
import itertools
import tempfile
import six
import networkx as nx
import scipy.stats as stats
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
from mpl_toolkits.axes_grid1 import  make_axes_locatable
import datetime
import vapeplot
from scipy.stats import mode
dy_name = 'eqtl_enrichments'

private_out = os.path.join(DJ.root, 'private_output', dy_name)
if not os.path.exists(private_out):
    DJ.makedir(private_out)

import gc
from IPython import display


In [38]:
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector

stats_r = importr('stats')

# from rpy2.robjects.packages import importr
utils = importr('utils')

def add_bh_fdr(top, col):
    top = top.copy()
    p_vals = top[col].tolist()
    p_adjust = stats_r.p_adjust(FloatVector(p_vals), method = 'fdr')
    top['fdr_corrected_p'] = list(p_adjust)
    return top

In [3]:
def per_variant_vc_unique(df, col1, col2, id_col, overlapping_sets = True):
    """ value counts that are mutually exclusive within first col T/F - second groupby groups-
    optionally, make sure the T/F sets are mutually exclusive entirely"""
    
    in_cat = df[(df[col1] == True)][id_col].unique()
    in_cat_sig = df[(df[col1] == True) & (df[col2] == True)][id_col].unique()
    in_cat_ns = df[(df[col1] == True) & (df[col2] == False)][id_col].unique()
    in_cat_ns = set(in_cat_ns).difference(in_cat_sig)
    num_in_cat_sig = len(in_cat_sig)
    num_in_cat_ns = len(in_cat_ns)
    
    if not overlapping_sets:
        ## make sure the two sets are totally mutually exclusive
        out_cat = df[(df[col1] == False)][id_col].unique()
        # remove things in the category from things out of the category 
        out_cat = set(out_cat).difference(in_cat)
        out_bin = df[(df[col1] == False) & (df[id_col].isin(out_cat))]
    else:
        out_cat = df[(df[col1] == False)][id_col].unique()
        out_bin = df[(df[col1] == False)]
        
    out_cat_sig =  out_bin[(out_bin[col2] == True)][id_col].unique()
    out_cat_ns =  out_bin[(out_bin[col2] == False)][id_col].unique()
    out_cat_ns = set(out_cat_ns).difference(out_cat_sig)
    num_out_cat_sig = len(out_cat_sig)
    num_out_cat_ns = len(out_cat_ns)

    v_in = [[num_in_cat_sig, num_in_cat_ns], [num_out_cat_sig, num_out_cat_ns]]
    return v_in

In [4]:
def vc_to_or(vc, v = False):
    def default_loc(df, a, b, default = 0):
        try:
            out = df.loc[a,b]
            return out
        except:
            return default

        
    if not v:
        
        t_g1 = [default_loc(vc, True, True), default_loc(vc, True, False)]
        f_g1 = [default_loc(vc, False, True), default_loc(vc, False, False)]
        v = [t_g1, f_g1]
    else:
        v = vc

    try:
        odds_ratio, p_fisher = stats.fisher_exact(v, )
    except:
        odds_ratio, p_fisher = (np.NaN, np.NaN)
    
    
    return v, odds_ratio, p_fisher

def gather_odds_ratio_data(df, gb1, gb2, bool_col, gb2_bool = True, unique_col = False,
                           overlapping_sets = False):
    
    gb1_cats = df[gb1].unique()
    if gb2_bool: # if this column is a bool- and not categorical
        data = []
        if not unique_col:
            vc = df.groupby((gb1, gb2))[bool_col].value_counts()
            for c1 in gb1_cats:
                tvc = vc.loc[c1]
                v, odds_ratio, p_fisher = vc_to_or(tvc)
                data.append([c1, gb2, v, odds_ratio, p_fisher])
        else:
            for c1 in gb1_cats:
                tdf = df[df[gb1] == c1]
                if tdf.shape[0] > 0:
                    vc = per_variant_vc_unique(tdf, gb2, 
                                               bool_col, unique_col, 
                                               overlapping_sets= overlapping_sets)
                    v, odds_ratio, p_fisher = vc_to_or(vc, v=True)
                    data.append([c1, gb2, v, odds_ratio, p_fisher]) 
            
        
        df_out = pd.DataFrame(data, columns = [gb1, gb2, 'contingency', 'odds_ratio', 
                                           'p_fisher']).pipe(add_bh_fdr, 'p_fisher')    
    else:
        data = []
        gb2_cats = df[gb2].unique()
        for c2 in gb2_cats:
            df['in_cat'] = (df[gb2] == c2)
            if not unique_col:
                vc = df.groupby((gb1, 'in_cat'))[bool_col].value_counts()     
            for c1 in gb1_cats:
                if unique_col:
                    tdf = df[df[gb1] == c1]
                    if tdf.shape[0] > 0:
                        v_in = per_variant_vc_unique(tdf, 'in_cat', 
                                               bool_col, unique_col, overlapping_sets= overlapping_sets)
                        
                        v, odds_ratio, p_fisher = vc_to_or(v_in, v=True)
                        data.append([c1, c2, v, odds_ratio, p_fisher])   
                else:
                    tvc = vc.loc[c1]
                    v, odds_ratio, p_fisher = vc_to_or(tvc)
                    data.append([c1, c2, v, odds_ratio, p_fisher])
                    
        df_out = pd.DataFrame(data, columns = [gb1, gb2, 'contingency', 'odds_ratio', 
                                           'p_fisher']).pipe(add_bh_fdr, 'p_fisher') 
    
    return df_out

In [5]:
def annotate_tests_data(df, col = "significant"):
    df = df.copy()
    def safe_div(a, b):
        try:
            out = a/b
        except:
            out = np.NaN
        return out
            
#     df['frac_non_{}_ol_feat'.format(col)] = df['{}_False'.format(col)].apply(lambda x: safe_div(x[1], x[0]))
#     df['frac_{}_ol_feat'.format(col)] =  df['{}_True'.format(col)].apply(lambda x: safe_div(x[1], x[0]))
    try:
        df['-log10p_fisher'] = np.log10(df['p_fisher']) * -1
    except:
        pass
    
    try:
        df['log_odds_ratio'] = np.log10(df['odds_ratio'])
    except:
        pass
    
    try:
        df['log2_odds_ratio'] = np.log2(df['odds_ratio'])
    except:
        pass
    df = df.reset_index()
    
    df['log2_odds_ratio_raw'] = df['log2_odds_ratio']
    
    
    t_neg_inf = df.log2_odds_ratio == (-np.inf)
    t_pos_inf = (df.log2_odds_ratio == (np.inf))
    
    exclude = t_neg_inf[t_neg_inf].index.tolist() + t_pos_inf[t_pos_inf].index.tolist()
    if len(exclude) > 0:
        inds_non_inf = set(df.index.tolist()).difference(exclude)

        if t_neg_inf[t_neg_inf].shape[0] > 0:
            inds = t_neg_inf[t_neg_inf].index.tolist()
            try:
                m = df.loc[inds_non_inf].log2_odds_ratio.min()
            except:
                m = -1
            
            if m >= -0.5:
                m = -1
            df.loc[inds, 'log2_odds_ratio'] = m
            
        if t_pos_inf[t_pos_inf].shape[0] > 0:
            inds = t_pos_inf[t_pos_inf].index.tolist()
            try:
                m = df.loc[inds_non_inf].log2_odds_ratio.max()
            except:
                m = 2
            if m < 0:
                m = 2
            df.loc[inds, 'log2_odds_ratio'] = m
            
    return df

In [6]:
def vc_unique_add_proportion(df, gb, col_unique):
    """groupby two categories, and count the number of unique elements in the third
    and proportion of total unique elements"""
    vc = df.groupby((gb[0], gb[1]))[col_unique].apply(lambda x: len(set(x))).to_frame('count').reset_index()
    vc_tot = df.groupby(gb[0])[col_unique].apply(lambda x: len(set(x))).to_frame('total')
    vc = vc[vc[gb[1]] == True]
    vc = vc.set_index(gb[0])
    vc = vc.join(vc_tot)
    vc['fraction'] = vc['count'] / vc['total']
    return vc

def add_fraction(vc, col, col_bool, count_col = 'count'):
    vc_tot = vc.groupby(col)[count_col].sum().to_frame('total')
    vc = vc[vc[col_bool] == True]
    vc = vc.set_index(col)
    vc = vc.join(vc_tot)
    vc['fraction'] = vc['count'] / vc['total']
    return vc

def vc_w_prop(df, gb_col, bool_col):
    vc = df.groupby(gb_col)[bool_col].value_counts().to_frame('count')
    vc_frac = df.groupby(gb_col)[bool_col].value_counts(normalize = True).to_frame('fraction')
    vc = vc.join(vc_frac)
    vc = vc.reset_index()
    tot = vc.groupby(gb_col)['count'].sum().to_frame('total')
    
    vc = vc[vc[bool_col] == True] 
    
    vc = vc.set_index(gb_col).join(tot)
    vc = vc.reset_index()
    return vc

In [9]:
def add_coding_annot_per_var(per_var):
    
    coding_egenes = var_egene_nr[(var_egene_nr.coding == True) & 
                                 (var_egene_nr.significant)].groupby('NR_ID').feature_id.unique().apply(len)

    ncoding_egenes = var_egene_nr[(var_egene_nr.coding == False) &
                                  (var_egene_nr.significant)].groupby('NR_ID').feature_id.unique().apply(len)

    coding_tested = var_egene_nr[(var_egene_nr.coding == True)].groupby('NR_ID').feature_id.unique().apply(len)
    ncoding_tested = var_egene_nr[(var_egene_nr.coding == False)].groupby('NR_ID').feature_id.unique().apply(len)

    coding_lead = var_egene_nr[(var_egene_nr.coding == True) & 
                               (var_egene_nr.top_hit_final_sig)].groupby('NR_ID').feature_id.unique().apply(len)
    ncoding_lead = var_egene_nr[(var_egene_nr.coding == False) & 
                                (var_egene_nr.top_hit_final_sig)].groupby('NR_ID').feature_id.unique().apply(len)

    # coding_tested = var_egene_nr[(var_egene_nr.coding == True)].groupby('NR_ID').feature_id.unique().apply(len)
    # ncoding_tested = var_egene_nr[(var_egene_nr.coding == False)].groupby('NR_ID').feature_id.unique().apply(len)

    per_var['num_coding_egenes_lead'] = coding_lead
    per_var['num_noncoding_egenes_lead'] = ncoding_lead

    var_egene_nr = var_egene[(var_egene.most_significant_nr)]

    per_var['num_coding_egenes'] = coding_egenes
    per_var['num_noncoding_egenes'] = ncoding_egenes

    per_var['num_coding_tested'] = coding_tested
    per_var['num_noncoding_tested'] = ncoding_tested

    per_var = per_var.fillna(0)
    return per_var

In [25]:
def compute_enrichment_vs_cat(df, comp_cat, comp_col, gb1, gb2, gb3, order_comp = False, **kwargs):
    dfs = []
    if order_comp:
        cats = order_comp
    else:
        cats = df[comp_col].unique().tolist()
        try: cats.remove(comp_cat) 
        except: pass
    for b in cats:
        tdf = df[df[comp_col].isin([comp_cat, b])]
        tdf['in_cat'] = (tdf[comp_col] == b)
        enr = gather_odds_ratio_data(tdf, gb1, gb2, gb3, **kwargs)
        enr['category'] = b
        enr['comp_category'] = comp_cat
        dfs.append(enr)
    enr = pd.concat(dfs).pipe(annotate_tests_data).pipe(add_bh_fdr, 
                                                              'p_fisher').pipe(add_fraction_contingency)
    tdf = df
    tdf['in_cat'] = (tdf[comp_col] == comp_cat)
    df = gather_odds_ratio_data(tdf, gb1, gb2, gb3, **kwargs)
    df['category'] = comp_cat
    df['comp_category'] = "others"
    df = df.pipe(annotate_tests_data).pipe(add_fraction_contingency)
    enr = pd.concat([enr,df])
    return enr

# Data Prep

In [10]:
def add_svtype_cats(all_qtls_filt):
    convert = {'STR':'STR', 'INDEL': 'INDEL', 'SNP': 'SNV', 'SNV': 'SNV', 'INDEL_DEL': 'INDEL', 
               'INDEL_INS': 'INDEL'}
    all_qtls_filt['SVTYPE_SUPER'] = all_qtls_filt.SVTYPE_NR.apply(lambda x: convert.get(x, 'SV'))
    convert = {'STR':'STR', 'INDEL': 'INDEL', 'SNP': 'SNV',
               'ALU': 'MEI', 'LINE1': 'MEI', 'SVA':'MEI', 'rMEI':'MEI', 
          'DUP': 'CNV', 'DEL': 'CNV', 'mCNV': 'CNV', 'SNV': 'SNV', 'INDEL_INS': 'INDEL', 'INDEL_DEL': 'INDEL', 
              'INDEL': 'INDEL', 'SNP': "SNV"}
    all_qtls_filt['SVTYPE_COLLAPSE'] = all_qtls_filt.SVTYPE_NR.apply(lambda x: convert.get(x, 'other SV'))
    convert = {'LINE1':'MEI', 'SVA': 'MEI', 'ALU': 'MEI'}
    all_qtls_filt['SVTYPE_NR_C'] = all_qtls_filt.SVTYPE_NR.apply(lambda x: convert.get(x,x))
    convert = {'LINE1':'MEI', 'SVA': 'MEI', 'ALU': 'MEI', 'DUP': 'CNV', 'DEL': 'CNV', 'mCNV': 'CNV'}
    all_qtls_filt['SVTYPE_NR_C2'] = all_qtls_filt.SVTYPE_NR.apply(lambda x: convert.get(x,x))
    
    return all_qtls_filt

In [11]:
def fix_svtypes(info_all_rna, suff = '_all'):
    svt_col = "SVTYPE{}".format(suff)
    st_col = "SUBTYPE{}".format(suff)
    
    inds = info_all_rna[(info_all_rna[svt_col] == 'mCNV') & (info_all_rna[st_col] == 'DEL')].index.tolist()

    inds_nr = info_all_rna[(info_all_rna[svt_col] == 'mCNV') & (info_all_rna[st_col] == 'DEL') & 
                           (info_all_rna.ID == info_all_rna.NR_ID)].index.tolist()
    
    info_all_rna.loc[inds_nr, "SVTYPE_NR"] = 'DEL'

    info_all_rna.loc[inds, svt_col] = 'DEL'

In [12]:
def add_fraction_contingency(df):
    df['fraction'] = df['contingency'].apply(lambda x: x[0][0]/sum(x[0]))
    return df

In [32]:
dy_name = 'qtl_results_01_17_v4'
outdir = os.path.join(private_out, dy_name)

if not os.path.exists(outdir):
    DJ.makedir(outdir)

In [11]:
var_egene = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/eqtl_processing/qtl_results_01_17_v4/var_egene_annot_maf5_sv_only.pkl')

In [113]:
var_egene = var_egene.pipe(add_svtype_cats)

In [114]:
var_egene_nr = var_egene[var_egene.most_significant_nr]

In [115]:
inds = var_egene_nr[(var_egene_nr.coding == True) & (var_egene_nr.significant)].snp_id.unique()
var_egene_nr['has_coding_assoc'] = var_egene_nr.snp_id.isin(inds) # any coding egene for this variant

# num signif per gene (is there a bunch of significant in LD?)
t = var_egene_nr[var_egene_nr.significant].groupby("feature_id").size().to_frame('num_signif_w_egene')
var_egene_nr = var_egene_nr.merge(t, right_index=True, left_on= 'feature_id', how = 'left')


var_egene_nr_nc = var_egene_nr[(var_egene_nr.coding == False) & (var_egene_nr.genic_category_variant != 'intersects_promoter')]

var_egene_nr_nc_nd = var_egene_nr[(var_egene_nr.coding == False) & (var_egene_nr.genic_category_variant != 'intersects_promoter') & (var_egene_nr.near_distal_loop == True)]

inds = var_egene_nr_nc_nd[var_egene_nr_nc_nd.category_min == 'inside_distal'].index.tolist()
var_egene_nr_nc_nd['min_dist_pc_anchor_mod'] = var_egene_nr_nc_nd.min_dist_pc_anchor
var_egene_nr_nc_nd.loc[inds, 'min_dist_pc_anchor_mod'] = var_egene_nr_nc_nd.loc[inds, 'min_dist_pc_anchor_mod'] * -1

In [116]:
CM.save_dataframe('var_egene_nr_nc_nd', var_egene_nr_nc_nd, outdir, print_vars_recorded_loc=False)

var_egene_nr_nc_nd = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/eqtl_enrichments/qtl_results_01_17_v4/var_egene_nr_nc_nd.pkl')
var_egene_nr_nc_nd = pd.read_csv('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/eqtl_enrichments/qtl_results_01_17_v4/var_egene_nr_nc_nd.tsv', sep='\t')


In [117]:
CM.save_dataframe('var_egene_nr', var_egene_nr, outdir, print_vars_recorded_loc=False)

var_egene_nr = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/eqtl_enrichments/qtl_results_01_17_v4/var_egene_nr.pkl')
var_egene_nr = pd.read_csv('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/eqtl_enrichments/qtl_results_01_17_v4/var_egene_nr.tsv', sep='\t')


In [14]:
per_variant_info = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/evariant_loop_analysis/qtl_results_v4/per_variant_info_annot_update.pkl')

In [20]:
per_variant_info = per_variant_info.pipe(add_svtype_cats)

# Enrichments Class

In [22]:
### likelihood to be eQTL of variants in each class versus variants from all others
enr_svtype_nr_c = gather_odds_ratio_data(per_variant_info, 'ALL_VARS', 'SVTYPE_NR_C', 'significant_association', gb2_bool=False).pipe(annotate_tests_data)

In [39]:
### likelihood to be eQTL of variants in each class versus STRs
enr_svt_vs_str_nr_c = compute_enrichment_vs_cat(per_variant_info, 'STR', 'SVTYPE_NR_C', 
                                           'ALL_VARS', 'in_cat', 'significant_association')


In [28]:
enr_svt_vs_str_nr_c_th = compute_enrichment_vs_cat(per_variant_info, 'STR', 'SVTYPE_NR_C', 
                                           'ALL_VARS', 'in_cat', 'lead_association')


In [33]:

CM.save_dataframe('enr_svt_vs_str_nr_c', enr_svt_vs_str_nr_c, outdir, print_only_pickle=True, 
                 print_vars_recorded_loc=False)
CM.save_dataframe('enr_svt_vs_str_nr_c_th',enr_svt_vs_str_nr_c_th, 
                  outdir, print_only_pickle=True,print_vars_recorded_loc=False)

enr_svt_vs_str_nr_c = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/eqtl_enrichments/qtl_results_01_17_v4/enr_svt_vs_str_nr_c.pkl')

enr_svt_vs_str_nr_c_th = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/eqtl_enrichments/qtl_results_01_17_v4/enr_svt_vs_str_nr_c_th.pkl')



# Enrichment Genic Categories

In [None]:
tdf = var_egene_nr[(var_egene_nr.top_hit_final_sig == True)]
# print tdf.shape
enr_dfs = []
for cat in order_genic_cats_full:
    for svt in order_vars_c:
        
        tdf['in_cat'] = (tdf['genic_category_collapsed'] == cat)
        tdf['is_svt'] = (tdf['SVTYPE_NR_C'] == svt)
    
        enr = gather_odds_ratio_data(tdf,'ALL_VARS', 'is_svt', 
                                     'in_cat', gb2_bool=True).pipe(annotate_tests_data)
        enr['category'] = cat
        enr['SVTYPE'] = svt
        enr_dfs.append(enr)

enr_svt_genic_cat_lead_prop = pd.concat(enr_dfs).pipe(add_bh_fdr, 'p_fisher').pipe(annotate_tests_data)




# Enrichments Looping

In [25]:
bins = [-0.5,0.5, 1.5, 3, 6, 100]
bin_labels = ['0', '1', '1-3', '3-6', '6+']

In [26]:
per_variant_info['bin_num_loops_to_unique_genes']  = pd.cut(per_variant_info.num_genes_tested_loop, 
                                                             bins = bins, labels= bin_labels,
                                                             include_lowest=True).astype(str)

In [118]:
enr_int_pc_pqtl_svt_lead = gather_odds_ratio_data(var_egene_nr_nc_nd, 'SVTYPE_NR_C', 
                                            'int_distal_pc_10kb', 'top_hit_final_sig', 
                       gb2_bool=False).pipe(annotate_tests_data).pipe(add_fraction_contingency)

inds = enr_int_pc_pqtl_svt_lead[enr_int_pc_pqtl_svt_lead.int_distal_pc_10kb].index.tolist()
enr_int_pc_pqtl_svt_lead.loc[inds] = enr_int_pc_pqtl_svt_lead.loc[inds].pipe(add_bh_fdr, 'p_fisher')

In [126]:
enr_int_pc_pqtl_svt = gather_odds_ratio_data(var_egene_nr_nc_nd, 'SVTYPE_NR_C', 
                                            'int_distal_pc_10kb', 'significant', 
                       gb2_bool=False).pipe(annotate_tests_data).pipe(add_fraction_contingency)

inds = enr_int_pc_pqtl_svt[enr_int_pc_pqtl_svt.int_distal_pc_10kb].index.tolist()
enr_int_pc_pqtl_svt.loc[inds] = enr_int_pc_pqtl_svt.loc[inds].pipe(add_bh_fdr, 'p_fisher')



In [122]:
CM.save_dataframe('enr_int_pc_pqtl_svt_lead', enr_int_pc_pqtl_svt_lead, outdir, 
                  print_vars_recorded_loc=False)

enr_int_pc_pqtl_svt_lead = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/eqtl_enrichments/qtl_results_01_17_v4/enr_int_pc_pqtl_svt_lead.pkl')
enr_int_pc_pqtl_svt_lead = pd.read_csv('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/eqtl_enrichments/qtl_results_01_17_v4/enr_int_pc_pqtl_svt_lead.tsv', sep='\t')


In [127]:
CM.save_dataframe('enr_int_pc_pqtl_svt', enr_int_pc_pqtl_svt, outdir, 
                  print_vars_recorded_loc=False)

enr_int_pc_pqtl_svt = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/eqtl_enrichments/qtl_results_01_17_v4/enr_int_pc_pqtl_svt.pkl')
enr_int_pc_pqtl_svt = pd.read_csv('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/eqtl_enrichments/qtl_results_01_17_v4/enr_int_pc_pqtl_svt.tsv', sep='\t')


In [41]:
enr_n_loops_nearby_all = gather_odds_ratio_data(per_variant_info, 'ALL_VARS', 
                                            'bin_num_loops_to_unique_genes', 'significant_association', 
                       gb2_bool=False).pipe(annotate_tests_data).pipe(add_fraction_contingency)



In [42]:
enr_n_loops_nearby_all_th = gather_odds_ratio_data(per_variant_info, 'ALL_VARS', 
                                            'bin_num_loops_to_unique_genes', 'lead_association', 
                       gb2_bool=False).pipe(annotate_tests_data).pipe(add_fraction_contingency)

In [95]:
def compute_enrichment_vs_cat(df, comp_cat, comp_col, gb1, gb2, gb3, order_comp = False, **kwargs):
    dfs = []
    if order_comp:
        cats = order_comp
    else:
        cats = df[comp_col].unique().tolist()
        try: cats.remove(comp_cat) 
        except: pass
    for b in cats:
        tdf = df[df[comp_col].isin([comp_cat, b])]
        tdf['in_cat'] = (tdf[comp_col] == b)
        enr = gather_odds_ratio_data(tdf, gb1, gb2, gb3, **kwargs)
        enr['category'] = b
        enr['comp_category'] = comp_cat
        dfs.append(enr)
    enr = pd.concat(dfs).pipe(annotate_tests_data).pipe(add_bh_fdr, 
                                                              'p_fisher').pipe(add_fraction_contingency)
    tdf = df
    tdf['in_cat'] = (tdf[comp_col] == comp_cat)
    df = gather_odds_ratio_data(tdf, gb1, gb2, gb3, **kwargs)
    df['category'] = comp_cat
    df['comp_category'] = "others"
    df = df.pipe(annotate_tests_data).pipe(add_fraction_contingency)
#     enr = enr.append(df)
    enr = pd.concat([enr,df])
    return enr


In [96]:
enr_vs_none_all_th = compute_enrichment_vs_cat(per_variant_info, '0', 'bin_num_loops_to_unique_genes', 
                                           'ALL_VARS', 'in_cat', 'lead_association')

In [98]:
enr_vs_none_all = compute_enrichment_vs_cat(per_variant_info, '0', 'bin_num_loops_to_unique_genes', 
                                           'ALL_VARS', 'in_cat', 'significant_association')



In [99]:
CM.save_dataframe('enr_n_loops_nearby_all_th', enr_vs_none_all_th, 
                  outdir, print_only_pickle=True,print_vars_recorded_loc=False)
CM.save_dataframe('enr_n_loops_nearby_all', enr_vs_none_all, outdir, print_only_pickle=True, 
                 print_vars_recorded_loc=False)


enr_n_loops_nearby_all_th = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/eqtl_enrichments/qtl_results_01_17_v4/enr_n_loops_nearby_all_th.pkl')

enr_n_loops_nearby_all = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/eqtl_enrichments/qtl_results_01_17_v4/enr_n_loops_nearby_all.pkl')

