In [2]:
%matplotlib inline
from __future__ import division
import numpy as np
import os
import sys
import datetime
from subprocess import call
import subprocess
import glob
import djPyi2 as DJ
from djPyi2 import Common as CM


import pandas as pd
pd.options.mode.chained_assignment = None
import csv
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import copy 
import pybedtools as pbt
import ciepy
import cardipspy as cpy
import itertools
import tempfile
import six
import networkx as nx
import scipy.stats as stats
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
from mpl_toolkits.axes_grid1 import  make_axes_locatable
import datetime

from scipy.stats import mode
dy_name = 'distance_to_loop_anchors'

private_out = os.path.join(DJ.root, 'private_output', dy_name)
if not os.path.exists(private_out):
    cpy.makedir(private_out)

import gc

In [3]:
from djPyi2 import mpltools

from djPyi2 import pandas_methods as pm

In [4]:
def prep_tx_info(df):
    df = df.copy()
    df['gene_id_mod'] = df.gene_id.apply(lambda x: x.split('.')[0])
    df = df.set_index('tx_id', drop = False)
    return df

def prep_promoter_info(df, tx_info):
    df = df.copy()
#     df['gene_id_mod'] = df.tx_id.apply(lambda x: x.split('.')[0])
    df['tx_id'] = df.tx_id.apply(lambda x: x.split('_')[0])
    
    df = df.set_index('tx_id', drop = False)
    df = df.join(tx_info[['gene_id', 'gene_id_mod']])
    
    return df

def prep_gene_info(df):
    df = df.copy()
    df['gene_id_mod'] = df.gene_id.apply(lambda x: x.split('.')[0])
    df = df.set_index('gene_id_mod', drop = False)
    return df

In [5]:
def prep_var_egene(df):
    df = df.copy()
    df['feature_chrom'] = df['feature_chromosome'].apply(lambda x: "chr{}".format(x))
    df['chrom_variant'] = df['feature_chrom']
    return df

In [5]:
tx_info =  pd.read_table('/publicdata/gencode_v19_20151104/transcript_to_gene.tsv', names=['tx_id', 'gene_id']).pipe(prep_tx_info)
promoter_info = pd.read_table('/publicdata/gencode_v19_20151104/promoters.bed', names=['chrom', 'start', 'end', 'tx_id', 'strand', 'other']).pipe(prep_promoter_info, tx_info)

In [None]:
fn_promoter_bed = outdir + '/promoters_annot.bed'
promoter_info.to_csv(fn_promoter_bed, sep = '\t', header = None, index = False)

In [6]:
# Make Loop IDs unique for loops from Montefiori paper

def add_anchor_sizes(df):
    df = df.copy()
    df['anchor_A_size'] = df['endA'] - df['startA']
    df['anchor_B_size'] = df['endB'] - df['startB']
    return df

pc_df = pd.read_table('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/pc_intersections/qtl_results_12_4/supp1.iPSC.pHiC.pgl', names = ['chromA', 'startA', 'endA','chromB', 'startB', 'endB', 'score', 'loop_id'])

pc_df = pc_df.pipe(pm.concat_cols, 'loop_id chromA	startA	endA	chromB	startB	endB'.split(), sep = ':', name = 'unique_loop_id')

pc_df = pc_df.pipe(add_anchor_sizes)
pc_df.set_index('unique_loop_id', drop = False, inplace = True)
pc_loop_anchor_sizes = pc_df[['anchor_A_size', 'anchor_B_size']]

fn = '/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/pc_intersections/qtl_results_12_4/supp1.iPSC.pHiC.annot_uniq.pgl'
pc_df.to_csv(fn, sep = '\t', index = False, header = None)

fn = '/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/pc_intersections/qtl_results_12_4/iPSC.reference.withIndex.pgl'
hic_df = pd.read_table(fn, names = ['chromA', 'startA', 'endA','chromB', 'startB', 'endB', 'loop_id'])

hic_df = hic_df.pipe(add_anchor_sizes)
hic_df.set_index('loop_id', drop = False, inplace = True)
hic_loop_anchor_sizes = hic_df[['anchor_A_size', 'anchor_B_size']]

# Commands

In [None]:
pgltools condense supp1.iPSC.pHiC.pgl > supp1.iPSC.pHiC.condense.bed
pgltools condense iPSC.reference.withIndex.pgl > iPSC.reference.withIndex.condense.bed

sort-bed promoters_annot.bed > promoters_annot.sorted.bed
sort-bed supp1.iPSC.pHiC.condense.bed > supp1.iPSC.pHiC.condense.sorted.bed
sort-bed  iPSC.reference.withIndex.condense.bed > iPSC.reference.withIndex.condense.sorted.bed

In [None]:
pgltools condense supp1.iPSC.pHiC.annot_uniq.pgl > supp1.iPSC.pHiC.annot_uniq.condense.bed
sort-bed supp1.iPSC.pHiC.annot_uniq.condense.bed > supp1.iPSC.pHiC.annot_uniq.condense.sorted.bed

In [7]:
fn = '/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/pc_intersections/qtl_results_12_4/supp1.iPSC.pHiC.annot_uniq.condense.sorted.bed'
pc_bt = pbt.BedTool(fn).sort()
# add 5kb slop to promoter capture loops
pc_bt = pc_bt.slop(b = 5000, g = '/frazer01/publicdata/gatk_bundle_2.8/b37/human_g1k_v37_decoy_Sendai.genome')

In [8]:
fn = '/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/pc_intersections/qtl_results_12_4/promoters_annot.sorted.bed'
prom_bt = pbt.BedTool(fn).sort()

In [9]:
def process_intersect(intersect):
    cols = ['chromA', 'startA', 'endA'] + 'tx_id	strand	other	gene_id	gene_id_mod'.split() + ['chromB', 'startB', 'endB', 'anchor_id', 'score', 'loop_id', 'loop_id_unique', 'overlap']
    df = intersect.to_dataframe(names = cols)
    df = df.drop_duplicates(['gene_id_mod', 'loop_id_unique'])
    
    df  = (df.pipe(pm.expand_col_split_join, 'loop_id_unique', 
                   sep=':', col_names=['loop_id2', 'chrom_anchorA', 'start_anchorA',
                                       'end_anchorA', 'chrom_anchorB', 'start_anchorB', 'end_anchorB'], 
                   subset = ['start_anchorA', 'end_anchorA', 'start_anchorB', 'end_anchorB'], overwrite = True))
    
    return df

In [10]:
# get loops that intersect a promoter
intersect = prom_bt.intersect(pc_bt, wo = True)

In [11]:
intersect = process_intersect(intersect)

In [None]:
/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/loop_locus_examples/iPSC.merged_nodups.filtered.intra.5kb.cool

# Pull in variants from eQTL testing- figure out which variants are closest to these loops

In [13]:
intersect = intersect.drop_duplicates(['gene_id_mod', 'loop_id_unique'])

In [14]:
var_egene = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/eqtl_processing/qtl_results_01_17_v2/all_qtls_combined_filt5.pkl')

In [15]:
genes_tested = var_egene.feature_id.unique().tolist()

In [16]:
intersect['in_tested_genes'] = intersect.gene_id_mod.isin(genes_tested)

In [17]:
intersects_tested_genes = intersect[intersect.in_tested_genes == True].drop_duplicates(['gene_id_mod'])[['loop_id_unique',
                                                                              'gene_id_mod', 'in_tested_genes']]

In [18]:
CM.save_dataframe('pc_promoter_intersect_tested_genes', intersects_tested_genes, private_out)

pc_promoter_intersect_tested_genes = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/pc_promoter_intersect_tested_genes.pkl')
pc_promoter_intersect_tested_genes = pd.read_csv('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/pc_promoter_intersect_tested_genes.tsv', sep='\t')
# all vars recorded: /frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/load_saved_nb_variables.py
# pickled vars recorded:/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/load_pickled_nb_variables.py


In [19]:
intersect[intersect.in_tested_genes].gene_id_mod.unique().shape

(13529,)

In [20]:
var_egene.feature_id.unique().shape

(16018,)

 we have a promoter loop to 13529 genes out of 16018 in test

In [21]:
intersect.gene_id_mod.unique().shape

(30148,)

a total of 30148 genes have a promoter loop to them (but we don't include all of them due to filtering of gene list to expressed things)

In [22]:
var_egene['has_pc_at_prom'] = var_egene.feature_id.isin(intersect.gene_id_mod.unique())

In [23]:
# sanity check
var_egene[var_egene.has_pc_at_prom].feature_id.unique().shape

(13529,)

In [24]:
def dist_lambda(df, col_start1, col_end1, col_start2, col_end2):
#     Chr = str(x.CHROM)

    
    data = []
    for x in df[[col_start1, col_end1, col_start2, col_end2]].values:
        l = map(int, x)
        
        start1, end1 = l[0], l[1]     
        start2, end2 = l[2], l[3]
    

        ## CNV before
        if start1 < start2 and end1 <= start2:
            dist = end1 - start2
        ## CNV after 
        elif start1 > end2:
            dist = start1- end2

        ## CNV overlaps right edge:
        elif start1 < start2 and end1 > start2 and end1 < end2:
            dist = 0
        ## CNV overlaps left edge

        elif start1 >= start2 and start1 <= end2 and end1 >= end2:
            dist=0

        ## CNV Overlaps entire region:
        elif start1 <= start2 and end1 >= end2:

            dist =0

        ## CNV_within entirely
        elif start1 >= start2 and start1 <= end2 and end1 >= start2 and end1 <= end2:
            dist = 0

        else:
            print start1, start2, end1, end2
            return
        
        data.append(dist)
    return data

In [46]:
def add_anchor_locations(tdf):
    """ extra annotations needed to figure out where anchors are wrt to promoter/variant and 
    whether variants and egenes are actually within the same loops or not"""

    tdf['min_dist_prom_anchor'] = tdf['dist_prom_anchor_A	dist_prom_anchor_B'.split()].abs().min(axis = 1)
    convert = {'True': 'A', 'False': 'B'}
    tdf['nearest_anchor_prom'] = (tdf.dist_prom_anchor_A.abs() == tdf.min_dist_prom_anchor).apply(lambda x: convert[str(x)])

    tdf['min_dist_var_anchor'] = tdf['dist_var_anchor_A	dist_var_anchor_B'.split()].abs().min(axis = 1)
    tdf['nearest_anchor_var'] = (tdf.dist_var_anchor_A.abs() == tdf.min_dist_var_anchor).apply(lambda x: convert[str(x)])
    
    tdf['proximal_anchor'] = tdf.nearest_anchor_prom
    convert = {'A':'B', 'B': 'A'}
    tdf['distal_anchor'] = tdf.nearest_anchor_prom.apply(lambda x: convert[x])
    
    # annotate whether the variant is upstream or downstream of the promoter
    convert = {'True': 'downstream', 'False': 'upstream'}
    tdf['var_wrt_prom'] = (tdf['start_variant'] >= tdf['startA']).apply(lambda x: convert[str(x)])
    
    # the anchor nearest the promoter will be called the proximal anchor, the other distal
    convert = {'True': 'proximal', 'False': 'distal'}
    tdf['nearest_anchor_type_var'] = (tdf['nearest_anchor_prom'] == tdf['nearest_anchor_var']).apply(lambda x: convert[str(x)])
    
    # annotate whether variant is inside or outside the loop
    # downstream ones can be measured from anchor B
    convert = {'True': 'inside', 'False': 'outside'}
    tdf['var_inside_loop'] = ((tdf['dist_var_anchor_B'] <= 0) & (tdf.dist_var_anchor_A >=0)).apply(lambda x: convert[str(x)])
    tdf['prom_inside_loop'] = ((tdf['dist_prom_anchor_A'] >= 0) & (tdf.dist_prom_anchor_B <=0)).apply(lambda x: convert[str(x)])

    # handle the cases where A is distal
    inds = tdf[tdf.distal_anchor == 'A'].index.tolist()
    tdf.loc[inds, 'var_inside_loop'] = ((tdf['dist_var_anchor_A'] >= 0) & (tdf.dist_var_anchor_B <=0)).apply(lambda x: convert[str(x)])
    tdf.loc[inds, 'prom_inside_loop'] = ((tdf['dist_prom_anchor_B'] <= 0) & (tdf.dist_prom_anchor_A >=0)).apply(lambda x: convert[str(x)])
    
    
    # distances to prox/distal_anchors- pull out the distances to anchor that is proximal- (on promoter side) 
    # or distal (on the variant side)- no adjustment of sign (- is upstream + is downstream of whatever anchor)
   
    tdf['dist_var_distal_anchor'] = tdf.dist_var_anchor_B
    tdf['dist_var_proximal_anchor'] = tdf.dist_var_anchor_A
    tdf['dist_prom_proximal_anchor'] = tdf.dist_prom_anchor_A
    tdf['dist_prom_distal_anchor'] = tdf.dist_prom_anchor_B

    inds = tdf[tdf.distal_anchor == 'A'].index.tolist()
    tdf.loc[inds,'dist_var_distal_anchor'] = tdf.loc[inds, 'dist_var_anchor_A']
    tdf.loc[inds, 'dist_var_proximal_anchor'] = tdf.loc[inds, 'dist_var_anchor_B']
    tdf.loc[inds,'dist_prom_distal_anchor'] = tdf.loc[inds, 'dist_prom_anchor_A']
    tdf.loc[inds, 'dist_prom_proximal_anchor'] = tdf.loc[inds, 'dist_prom_anchor_B']


    tdf['var_prom_location'] = ((tdf['var_wrt_prom'] + '_' + tdf['var_inside_loop'] + '_' + 
                                 tdf['prom_inside_loop'] + '_' + tdf.nearest_anchor_type_var))
    
    
    # add on distances where inside loop from proximal to promoter distance is positive,
    # and inside loop from var to distal loop anchor is negative-
    # this is normalizing for changes in sign based on var-prom orientation
    tdf['dist_var_distal_norm'] = tdf.dist_var_distal_anchor
    tdf['dist_var_proximal_norm'] = tdf.dist_var_proximal_anchor
    
    inds = tdf[tdf.var_inside_loop == 'inside'].index.tolist()
    tdf.loc[inds, 'dist_var_distal_norm'] = (tdf.loc[inds, 'dist_var_distal_norm'].abs() * -1)
    tdf.loc[inds, 'dist_var_proximal_norm'] = (tdf.loc[inds, 'dist_var_proximal_norm'].abs())
    
    inds = tdf[tdf.var_inside_loop == 'outside'].index.tolist()
    tdf.loc[inds, 'dist_var_distal_norm'] = (tdf.loc[inds, 'dist_var_distal_norm'].abs())
    tdf.loc[inds, 'dist_var_proximal_norm'] = (tdf.loc[inds, 'dist_var_proximal_norm'].abs() * -1)

    tdf['dist_prom_proximal_norm'] = tdf.dist_prom_proximal_anchor

    inds = tdf[tdf.prom_inside_loop == 'inside'].index.tolist()
    tdf.loc[inds, 'dist_prom_proximal_norm'] = (tdf.loc[inds, 'dist_prom_proximal_norm'].abs())

    inds = tdf[tdf.prom_inside_loop == 'outside'].index.tolist()
    tdf.loc[inds, 'dist_prom_proximal_norm'] = (tdf.loc[inds, 'dist_prom_proximal_norm'].abs() * -1)
    return tdf

In [26]:
def process_closest_info(tdf):
    # variant distance
    d = dist_lambda(tdf, 'start_variant', 'end_variant','start_anchorA', 'end_anchorA')
    tdf['dist_var_anchor_A'] = d
    d = dist_lambda(tdf, 'start_variant', 'end_variant','start_anchorB', 'end_anchorB')
    tdf['dist_var_anchor_B'] = d
    # promoter distance
    d = dist_lambda(tdf, 'startA', 'endA','start_anchorA', 'end_anchorA')
    tdf['dist_prom_anchor_A'] = d
    d = dist_lambda(tdf, 'startA', 'endA', 'start_anchorB', 'end_anchorB')
    tdf['dist_prom_anchor_B'] = d
    tdf['dist_var_prom'] =  dist_lambda(tdf, 'start_variant', 'end_variant', 'startA', 'endA')
    tdf = add_anchor_locations(tdf)

    return tdf

In [27]:
prom_loop_vars = intersect.merge(var_egene[['feature_id','start_variant', 'end_variant', 'snp_id']], left_on='gene_id_mod', right_on='feature_id', how = 'inner')

In [121]:
prom_loop_vars.shape

(27940796, 25)

In [None]:
# this is really big, to save RAM I'll chunk process the file and save the outputs- delete the input file and reload from the outputs- processing them one at a time

In [16]:
# outdir = private_out + '/distance_anchors_maf5'
# DJ.makedir(outdir)

In [28]:
outdir = private_out + '/distance_anchors_maf5_v2'
DJ.makedir(outdir)

In [30]:
for x, df in prom_loop_vars.groupby('chromA'):
    fn_out = outdir + '/closest_annot_{}.tsv'.format(x)
    tdf = process_closest_info(df)
    tdf = tdf.pipe(add_anchor_locations)
    tdf.to_csv(fn_out, sep = '\t')

In [124]:
del prom_loop_vars

In [125]:
gc.collect()

396

# Post Process the Loop Distance Information-
Retrieve the info about the variants that are closest to loop anchors, inside or outside the loop, proximal or distal anchor.  This way we can test influence of several distances as predictors

In [31]:
def retrieve_loop_info_per_var_gene(tdf):
    
    tdf['dist_var_distal_abs'] = tdf['dist_var_distal_norm'].abs()
    tdf['dist_var_proximal_abs'] = tdf['dist_var_proximal_norm'].abs()
    tdf['loop_size_inner'] = tdf['start_anchorB'] - tdf.end_anchorA
    cats = ['inside', 'outside']
    coding ={0:'inside_distal', 1: 'inside_proximal', 2:'outside_distal', 3:'outside_proximal'}
    coding_rev = dict(zip(coding.values(), coding.keys()))
    data = []
    for x, df in tdf.groupby(['gene_id_mod', 'snp_id']):

        out = []
        annot = []
        lengths = []
        for c in cats:
            cdf = df[df['var_inside_loop'] == c]
            s =  cdf.shape[0]
            if s > 0:
                if s > 1:
                    t = cdf.sort_values(['dist_var_distal_abs']).iloc[0]
                    min_var_to_distal, ls_min_distal, id_min_distal = (t['dist_var_distal_abs'], 
                                                                       t['loop_size_inner'], t['loop_id_unique'])

                    t = cdf.sort_values(['dist_var_proximal_abs']).iloc[0]
                    min_var_to_proximal, ls_min_prox, id_min_prox = (t['dist_var_proximal_abs'], 
                                                                     t['loop_size_inner'], t['loop_id_unique'])
                    
                else:
                    t = cdf.iloc[0]
                    min_var_to_distal, ls_min_distal, id_min_distal = (t['dist_var_distal_abs'],
                                                                       t['loop_size_inner'], t['loop_id_unique'])
                    min_var_to_proximal,  ls_min_prox, id_min_prox = (t['dist_var_proximal_abs'],
                                                                      t['loop_size_inner'], t['loop_id_unique'])
            else:
                min_var_to_distal = np.NaN
                min_var_to_proximal = np.NaN

                ls_min_distal = False
                ls_min_prox = False
                id_min_prox = False
                id_min_distal = False
            lengths = lengths + [ls_min_distal, ls_min_prox]
            annot = annot + [id_min_distal, id_min_prox]
            out = out + [min_var_to_distal, min_var_to_proximal]

        min_overall_dist = np.nanmin(out)
        category_of_min = coding[out.index(min_overall_dist)]
        loop_id_overall_min = annot[coding_rev[category_of_min]]
        loop_size_overall_min = lengths[coding_rev[category_of_min]]
        out = (out + annot + [lengths[0]] + 
               [min_overall_dist, category_of_min, x[0], x[1], loop_id_overall_min, loop_size_overall_min])
        data.append(out)

    df_out = pd.DataFrame(data, columns=['min_inside_distal', 'min_inside_proximal', 'min_outside_distal', 'min_outside_proximal', 'loop_id_inside_distal', 'loop_id_inside_proximal', 'loop_id_outside_distal', 'loop_id_outside_proximal', 'loop_size_inside_distal_min', 'min_overall_dist', 'category_min', 'gene_id_mod', 'snp_id', 'loop_id_overall_min', 'loop_size_overall_min'])
    
    return df_out

In [38]:
def post_process_distance_info(fns):
    dfs = [pd.read_table(f) for f in fns_out]
    distance_summary = pd.concat(dfs, ignore_index=True)
    distance_summary['var_inside_and_outside'] = ((distance_summary.min_inside_distal.isnull() == False) & 
                                                  (distance_summary.min_outside_distal.isnull() == False))

    distance_summary['var_inside_only'] = ((distance_summary.min_inside_distal.isnull() == False) &
                                           (distance_summary.min_outside_distal.isnull() == True))

    distance_summary['var_outside_only'] = ((distance_summary.min_inside_distal.isnull() == True) & 
                                            (distance_summary.min_outside_distal.isnull() == False))
    
    # modify bins of distance to loops
    bins = [0,0.5, 5000, 10000, 20000, 50000, 100000, 200000, 300000,1000000000]
    bin_names = ['0bp','1-5kb', '5kb-10kb', '10kb-20kb', '20kb-50kb', '50kb-100kb', '100kb-200kb', '200kb-300kb',
                 '300kb+']
    distance_summary['bin_dist_inside_distal'] = pd.cut(distance_summary.min_inside_distal, bins = bins, 
                                                        labels = bin_names, include_lowest=True)

    # modify bins of distance to loops
    bins = [0,0.5, 5000, 10000, 20000, 50000, 100000, 200000, 300000,1000000000]
    bin_names = ['0bp','1-5kb', '5kb-10kb', '10kb-20kb', '20kb-50kb', '50kb-100kb', '100kb-200kb', '200kb-300kb',
                 '300kb+']

    distance_summary['bin_dist_outside_distal'] = pd.cut(distance_summary.min_outside_distal, 
                                                         bins = bins, labels = bin_names, include_lowest=True)

    bins = [0,0.5, 5000, 10000, 20000, 50000, 100000, 200000, 300000,1000000000]
    bin_names = ['0bp','1-5kb', '5kb-10kb', '10kb-20kb', '20kb-50kb', '50kb-100kb', '100kb-200kb', '200kb-300kb',
                 '300kb+']
    distance_summary['bin_dist_overall_loop'] = pd.cut(distance_summary.min_overall_dist, 
                                                       bins = bins, labels = bin_names,
                                                      include_lowest = True)

    distance_summary['min_overall_dist_norm'] = distance_summary.min_overall_dist
    inds = distance_summary[distance_summary.category_min.apply(lambda x: x.split("_")[0] == 'inside')].index.tolist()

    # make the inner distal ones negative outside distal positive
    distance_summary.loc[inds, 'min_overall_dist_norm'] = distance_summary.loc[inds, 'min_overall_dist_norm'] * -1
    bins = [-1000000000, -300000, -200000, -100000, -50000, -20000, 
            -10000, -5000,-0.5, 0.5, 5000, 10000, 20000, 50000, 100000, 200000, 300000, 1000000000]

    bin_names = ['-300kb+', '-200kb-300kb', '-100kb-200kb', '-50kb-100kb', '-20kb-50kb', 
                 '-10kb-20kb', '-10kb-5kb', '-5kb-1bp','0bp','1bp-5kb', '5kb-10kb', '10kb-20kb', 
                 '20kb-50kb', '50kb-100kb', '100kb-200kb', '200kb-300kb','300kb+']
    
    distance_summary['bin_dist_up_and_down'] = pd.cut(distance_summary.min_overall_dist_norm, 
                                                  bins = bins, labels = bin_names, include_lowest=True)
    
    distance_summary['closer_to_inside_proximal'] = (distance_summary.min_inside_proximal < 
                                                     distance_summary.min_inside_distal)
    return distance_summary

In [48]:
tdf = pd.read_table('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/distance_anchors_maf5/closest_annot_chr5.tsv')

In [44]:
(tdf.var_inside_loop + '_' + tdf.nearest_anchor_type_var).value_counts()

outside_proximal    407176
outside_distal      315793
inside_distal        41957
inside_proximal      40088
dtype: int64

In [49]:
tdf = tdf.pipe(add_anchor_locations)

In [50]:
(tdf.var_inside_loop + '_' + tdf.nearest_anchor_type_var).value_counts()

outside_proximal    409769
outside_distal      317235
inside_distal        42165
inside_proximal      40325
dtype: int64

In [342]:
(tdf.var_inside_loop + '_' + tdf.nearest_anchor_type_var).value_counts()

outside_proximal    850758
outside_distal      710462
inside_distal        87884
inside_proximal      85880
dtype: int64

In [32]:
fns_out = glob.glob(outdir + '/*')

In [33]:
fns_out

['/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/distance_anchors_maf5_v2/closest_annot_chr21.tsv',
 '/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/distance_anchors_maf5_v2/closest_annot_chr13.tsv',
 '/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/distance_anchors_maf5_v2/closest_annot_chr18.tsv',
 '/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/distance_anchors_maf5_v2/closest_annot_chr9.tsv',
 '/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/distance_anchors_maf5_v2/closest_annot_chr7.tsv',
 '/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/distance_anchors_maf5_v2/closest_annot_chr15.tsv',
 '/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/distance_anchors_maf5_v2/closes

In [34]:
fns_out = glob.glob(outdir + '/*')
dfs = []
for f in fns_out:
    chrom = f.split('/')[-1].split('_')[-1].replace('.tsv', '')
    print chrom, datetime.datetime.now()
    fn_out = outdir + '/distances_summary_{}.tsv'.format(chrom)
    df_out = pd.read_table(f).pipe(retrieve_loop_info_per_var_gene)
    df_out.to_csv(fn_out, sep = '\t')

chr21 2019-03-21 14:59:35.420696
chr13 2019-03-21 14:59:47.863982
chr18 2019-03-21 15:00:09.898572
chr9 2019-03-21 15:00:28.301052
chr7 2019-03-21 15:01:31.418458
chr15 2019-03-21 15:03:05.177692
chr5 2019-03-21 15:03:54.805145
chr10 2019-03-21 15:05:04.483335
chr11 2019-03-21 15:06:01.107195
chr16 2019-03-21 15:07:49.920386
chr17 2019-03-21 15:09:11.993206
chr2 2019-03-21 15:11:40.814362
chr20 2019-03-21 15:13:15.004091
chr8 2019-03-21 15:13:58.056727
chr19 2019-03-21 15:14:45.807808
chr3 2019-03-21 15:18:11.785747
chr12 2019-03-21 15:19:45.663409
chr4 2019-03-21 15:21:32.976882
chr6 2019-03-21 15:22:25.037134
chr14 2019-03-21 15:24:48.148446
chr22 2019-03-21 15:25:54.284917
chr1 2019-03-21 15:26:45.798731


In [40]:
fns_out = glob.glob(outdir + '/distances*')
distance_summary = post_process_distance_info(fns_out)

In [52]:
CM.save_dataframe('pc_distance_summary_sv_maf5_v2', distance_summary, private_out)

pc_distance_summary_sv_maf5_v2 = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/pc_distance_summary_sv_maf5_v2.pkl')
pc_distance_summary_sv_maf5_v2 = pd.read_csv('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/pc_distance_summary_sv_maf5_v2.tsv', sep='\t')
# all vars recorded: /frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/load_saved_nb_variables.py
# pickled vars recorded:/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/distance_to_loop_anchors/load_pickled_nb_variables.py
