In [1]:
from __future__ import division
import numpy as np
import os
import sys
import datetime
from subprocess import call
import subprocess
import glob
import djPyBio as DJ
from djPyBio import Common as CM
import argparse

import pandas as pd
import csv
import copy 
import pybedtools as pbt
import ciepy
import cardipspy as cpy
import itertools
import tempfile
import six
import networkx as nx
from scipy.stats import mode
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
from mpl_toolkits.axes_grid1 import  make_axes_locatable
import datetime

from scipy.stats import mode
import scipy.stats as stats

from collections import Counter


In [6]:
def combine_cnvs(cnvs, info):
    """Combine the list of CNVs cnvs into a single CNV of form CNV_{chrom}_{start}_{end}."""
    return 'CNV_{}_{}_{}'.format(info.loc[cnvs[0], 'Chr'], 
                                 info.loc[cnvs, 'Start'].min(), 
                                 info.loc[cnvs, 'End'].max())

def merge_cnvs(a, b, info, cns):
    """Return boolean indicating whether CNVs a and b should be merged."""
    return ((abs(info.loc[a, 'order'] - info.loc[b, 'order']) == 1) and
            ((cns.loc[a] - cns.loc[b]).abs().mean() < 0.5))


# In[38]:

def compare_lists(l1, l2):
    count = 0
    for i1, i2 in zip(l1, l2):
        if i1 != i2:
            count +=1
    return count
def calculate_absolute_mean_diff(a1, a2):
    """calculate the absolute mean difference of elements in 2 equal length arrays"""
    diff = np.array(a1) - np.array(a2)
    diff_abs = np.absolute(diff)
    mean_diff = diff_abs.mean()
    return mean_diff


# In[76]:

def compare_sites(ind1, ind2, cns_t, samples,samples_nmode1, samples_nmode2, cnmode1, cnmode2, samples_lq1, samples_lq2, subtract_lq = True):
    """ calculate the pearson corr of sites in the intersection, calculate the number of differences among non-mode samples """
    data = []

      
    ids = [ind1, ind2]
    ids_mod = ["_".join(i.split('_')[:-1]) for i in ids]
    
    
    cns_1 = cns_t[ind1].to_dict()
    cns_2 = cns_t[ind2].to_dict()
    
        
    samples_nmode1 = [i for i in samples if cns_1[i] != cnmode1]
    samples_nmode2 = [i for i in samples if cns_2[i] != cnmode2]
    samples_to_compare = set(samples_nmode1 + samples_nmode2)
    if subtract_lq:
        # exclude LQ samps
        samples_to_exclude = set(samples_lq1 + samples_lq2)
        samples_to_compare_nmode = list(samples_to_compare.difference(samples_to_exclude))
        
        samples_to_compare_nmode = list(samples_to_compare.difference(samples_to_exclude))
        samples_to_compare_corr = list(set(samples).difference(samples_to_exclude))
    
    else: 
        samples_to_compare_corr = samples
        samples_nmode1 = [i for i in samples if cns_1[i] != cnmode1]
        samples_nmode2 = [i for i in samples if cns_2[i] != cnmode2]
        samples_to_compare_nmode = list(set(samples_nmode1 + samples_nmode2))
        samples_to_exclude = []
    
    cns_nmode_1 = [int(cns_1[i]) for i in samples_to_compare_nmode]
    cns_nmode_2 = [int(cns_2[i]) for i in samples_to_compare_nmode]

    cns_corr_1 = [int(cns_1[i]) for i in samples_to_compare_corr]
    cns_corr_2 = [int(cns_2[i]) for i in samples_to_compare_corr]
    
    allele_dist1 = dict(Counter(cns_corr_1))
    allele_dist2 = dict(Counter(cns_corr_2))
    
    corr_coef = stats.pearsonr(cns_corr_1, cns_corr_2)[0]
        
    nsamp = len(samples_to_compare_nmode)
    nsamp_pass = len(samples_to_compare_corr)

    num_diff = compare_lists(cns_nmode_1, cns_nmode_2)
    alleles = set(cns_corr_1 + cns_corr_2)
    
    alleles1 = set(cns_corr_1)
    num_alleles1 = len(alleles1)
    alleles2 = set(cns_corr_2)
    num_alleles2 = len(alleles2)

    mean_diff_all = calculate_absolute_mean_diff(cns_corr_1, cns_corr_2)
    mean_diff_nmode = calculate_absolute_mean_diff(cns_nmode_1, cns_nmode_2)
    
    exact_match = (cns_corr_1 == cns_corr_2)

    
    try:
        perc_diff = num_diff/nsamp
    except:
#         print nsamp, 'nsamp is zero'
        perc_diff = 0

    out = [ind1, ind2, corr_coef, num_diff, nsamp, 
           perc_diff, samples_to_compare_nmode, samples_to_compare_corr, 
           list(samples_to_exclude), nsamp_pass, exact_match, mean_diff_all, mean_diff_nmode, 
           alleles1, alleles2, num_alleles1, num_alleles2, allele_dist1, allele_dist2]

    return out


def split_id_to_coord(ID):
    spl = ID.split('_')
    chrom = spl[1]
    start = int(spl[2])
    end = int(spl[3])
    return chrom, start, end


# In[41]:

def compute_dist(id1, id2):
    chrom1, start1, end1 = split_id_to_coord(id1)
    chrom2, start2, end2 = split_id_to_coord(id2)
    dist = start2 - end1
    return dist
    


# In[65]:

def collect_data_adjacent_sites(info, cns_t, samples, cn_mode_col = 'cn_mode', subtract_lq=True):
    
    data = []

    for chrom, df in info.groupby('Chr'):
        df = df.sort_values('order')
        inds = df.index.tolist()
        diff_mode_uuids = info.diff_mode_uuids.to_dict()
        mode_cn_all = info[cn_mode_col].to_dict()
        lq_uuids = info.lq_samps.to_dict()
        cnv_classes = info.cnv_class.to_dict()


#         if chrom not in ['X', 'Y']:
        max_range = len(inds)-1 
        for i in range(0, max_range):
            if i < max_range:

                ind1 = inds[i]
                ind2 = inds[i+1]
                pair_ind = '-'.join([ind1, ind2])

                distance_between = compute_dist(ind1, ind2)
                if distance_between < 0:
                    absolute_dist = 0
                else:
                    absolute_dist = distance_between

                cnv_class1 = cnv_classes[ind1]
                cnv_class2 = cnv_classes[ind2]

                diff_uuids1 = diff_mode_uuids[ind1]
                diff_uuids2 = diff_mode_uuids[ind2]
                mode_cn1 = mode_cn_all[ind1]
                mode_cn2 = mode_cn_all[ind2]
                lq_uuids1 = lq_uuids[ind1]
                lq_uuids2 = lq_uuids[ind2]
                pair = [ind1,ind2]

                num_diff1 = len(diff_uuids1)
                num_diff2 = len(diff_uuids2)

                comp = compare_sites(ind1, ind2, cns_t, samples, 
                                     diff_uuids1, diff_uuids2, mode_cn1, mode_cn2, 
                                     lq_uuids1, lq_uuids2, subtract_lq=subtract_lq)

                comp = comp + [pair, mode_cn1, mode_cn2, cnv_class1, cnv_class2, distance_between, 
                               absolute_dist, pair_ind, num_diff1, num_diff2, diff_uuids1, diff_uuids2,
                              chrom]
                data.append(comp)

    df = pd.DataFrame(data, columns=['ID1', 'ID2', 'corr_coef',
                                     'num_diff', 'num_non_mode', 'percent_non_mode_diff', 
                                     'samps_to_compare_nmode', 'samps_to_compare_corr',
                                     'samps_to_exclude', 'num_pass', 'exact_cn_match', 
                                     'mean_cn_diff_all', 'mean_cn_diff_nmode','alleles1', 'alleles2', 
                                     'num_alleles1', 'num_alleles2','allele_dist1', 'allele_dist2','pair', 
                                     'mode_cn1', 'mode_cn2', 'cnv_class1', 'cnv_class2', 
                                     'distance_between', 'distance_between_mod', 'cat_pair', 
                                     'num_diff1', 'num_diff2', 'diff_uuids1', 'diff_uuids2', 'chrom'])
    
    df.index = df.cat_pair
    return df


def prep_info(df):
    df = df.copy()
    df = df.sort_values(['Chr', 'Start', "End"])
    df['order'] = range(0, df.shape[0])
    return df



def prep_cns(df, info):
    df = df.copy()
    df = df.reindex(info.index)
    return df 



def annotate_passing(df, thresh):
    df = df.copy()
    inds = df[(df.corr_coef > 0.9) & (df.percent_non_mode_diff <= 0.2) & (df.mean_cn_diff_all < 0.5) &
              (df.mean_cn_diff_nmode < 0.5) & (df.distance_between_mod < thresh)].index.tolist()
    

    new_size = df.shape[0]
    
    
    df['passing_criteria'] = False
    df.loc[inds, 'passing_criteria'] = True
    return df



def prep_adj_sites(adj_sites_prestich, thresh = 5000):  
    adj_sites_prestich = adj_sites_prestich.copy()
    adj_sites_prestich['matching_mode'] = (adj_sites_prestich.mode_cn1 == adj_sites_prestich.mode_cn2)
#     adj_sites_plot = adj_sites_prestich[(adj_sites_prestich.matching_mode == True) & 
#                                     (adj_sites_prestich.num_alleles1 > 1) & 
#                                         (adj_sites_prestich.num_alleles2 > 1)].copy()
    adj_sites_prestich['log_dist'] = np.log10(adj_sites_prestich.distance_between_mod + 1)
    adj_sites_prestich = adj_sites_prestich.pipe(annotate_passing, thresh)
    return adj_sites_prestich


In [7]:
def get_corr_at_samples(ind1, ind2, samples, cns_t):
    cns_1 = cns_t[ind1].to_dict()
    cns_2 = cns_t[ind2].to_dict()
    try:
        cns_corr1 = [int(cns_1[i]) for i in samples]
        cns_corr2 = [int(cns_2[i]) for i in samples]
        corr_coef = stats.pearsonr(cns_corr1, cns_corr2)[0]
    except:
        print samples
    return corr_coef


def gather_consensus_X(adj_sites_all_males, adj_sites_all_females, adj_sites, cns_t):
    inds = adj_sites_all_males.index.tolist()
    assert (inds == adj_sites_all_females.index.tolist())
    
    num_non_mode_males = adj_sites_all_males.num_non_mode.tolist()
    num_non_mode_females = adj_sites_all_females.num_non_mode.tolist()
    
    ndiff_male = adj_sites_all_males.num_diff.tolist()
    ndiff_female = adj_sites_all_females.num_diff.tolist()
    
    passing_b_males = adj_sites_all_males.passing_criteria.tolist()
    passing_b_females = adj_sites_all_females.passing_criteria.tolist()
    
    samples_nm_males = adj_sites_all_males.samps_to_compare_corr.tolist()
    samples_nm_females = adj_sites_all_females.samps_to_compare_corr.tolist()
    union_samples = [list(set(i1 + i2)) for i1, i2 in zip(samples_nm_males, samples_nm_females)]
   
    corr_males = adj_sites_all_males.corr_coef.tolist()
    corr_females = adj_sites_all_females.corr_coef.tolist()
    dist = adj_sites_all_males.distance_between_mod.tolist()
    
    out = []
    for ind, nm_male, nm_female, pass_b_male, pass_b_female, cm, cf, ndm, ndf, d, us in zip(inds,
                                                                                            num_non_mode_males,
                                                                           num_non_mode_females,
                                                                           passing_b_males, passing_b_females, 
                                                                           corr_males, corr_females,
                                                                           ndiff_male, ndiff_female, 
                                                                           dist, union_samples):
        
        
        ind1, ind2 = ind.split('-')
        chrom = ind.split('_')[1]
        
        if chrom == 'X':

            if [nm_male, nm_female] == [0, 0]:
                out.append([ind, False])

            elif (nm_male == 0) & (nm_female > 0):
                out.append([ind, pass_b_female])

            elif (nm_male > 0) & (nm_female == 0):
                out.append([ind, pass_b_male])

            elif (nm_male > 0) & (nm_female > 0):
                if all([pass_b_male, pass_b_female]):
                    out.append([ind, True])
                else:
                    out.append([ind,False])
                    
            else:
                print "didn't account for all scenarios"
                break
                
    df = pd.DataFrame(out, columns=['cat_pair', 'passing_consensus'])
    df.index = df.cat_pair
    return df


In [8]:
def get_x_chrom(df):
    df = df.copy()
    df = df[df.chrom=='X']
    return df

In [9]:
def compile_stitch_clusters(adj_sites, info):
    
    out = dict()
    adj_sites['order'] = range(0, adj_sites.shape[0])
    
    for chrom, df in adj_sites.groupby('chrom'):
        # ensure order maintained
        df = df.sort_values('order')
        combine = df.passing_criteria_adjusted.tolist()
        last_cnv = df.ID2.tolist()[-1]
        cnvs = df.ID1.tolist() + [last_cnv]
        
        i = 0    
        to_combine = [cnvs[0]]
        while i < len(cnvs) - 1:
            if combine[i]:
                to_combine.append(cnvs[i + 1])
            else:
                if len(to_combine) > 1:
                    out[combine_cnvs(to_combine, info)] = to_combine
                to_combine = [cnvs[i + 1]]
            i += 1
        # if we end on a True - combine what is left
        if len(to_combine) > 1:
            out[combine_cnvs(to_combine, info)] = to_combine

    return out
    

def characterize_cnv_classes(x):
    Types = []
    out = 'none'
    if ('DUP' in x) and ('mCNV' in x) and ('DEL' not in x):
        out = 'DUP,mCNV'
    if ('DUP' in x )and ('DEL' not in x) and ('mCNV' not in x):
        out = 'DUP,DUP'
    if ('DEL' in x) and ('DUP' not in x) and ('mCNV' not in x):
        out = 'DEL,DEL'
    if ('DEL' in x) and ('mCNV' in x) and ('DUP' not in x):
        out = 'DEL,mCNV'
    if ('DEL' in x) and ('mCNV' in x) and ('DUP' in x):
        out = 'DEL,DUP,mCNV'
    if ('DEL' in x) and ('DUP' in x) and ('mCNV' not in x):
        out = 'DUP,DEL'
    if ('mCNV' in x) and ('DUP' not in x) and ('DEL' not in x):
        out = 'mCNV,mCNV'
        
    return out

def lambda_stitch_site_in_cnvs_merged(x):
    merged_cnvs = x['cnvs_merged']
    stitched_cnv = x['stitched_cnv_site_ID']
    
    for c in merged_cnvs:
        if "_".join(c.split('_')[:-1]) == stitched_cnv:
            return [True, c, stitched_cnv]
    return [False, False, stitched_cnv]



def lambda_add_stitched_tag(x):
    if x.split('_')[-1] in ['iPSCORE', 'HipSci']:
        return x
    else:
        return x + '_Stitched'

def get_stitch_cluster_info(combined, info):
    def lambda_add_stitched_tag(x):
        if x.split('_')[-1] in ['iPSCORE', 'HipSci', 'Stitched']:
            return x
        else:
            return x + '_Stitched'
    
    
    to_remove = []
    for k in combined.keys():
        to_remove += combined[k]
    print('{} CNVs combined into {} CNVs.'.format(len(to_remove), len(combined)))
    
        
    data = []
    for i in combined.keys():
        spl = i.split('_')
        chrom = spl[1]
        start,end = int(spl[2]), int(spl[3])
        merged_calls = combined[i]
        len_merge = len(merged_calls)
        classes_merged = []
        cluster_str = ",".join(merged_calls)
        
        for z in merged_calls:
            class_call = info.loc[z].cnv_class
            classes_merged.append(class_call)
     
        data.append([chrom, start, end, i, len_merge, merged_calls, classes_merged, cluster_str])

    merged = pd.DataFrame(data, columns=['chrom', 'start', 'end', 'stitched_cnv_site_ID','num_cnvs_merged', 'cnvs_merged', 'cnv_classes', 'stitch_cluster'])
    
    merged.sort_values(['chrom', 'start', 'end'], inplace=True)
    
    merged['merge_class']= merged.cnv_classes.apply(lambda x: characterize_cnv_classes(x))
    merged['Length']= merged.end - merged.start
    merged.index = merged.stitched_cnv_site_ID
    
    
        
    distances = []
    for i in combined.keys():
        list_sites = combined[i]
        num_combined = len(list_sites)

        for l,z in enumerate(list_sites[:-1]):
            p1, p2 = list_sites[l],list_sites[l+1]
            end_1 = int(p1.split('_')[3])
            start_2 = int(p2.split('_')[2])

            pair = p1 + ',' + p2

            dist = start_2 - end_1
            CNV_Types = str(info.loc[p1].cnv_class) + ',' + str(info.loc[p2].cnv_class)

            distances.append([pair, i, CNV_Types, dist, num_combined])
            

    dist_frame = pd.DataFrame(distances, columns=['pair', 'stitched_cnv_site_ID' ,'cnv_types', 'distance', 'num_combined'])
    mean_dist = dist_frame.groupby('stitched_cnv_site_ID').distance.mean().to_frame()
    mean_dist['mean_distance_between']= mean_dist.distance
    merged = merged.join(mean_dist['mean_distance_between'])
    
    
    
    cluster_ID_dict = {}
    stitched_ID_dict = {}
    count = 1
    for x1, x2 in zip(merged.cnvs_merged.tolist(), merged['stitched_cnv_site_ID'].tolist()):
        for ID in x1:
            cluster_ID_dict[ID] = count
            stitched_ID_dict[ID] = x2

        cluster_ID_dict[x2] = count 
        count +=1

    merged['stitch_cluster_ID'] = merged.stitched_cnv_site_ID.apply(lambda x: cluster_ID_dict[x])
    
    data = merged.apply(lambda x: lambda_stitch_site_in_cnvs_merged(x), axis = 1).tolist()
    tdf = pd.DataFrame(data, columns=['stitch_site_already_gt', 'stitch_site_original_data', 'stitched_cnv'])
    tdf.index = tdf.stitched_cnv
    merged = merged.join(tdf)
   
    
    merged['stitched_cnv_site_ID_mod'] = merged.stitched_cnv_site_ID.apply(lambda x: lambda_add_stitched_tag(x))
    ind = merged[merged.stitch_site_already_gt == True].index.tolist()
    merged.loc[ind, 'stitched_cnv_site_ID_mod']= merged.loc[ind, 'stitch_site_original_data']
    
    
    cluster_ID_dict = {}
    stitched_ID_dict_mod = {}
    cluster_str_dict = {}
    count = 1
    for x1, x2, clust in zip(merged.cnvs_merged.tolist(), merged['stitched_cnv_site_ID_mod'].tolist(), merged.stitch_cluster.tolist()):
        for ID in x1:
            cluster_ID_dict[ID] = count
            stitched_ID_dict_mod[ID] = x2
            cluster_str_dict[ID] = clust


    cluster_ID_dict[x2] = count 
    cluster_str_dict[x2] = clust
    count +=1


  
    
    # build the info data for all sites
    tdf = pd.DataFrame([x.split('_') for x in combined.keys()], columns=['cnv', 'Chr', 'Start', 'End'],
                   index=combined.keys()).drop('cnv', axis=1)
   
    tdf['ID']= tdf.index
    tdf = tdf.join(merged[['stitch_cluster_ID', 'stitch_cluster', 'stitched_cnv_site_ID', 'stitched_cnv_site_ID_mod', 'stitch_site_original_data', 'stitch_site_already_gt']])
    

    tdf.index = tdf['stitched_cnv_site_ID_mod']
    
    tdf.ID = tdf.stitched_cnv_site_ID_mod
    cols = ['Chr','Start', 'End', 'ID','stitch_breakpoint', 
                             'stitch_constituent', 'stitch_cluster_ID', 'stitch_cluster',
            'stitched_cnv_site_ID', 'stitched_cnv_site_ID_mod', 'stitch_site_already_gt']
    
    tdf.Start = tdf.Start.astype(int)
    tdf.End = tdf.End.astype(int)
    # site generated by stitching?
    tdf['stitch_breakpoint']=True
    # is this a site that is a constituent of a stitching cluster?
    tdf['stitch_constituent'] = False
    


    tdf = tdf[tdf.stitch_site_original_data == False]
    tdf = tdf[cols]
    
    # what cluster of stitched site does this correspond to- stitched site ID also gets this numeric ID
    
    info['stitch_cluster_ID'] = info.ID.apply(lambda x: cluster_ID_dict.get(x, 0))
    
    # what stitch site is this site a constituent of, if any?
    info['stitched_cnv_site_ID'] = info.ID.apply(lambda x: stitched_ID_dict.get(x, False))
    info['stitched_cnv_site_ID_mod'] = info.ID.apply(lambda x: stitched_ID_dict_mod.get(x, False))
    info['stitch_cluster'] = info.ID.apply(lambda x: cluster_str_dict.get(x, False))
    
    
    
    # mark sites in the original info that are stitch sites, but aren't new break points- (didn't need new genotyping)
    # these are sites that are contained within one another mostly that didn't pass redundancy checking threshlods
    
    inds = list(set(merged[merged.stitch_site_original_data !=False].stitch_site_original_data.tolist()))
    info['stitch_site_already_gt'] = False
    info.loc[inds, 'stitch_site_already_gt' ] = True
    
    # mark constituent sites of stitching sites
    info['stitch_constituent']= False
    to_remove_mod = [i for i in to_remove if i not in inds]
    info.loc[to_remove_mod, 'stitch_constituent'] = True
    
    
    # mark these sites as stitch_breakpoint so we can still identify them, but allow them to keep their original IDs
    # original IDs still carry info about the origin of the variant call (iPSCORE or HipSci discovery)
    info['stitch_breakpoint'] = False
    info.loc[inds, 'stitch_breakpoint' ] = True
    
    
    
    
    # carry over original info col from the original sites that are unstitched
    info_trunc = info[['Chr','Start', 'End', 'ID','stitch_breakpoint', 
                             'stitch_constituent', 'stitch_cluster_ID', 'stitch_cluster','stitched_cnv_site_ID', 'stitched_cnv_site_ID_mod', 'stitch_site_already_gt']]
    
    
    gs_combined_info_unannotated = pd.concat([tdf, info_trunc])
    gs_combined_info_unannotated = gs_combined_info_unannotated.sort_values(by=['Chr', 'Start', 'End'])
    
    
    
#     gs_combined_info_unannotated.ID =  gs_combined_info_unannotated.ID.apply(lambda x: lambda_add_stitched_tag(x))
#     gs_combined_info_unannotated.index = gs_combined_info_unanno

    
    return merged, to_remove, dist_frame, gs_combined_info_unannotated, tdf

In [10]:
def get_info_adj_sites(info, cns_t, sample_info):
    
    samples_discovery = sample_info[sample_info.CELL_TYPE != 'iPSC'].WGS_ID.tolist()
    samples_ipscore_males = sample_info[(sample_info.STUDY == 'iPSCORE') & (sample_info.SEX == 'M')].WGS_ID.tolist()
    samples_females = sample_info[(sample_info.CELL_TYPE != 'iPSC') & (sample_info.SEX=='F')].WGS_ID.tolist()
    samples_males = sample_info[(sample_info.CELL_TYPE != 'iPSC') & (sample_info.SEX=='M')].WGS_ID.tolist()

    
    adj_sites = collect_data_adjacent_sites(info, cns_t, samples_discovery, subtract_lq=True).pipe(prep_adj_sites, thresh=30000)
    passing_lq_correct = adj_sites[adj_sites.passing_criteria == True].index.tolist()

    adj_sites_all_males = collect_data_adjacent_sites(info, cns_t, samples_males, cn_mode_col='cn_mode_male', subtract_lq=True).pipe(prep_adj_sites, thresh=30000)

    adj_sites_all_females = collect_data_adjacent_sites(info, cns_t, samples_females, cn_mode_col='cn_mode_female', subtract_lq=True).pipe(prep_adj_sites, thresh=30000)

    adj_sites_male_ipscore = collect_data_adjacent_sites(info, cns_t, samples_ipscore_males, cn_mode_col = 'cn_mode_male_ipscore_fb', subtract_lq=True).pipe(prep_adj_sites, thresh=30000)

    x_chrom_consensus = gather_consensus_X(adj_sites_all_males, adj_sites_all_females, adj_sites, cns_t)

    adj_sites['passing_criteria_adjusted'] = adj_sites.passing_criteria
    inds_x = adj_sites[adj_sites.chrom=='X'].index.tolist()
    inds_y = adj_sites[adj_sites.chrom=='Y'].index.tolist()
    adj_sites_y_males = adj_sites_male_ipscore[adj_sites_male_ipscore.chrom=='Y'].copy()

    adj_sites.loc[inds_x, 'passing_criteria_adjusted'] = False
    adj_sites.loc[inds_y, 'passing_criteria_adjusted'] = False

    adj_sites.loc[inds_x, 'passing_criteria_adjusted'] = x_chrom_consensus.passing_consensus
    adj_sites.loc[inds_y, 'passing_criteria_adjusted'] = adj_sites_y_males.passing_criteria
    
    return adj_sites

In [19]:
info = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/info_all_sites_rmdup_filt.pkl').pipe(prep_info)

cns = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/cns_all.pkl').pipe(prep_cns, info)
cns.drop('old_index', axis =1, inplace=True)

sample_info = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/sample_info_combined/sample_info.pkl')

samples_discovery = sample_info[sample_info.CELL_TYPE != 'iPSC'].WGS_ID.tolist()

cns_t = cns.T.copy()
cns_t = cns_t.loc[samples_discovery]

adj_sites = get_info_adj_sites(info, cns_t, sample_info)

to_combine  = compile_stitch_clusters(adj_sites, info)



In [20]:
merged, to_remove, dist_frame, gs_combined_info_unannotated, t = get_stitch_cluster_info(to_combine, info)

3913 CNVs combined into 1252 CNVs.


In [29]:
info = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/gs_info_filt_rmdup.pkl').pipe(prep_info)

cns = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/cns_all.pkl').pipe(prep_cns, info)
cns.drop('old_index', axis =1, inplace=True)

sample_info = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/sample_info_combined/sample_info.pkl')

# samples_discovery = sample_info[sample_info.CELL_TYPE != 'iPSC'].WGS_ID.tolist()

cns_t = cns.T.copy()
cns_t = cns_t.loc[samples_discovery]

adj_sites = get_info_adj_sites(info, cns_t, sample_info)

to_combine  = compile_stitch_clusters(adj_sites, info)



In [30]:
merged, to_remove, dist_frame, gs_combined_info_unannotated, t = get_stitch_cluster_info(to_combine, info)

3913 CNVs combined into 1252 CNVs.


In [31]:
out_dir = '/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined'

In [32]:
CM.save_dataframe('stitch_site_info_v3', merged, out_dir)

stitch_site_info_v3 = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/stitch_site_info_v3.pkl')
stitch_site_info_v3 = pd.read_csv('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/stitch_site_info_v3.tsv', sep='\t')
# all vars recorded: /frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/load_saved_nb_variables.py
# pickled vars recorded:/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/load_pickled_nb_variables.py


In [33]:
CM.save_dataframe('gs_info_stitched_ua', gs_combined_info_unannotated, out_dir)

gs_info_stitched_ua = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/gs_info_stitched_ua.pkl')
gs_info_stitched_ua = pd.read_csv('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/gs_info_stitched_ua.tsv', sep='\t')
# all vars recorded: /frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/load_saved_nb_variables.py
# pickled vars recorded:/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/load_pickled_nb_variables.py


In [34]:
CM.save_dataframe('adj_site_info', adj_sites, out_dir)

adj_site_info = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/adj_site_info.pkl')
adj_site_info = pd.read_csv('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/adj_site_info.tsv', sep='\t')
# all vars recorded: /frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/load_saved_nb_variables.py
# pickled vars recorded:/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/load_pickled_nb_variables.py


In [21]:
gs_info_stitched_ua = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/gs_info_stitched_ua.pkl')

In [77]:
merged, to_remove, dist_frame, gs_combined_info_unannotated, t = get_stitch_cluster_info(to_combine, info)

3913 CNVs combined into 1252 CNVs.


In [81]:
# merged, to_remove, dist_frame, gs_combined_info_unannotated, t = get_stitch_cluster_info(to_combine, info)

In [39]:
# stitch_site_info_v2.stitch_site_already_gt.value_counts()

False    922
True     397
Name: stitch_site_already_gt, dtype: int64

In [83]:
merged.stitch_site_already_gt.value_counts()

False    897
True     355
Name: stitch_site_already_gt, dtype: int64

In [85]:
CM.save_dataframe('stitch_site_info_v2', merged, out_dir)

stitch_site_info_v2 = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/stitch_site_info_v2.pkl')
stitch_site_info_v2 = pd.read_csv('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/stitch_site_info_v2.tsv', sep='\t')
# all vars recorded: /frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/load_saved_nb_variables.py
# pickled vars recorded:/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/load_pickled_nb_variables.py


In [86]:
CM.save_dataframe('gs_info_stitched_ua', gs_combined_info_unannotated, out_dir)

gs_info_stitched_ua = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/gs_info_stitched_ua.pkl')
gs_info_stitched_ua = pd.read_csv('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/gs_info_stitched_ua.tsv', sep='\t')
# all vars recorded: /frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/load_saved_nb_variables.py
# pickled vars recorded:/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/load_pickled_nb_variables.py


In [87]:
CM.save_dataframe('adj_site_info', adj_sites, out_dir)

adj_site_info = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/adj_site_info.pkl')
adj_site_info = pd.read_csv('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/adj_site_info.tsv', sep='\t')
# all vars recorded: /frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/load_saved_nb_variables.py
# pickled vars recorded:/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/load_pickled_nb_variables.py


In [6]:
adj_site_info = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V3/i2QTL_combined/adj_site_info.pkl')

In [7]:
adj_site_info[adj_site_info.ID1 == 'CNV_11_114477669_114482169_HipSci']

Unnamed: 0_level_0,ID1,ID2,corr_coef,num_diff,num_non_mode,percent_non_mode_diff,samps_to_compare_nmode,samps_to_compare_corr,samps_to_exclude,num_pass,exact_cn_match,mean_cn_diff_all,mean_cn_diff_nmode,alleles1,alleles2,num_alleles1,num_alleles2,allele_dist1,allele_dist2,pair,mode_cn1,mode_cn2,cnv_class1,cnv_class2,distance_between,distance_between_mod,cat_pair,num_diff1,num_diff2,diff_uuids1,diff_uuids2,chrom,matching_mode,log_dist,passing_criteria,passing_criteria_adjusted,order
cat_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
CNV_11_114477669_114482169_HipSci-CNV_11_114478169_114481969_iPSCORE,CNV_11_114477669_114482169_HipSci,CNV_11_114478169_114481969_iPSCORE,1.0,0,10,0.0,"[SAMEA2632917, SAMEA2555011, 0f518720-9d4b-498...","[fea585f4-3cae-48a1-8e2c-e89d88930b99, SAMEA27...",[],478,True,0.0,0.0,"{1, 2, 3}","{1, 2, 3}",3,3,"{1: 7, 2: 468, 3: 3}","{1: 7, 2: 468, 3: 3}","[CNV_11_114477669_114482169_HipSci, CNV_11_114...",2,2,mCNV,mCNV,-4000,0,CNV_11_114477669_114482169_HipSci-CNV_11_11447...,10,10,"[ae757c28-7757-4a5e-9016-9d89dbf56499, 0f51872...","[ae757c28-7757-4a5e-9016-9d89dbf56499, 0f51872...",11,True,0.0,True,True,2286


In [5]:
adj_site_info['']

Unnamed: 0_level_0,ID1,ID2,corr_coef,num_diff,num_non_mode,percent_non_mode_diff,samps_to_compare_nmode,samps_to_compare_corr,samps_to_exclude,num_pass,exact_cn_match,mean_cn_diff_all,mean_cn_diff_nmode,alleles1,alleles2,num_alleles1,num_alleles2,allele_dist1,allele_dist2,pair,mode_cn1,mode_cn2,cnv_class1,cnv_class2,distance_between,distance_between_mod,cat_pair,num_diff1,num_diff2,diff_uuids1,diff_uuids2,chrom,matching_mode,log_dist,passing_criteria,passing_criteria_adjusted,order
cat_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
CNV_1_55175_56686_iPSCORE-CNV_1_85391_86406_iPSCORE,CNV_1_55175_56686_iPSCORE,CNV_1_85391_86406_iPSCORE,0.846991,107,295,0.362712,"[fea585f4-3cae-48a1-8e2c-e89d88930b99, SAMEA32...","[fea585f4-3cae-48a1-8e2c-e89d88930b99, SAMEA27...","[ee146263-82f5-4ed7-bb49-4300837acc85, d941dce...",438,False,0.273973,0.40678,"{0, 1, 2, 3, 4, 5}","{0, 1, 2, 3, 4, 5}",6,6,"{0: 116, 1: 194, 2: 97, 3: 25, 4: 4, 5: 2}","{0: 102, 1: 167, 2: 116, 3: 41, 4: 11, 5: 1}","[CNV_1_55175_56686_iPSCORE, CNV_1_85391_86406_...",1,1,mCNV,mCNV,28705,28705,CNV_1_55175_56686_iPSCORE-CNV_1_85391_86406_iP...,259,282,"[61d77efd-eec4-421b-a717-4486512668da, 8316c9a...","[41b56a79-65a4-44d0-8f3e-86c7d6621fcb, c7a966f...",1,True,4.457973,False,False,0
CNV_1_85391_86406_iPSCORE-CNV_1_86406_91299_iPSCORE,CNV_1_85391_86406_iPSCORE,CNV_1_86406_91299_iPSCORE,0.906808,84,295,0.284746,"[fea585f4-3cae-48a1-8e2c-e89d88930b99, 3ba3d4d...","[fea585f4-3cae-48a1-8e2c-e89d88930b99, SAMEA27...","[ee146263-82f5-4ed7-bb49-4300837acc85, d941dce...",450,False,0.202222,0.308475,"{0, 1, 2, 3, 4, 5}","{0, 1, 2, 3, 4}",6,5,"{0: 102, 1: 169, 2: 119, 3: 44, 4: 15, 5: 1}","{0: 116, 1: 196, 2: 99, 3: 33, 4: 6}","[CNV_1_85391_86406_iPSCORE, CNV_1_86406_91299_...",1,1,mCNV,mCNV,0,0,CNV_1_85391_86406_iPSCORE-CNV_1_86406_91299_iP...,282,272,"[41b56a79-65a4-44d0-8f3e-86c7d6621fcb, c7a966f...","[32ee31cb-156d-4a95-a708-6271f7e2cbab, 8316c9a...",1,True,0.0,False,False,1
CNV_1_86406_91299_iPSCORE-CNV_1_564439_567804_iPSCORE,CNV_1_86406_91299_iPSCORE,CNV_1_564439_567804_iPSCORE,0.049197,238,345,0.689855,"[fea585f4-3cae-48a1-8e2c-e89d88930b99, SAMEA27...","[fea585f4-3cae-48a1-8e2c-e89d88930b99, SAMEA27...","[4a1becf2-f540-4a14-99a2-41aa3205bd81, SAMEA31...",435,False,1.372414,1.469565,"{0, 1, 2, 3, 4}","{0, 1, 2, 3, 4, 5, 6, 7, 9, 12, 13, 15, 20}",5,13,"{0: 103, 1: 187, 2: 101, 3: 36, 4: 8}","{0: 12, 1: 101, 2: 219, 3: 70, 4: 14, 5: 9, 6:...","[CNV_1_86406_91299_iPSCORE, CNV_1_564439_56780...",1,2,mCNV,mCNV,473140,473140,CNV_1_86406_91299_iPSCORE-CNV_1_564439_567804_...,272,216,"[32ee31cb-156d-4a95-a708-6271f7e2cbab, 8316c9a...","[5a150d58-df23-4b02-b708-c6594b180d14, 41b56a7...",1,False,5.674991,False,False,2
CNV_1_564439_567804_iPSCORE-CNV_1_565396_567497_iPSCORE,CNV_1_564439_567804_iPSCORE,CNV_1_565396_567497_iPSCORE,0.699246,37,210,0.17619,"[3ba3d4d5-9589-4f74-aa4c-28a96f8c222b, SAMEA32...","[SAMEA2784668, 2914ab72-1d02-470a-ba53-7868982...","[fea585f4-3cae-48a1-8e2c-e89d88930b99, SAMEA23...",420,False,0.245238,0.490476,"{0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 15, 20}","{0, 1, 2, 3, 4, 5, 8, 14}",12,8,"{0: 12, 1: 101, 2: 216, 3: 65, 4: 11, 5: 7, 6:...","{0: 13, 1: 106, 2: 217, 3: 71, 4: 10, 5: 1, 8:...","[CNV_1_564439_567804_iPSCORE, CNV_1_565396_567...",2,2,mCNV,mCNV,-2408,0,CNV_1_564439_567804_iPSCORE-CNV_1_565396_56749...,216,233,"[5a150d58-df23-4b02-b708-c6594b180d14, 41b56a7...","[5a150d58-df23-4b02-b708-c6594b180d14, dd70912...",1,True,0.0,False,False,3
CNV_1_565396_567497_iPSCORE-CNV_1_565396_567904_HipSci,CNV_1_565396_567497_iPSCORE,CNV_1_565396_567904_HipSci,0.558123,8,222,0.036036,"[SAMEA3257661, SAMEA2438671, 4df98c9a-93f7-473...","[SAMEA2784668, 2914ab72-1d02-470a-ba53-7868982...","[fea585f4-3cae-48a1-8e2c-e89d88930b99, 3ba3d4d...",440,False,0.143182,0.283784,"{0, 1, 2, 3, 4, 8, 14}","{0, 1, 2, 3, 4, 5, 7, 13, 16, 18}",7,10,"{0: 15, 1: 111, 2: 221, 3: 78, 4: 13, 8: 1, 14...","{0: 14, 1: 109, 2: 218, 3: 78, 4: 13, 5: 1, 7:...","[CNV_1_565396_567497_iPSCORE, CNV_1_565396_567...",2,2,mCNV,mCNV,-2101,0,CNV_1_565396_567497_iPSCORE-CNV_1_565396_56790...,233,227,"[5a150d58-df23-4b02-b708-c6594b180d14, dd70912...","[5a150d58-df23-4b02-b708-c6594b180d14, dd70912...",1,True,0.0,False,False,4


In [None]:
adj_site_info

In [64]:
info = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_combine_ipscore_hipsci/i2QTL_final/info_all_sites_rmdup_filt.pkl').pipe(prep_info)
cns = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V3/i2QTL_combined/cns_all.pkl').pipe(prep_cns, info)
cns.drop('old_index', axis =1, inplace=True)

sample_info = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/sample_info_combined/sample_info.pkl')


cns_t = cns.T.copy()
cns_t = cns_t.loc[samples_discovery]



In [65]:
adj_sites = get_info_adj_sites(info, cns_t, sample_info)

to_combine  = compile_stitch_clusters(adj_sites, info)



In [35]:
merged, to_remove, dist_frame, gs_combined_info_unannotated, t = get_stitch_cluster_info(to_combine, info)

3913 CNVs combined into 1252 CNVs.


In [36]:
out_dir = '/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined'

In [38]:
merged.stitch_site_already_gt.value_counts()

False    897
True     355
Name: stitch_site_already_gt, dtype: int64

In [37]:
CM.save_dataframe('stitch_site_info_v2', merged, out_dir)
CM.save_dataframe('gs_info_stitched_ua', gs_combined_info_unannotated, out_dir)
CM.save_dataframe('adj_site_info', adj_sites, out_dir)

stitch_site_info_v2 = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/stitch_site_info_v2.pkl')
stitch_site_info_v2 = pd.read_csv('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/stitch_site_info_v2.tsv', sep='\t')
# all vars recorded: /frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/load_saved_nb_variables.py
# pickled vars recorded:/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/load_pickled_nb_variables.py
gs_info_stitched_ua = pd.read_pickle('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/gs_info_stitched_ua.pkl')
gs_info_stitched_ua = pd.read_csv('/frazer01/projects/hipsci/analysis/i2QTL-sv-analysis/private_output/gs_processing_V4/i2QTL_combined/gs_info_stitched_ua.tsv', sep='\t')
# all vars recorded: /frazer0

# Deprecated/Development

In [168]:
# fix info data for the stitched cnv sites that are already present in the data set

alread_gt_ind = merged[merged.stitch_site_already_gt == True].stitched_cnv_site_ID_mod.tolist()

gs_combined_info_unannotated['stitch_site_already_gt'] = False
gs_combined_info_unannotated.loc[alread_gt_ind] = True
gs_combined_info_unannotated['stitched_cnv_site_ID_mod'] = False


tdf = merged[merged.stitch_site_already_gt == True].copy()
# tdf.index = tdf.stitch_site_already_gt

tdf.index = tdf.stitch_site_original_data

tdf['stitch_breakpoint'] = True
tdf['stitch_constituent'] = False
tdf['stitch_site_already_gt'] = False

inds_original_in_stitch = merged[merged.stitch_site_already_gt == True].stitch_site_original_data.tolist()

cols = 'cluster_ID	cluster_str	stitch_breakpoint	stitch_constituent	stitched_cnv_site_ID	stitch_site_already_gt stitched_cnv_site_ID_mod'.split()

test = tdf.loc[inds_original_in_stitch][cols].values

gs_combined_info_unannotated.loc[inds_original_in_stitch, cols] = test

gs_combined_info_unannotated.stitch_breakpoint.value_counts()

gs_combined_info_unannotated = gs_combined_info_unannotated[gs_combined_info_unannotated.stitch_site_already_gt == False]


gs_combined_info_unannotated.loc[inds_original_in_stitch, 'stitch_site_already_gt'] = True

In [221]:
# def gather_consensus_X(adj_sites_all_males, adj_sites_all_females, adj_sites, cns_t):
#     inds = adj_sites_all_males.index.tolist()
#     assert (inds == adj_sites_all_females.index.tolist())
    
#     num_non_mode_males = adj_sites_all_males.num_non_mode.tolist()
#     num_non_mode_females = adj_sites_all_females.num_non_mode.tolist()
    
#     ndiff_male = adj_sites_all_males.num_diff.tolist()
#     ndiff_female = adj_sites_all_females.num_diff.tolist()
    
#     passing_b_males = adj_sites_all_males.passing_criteria.tolist()
#     passing_b_females = adj_sites_all_females.passing_criteria.tolist()
    
#     samples_nm_males = adj_sites_all_males.samps_to_compare_corr.tolist()
#     samples_nm_females = adj_sites_all_females.samps_to_compare_corr.tolist()
#     union_samples = [list(set(i1 + i2)) for i1, i2 in zip(samples_nm_males, samples_nm_females)]
   
#     corr_males = adj_sites_all_males.corr_coef.tolist()
#     corr_females = adj_sites_all_females.corr_coef.tolist()
#     dist = adj_sites_all_males.distance_between_mod.tolist()
    
#     out = []
#     for ind, nm_male, nm_female, pass_b_male, pass_b_female, cm, cf, ndm, ndf, d, us in zip(inds,
#                                                                                             num_non_mode_males,
#                                                                            num_non_mode_females,
#                                                                            passing_b_males, passing_b_females, 
#                                                                            corr_males, corr_females,
#                                                                            ndiff_male, ndiff_female, 
#                                                                            dist, union_samples):
        
        
#         ind1, ind2 = ind.split('-')
#         chrom = ind.split('_')[1]
        
#         if chrom == 'X':

#             if [nm_male, nm_female] == [0, 0]:
#                 out.append([ind, False])

#             elif (nm_male == 0) & (nm_female > 0):
#                 out.append([ind, pass_b_female])

#             elif (nm_male > 0) & (nm_female == 0):
#                 out.append([ind, pass_b_male])

#             elif (nm_male > 0) & (nm_female > 0):
#                 if all([pass_b_male, pass_b_female]):
#                     out.append([ind, True])
#                 else:
#                     perc_diff = (ndm + ndf)/(nm_male + nm_female)
#                     corr = get_corr_at_samples(ind1, ind2, us, cns_t)
#                     if all([(cm > 0.9), (cf > 0.9), (perc_diff <=0.2), (d < 30000), (corr > 0.9)]):
#                         print ind
#                         out.append([ind, True])
#                     else:
#                         out.append([ind,False])

                    
#             else:
#                 print "didn't account for all scenarios"
#                 break
                
#     df = pd.DataFrame(out, columns=['cat_pair', 'passing_consensus'])
#     df.index = df.cat_pair
#     return df
