## Import dependecies

In [31]:
import gc
import sys
import time
import warnings
import logging
import random
import json
from scipy import stats
from os.path import join, basename, splitext, isfile
from os import listdir
from collections import OrderedDict

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import cooler
import cooltools

import hichew
from hichew.calling import boundaries, domains, clusters
from hichew.compute import normalize, d_scores, insulation_scores, silhouette
from hichew.loader import cool_files
from hichew.plot import clusters_dynamics, viz_opt_curves, viz_tads, _pca, _tsne


In [32]:
import logging
import time
import operator
import os
import sys
import warnings

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, AffinityPropagation, MeanShift

from hichew.lib import utils

In [33]:
sns.set(context='paper', style='whitegrid')
warnings.filterwarnings("ignore")

In [34]:
%matplotlib inline

## Specify parameters

In [35]:
resolution = 10000

fountains_path = '../arcuda/fountains/{}.bed'
boundaries_path = '../arcuda/boundaries/filtered/{}.csv'
coolers_path = '../arcuda/coolers/This2022/{}.mcool'

stages_fountains = ['Wild-Type_2.75.danrer11-reduced.mapq_30.1000', 
                    'WT.danrer11-reduced.mapq_30.1000', 
                    'TR.danrer11-reduced.mapq_30.1000', 
                    'Wild-Type_11.danrer11-reduced.mapq_30.1000', 
                    'Wild-Type_25.danrer11-reduced.mapq_30.1000']

stages_boundaries = ['sperm.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_2.75.danrer11-reduced.mapq_30.1000', 
                     'WT.danrer11-reduced.mapq_30.1000', 
                     'TR.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_5.3.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_11.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_25.danrer11-reduced.mapq_30.1000']

stages_all = ['sperm.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_2.75.danrer11-reduced.mapq_30.1000', 
              'WT.danrer11-reduced.mapq_30.1000', 
              'TR.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_5.3.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_11.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_25.danrer11-reduced.mapq_30.1000']


## Get data

#### Chromosomes

In [36]:
# chromosome armsizes

chromosome_armsizes_path = '../arcuda/genome/danRer11.armsizes.txt'
chromosome_armsizes_data = pd.read_csv(chromosome_armsizes_path)
chromosome_armsizes_data = chromosome_armsizes_data.iloc[:-1]
chromosomes = list(chromosome_armsizes_data.chrom.drop_duplicates())

#### Bad bins

In [37]:
# bad bins

bad_bins_path = '../arcuda/genome/blacklist.danrer11-reduced.tsv'
bad_bins_data = pd.read_csv(bad_bins_path, sep='\t')
bad_bins_data.columns = ['ch', 'bgn', 'end']

#### Good bins

In [38]:
# good bins

good_bins_path = '../arcuda/genome/selected.50000-safe.danrer11-reduced.tsv'
good_bins_data = pd.read_csv(good_bins_path, sep='\t')

#### Fountains

In [39]:
fountains_dict = dict.fromkeys(stages_fountains, None)
for stage in stages_fountains:
    fountains_data = pd.read_csv(fountains_path.format(stage), sep='\t', header=None)
    fountains_data.columns = ['ch', 'bgn', 'end', 'unknown', 'sim']
    fountains_dict[stage] = fountains_data

#### Boundaries

In [40]:
boundaries_dict = dict.fromkeys(stages_boundaries, None)
for stage in stages_boundaries:
    boundaries_data = pd.read_csv(boundaries_path.format(stage))
    boundaries_dict[stage] = boundaries_data

## Calculate distance to the nearest boundaries from each fountain

To the left and to the right of the **fountain** (centered in fountains).

If there would be a **fountain or bad bin** between the fountain and nearest boundary -- then put None. Else -- distance to the nearest boundary (with "–" sign in case of left, with "+" sign in case of right).

Visualized in two modes: without `None`'s independently (left and right) and without `None`'s in both directions.

In [50]:
ddd = pd.DataFrame()

In [47]:
def calculate_distance(fountains, boundaries, bad_bins, centered='fountains', mode='independent'):
    
    def check_left(x, fountains, boundaries, bad_bins):
        print('!!!')
        # boundaries
        sub_df_boundaries = boundaries[(boundaries.ch == x.ch) & (boundaries.bgn <= x.bgn) & (boundaries.end <= x.end)]
        sub_df_boundaries = sub_df_boundaries.sort_values('bgn', ascending=True)
        try:
            closest_boundary = (sub_df_boundaries.iloc[-1]['bgn'], sub_df_boundaries.iloc[-1]['end'])
        except:
            closest_boundary = None
        
        # fountains
        sub_df_fountains = fountains[(fountains.ch == x.ch) & (fountains.bgn <= x.bgn) & (fountains.end <= x.end)]
        sub_df_fountains = sub_df_fountains.sort_values('bgn', ascending=True)
        try:
            closest_fountain = (sub_df_fountains.iloc[-1]['bgn'], sub_df_fountains.iloc[-1]['end'])
        except:
            closest_fountain = None
        
        # bad bins
        sub_df_bad_bins = bad_bins[(bad_bins.ch == x.ch) & (bad_bins.bgn <= x.bgn) & (bad_bins.end <= x.end)]
        sub_df_bad_bins = sub_df_bad_bins.sort_values('bgn', ascending=True)
        try:
            closest_bad_bin = (sub_df_bad_bins.iloc[-1]['bgn'], sub_df_bad_bins.iloc[-1]['end'])
        except:
            closest_bad_bin = None
        
        # locus of interest
        locus = (x.bgn, x.end)
        
        if (closest_fountain < locus and closest_fountain > closest_boundary) or (closest_bad_bin < locus and closest_bad_bin > closest_boundary):
            return None
        else:
            return closest_boundary[0] - locus[0]
    
    def check_right(x, fountains, boundaries, bad_bins):
        # boundaries
        sub_df_boundaries = boundaries[(boundaries.ch == x.ch) & (boundaries.bgn >= x.bgn) & (boundaries.end >= x.end)]
        sub_df_boundaries = sub_df_boundaries.sort_values('bgn', ascending=False)
        closest_boundary = (sub_df_boundaries.iloc[-1]['bgn'], sub_df_boundaries.iloc[-1]['end'])
        
        # fountains
        sub_df_fountains = fountains[(fountains.ch == x.ch) & (fountains.bgn >= x.bgn) & (fountains.end >= x.end)]
        sub_df_fountains = sub_df_fountains.sort_values('bgn', ascending=False)
        closest_fountain = (sub_df_fountains.iloc[-1]['bgn'], sub_df_fountains.iloc[-1]['end'])
        
        # bad bins
        sub_df_bad_bins = bad_bins[(bad_bins.ch == x.ch) & (bad_bins.bgn >= x.bgn) & (bad_bins.end >= x.end)]
        sub_df_bad_bins = sub_df_bad_bins.sort_values('bgn', ascending=False)
        closest_bad_bin = (sub_df_bad_bins.iloc[-1]['bgn'], sub_df_bad_bins.iloc[-1]['end'])
        
        # locus of interest
        locus = (x.bgn, x.end)
        
        if (closest_fountain > locus and closest_fountain < closest_boundary) or (closest_bad_bin > locus and closest_bad_bin < closest_boundary):
            return None
        else:
            return closest_boundary[0] - locus[0]
        
        
    if centered == 'fountains':
        fountains['left'] = fountains.apply(check_left, axis=1, fountains=fountains, boundaries=boundaries, bad_bins=bad_bins)
        fountains['right'] = fountains.apply(check_right, axis=1, fountains=fountains, boundaries=boundaries, bad_bins=bad_bins)
        if mode == 'independent':
            return list(fountains[fountains['left'].notnull()]['left']), list(fountains[fountains['right'].notnull()]['right'])
        elif mode == 'overlaped':
            sub = fountains[(fountains['left'].notnull()) & (fountains['right'].notnull())]
            return list(sub['left']), list(sub['right'])
    elif centered == 'boundaries':
        boundaries['left'] = boundaries.apply(check_left, axis=1, fountains=fountains, boundaries=boundaries, bad_bins=bad_bins)
        boundaries['right'] = boundaries.apply(check_right, axis=1, fountains=fountains, boundaries=boundaries, bad_bins=bad_bins)
        if mode == 'independent':
            return list(boundaries[boundaries['left'].notnull()]['left']), list(boundaries[boundaries['right'].notnull()]['right'])
        elif mode == 'overlaped':
            sub = boundaries[(boundaries['left'].notnull()) & (boundaries['right'].notnull())]
            return list(sub['left']), list(sub['right']) 
        

### Independent mode

In [48]:
fountains_centered_independent_mode = pd.DataFrame(index=stages_fountains, columns=stages_boundaries)

In [49]:

for f_stage in stages_fountains:
    for b_stage in stages_boundaries:
        fountains_centered_independent_mode.loc[f_stage, b_stage] = calculate_distance(fountains_dict[f_stage], 
                                                                                       boundaries_dict[b_stage], 
                                                                                       bad_bins_data, 
                                                                                       centered='fountains', 
                                                                                       mode='independent')
    

!!!
!!!
!!!
!!!


IndexError: single positional indexer is out-of-bounds