## Import dependecies

In [81]:
import gc
import sys
import time
import warnings
import logging
import random
import json
from scipy import stats
from os.path import join, basename, splitext, isfile
from os import listdir
from collections import OrderedDict

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import cooler
import cooltools

import hichew
from hichew.calling import boundaries, domains, clusters
from hichew.compute import normalize, d_scores, insulation_scores, silhouette
from hichew.loader import cool_files
from hichew.plot import clusters_dynamics, viz_opt_curves, viz_tads, _pca, _tsne


In [82]:
import logging
import time
import operator
import os
import sys
import warnings

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, AffinityPropagation, MeanShift

from hichew.lib import utils

In [83]:
sns.set(context='paper', style='whitegrid')
warnings.filterwarnings("ignore")

In [84]:
%matplotlib inline

In [85]:
def _insulation_scores_(df_b, df, coolers, stages, chromnames=None, ignore_diags=2):
    """
    Function to compute insulation scores to perform clustering.
    :param df: dataframe with TAD boundaries annotation
    :param coolers: :param coolers: python dictionary with cooler files that correspond to selected stages of development.
    :param stages: list of developmental stages.
    :param chromnames: list of chromosomes of interest. If None -- all chromosomes will be considered.
    :param ignore_diags: parameter for cooltools calculate_insulation_score method to ignore first K diagonals while computing insulation diamond.
    :return: adjusted dataframe with insulation scores computed for each stage.
    """

    logging.info("COMPUTE|INSULATION_SCORES| Start computing insulation scores...")
    in_time = time.time()
    df = df.copy()
    df.index = df[['chrom', 'start', 'end']]
    
    if chromnames:
        chrms = chromnames
    else:
        chrms = list(coolers.values())[0].chromnames

    for stage in stages:
        ins_scores = pd.DataFrame(
            columns=['chrom', 'start', 'end', 'is_bad_bin', 'log2_insulation_score', 'n_valid_pixels'])
        for ch in chrms:
            opt_window_ch = df_b.query("ch=='{}'".format(ch))['window'].iloc[0]
            sub_df = cooltools.insulation.calculate_insulation_score(coolers[stage], int(opt_window_ch),
                                                                     ignore_diags=ignore_diags, chromosomes=[ch])
            sub_df.rename(columns={'log2_insulation_score_{}'.format(int(opt_window_ch)): 'log2_insulation_score',
                                   'n_valid_pixels_{}'.format(int(opt_window_ch)): 'n_valid_pixels'}, inplace=True)
            ins_scores = pd.concat([ins_scores, sub_df])
        ins_scores.reset_index(drop=True, inplace=True)
        ins_scores.index = ins_scores[['chrom', 'start', 'end']]
        
        df['ins_score_{}'.format(stage)] = ins_scores['log2_insulation_score']

    df.index = list(range(df.shape[0]))
    segmentation = df.dropna(axis=0, subset=['ins_score_{}'.format(x) for x in stages]).reset_index(drop=True)
    time_elapsed = time.time() - in_time
    logging.info(
        "COMPUTE|INSULATION_SCORES| Complete computing insulation scores in {:.0f}m {:.0f}s".format(time_elapsed // 60,
                                                                                               time_elapsed % 60))
    return segmentation

## Specify parameters

In [86]:
resolution = 10000

fountains_path = '../arcuda/fountains/{}.bed'
boundaries_path = '../arcuda/boundaries/filtered/{}.csv'
coolers_path = '../arcuda/coolers/This2022/{}.mcool'

stages_fountains = ['Wild-Type_2.75.danrer11-reduced.mapq_30.1000', 
                    'WT.danrer11-reduced.mapq_30.1000', 
                    'TR.danrer11-reduced.mapq_30.1000', 
                    'Wild-Type_11.danrer11-reduced.mapq_30.1000', 
                    'Wild-Type_25.danrer11-reduced.mapq_30.1000']

stages_boundaries = ['sperm.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_2.75.danrer11-reduced.mapq_30.1000', 
                     'WT.danrer11-reduced.mapq_30.1000', 
                     'TR.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_5.3.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_11.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_25.danrer11-reduced.mapq_30.1000']

stages_all = ['sperm.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_2.75.danrer11-reduced.mapq_30.1000', 
              'WT.danrer11-reduced.mapq_30.1000', 
              'TR.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_5.3.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_11.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_25.danrer11-reduced.mapq_30.1000']


## Get data

#### Chromosomes

In [87]:
# chromosome armsizes

chromosome_armsizes_path = '../arcuda/genome/danRer11.armsizes.txt'
chromosome_armsizes_data = pd.read_csv(chromosome_armsizes_path)
chromosome_armsizes_data = chromosome_armsizes_data.iloc[:-1]
chromosomes = list(chromosome_armsizes_data.chrom.drop_duplicates())

#### Good bins

In [88]:
good_bins_path = '../arcuda/genome/selected.50000-safe.danrer11-reduced.tsv'
good_bins_data = pd.read_csv(good_bins_path, sep='\t')

In [89]:
good_bins_data

Unnamed: 0,chrom,start,end
0,chr1,0,10000
1,chr1,10000,20000
2,chr1,20000,30000
3,chr1,30000,40000
4,chr1,40000,50000
...,...,...,...
84678,chr25,37400000,37410000
84679,chr25,37410000,37420000
84680,chr25,37420000,37430000
84681,chr25,37430000,37440000


#### Boundaries

In [90]:
boundaries_dict = dict.fromkeys(stages_boundaries, None)
for stage in stages_boundaries:
    boundaries_data = pd.read_csv(boundaries_path.format(stage))
    boundaries_dict[stage] = boundaries_data

In [91]:
boundaries_dict_all = dict.fromkeys(stages_boundaries, None)
for stage in stages_boundaries:
    boundaries_dict_all[stage] = good_bins_data.copy()

## Calculate insulation score for each set of boundaries for all stages

E.g. for boundaries called on the stage 25 hrs we would calculate insulation score in corresponding loci for all stages given in the variable `stages_boundaries`

In [94]:
fname = '../arcuda/clustering/insulation_all/{}.csv'

In [93]:
for stage in stages_boundaries:
    print(stage)
    matrices, coolers = cool_files(coolers_path.format(stage), resolution=resolution, chromnames=chromosomes)
    for st in stages_boundaries:
        insulations = boundaries_dict[st].copy()
        insulations_all = boundaries_dict_all[st].copy()
        insulations_all = _insulation_scores_(insulations, insulations_all, coolers, stages=[stage], chromnames=chromosomes)
        boundaries_dict_all[st] = insulations_all.copy()
    del matrices
    del coolers
    gc.collect()

INFO:root:LOADER|COOL_FILES| List of coolfiles of interest: ['../arcuda/coolers/This2022/sperm.danrer11-reduced.mapq_30.1000.mcool']
INFO:root:LOADER|COOL_FILES| Start loading coolfiles...


sperm.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 27s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 48s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 52s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 30s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 28s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 35s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m

Wild-Type_2.75.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 20s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 16s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 51s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 29s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 26s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 34s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m

WT.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 22s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 51s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 55s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 31s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 30s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 52s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m

TR.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 31s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 60s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 4s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 38s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 35s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 44s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 

Wild-Type_5.3.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 18s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 42s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 52s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 33s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 26s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 31s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m

Wild-Type_11.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 19s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 50s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 60s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 31s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 27s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 37s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m

Wild-Type_25.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 20s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 37s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 41s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 24s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 23s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 28s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m

In [95]:
for stage in stages_boundaries:
    boundaries_dict_all[stage].to_csv(fname.format(stage), index=None)

In [99]:
opt_windows = pd.DataFrame(index=chromosomes, columns=stages_boundaries)

In [100]:
for stage in stages_boundaries:
    for ch in chromosomes:
        opt_window_ch = boundaries_dict[stage].query("ch=='{}'".format(ch))['window'].iloc[0]
        opt_windows.loc[ch, stage] = opt_window_ch

In [102]:
opt_windows.to_csv(fname.format('insulation_window_values'))

In [107]:
for stage in stages_boundaries:
    stage_df = pd.read_csv('../data/insulation_all/{}.csv'.format(stage))
    for st in stages_boundaries:
        stage_df[['chrom', 'start', 'end', 'ins_score_{}'.format(st)]].to_csv('../data/insulation_all/bw/{}/{}.bed'.format(stage, st), sep='\t', header=None, index=None)
        
        