## Import dependecies

In [43]:
import gc
import sys
import time
import warnings
import logging
import random
import json
from scipy import stats
from os.path import join, basename, splitext, isfile
from os import listdir
from collections import OrderedDict

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import cooler
import cooltools

import hichew
from hichew.calling import boundaries, domains, clusters
from hichew.compute import normalize, d_scores, insulation_scores, silhouette
from hichew.loader import cool_files
from hichew.plot import clusters_dynamics, viz_opt_curves, viz_tads, _pca, _tsne


In [44]:
import logging
import time
import operator
import os
import sys
import warnings

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, AffinityPropagation, MeanShift

from hichew.lib import utils

In [45]:
sns.set(context='paper', style='whitegrid')
warnings.filterwarnings("ignore")

In [46]:
%matplotlib inline

## Specify parameters

In [47]:
resolution = 10000

fountains_path = '../arcuda/fountains/{}.bed'
boundaries_path = '../arcuda/boundaries/filtered/{}.csv'
coolers_path = '../arcuda/coolers/This2022/{}.mcool'

stages_fountains = ['Wild-Type_2.75.danrer11-reduced.mapq_30.1000', 
                    'WT.danrer11-reduced.mapq_30.1000', 
                    'TR.danrer11-reduced.mapq_30.1000', 
                    'Wild-Type_11.danrer11-reduced.mapq_30.1000', 
                    'Wild-Type_25.danrer11-reduced.mapq_30.1000']

stages_boundaries = ['sperm.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_2.75.danrer11-reduced.mapq_30.1000', 
                     'WT.danrer11-reduced.mapq_30.1000', 
                     'TR.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_5.3.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_11.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_25.danrer11-reduced.mapq_30.1000']

stages_all = ['sperm.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_2.75.danrer11-reduced.mapq_30.1000', 
              'WT.danrer11-reduced.mapq_30.1000', 
              'TR.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_5.3.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_11.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_25.danrer11-reduced.mapq_30.1000']


## Get data

#### Chromosomes

In [48]:
# chromosome armsizes

chromosome_armsizes_path = '../arcuda/genome/danRer11.armsizes.txt'
chromosome_armsizes_data = pd.read_csv(chromosome_armsizes_path)
chromosome_armsizes_data = chromosome_armsizes_data.iloc[:-1]
chromosomes = list(chromosome_armsizes_data.chrom.drop_duplicates())

#### Boundaries

In [49]:
boundaries_dict = dict.fromkeys(stages_boundaries, None)
for stage in stages_boundaries:
    boundaries_data = pd.read_csv(boundaries_path.format(stage))
    boundaries_dict[stage] = boundaries_data

#### Fountains

In [55]:
fountains_dict = dict.fromkeys(stages_fountains, None)
for stage in stages_fountains:
    fountains_data = pd.read_csv(fountains_path.format(stage), sep='\t', header=None)
    fountains_data.columns = ['ch', 'bgn', 'end', 'unknown', 'sim']
    boundaries_data = boundaries_dict[stage].copy()
    fountains_data['bs_threshold'] = None
    fountains_data['window'] = None
    for ch in chromosomes:
        bs_threshold = boundaries_data[boundaries_data.ch == ch]['bs_threshold'].iloc[0]
        window = boundaries_data[boundaries_data.ch == ch]['window'].iloc[0]
        fountains_data.loc[fountains_data[fountains_data.ch == ch].index, 'bs_threshold'] = bs_threshold
        fountains_data.loc[fountains_data[fountains_data.ch == ch].index, 'window'] = window
    fountains_dict[stage] = fountains_data

## Calculate insulation score for each set of boundaries for all stages

E.g. for boundaries called on the stage 25 hrs we would calculate insulation score in corresponding loci for all stages given in the variable `stages_boundaries`

In [28]:
fname = '../arcuda/clustering/insulation/boundaries/{}.csv'

In [29]:
for stage in stages_boundaries:
    print(stage)
    matrices, coolers = cool_files(coolers_path.format(stage), resolution=resolution, chromnames=chromosomes)
    for st in stages_boundaries:
        insulations = boundaries_dict[st].copy()
        insulations = insulation_scores(insulations, coolers, stages=[stage], chromnames=chromosomes)
        boundaries_dict[st] = insulations.copy()
    del matrices
    del coolers
    gc.collect()

INFO:root:LOADER|COOL_FILES| List of coolfiles of interest: ['../arcuda/coolers/This2022/sperm.danrer11-reduced.mapq_30.1000.mcool']
INFO:root:LOADER|COOL_FILES| Start loading coolfiles...


sperm.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 29s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 2s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 4s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 55s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 52s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 0s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 47

Wild-Type_2.75.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 32s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 32s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 23s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 10s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 52s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 17s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m

WT.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 22s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 6s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 5s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 55s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 57s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 3s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 47

TR.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 29s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 6s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 11s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 3s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 18s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 39s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 5

Wild-Type_5.3.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 21s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 60s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 7s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 57s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 55s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 2s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 1

Wild-Type_11.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 21s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 1s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 11s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 58s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 55s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 11s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 

Wild-Type_25.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 25s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 59s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 56s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 51s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 50s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 59s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m

In [30]:
for stage in stages_boundaries:
    boundaries_dict[stage].to_csv(fname.format(stage), index=None)

## Calculate insulation score for each set of fountains for all stages

E.g. for fountains called on the stage 25 hrs we would calculate insulation score in corresponding loci for all stages given in the variable `fountains_boundaries`. In case of fountains we will use same window size as for boundaries for each chromosome. (NB: another approach required? to calculate Scharr score for each stage?)

In [58]:
fname = '../arcuda/clustering/insulation/fountains/{}.csv'

In [59]:
for stage in stages_fountains:
    print(stage)
    matrices, coolers = cool_files(coolers_path.format(stage), resolution=resolution, chromnames=chromosomes)
    for st in stages_fountains:
        insulations = fountains_dict[st].copy()
        insulations = insulation_scores(insulations, coolers, stages=[stage], chromnames=chromosomes)
        fountains_dict[st] = insulations.copy()
    del matrices
    del coolers
    gc.collect()

INFO:root:LOADER|COOL_FILES| List of coolfiles of interest: ['../arcuda/coolers/This2022/Wild-Type_2.75.danrer11-reduced.mapq_30.1000.mcool']
INFO:root:LOADER|COOL_FILES| Start loading coolfiles...


Wild-Type_2.75.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 26s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 35s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 6s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 3s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 34s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 10s
INFO:root:LOADER|COOL_FILES| List of coolfiles of interest: ['../arcuda/coolers/This2022/WT.danrer11-reduced.mapq_30.1000.mcool']
INFO:root:LOADER|COOL_FIL

WT.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 26s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 28s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 17s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 5s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 24s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 6s
INFO:root:LOADER|COOL_FILES| List of coolfiles of interest: ['../arcuda/coolers/This2022/TR.danrer11-reduced.mapq_30.1000.mcool']
INFO:root:LOADER|COOL_FIL

TR.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 32s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 26s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 14s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 24s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 15s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 20s
INFO:root:LOADER|COOL_FILES| List of coolfiles of interest: ['../arcuda/coolers/This2022/Wild-Type_11.danrer11-reduced.mapq_30.1000.mcool']
INFO:root:LOA

Wild-Type_11.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 26s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 20s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 8s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 3s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 13s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 11s
INFO:root:LOADER|COOL_FILES| List of coolfiles of interest: ['../arcuda/coolers/This2022/Wild-Type_25.danrer11-reduced.mapq_30.1000.mcool']
INFO:root:LOADE

Wild-Type_25.danrer11-reduced.mapq_30.1000


INFO:root:LOADER|COOL_FILES| Loading completed in 0m 26s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 14s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 57s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 0m 54s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 8s
INFO:root:COMPUTE|INSULATION_SCORES| Start computing insulation scores...
INFO:root:COMPUTE|INSULATION_SCORES| Complete computing insulation scores in 1m 5s


In [60]:
for stage in stages_fountains:
    fountains_dict[stage].to_csv(fname.format(stage), index=None)