## Import dependecies

In [1]:
import sys
import time
import warnings
import logging
import random
import json
from os.path import join, basename, splitext, isfile
from os import listdir

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import cooler
import cooltools

import hichew
from hichew.calling import boundaries, domains, clusters
from hichew.compute import normalize, d_scores, insulation_scores, silhouette
from hichew.loader import cool_files
from hichew.plot import clusters_dynamics, viz_opt_curves, viz_tads, _pca, _tsne


  import pandas.util.testing as tm


In [2]:
import logging
import time
import operator
import os
import sys
import warnings

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, AffinityPropagation, MeanShift

from hichew.lib import utils

In [3]:
sns.set(context='paper', style='whitegrid')
warnings.filterwarnings("ignore")

In [4]:
%matplotlib inline

## Specify parameters

In [113]:
resolution = 10000

fountains_path = '../arcuda/fountains/{}.bed'
boundaries_path = '../arcuda/boundaries/{}.csv'
coolers_path = '../arcuda/coolers/This2022/{}.mcool'

stages_fountains = ['Wild-Type_2.75.danrer11-reduced.mapq_30.1000', 
                    'WT.danrer11-reduced.mapq_30.1000', 
                    'TR.danrer11-reduced.mapq_30.1000', 
                    'Wild-Type_11.danrer11-reduced.mapq_30.1000', 
                    'Wild-Type_25.danrer11-reduced.mapq_30.1000']

stages_boundaries = ['sperm.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_2.75.danrer11-reduced.mapq_30.1000', 
                     'WT.danrer11-reduced.mapq_30.1000', 
                     'TR.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_5.3.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_11.danrer11-reduced.mapq_30.1000', 
                     'Wild-Type_25.danrer11-reduced.mapq_30.1000']

stages_all = ['sperm.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_2.75.danrer11-reduced.mapq_30.1000', 
              'WT.danrer11-reduced.mapq_30.1000', 
              'TR.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_5.3.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_11.danrer11-reduced.mapq_30.1000', 
              'Wild-Type_25.danrer11-reduced.mapq_30.1000']

################
i = 6 ##########
################

stage_to_call = coolers_path.format(stages_all[i])

In [114]:
stages_all[i]

'Wild-Type_25.danrer11-reduced.mapq_30.1000'

## Get data

#### Chromosomes

In [115]:
# chromosome armsizes

chromosome_armsizes_path = '../arcuda/genome/danRer11.armsizes.txt'
chromosome_armsizes_data = pd.read_csv(chromosome_armsizes_path)
chromosome_armsizes_data = chromosome_armsizes_data.iloc[:-1]
chromosomes = list(chromosome_armsizes_data.chrom.drop_duplicates())

#### Bad bins

In [116]:
# bad bins

bad_bins_path = '../arcuda/genome/blacklist.danrer11-reduced.tsv'
bad_bins_data = pd.read_csv(bad_bins_path, sep='\t')

#### Good bins

In [117]:
# good bins

good_bins_path = '../arcuda/genome/selected.50000-safe.danrer11-reduced.tsv'
good_bins_data = pd.read_csv(good_bins_path, sep='\t')

In [118]:
good_bins_data.head()

Unnamed: 0,chrom,start,end
0,chr1,0,10000
1,chr1,10000,20000
2,chr1,20000,30000
3,chr1,30000,40000
4,chr1,40000,50000


#### Coolers

In [119]:
# matrices, coolers = cool_files(stage_to_call, resolution=resolution, chromnames=chromosomes)

#### Fountains

In [120]:
if isfile(fountains_path.format(stages_all[i])):
    fountains_data = pd.read_csv(fountains_path.format(stages_all[i]), sep='\t', header=None)
    fountains_data.columns = ['chrom', 'start', 'end', 'unknown', 'sim']
else:
    print('No fountains for stage {}'.format(stages_all[i]))

In [121]:
fountains_data.head()

Unnamed: 0,chrom,start,end,unknown,sim
0,chr7,31550000,31560000,42430,0.409056
1,chr2,34030000,34040000,9361,0.392029
2,chr23,23080000,23090000,124242,0.392027
3,chr7,21720000,21730000,41447,0.390719
4,chr4,14560000,14570000,19642,0.374675


#### Boundaries

In [122]:
if isfile(boundaries_path.format(stages_all[i])):
    boundaries_data = pd.read_csv(boundaries_path.format(stages_all[i]))
else:
    print('No boundaries for stage {}'.format(stages_all[i]))

In [123]:
boundaries_data.head()

Unnamed: 0,bgn,end,bs_threshold,window,ch,insulation_score,boundary_strength
0,110000,120000,0.9,160000,chr1,-0.541533,0.750213
1,620000,630000,0.9,160000,chr1,-0.449879,0.651884
2,1080000,1090000,0.9,160000,chr1,-0.71708,1.004747
3,1340000,1350000,0.9,160000,chr1,-0.809942,1.241479
4,1630000,1640000,0.9,160000,chr1,-0.717207,0.968238


## Fountains filtration (sanity check)

In [124]:
def check_fountains_filtration(fountains_data, good_bins_data):
    good_bins_data_array = good_bins_data.values
    fountains_data_array = fountains_data[['chrom', 'start', 'end']].values
    bool_mask = [True if x in good_bins_data_array else False for x in fountains_data_array]
    if len(bool_mask) == sum(bool_mask):
        print('OK')
    else:
        print('NOT OK')

In [125]:
if isfile(fountains_path.format(stages_all[i])):
    check_fountains_filtration(fountains_data, good_bins_data)
else:
    print('No fountains for stage {}'.format(stages_all[i]))

OK


## Boundaries filtration

In [126]:
def filtrate_boundaries(boundaries_data, good_bins_data):
    def is_good(x, gbda):
        return [x.ch, x.bgn, x.end] in gbda
    
    good_bins_data_array = good_bins_data.values
    boundaries_data_filtered = boundaries_data.loc[boundaries_data.apply(is_good, axis=1, gbda=good_bins_data_array)].copy()
    boundaries_data_filtered = boundaries_data_filtered.reset_index(drop=True)
    return boundaries_data_filtered

In [127]:
if isfile(boundaries_path.format(stages_all[i])):
    boundaries_data_filtered = filtrate_boundaries(boundaries_data, good_bins_data)
else:
    print('No boundaries for stage {}'.format(stages_all[i]))

## Boundaries filtration (sanity check)

In [128]:
def check_boundaries_filtration(boundaries_data_filtered, good_bins_data):
    good_bins_data_array = good_bins_data.values
    boundaries_data_array = boundaries_data_filtered[['ch', 'bgn', 'end']].values
    bool_mask = [True if x in good_bins_data_array else False for x in boundaries_data_array]
    if len(bool_mask) == sum(bool_mask):
        print('OK')
        boundaries_data_filtered.to_csv('../arcuda/boundaries/filtered/{}.csv'.format(stages_all[i]), index=None)
    else:
        print('NOT OK')

In [129]:
if isfile(boundaries_path.format(stages_all[i])):
    check_boundaries_filtration(boundaries_data_filtered, good_bins_data)
else:
    print('No boundaries for stage {}'.format(stages_all[i]))

OK


In [130]:
boundaries_data_filtered.shape

(1614, 7)