In [None]:
# # set the number of threads for many common libraries
# from os import environ
# N_THREADS = '1'
# environ['OMP_NUM_THREADS'] = N_THREADS
# environ['OPENBLAS_NUM_THREADS'] = N_THREADS
# environ['MKL_NUM_THREADS'] = N_THREADS
# environ['VECLIB_MAXIMUM_THREADS'] = N_THREADS
# environ['NUMEXPR_NUM_THREADS'] = N_THREADS
# # https://superfastpython.com/numpy-number-blas-threads/

In [None]:
import pandas as pd
import numpy as np
from itertools import chain

# Hi-C utilities imports:
import cooler
import bioframe
import cooltools
from cooltools.lib.numutils import fill_diag

# Visualization imports:
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import matplotlib.patches as patches
from matplotlib.ticker import EngFormatter

from itertools import cycle

# from ipywidgets import interact, fixed

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

### Import modified "guts" of the dotfinder submodule from `helper_func` file

In [None]:
from helper_func import draw_kernel

# turns out still need some of the dotfinder guts in here
from cooltools.api.dotfinder import bp_to_bins, generate_tiles_diag_band
from cooltools.lib.numutils import LazyToeplitz
from cooltools.lib.common import assign_regions

from datashader.mpl_ext import dsshow, alpha_colormap
import datashader as ds
import datashader.transfer_functions as tf
from functools import partial
from data_catalog import bws, bws_vlim, telo_dict


### pick a dataset and binsize to work on ...

In [None]:
# # ! pip install --upgrade --no-cache --no-deps --ignore-install cooler
# # ls /home/dekkerlab/dots-test
# # import higlass as hg
# import jscatter
import scipy
import logging
import multiprocess as mp
# import mpire for nested multi-processing
from mpire import WorkerPool

In [None]:
# ! ls /data/proj_sync/data_ranger/finalcoolers_mega/ranGAP1-aux-G1s-MEGA.hg38.mapq_30.1000.mcool

# 10 kb is a resolution at which one can clearly see "dots":
binsize = 10_000
# cooler files that we'll work on :
telo_clrs = { _k: cooler.Cooler(f"{_path}::/resolutions/{binsize}") for _k, _path in telo_dict.items() }


### pick a region to work on ...

In [None]:
# Use bioframe to fetch the genomic features from the UCSC.
hg38_chromsizes = bioframe.fetch_chromsizes('hg38')
hg38_cens = bioframe.fetch_centromeres('hg38')
hg38_arms_full = bioframe.make_chromarms(hg38_chromsizes, hg38_cens)
# # remove "bad" chromosomes and near-empty arms ...
# excluded_arms = ["chr13_p", "chr14_p", "chr15_p", "chr21_p", "chr22_p", "chrM_p", "chrY_p", "chrY_q", "chrX_p", "chrX_q"]
# hg38_arms = hg38_arms_full[~hg38_arms_full["name"].isin(excluded_arms)].reset_index(drop=True)

# can do 1 chromosome (or arm) as well ..
included_arms = ["chr1_q", "chr2_p", "chr4_q", "chr6_q"]
included_arms = hg38_arms_full["name"].to_list()[:44] # all autosomal ones ...
hg38_arms = hg38_arms_full[hg38_arms_full["name"].isin(included_arms)].reset_index(drop=True)

### pre-calculate expected for the cooler ...

In [None]:
def _job(packed_data, sample):
    # packed data -> exp_kwargs and a dict with coolers for each sample
    exp_kwargs, clr_dict = packed_data
    _clr = clr_dict[sample]
    # in order to use spawn/forkserver we have to import for worker
    from cooltools import expected_cis
    _exp = expected_cis( _clr, **exp_kwargs)
    return (sample, _exp)

# define expected parameters in the form of kwargs-dict:
exp_kwargs = dict(
    view_df=hg38_arms,
    intra_only=False,
    nproc=12
)

# have to use daemon=False, because _job is multiprocessing-based already ...
with WorkerPool(
    n_jobs=8,
    daemon=False,
    shared_objects=( exp_kwargs, telo_clrs ),
    start_method="forkserver",  # little faster than spawn, fork is the fastest
    use_dill=True,
) as wpool:
    results = wpool.map(_job, telo_clrs, progress_bar=True)

# sort out the results ...
telo_exps_cis = {sample: _exp for sample, _exp in results}
# # old way of doing it
# telo_exps_cis = {k: cooltools.expected_cis( _clr, **exp_kwargs) for k, _clr in telo_clrs.items()}

# generate custom kernels - similar to original from hiccups

In [None]:
# define stripy kernels for small compartment detection ...
def get_stripy_kernels_new(halfwidth):
    """
    halfwidth : int
        half width of the kernel, kernel size must be odd number in both dimensions

    returns :
    dictionaty with kernels
    """
    # kernel width defined - odd dimensions ...
    kwidth = (2*halfwidth + 1)
    # define str|ipe width
    stripe_width = kwidth // 3

    # create a grid of coordinates from -h to +h, to define round kernels
    x, y = np.meshgrid(
        np.linspace(-halfwidth, halfwidth, kwidth),
        np.linspace(-halfwidth, halfwidth, kwidth),
    )

    # define horizontal and vertical stripes
    maskv = ((x < stripe_width - halfwidth) | (x > halfwidth - stripe_width))
    maskv = maskv & ((y >= stripe_width - halfwidth) & (y <= halfwidth - stripe_width))
    maskvmid = ~maskv & ((y >= stripe_width - halfwidth) & (y <= halfwidth - stripe_width))
    maskh = ((y < stripe_width - halfwidth) | (y > halfwidth - stripe_width))
    maskh = maskh & ((x >= stripe_width - halfwidth) & (x <= halfwidth - stripe_width))
    maskhmid = ~maskh & ((x >= stripe_width - halfwidth) & (x <= halfwidth - stripe_width))

    # new kernels with more round donut and lowleft masks:
    return {
        f'mid': maskvmid,
        f'v{halfwidth}': maskv,
        f'h{halfwidth}': maskh,
    }

In [None]:
# draw_kernel??

In [None]:
# define stripy kernels of different sizes
k4 = get_stripy_kernels_new(halfwidth=3)
k7 = get_stripy_kernels_new(halfwidth=7)
kl = get_stripy_kernels_new(halfwidth=10)


# plot rounded kernels
fig, axs = plt.subplots(ncols=len(k4), nrows=len([k4, k7, kl]), figsize=(len(k4)*2.5, len([k4, k7, kl])*2.5), squeeze=False)
for ax_row, ks in zip(axs, [k4, k7, kl]):
    for ax, (ktype, kernel) in zip(ax_row, ks.items()):
        imk = draw_kernel(kernel, ax, kernel_name=ktype,cmap="plasma")

## Work on a particular clr/exp pair - mostly the 5hr sample ...

In [None]:
clr = telo_clrs["p5hR1R2"]
exp = telo_exps_cis["p5hR1R2"]
exp_indexed = exp.set_index(["region1", "region2"])

# Working on a Figure here - just load the calls:
 - enriched pixels
 - clustered pixels
 - existing anchors ...
...

In [None]:
# id_anchor_fnames = {
#     "mega_2X_enrichment": "ID_anchors/mega_2X_enrichment.fourth_mega.max_size.bed",
#     "5hr_2X_enrichment_old": "ID_anchors/5hr_2X_enrichment.second_bulk.max_size.bed",
#     "5hr_2X_enrichment": "ID_anchors/5hr_2X_enrichment.pixel_derived.bed",
#     "5hr_2X_enrichment_nosing": "ID_anchors/5hr_2X_enrichment.pixel_derived.no_singletons.bed",
#     "5hr_notinCyto_2X_enrichment_signal": "ID_anchors/p5notin_pCyto_anchors_2X_enrichment.pixel_derived.signal_peaks.bed",
#     "5hr_2X_enrichment_signal": "ID_anchors/5hr_2X_enrichment.pixel_derived.signal_peaks.bed",
#     "10hr_2X_enrichment_signal": "ID_anchors/10hrs_2X_enrichment.pixel_derived.signal_peaks.bed",
#     "N93p5_2X_enrichment_signal": "ID_anchors/N93p5_2X_enrichment.pixel_derived.signal_peaks.bed",
#     "pCyto_2X_enrichment_signal": "ID_anchors/pCyto_2X_enrichment.pixel_derived.signal_peaks.bed",
#     "mCyto_2X_enrichment_signal": "ID_anchors/mCyto_2X_enrichment.pixel_derived.signal_peaks.bed",
#     "mega_3X_enrichment": "ID_anchors/mega_3X_enrichment.fifth_mega3x.max_size.bed",
#     "MEGA_2X_enrichment": "ID_anchors/MEGAp5_2X_enrichment.pixel_derived.signal_peaks.bed",
#     "MEGA_weaker_2X_enrichment": "ID_anchors/MEGA_plus_weak_anchors_2X_enrichment.pixel_derived.signal_peaks.bed",
#     "MEGAN93_2X_enrichment": "ID_anchors/MEGAN93p5_2X_enrichment.pixel_derived.signal_peaks.bed",
#     "MEGAminus_2X_enrichment": "ID_anchors/MEGA_minus_ctrl_2X_enrichment.pixel_derived.signal_peaks.bed",
#     "cyto_2x_enrichment": "ID_anchors/cyto_2x_enrichment.third_mCyto.max_size.bed",
# }

# id_anchors_dict = {}
# for id_name, fname in id_anchor_fnames.items():
#     id_anchors_dict[id_name] = pd.read_csv(fname, sep="\t")
#     # ...
#     print(f"loaded {len(id_anchors_dict[id_name]):5d} ID anchors {id_name:>20} in BED format ...")


# # _anchors5 = id_anchors_dict["MEGA_2X_enrichment"]
# _anchors5 = id_anchors_dict["5hr_2X_enrichment_signal"]
# _anchorsCyto = id_anchors_dict["pCyto_2X_enrichment_signal"]

In [None]:
! ls native_comps_10kb

In [None]:
! ls enriched_pixels_10kb

In [None]:
# loading enriched pixels ...
enrich_fnames = {
    "5hr_2X_enrichment": "enriched_pixels_10kb/5hr_2X_aug24.binpe",
}
# let's load them all into a dictionary ...
enriched_dict = {}
for id_name, fname in enrich_fnames.items():
    enriched_dict[id_name] = pd.read_csv(fname, sep="\t")
    # ...
    print(f"loaded {len(enriched_dict[id_name]):5d} enriched pixels {id_name:>20} in BEDPE format ...")


# clustered pixels loading
clust_fnames = {
    "5hr_2X_enrichment": "clustered_pixels_10kb/5hr_2X_aug24.binpe",
}
clustered_dict = {}
for clst_name, fname in clust_fnames.items():
    clustered_dict[clst_name] = pd.read_csv(fname, sep="\t")
    # ...
    print(f"loaded {len(clustered_dict[clst_name]):5d} clustered ID interactions {clst_name:>20} in BEDPE format ...")


# clustered pixels loading
filt_fnames = {
    "5hr_2X_enrichment": "enriched_pixels_10kb/5hr_2X_second.no_singletons.binpe",
}
filt_dict = {}
for filt_name, fname in filt_fnames.items():
    filt_dict[filt_name] = pd.read_csv(fname, sep="\t")
    # ...
    print(f"loaded {len(filt_dict[filt_name]):5d} filtered (no singletons) ID interactions {filt_name:>20} in BEDPE format ...")

# loading anchors here ...
print("...")
id_anchor_fnames = {
    "mega_2X_enrichment": "ID_anchors/mega_2X_enrichment.fourth_mega.max_size.bed",
    "5hr_2X_enrichment_old": "ID_anchors/5hr_2X_enrichment.second_bulk.max_size.bed",
    "5hr_2X_enrichment": "ID_anchors/5hr_2X_enrichment.pixel_derived.bed",
    "5hr_2X_enrichment_nosing": "ID_anchors/5hr_2X_enrichment.pixel_derived.no_singletons.bed",
    "5hr_notinCyto_2X_enrichment_signal": "ID_anchors/p5notin_pCyto_anchors_2X_enrichment.pixel_derived.signal_peaks.bed",
    "5hr_2X_enrichment_signal": "ID_anchors/5hr_2X_enrichment.pixel_derived.signal_peaks.bed",
    "10hr_2X_enrichment_signal": "ID_anchors/10hrs_2X_enrichment.pixel_derived.signal_peaks.bed",
    "N93p5_2X_enrichment_signal": "ID_anchors/N93p5_2X_enrichment.pixel_derived.signal_peaks.bed",
    "pCyto_2X_enrichment_signal": "ID_anchors/pCyto_2X_enrichment.pixel_derived.signal_peaks.bed",
    "mCyto_2X_enrichment_signal": "ID_anchors/mCyto_2X_enrichment.pixel_derived.signal_peaks.bed",
    "mega_3X_enrichment": "ID_anchors/mega_3X_enrichment.fifth_mega3x.max_size.bed",
    "MEGA_2X_enrichment": "ID_anchors/MEGAp5_2X_enrichment.pixel_derived.signal_peaks.bed",
    "MEGA_weaker_2X_enrichment": "ID_anchors/MEGA_plus_weak_anchors_2X_enrichment.pixel_derived.signal_peaks.bed",
    "MEGAN93_2X_enrichment": "ID_anchors/MEGAN93p5_2X_enrichment.pixel_derived.signal_peaks.bed",
    "MEGAminus_2X_enrichment": "ID_anchors/MEGA_minus_ctrl_2X_enrichment.pixel_derived.signal_peaks.bed",
    "cyto_2x_enrichment": "ID_anchors/cyto_2x_enrichment.third_mCyto.max_size.bed",
}

id_anchors_dict = {}
for id_name, fname in id_anchor_fnames.items():
    id_anchors_dict[id_name] = pd.read_csv(fname, sep="\t")
    # ...
    print(f"loaded {len(id_anchors_dict[id_name]):5d} ID anchors {id_name:>20} in BED format ...")


# Pick one condition to explore `5hr_2X_enrichment` ...

In [None]:
_condition = "5hr_2X_enrichment"

enriched_pixels = enriched_dict[_condition]
clustered_pixels = clustered_dict[_condition]
filtered_pixels = filt_dict[_condition]

final_mcds = id_anchors_dict[f"{_condition}_signal"]

In [None]:

_region1 = ('chr6', 129_000_000, 129_000_000+1_500_000)
_region2 = ('chr6', 129_000_000, 129_000_000+3_700_000)

_region1_zoom = ('chr6', 129_550_000, 129_550_000 + 280_000)
_region2_zoom = ('chr6', 130_290_000, 130_290_000 + 310_000)

region1_name = bioframe.select(hg38_arms, _region1).iat[0,-1]
region2_name = bioframe.select(hg38_arms, _region2).iat[0,-1]
assert region1_name == region2_name
region_name = region2_name

tile_span_i = clr.extent(_region1)
tile_span_j = clr.extent(_region2)
_the_tile = (region_name, tile_span_i, tile_span_j )

tile_start_ij = (tile_span_i[0], tile_span_j[0])
lazy_exp = LazyToeplitz(
    exp_indexed.loc[region_name, region_name]["balanced.avg"].to_numpy()
)
# RAW observed matrix slice:
observed = clr.matrix()[slice(*tile_span_i), slice(*tile_span_j)]
expected = lazy_exp[slice(*tile_span_i), slice(*tile_span_j)]

# let's figure out slices' coordinates ....
_bins_i = clr.bins()[slice(*tile_span_i)]
_bins_j = clr.bins()[slice(*tile_span_j)]
_chrom_i, _start_i, _end_i = _bins_i.iloc[0]["chrom"], _bins_i.iloc[0]["start"], _bins_i.iloc[-1]["end"]
_chrom_j, _start_j, _end_j = _bins_j.iloc[0]["chrom"], _bins_j.iloc[0]["start"], _bins_j.iloc[-1]["end"]


gOE = scipy.ndimage.gaussian_filter(
    (observed/expected),
    sigma=0.4,
    order=0,
    mode='reflect',
    cval=0.0,
    # radius=3,
    truncate=1.0,
)

region1_name_zoom = bioframe.select(hg38_arms, _region1_zoom).iat[0,-1]
region2_name_zoom = bioframe.select(hg38_arms, _region2_zoom).iat[0,-1]
assert region1_name_zoom == region2_name_zoom
region_name_zoom = region2_name_zoom

tile_span_zoom_i = clr.extent(_region1_zoom)
tile_span_zoom_j = clr.extent(_region2_zoom)
_the_tile_zoom = (region_name_zoom, tile_span_zoom_i, tile_span_zoom_j )

tile_start_zoom_ij = (tile_span_zoom_i[0], tile_span_zoom_j[0])
lazy_exp = LazyToeplitz(
    exp_indexed.loc[region_name_zoom, region_name_zoom]["balanced.avg"].to_numpy()
)
# RAW observed matrix slice:
observed_zoom = clr.matrix()[slice(*tile_span_zoom_i), slice(*tile_span_zoom_j)]
expected_zoom = lazy_exp[slice(*tile_span_zoom_i), slice(*tile_span_zoom_j)]

# let's figure out slices' coordinates ....
_bins_zoom_i = clr.bins()[slice(*tile_span_zoom_i)]
_bins_zoom_j = clr.bins()[slice(*tile_span_zoom_j)]
_chrom_zoom_i, _start_zoom_i, _end_zoom_i = _bins_zoom_i.iloc[0]["chrom"], _bins_zoom_i.iloc[0]["start"], _bins_zoom_i.iloc[-1]["end"]
_chrom_zoom_j, _start_zoom_j, _end_zoom_j = _bins_zoom_j.iloc[0]["chrom"], _bins_zoom_j.iloc[0]["start"], _bins_zoom_j.iloc[-1]["end"]

gOE_zoom = observed_zoom/expected_zoom

# select MCDs from the regions here ...
_anchors = final_mcds[["chrom","peak_start","peak_end","cluster"]].rename(columns={"peak_start":"start","peak_end":"end"})
_anchors_reg = bioframe.select(_anchors, _region2).reset_index(drop=True)
_anchors_reg["cluster"] = _anchors_reg["cluster"] - _anchors_reg["cluster"].min()
_anchors_reg = calculate_valencies(
    _anchors_reg,   # must be output of bedpe_to_anchors, which in turn is a clustering inside
    clustered_pixels,
    cluster_colname = "cluster",
    valency_colname = "valency",
    bed_cols = ["chrom", "start", "end"],
    bedpe_cols1 = ["chrom1", "start1", "end1"],
    bedpe_cols2 = ["chrom2", "start2", "end2"],
)

_bedpe_region = bioframe.pair_by_distance(
    _anchors_reg,
    min_sep=0,
    max_sep=(_region2[2] - _region2[1])+100_000_000,
    suffixes=("1","2"),
    keep_order=True,
)
_bedpe_region["bin1_id"] = _bedpe_region[["chrom1","start1","end1"]].apply(clr.offset,axis=1,result_type="expand")
_bedpe_region["bin1_width"] = _bedpe_region[["chrom1","start1","end1"]].apply(clr.extent,axis=1,result_type="expand").apply(np.diff,axis=1,result_type="expand")[0]

_bedpe_region["bin2_id"] = _bedpe_region[["chrom2","start2","end2"]].apply(clr.offset,axis=1,result_type="expand")
_bedpe_region["bin2_width"] = _bedpe_region[["chrom2","start2","end2"]].apply(clr.extent,axis=1,result_type="expand").apply(np.diff,axis=1,result_type="expand")[0]


def get_bin_coverage(df):
    """
    df with bin1_id and bin2_id columns
    coverage for every bin that is out there ...
    """
    # simply count "valencies" of enriched pixels i-s and j-s - and sum tham up togeher ...
    b1cov = df.groupby("bin1_id").size()
    b2cov = df.groupby("bin2_id").size()
    b1cov.index.name = "bin"
    b2cov.index.name = "bin"
    return b1cov.add(b2cov, fill_value=0)

# redefine kernel drawing for some tweaks ...
def draw_kernel(kernel, axis=None, kernel_name="default", cmap='viridis'):
    if axis is None:
        f, axis = plt.subplots()
    # kernel:
    imk = axis.imshow(
                    kernel[::-1,::-1],  # flip it, as in convolution
                    alpha=0.85,
                    cmap=cmap,
                    interpolation='nearest')
    # draw a square around the target pixel:
    x0 = kernel.shape[0] // 2 - 0.5
    y0 = kernel.shape[1] // 2 - 0.5
    rect = patches.Rectangle((x0, y0), 1, 1, lw=1, ec='r', fc='r')
    axis.add_patch(rect)

    # clean axis:
    axis.set_xticks([])
    axis.set_yticks([])
    axis.set_xticklabels('',visible=False)
    axis.set_yticklabels('',visible=False)
    axis.set_title(f"{kernel_name}", fontsize=12)
    # add a checkerboard to highlight pixels:
    checkerboard = np.add.outer(range(kernel.shape[0]),
                                range(kernel.shape[1])) % 2
    # show it:
    axis.imshow(checkerboard,
            cmap='gray',
            interpolation='nearest',
            alpha=0.3)

    return imk


# more drawing functions ...
def rectangles_around_dots(dots_bins_df, the_tile, loc="upper", lw=1, ec="cyan", fc="none"):
    rectangle_kwargs = dict(lw=lw, ec=ec, fc=fc)
    # parse the tile
    _, tspan1, tspan2 = the_tile
    # select only visible pixels :
    _the_dots = dots_bins_df \
        .query("""(@tspan1[0] < bin1_id < @tspan1[1]) & \
                  (@tspan2[0] < bin2_id < @tspan2[1]) """) \
        .eval("""b1 = bin1_id - @tspan1[0] - 0.5
                 b2 = bin2_id - @tspan2[0] - 0.5 """)
    print(f"{len(_the_dots)} pixels are visible out of {len(dots_bins_df)} ...")
    # iterate over visible pixels...
    for b1, b2 in _the_dots[["b1", "b2"]].itertuples(index=False):
        w1 = w2 = 1
        if loc == "upper":
            yield patches.Rectangle((b2, b1), w2, w1, **rectangle_kwargs)
        elif loc == "lower":
            yield patches.Rectangle((b1, b2), w1, w2, **rectangle_kwargs)
        else:
            raise ValueError("loc has to be uppper or lower")

# in a specific region, and exposing importnat plotting parameters
def rectangles_around_dots_ww(dots_bins_df, the_tile, loc="upper", lw=1, ec="cyan", fc="none", halo=30_000, ext_width=0):
    rectangle_kwargs = dict(lw=lw, ec=ec, fc=fc)
    # parse the tile
    _, tspan1, tspan2 = the_tile
    # select only visible "boxes" :
    _the_dots = dots_bins_df \
        .query("""(@tspan1[0] - @halo < bin1_id < @tspan1[1] + @halo) & \
                  (@tspan2[0] - @halo < bin2_id < @tspan2[1] + @halo) """) \
        .eval("""
                b1 = bin1_id - @tspan1[0] - @ext_width - 0.5
                b2 = bin2_id - @tspan2[0] - @ext_width - 0.5
                bin1_width = bin1_width + 2*@ext_width
                bin2_width = bin2_width + 2*@ext_width
            """)
    print(f"{len(_the_dots)} pixels are visible out of {len(dots_bins_df)} ...")
    for b1, b2, w1, w2 in _the_dots[["b1", "b2", "bin1_width", "bin2_width"]].itertuples(index=False):
        if loc == "upper":
            yield patches.Rectangle((b2, b1), w2+1, w1+1, **rectangle_kwargs)
        elif loc == "lower":
            yield patches.Rectangle((b1, b2), w1+1, w2+1, **rectangle_kwargs)
        else:
            raise ValueError("loc has to be uppper or lower")

from scipy.signal import find_peaks
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.colors import Normalize, TwoSlopeNorm

# https://stackoverflow.com/questions/48625475/python-shifted-logarithmic-colorbar-white-color-offset-to-center
class MidPointLogNorm(LogNorm):
    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        LogNorm.__init__(self,vmin=vmin, vmax=vmax, clip=clip)
        self.midpoint=midpoint

    def __call__(self, value, clip=None):
        # I'm ignoring masked values and all kinds of edge cases to make a
        # simple example...
        vmin, midpoint, vmax = self.vmin, self.midpoint, self.vmax
        x, y = [np.log(vmin), np.log(midpoint), np.log(vmax)], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(np.log(value), x, y))

    def inverse(self, value):
        if not self.scaled():
            raise ValueError("Not invertible until scaled")
        # t_vmin, t_midpoint, t_vmax = np.log(self.vmin), np.log(self.midpoint), np.log(self.vmax)
        vmin, midpoint, vmax = self.vmin, self.midpoint, self.vmax

        x, y = [0, 0.5, 1], [np.log(vmin), np.log(midpoint), np.log(vmax)]
        # # return np.ma.masked_array(np.interp(np.log(value), x, y))
        # if np.iterable(value):
        #     val = np.ma.asarray(value)
        #     return np.ma.power(val, 1. / gamma) * (vmax - vmin) + vmin
        # else:
        # return pow(value, 1. / gamma) * (vmax - vmin) + vmin
        return np.exp(np.interp(value, x, y))

def rectangles_around_dots_pileup(matrix, dots_bins_df, the_tile, halo=30_000, half_width=10):
    # parse the tile
    _, tspan1, tspan2 = the_tile
    # select only visible "boxes" :
    _the_dots = dots_bins_df \
        .query("""(@tspan1[0] - @halo < bin1_id < @tspan1[1] + @halo) & \
                  (@tspan2[0] - @halo < bin2_id < @tspan2[1] + @halo) """) \
        .eval("""
                b1 = bin1_id + bin1_width/2 - @tspan1[0] - @half_width
                b2 = bin2_id + bin2_width/2 - @tspan2[0] - @half_width
                bin1_width = 2*@half_width + 1
                bin2_width = 2*@half_width + 1
            """)
    _acc = []
    for b1, b2, w1, w2 in _the_dots[["b1", "b2", "bin1_width", "bin2_width"]].itertuples(index=False):
        # print(b1,(b1+w1),b2,(b2+w2))
        _mat = matrix[int(b1):int(b1+w1),int(b2):int(b2+w2)]
        # print(_mat)
        # print(_mat.shape)
        _acc.append(_mat)
    return np.asarray(_acc)


# some common parameters, kwargs and things of that nature ...
mycmap = plt.cm.coolwarm
mycmap.set_over(_over_color)
mycmap.set_under(_under_color)

imshow_kwargs = dict(
    # norm=LogNorm(vmin=1/5, vmax=5),
    norm=TwoSlopeNorm(vcenter=1, vmin=0.0025, vmax=2.5),
    # norm=MidPointLogNorm(vmin=1/10, vmax=3, midpoint=1),
    # cmap="RdBu_r",
    cmap=mycmap,
    interpolation="antialiased",
    interpolation_stage="rgba",
)
_over_color = "crimson"
_under_color = "steelblue"

suptitle_kwargs = dict(
    x=0.05,
    y=.975,
    horizontalalignment='left',
    verticalalignment='top',
    fontsize = 12,
)

#101820FF
_pixel_boxes_kwargs = dict(loc="upper", lw=1, ec="black", fc="black")
# _pixel_boxes_kwargs = dict(loc="upper", lw=1, ec="#EDFF00FF", fc="#EDFF00FF")
_big_boxes_kwargs = dict(loc="upper", lw=1.5, ec="#EDFF00FF", fc="none", halo=0, ext_width=4)



# Run pileups themselves for the little stackup cartoon overhere ...
_pstack = rectangles_around_dots_pileup(
    gOE,
    _bedpe_region,
    _the_tile,
    halo=0,
    half_width=10,
)

_code_color="tab:blue"


In [None]:
# Start with a square Figure.
fig = plt.figure(figsize=(7.25, 11), layout="none", facecolor="none")
# make like 6 subfigure on top of each other ...
subfigs = fig.subfigures(6, 3, hspace=0.025, wspace=0.05, width_ratios=[1,1.6,0.7])

#################################################
f1 = subfigs[0,0]
f1.suptitle("$\\bf{a.}$ ", **suptitle_kwargs)
ax = f1.add_subplot(1, 1, 1)
ax.set_axis_off()
ax.text(
    .0,
    .5,
    """
observed/expected
contact map used
for quantification
of pixel enrichments
    """,
    transform=ax.transAxes,
    horizontalalignment="left",
    verticalalignment="center"
)

# nice megabase formatiing ...
_mb = lambda coord: f"{coord/1_000_000:.1f}" if coord%1_000_000 else f"{coord//1_000_000}"

f1 = subfigs[0, 1]
ax = f1.add_axes([0,0,1,1])
hm = ax.imshow(gOE, **imshow_kwargs)
ax.set_xticks([])
ax.set_yticks([])
_region1_title = f"{_region1[0]}:{_mb(_region1[1])}—{_mb(_region1[2])}MB"
_region2_title = f"{_region2[0]}:{_mb(_region2[1])}—{_mb(_region2[2])}MB"
ax.set_ylabel(_region1_title, fontsize=9, labelpad=1)
ax.set_title(_region2_title, fontsize=9, pad=2)
####################################################


####################################################
# second step ...
f2 = subfigs[1,0]
f2.suptitle("$\\bf{b.}$ ", **suptitle_kwargs)
# f2.set_facecolor("blue")
k_fig = {"M": k7["mid"], "V": k7["h7"], "H": k7["v7"]}
# ax = f2.add_subplot(1, 1, 1)
spec = f2.add_gridspec(
    ncols=len(k_fig),
    nrows=2,
    height_ratios=[1,0.7],
    wspace=0.07,
    hspace=0.07,
)
# plot rounded kernels
for ii, (ktype, kernel) in enumerate(k_fig.items()):
    ax = f2.add_subplot(spec[0, ii])
    imk = draw_kernel(kernel, ax, kernel_name=ktype, cmap="plasma")
# just below - write some stuff ...
ax = f2.add_subplot(spec[1, :])
ax.set_axis_off()
ax.text(
    .0,
    .7,
    """
use simple thresholding
to detect enriched pixels
    """,
    transform=ax.transAxes,
    horizontalalignment="left",
    verticalalignment="center"
);
ax.text(
    .0,
    .1,
    " M > 2*(V | H)",
    transform=ax.transAxes,
    horizontalalignment="left",
    verticalalignment="center",
    fontname='Monospace',
    color=_code_color,
    fontsize=12
);
# ...
f2 = subfigs[1, 1]
ax = f2.add_axes([0,0,1,1])
hm = ax.imshow(gOE, **imshow_kwargs)
ax.set_xticks([])
ax.set_yticks([])
for box in rectangles_around_dots(
    enriched_pixels,
    _the_tile,
    **_pixel_boxes_kwargs,
):
    ax.add_patch(box)
# ...
f2 = subfigs[1,2]
ax = f2.add_axes([0,0,0.9,1])
hm = ax.imshow(gOE_zoom, **imshow_kwargs)
ax.set_xticks([])
ax.set_yticks([])
for box in rectangles_around_dots(
    enriched_pixels,
    _the_tile_zoom,
    **dict(loc="upper", lw=1, ec="black", fc="none"),
):
    ax.add_patch(box)
####################################################


####################################################
# third step ...
f3 = subfigs[2,0]
f3.suptitle("$\\bf{c.}$ ", **suptitle_kwargs)
ax = f3.add_subplot(1, 1, 1)
ax.set_axis_off()
ax.text(
    .0,
    .7,
    """
filter 'singletons' and
noisy clusters using
density based clustering
of enriched pixels:
    """,
    transform=ax.transAxes,
    horizontalalignment="left",
    verticalalignment="center"
)
ax.text(
    .0,
    .25,
    """
sklearn.cluster.OPTICS(
 min_samples = 5,
 max_eps = 33_000,
).fit(enriched_pixels)
    """,
    transform=ax.transAxes,
    horizontalalignment="left",
    verticalalignment="center",
    fontname='Monospace',
    fontsize=9,
    color=_code_color,
);

f3 = subfigs[2, 1]
ax = f3.add_axes([0,0,1,1])
hm = ax.imshow(gOE, **imshow_kwargs)
ax.set_xticks([])
ax.set_yticks([])
# draw boxes around clustered pixels ...
for box in rectangles_around_dots_ww(
    clustered_pixels,
    _the_tile,
    **_big_boxes_kwargs,
):
    ax.add_patch(box)
# draw enriched pixels themselves again ...
for box in rectangles_around_dots(
    filtered_pixels,
    _the_tile,
    **_pixel_boxes_kwargs,
):
    ax.add_patch(box)

f3 = subfigs[2,2]
ax = f3.add_axes([0,0,0.9,1])
hm = ax.imshow(gOE_zoom, **imshow_kwargs)
ax.set_xticks([])
ax.set_yticks([])
for box in rectangles_around_dots(
    filtered_pixels,
    _the_tile_zoom,
    **dict(loc="upper", lw=1, ec="black", fc="none"),
):
    ax.add_patch(box)
# draw boxes around clustered pixels ...
for box in rectangles_around_dots_ww(
    clustered_pixels,
    _the_tile_zoom,
    **dict(loc="upper", lw=3, ec="#EDFF00FF", fc="none", halo=0, ext_width=2),
):
    ax.add_patch(box)
#################################################


#################################################
# fifth step ...
f4 = subfigs[3,0]
f4.suptitle("$\\bf{d.}$ ", **suptitle_kwargs)
ax = f4.add_subplot(1, 1, 1)
ax.set_axis_off()
ax.text(
    .0,
    .5,
    """
use 'coverage' of
filtered pixels to
detect MCD-anchors,
as microcompartments
keep contributing
to the same anchors
    """,
    transform=ax.transAxes,
    horizontalalignment="left",
    verticalalignment="center"
)

f4 = subfigs[3, 1]
ax = f4.add_axes([0,0.1,1,0.7])
# calculate pixels coverage ...
rich_pix_vals = get_bin_coverage(filtered_pixels)
_from, _to = clr.extent(_region2)
rich_pix_vals_plot = pd.Series(index=np.arange(_from,_to)).add(rich_pix_vals.loc[_from: _to], fill_value=0)
# ax.plot(rich_pix_vals_plot.fillna(0), marker=",", markersize=2, lw=0.75, label="all enriched pixels", color="black")
ax.plot(rich_pix_vals_plot.fillna(0), marker=".", markersize=1, lw=0.5, color="black")
ax.set_xlim(_from, _to)
# coverage track params
cov_ylim = (-3, 77)
cov_yticks = [0,70]
ax.set_ylim(cov_ylim)
ax.set_yticks(cov_yticks)
ax.set_xticks([])
ax.set_xlabel(_region2_title, fontsize=9, labelpad=2)
####################################################


#################################################
f5 = subfigs[4,0]
f5.suptitle("$\\bf{e.}$ ", **suptitle_kwargs)
ax = f5.add_subplot(1, 1, 1)
ax.set_axis_off()
ax.text(
    .0,
    .65,
    """
detect coverage peaks,
i.e. MCD-anchors, using
simple 1D peak detection:
    """,
    transform=ax.transAxes,
    horizontalalignment="left",
    verticalalignment="center"
)
ax.text(
    .0,
    .2,
    """
scipy.signal.find_peaks(
 coverage,
 height=7,
 distance=5,
)
    """,
    transform=ax.transAxes,
    horizontalalignment="left",
    verticalalignment="center",
    fontname='Monospace',
    fontsize=9,
    color=_code_color,
);

f5 = subfigs[4, 1]
ax = f5.add_axes([0,0.1,1,0.7])
# # take empty bins and fill non-zero coverage with rich_clust_pix_vals ...
# # use value threshold to define the "floor" of the pixel coverage ...
_bins = clr.bins()[:]
_bins.index.name = "bin"
_bins["cov"] = rich_pix_vals
_bins["cov"] = _bins["cov"].fillna(0)
_arr = _bins["cov"].to_numpy()

# detect praks on the coverage track ...
_val_thresh = 7
_distance_thresh = 5
_peaks, _props = find_peaks(
    _arr,
    height=_val_thresh,
    prominence=(None,None),
    distance=_distance_thresh,
)
# extract left/right boundaries of every peak ...
_lefts = _props["left_bases"]
_rights = _props["right_bases"]
_arr_clipped = np.clip(_arr, _val_thresh, None)

# ax.plot(_arr_clipped[_from: _to], color="black", lw=0.75, marker=None)
ax.plot(_arr[_from: _to], color="black", lw=0.75, marker=None)
ax.axhline(_val_thresh, lw=0.5, color="dimgray", linestyle="--")
ax.plot(_peaks - _from, _arr_clipped[_peaks], marker=".", lw=0, markersize=7, color="orange")
ax.plot(_lefts - _from, _arr_clipped[_lefts], marker=8, lw=0, markersize=7, color="dimgray" )
ax.plot(_rights - _from, _arr_clipped[_rights], marker=9, lw=0, markersize=7, color="dimgray" )
# ax.plot(_lefts - _from, _arr_clipped[_lefts], marker="|", lw=0, markersize=7, color="tab:blue" )
# ax.plot(_rights - _from, _arr_clipped[_rights], marker="|", lw=0, markersize=7, color="tab:red" )
# ax.plot(_lefts - _from, _arr_clipped[_lefts], marker="|", lw=0, markersize=12, color="dimgray" )
# ax.plot(_rights - _from, _arr_clipped[_rights], marker="|", lw=0, markersize=12, color="dimgray" )
ax.set_xlim(0, _to-_from )
ax.set_ylim(cov_ylim)
# ax.set_yticks(cov_yticks)
ax.set_yticks([0,_val_thresh,70])
ax.set_xticks([])
ax.set_xlabel(_region2_title, fontsize=9, labelpad=2)

f5 = subfigs[4, 2]
ax = f5.add_subplot(1, 1, 1)
ax.set_axis_off()

x_foot = 0.03
x_summit = 0.1
y_foot = 0.37
y_summit = 0.63
_radius = 0.05
_dist = 0.17
x_left = x_foot
# ax.add_patch(patches.Arrow(x_foot+_radius, y_foot, _dist-2*_radius, 0, color="grey", width=0.1))
ax.scatter(x=[x_foot+0.05], y=[y_foot], s=80, marker=8, linewidths=1.5, color="dimgray")
ax.scatter(x=[x_foot+_dist-0.05], y=[y_foot], s=80, marker=9, linewidths=1.5, color="dimgray")
ax.text(
    x=x_left+0.27,
    y=y_foot,
    s="footprint",
    horizontalalignment="left",
    verticalalignment="center"
)
ax.scatter(x=[x_summit], y=[y_summit], s=80, marker=".", linewidths=1.5, color="orange")
ax.text(
    x=x_left+0.27,
    y=y_summit,
    s="summit",
    horizontalalignment="left",
    verticalalignment="center"
)
ax.set_aspect(1)
ax.set_xlim(0,1)
ax.set_ylim(0,1)
####################################################


####################################################
# third step ...
f6 = subfigs[5,0]
f6.suptitle("$\\bf{f.}$ ", **suptitle_kwargs)
ax = f6.add_subplot(1, 1, 1)
ax.set_axis_off()
ax.text(
    .0,
    .5,
    """
explore all-by-all grid
of detected anchors,
i.e. microcompartment
domains.

use their 'summits' for
centering stackups and
pileups.
    """,
    transform=ax.transAxes,
    horizontalalignment="left",
    verticalalignment="center"
)

f6 = subfigs[5, 1]
ax = f6.add_axes([0,0,1,1])
hm = ax.imshow(gOE, **imshow_kwargs)
# bounds yields (x0, y0, width, height)
x0, y0, dx, dy = ax.get_position().bounds
# get figure bounds ...
_, _, _h, _w = f6.figbbox.bounds
awidth = 0.06
aoffset = 0.02

ax_x = f6.add_axes([x0, y0-awidth-aoffset, dx, awidth])
ax_y = f6.add_axes([x0-(awidth+aoffset)*(_h/_w), y0, awidth*(_h/_w), dy])

_reg2w = np.diff(clr.extent(_region2)).item()
_reg1w = np.diff(clr.extent(_region1)).item()
ax_x.set_xlim(0, _reg2w)
ax_y.set_ylim(0, _reg1w)

ax.set_xticks([])
ax.set_yticks([])
ax_x.set_xticks([])
ax_x.set_yticks([])
ax_y.set_xticks([])
ax_y.set_yticks([])

# draw boxes around clustered pixels ...
_big_boxes_kwargs["ec"] = "black"
_big_boxes_kwargs["lw"] = 1
for box in rectangles_around_dots_ww(
    _bedpe_region,
    _the_tile,
    **_big_boxes_kwargs,
):
    ax.add_patch(box)
    x0, y0, dx, dy = box.get_bbox().bounds
    ax_x.axvspan(x0, x0+dx, facecolor='black', alpha=0.7)
    ax_x.axvspan(y0, y0+dy, facecolor='black', alpha=0.7)
    ax_y.axhspan(y0, y0+dy, facecolor='black', alpha=0.7)

ax_y.invert_yaxis()

f6 = subfigs[5,2]
stack = ttt
ooe_norm = imshow_kwargs["norm"]
# create a 21 x 21 vertex mesh
*_, dimens = stack.shape
X, Y = np.meshgrid(np.linspace(0,1,dimens), np.linspace(0,1,dimens))
# create vertices for a rotated mesh (3D rotation matrix)
Z = np.zeros_like(X)
# show the 3D rotated projection
ax = f6.add_subplot(1,1,1,projection='3d')
num_to_show = 6
for i, _matrix in enumerate( stack ):
    if i > num_to_show:
        break
    is_top_element = i!=num_to_show
    # get the heatmap for the region
    ax.plot_wireframe(X, Y, Z+i/num_to_show, linewidths=4, rstride=1, cstride=1,color="black")
    ax.plot_surface(
        X,
        Y,
        Z+i/num_to_show,
        rstride=1,
        cstride=1,
        facecolors=mycmap(ooe_norm(_matrix)),
        shade=is_top_element,
    )
ax.set_axis_off()
ax.set_position([0,0,1,1])

fig.savefig("FigureSupp2-mcd_detection.pdf", dpi=300)

# Legacy stuff abandoned ...

In [None]:
def rectangles_around_dots(dots_bins_df, the_tile, loc="upper", lw=1, ec="cyan", fc="none"):
    rectangle_kwargs = dict(lw=lw, ec=ec, fc=fc)
    # parse the tile
    _, tspan1, tspan2 = the_tile
    # select only visible pixels :
    _the_dots = dots_bins_df \
        .query("""(@tspan1[0] < bin1_id < @tspan1[1]) & \
                  (@tspan2[0] < bin2_id < @tspan2[1]) """) \
        .eval("""b1 = bin1_id - @tspan1[0]
                 b2 = bin2_id - @tspan2[0] """)
    print(f"{len(_the_dots)} pixels are visible out of {len(dots_bins_df)} ...")
    # iterate over visible pixels...
    for b1, b2 in _the_dots[["b1", "b2"]].itertuples(index=False):
        w1 = w2 = 1
        if loc == "upper":
            yield patches.Rectangle((b2, b1), w2, w1, **rectangle_kwargs)
        elif loc == "lower":
            yield patches.Rectangle((b1, b2), w1, w2, **rectangle_kwargs)
        else:
            raise ValueError("loc has to be uppper or lower")

# in a specific region, and exposing importnat plotting parameters
def rectangles_around_dots_ww(dots_bins_df, the_tile, loc="upper", lw=1, ec="cyan", fc="none", halo=30_000, ext_width=0):
    rectangle_kwargs = dict(lw=lw, ec=ec, fc=fc)
    # parse the tile
    _, tspan1, tspan2 = the_tile
    # select only visible "boxes" :
    _the_dots = dots_bins_df \
        .query("""(@tspan1[0] - @halo < bin1_id < @tspan1[1] + @halo) & \
                  (@tspan2[0] - @halo < bin2_id < @tspan2[1] + @halo) """) \
        .eval("""
                b1 = bin1_id - @tspan1[0] - @ext_width
                b2 = bin2_id - @tspan2[0] - @ext_width
                bin1_width = bin1_width + @ext_width
                bin2_width = bin2_width + @ext_width
            """)
    print(f"{len(_the_dots)} pixels are visible out of {len(dots_bins_df)} ...")
    for b1, b2, w1, w2 in _the_dots[["b1", "b2", "bin1_width", "bin2_width"]].itertuples(index=False):
        if loc == "upper":
            yield patches.Rectangle((b2, b1), w2+1, w1+1, **rectangle_kwargs)
        elif loc == "lower":
            yield patches.Rectangle((b1, b2), w1+1, w2+1, **rectangle_kwargs)
        else:
            raise ValueError("loc has to be uppper or lower")

def bedpe_to_anchors(
    bedpe_df,
    view_df,  # for sorting !
    cols1 = ["chrom1", "start1", "end1"],
    cols2 = ["chrom2", "start2", "end2"],
    mode="cluster"
):
    """
    turning bedpe interactions to a bed of anchors - the simple way

    mode - allow for several way to merge upstream and downstream
    anchors. cluster, max_size, max_valency, median
    """
    _cols = ["chrom", "start", "end"]
    _cluster_cols = ["chrom", "cluster_start", "cluster_end"]
    # concat left and right anchors ...
    _bed = pd.concat(
        [
            bedpe_df[cols1].rename(columns={c1:c for c1,c in zip(cols1, _cols)}),
            bedpe_df[cols2].rename(columns={c2:c for c2,c in zip(cols2, _cols)}),
        ],
        ignore_index=True,
    )
    # clustering anchors - define clusters of overlaping anchors ...
    _anchors = bioframe.cluster(
        bioframe.sort_bedframe(_bed, view_df=view_df),
        # min_dist=None,
        min_dist=binsize+1,
        return_input=True,
    ).reset_index(drop=True)
    if mode == "cluster":
        # simply return resulting clusters - i.e. total footprint of clustered anchors ...
        _anchors = _anchors.drop_duplicates(subset=_cluster_cols).reset_index(drop=True)
        # calculate size just in case
        _anchors["size"] = _anchors[_cluster_cols[2]] - _anchors[_cluster_cols[1]]
        # return _anchors with coordinates rename as needed !
        return _anchors.drop(columns=["start","end"]).rename(columns={"cluster_start":"start", "cluster_end":"end"})
    elif mode == "max_size":
        # return the largest anchor per cluster
        # size of anchors, not clusters !!!
        _anchors["size"] = _anchors[_cols[2]] - _anchors[_cols[1]]
        _largest_anchor_idx = _anchors.groupby("cluster")["size"].idxmax()
        _anchors = _anchors.loc[_largest_anchor_idx]
        # return _anchors - i.e. the largest anchor per cluster of overlaping anchors
        return _anchors.drop(columns=["cluster_start","cluster_end"]).reset_index(drop=True)
    elif mode == "median":
        # return the of start and end coords per cluster of overlaping anchors ...
        _anchors = _anchors.groupby("cluster").agg({"chrom":"first", "start":"median", "end":"median"})
        _anchors = _anchors.reset_index().astype({"start":int, "end":int})
        _anchors["size"] = _anchors[_cols[2]] - _anchors[_cols[1]]
        # return _anchors - i.e. the largest anchor per cluster of overlaping anchors
        return _anchors.reset_index(drop=True)
    else:
        raise ValueError()


def calculate_valencies(
    bed_df,  # must be output of bedpe_to_anchors, which in turn is a clustering inside
    bedpe_df,
    cluster_colname = "cluster",
    valency_colname = "valency",
    bed_cols = ["chrom", "cluster_start", "cluster_end"],
    bedpe_cols1 = ["chrom1", "start1", "end1"],
    bedpe_cols2 = ["chrom2", "start2", "end2"],
):
    """
    calculate valencies of a given anchors, given the bedpe ...
    """

    if cluster_colname not in bed_df.columns:
        raise ValueError("bed_df does not seem to be the result of bedpe_to_anchors/clustering ...")

    # overlap combined anchors with the left anchors to see how many "dots" we overlap ...
    anchors_left = bioframe.overlap(
        bed_df,
        bedpe_df,
        how='left',
        cols1=bed_cols,
        cols2=bedpe_cols1,
    ).dropna( subset=[f"{c}_" for c in bedpe_cols1] )

    # overlap combined anchors with the right anchors to see how many "dots" we overlap ...
    anchors_right = bioframe.overlap(
        bed_df,
        bedpe_df,
        how='left',
        cols1=bed_cols,
        cols2=bedpe_cols2,
    ).dropna( subset=[f"{c}_" for c in bedpe_cols2] )

    _num_clusters = len(bed_df)
    # sanity check here ... - make sure we cover all of the cluster that are available ...
    assert ( bed_df[cluster_colname].sort_values() == np.arange(_num_clusters) ).all()
    # ...
    _empty_clust_series = pd.Series(
        data=np.zeros(_num_clusters),
        index=pd.Index(data=np.arange(_num_clusters), name=cluster_colname),
        name="count"
    )

    # calculate valencies ...
    _valencies = (_empty_clust_series + anchors_left[cluster_colname].value_counts()).fillna(0) \
                + (_empty_clust_series + anchors_right[cluster_colname].value_counts()).fillna(0)

    # assign valencies back to anchors bed_df - carefully !
    # _valencies are indexed using cluster_id - s
    bed_df_clust_indexed = bed_df.set_index(cluster_colname)
    bed_df_clust_indexed[valency_colname] = _valencies.astype(int)
    #
    return bed_df_clust_indexed.reset_index()


# Let's draw an all-by-all grid of IDs ...
... to demonstrate what's being called and what's not ...

In [None]:
_anchors5

### turn anchors into all-by-grid that fits in the region ...

In [None]:

_region1 = ('chr6', 129_000_000, 129_000_000+1_500_000)
_region2 = ('chr6', 129_000_000, 129_000_000+2_700_000)


_anchors = _anchors5[["chrom","peak_start","peak_end","cluster"]].rename(columns={"peak_start":"start","peak_end":"end"})
_anchors_reg = bioframe.select(_anchors, _region2).reset_index(drop=True)
_anchors_reg["cluster"] = _anchors_reg["cluster"] - _anchors_reg["cluster"].min()
_anchors_reg = calculate_valencies(
    _anchors_reg,   # must be output of bedpe_to_anchors, which in turn is a clustering inside
    clustered_pixels,
    cluster_colname = "cluster",
    valency_colname = "valency",
    bed_cols = ["chrom", "start", "end"],
    bedpe_cols1 = ["chrom1", "start1", "end1"],
    bedpe_cols2 = ["chrom2", "start2", "end2"],
)

# # print(_anchors_reg)
# _anchors_reg = _anchors_reg.query("valency > 1").reset_index(drop=True)
# # _anchors_reg = _anchors_reg.sample(n=10, weights="valency").sort_index()

_bedpe_region = bioframe.pair_by_distance(
    _anchors_reg,
    min_sep=0,
    # max_sep=(_end - _start)+100_000,
    max_sep=(_end - _start)+100_000_000,
    suffixes=("1","2"),
    keep_order=True,
)

# now find those all-by-allers that were actually called in the screening procedure ...

_overlap1 = bioframe.overlap(
    _bedpe_region,
    clustered_pixels,
    how='left',
    cols1=["chrom1","start1","end1"],
    cols2=["chrom1","start1","end1"],
    suffixes=('', '_'),
).dropna().set_index(["cluster1","cluster2"])


_overlap2 = bioframe.overlap(
    _bedpe_region,
    clustered_pixels,
    how='left',
    cols1=["chrom2","start2","end2"],
    cols2=["chrom2","start2","end2"],
    suffixes=('', '_'),
).dropna().set_index(["cluster1","cluster2"])


_2d_mask = []
for _c1, _c2 in _bedpe_region[["cluster1","cluster2"]].itertuples(index=False):
    # print(_c1, _c2)
    try:
        _labs1 = _overlap1.loc[(_c1,_c2), "labels_"].to_list()
        _labs2 = _overlap2.loc[(_c1,_c2), "labels_"].to_list()
        _call_overlap = bool(set(_labs1) & set(_labs2))
        _2d_mask.append(_call_overlap)
    except KeyError:
        _2d_mask.append(False)

print(_2d_mask)

# annotate bedpe region with bins and bin widths ...

In [None]:
_bedpe_region["bin1_id"] = _bedpe_region[["chrom1","start1","end1"]].apply(clr.offset,axis=1,result_type="expand")
_bedpe_region["bin1_width"] = _bedpe_region[["chrom1","start1","end1"]].apply(clr.extent,axis=1,result_type="expand").apply(np.diff,axis=1,result_type="expand")[0]


_bedpe_region["bin2_id"] = _bedpe_region[["chrom2","start2","end2"]].apply(clr.offset,axis=1,result_type="expand")
_bedpe_region["bin2_width"] = _bedpe_region[["chrom2","start2","end2"]].apply(clr.extent,axis=1,result_type="expand").apply(np.diff,axis=1,result_type="expand")[0]

In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.colors import Normalize, TwoSlopeNorm

# # https://stackoverflow.com/questions/48625475/python-shifted-logarithmic-colorbar-white-color-offset-to-center
# class MidPointLogNorm(LogNorm):
#     def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
#         LogNorm.__init__(self,vmin=vmin, vmax=vmax, clip=clip)
#         self.midpoint=midpoint
#     def __call__(self, value, clip=None):
#         # I'm ignoring masked values and all kinds of edge cases to make a
#         # simple example...
#         x, y = [np.log(self.vmin), np.log(self.midpoint), np.log(self.vmax)], [0, 0.5, 1]
#         return np.ma.masked_array(np.interp(np.log(value), x, y))



# https://stackoverflow.com/questions/48625475/python-shifted-logarithmic-colorbar-white-color-offset-to-center
class MidPointLogNorm(LogNorm):
    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        LogNorm.__init__(self,vmin=vmin, vmax=vmax, clip=clip)
        self.midpoint=midpoint

    def __call__(self, value, clip=None):
        # I'm ignoring masked values and all kinds of edge cases to make a
        # simple example...
        vmin, midpoint, vmax = self.vmin, self.midpoint, self.vmax
        x, y = [np.log(vmin), np.log(midpoint), np.log(vmax)], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(np.log(value), x, y))

    def inverse(self, value):
        if not self.scaled():
            raise ValueError("Not invertible until scaled")
        # t_vmin, t_midpoint, t_vmax = np.log(self.vmin), np.log(self.midpoint), np.log(self.vmax)
        vmin, midpoint, vmax = self.vmin, self.midpoint, self.vmax

        x, y = [0, 0.5, 1], [np.log(vmin), np.log(midpoint), np.log(vmax)]
        # # return np.ma.masked_array(np.interp(np.log(value), x, y))
        # if np.iterable(value):
        #     val = np.ma.asarray(value)
        #     return np.ma.power(val, 1. / gamma) * (vmax - vmin) + vmin
        # else:
        # return pow(value, 1. / gamma) * (vmax - vmin) + vmin
        return np.exp(np.interp(value, x, y))



imshow_kwargs = dict(
        # norm=LogNorm(vmin=1/5, vmax=5),
        norm=TwoSlopeNorm(vcenter=1, vmin=-0.1, vmax=3),
        # norm=MidPointLogNorm(vmin=1/10, vmax=3, midpoint=1),
        # cmap="RdBu_r",
        cmap="coolwarm",
        # interpolation="nearest",
        interpolation="none",
)


In [None]:
region1_name = bioframe.select(hg38_arms, _region1).iat[0,-1]
region2_name = bioframe.select(hg38_arms, _region2).iat[0,-1]
assert region1_name == region2_name
region_name = region2_name
#
tile_span_i = clr.extent(_region1)
tile_span_j = clr.extent(_region2)
_the_tile = (region_name, tile_span_i, tile_span_j )

tile_start_ij = (tile_span_i[0], tile_span_j[0])
lazy_exp = LazyToeplitz(
    exp_indexed.loc[region_name, region_name]["balanced.avg"].to_numpy()
)
# RAW observed matrix slice:
observed = clr.matrix()[slice(*tile_span_i), slice(*tile_span_j)]
expected = lazy_exp[slice(*tile_span_i), slice(*tile_span_j)]

# let's figure out slices' coordinates ....
_bins_i = clr.bins()[slice(*tile_span_i)]
_bins_j = clr.bins()[slice(*tile_span_j)]
_chrom_i, _start_i, _end_i = _bins_i.iloc[0]["chrom"], _bins_i.iloc[0]["start"], _bins_i.iloc[-1]["end"]
_chrom_j, _start_j, _end_j = _bins_j.iloc[0]["chrom"], _bins_j.iloc[0]["start"], _bins_j.iloc[-1]["end"]


# Start with a square Figure.
fig = plt.figure(figsize=(18, 8))

outer_grid = fig.add_gridspec(4, 1, wspace=0, hspace=0)

ax = fig.add_subplot(outer_grid[0])
axb = fig.add_subplot(outer_grid[1])
axc = fig.add_subplot(outer_grid[2])
axd = fig.add_subplot(outer_grid[3])

divider = make_axes_locatable(ax)
ax_x = divider.append_axes("bottom", size=0.12, pad=0.00, sharex=ax)
ax_y = divider.append_axes("left", size=0.12, pad=0.00, sharey=ax)
# create new axes on the right and on the top of the current axes.
divider = make_axes_locatable(axb)
axb_x = divider.append_axes("bottom", size=0.12, pad=0.00, sharex=axb)
axb_y = divider.append_axes("left", size=0.12, pad=0.00, sharey=axb)
# create new axes on the right and on the top of the current axes.
divider = make_axes_locatable(axc)
axc_x = divider.append_axes("bottom", size=0.12, pad=0.00, sharex=axc)
axc_y = divider.append_axes("left", size=0.12, pad=0.00, sharey=axc)
# create new axes on the right and on the top of the current axes.
divider = make_axes_locatable(axd)
axd_x = divider.append_axes("bottom", size=0.12, pad=0.00, sharex=axd)
axd_y = divider.append_axes("left", size=0.12, pad=0.00, sharey=axd)


gOE = scipy.ndimage.gaussian_filter(
    (observed/expected),
    sigma=0.4,
    order=0,
    mode='reflect',
    cval=0.0,
    # radius=3,
    truncate=1.0,
)


hm_axes = [ax, axb, axc, axd]
x_axes = [ax_x, axb_x, axc_x, axd_x]
y_axes = [ax_y, axb_y, axc_y, axd_y]
_boxes_to_draw = [clustered_pixels_old, enriched_pixels, clustered_pixels, _bedpe_region]
_boxes_type = ["box", "pixel", "box", "box"]
_alphas = [0.7, 0.2, 0.7, 0.7]

for _ax, _axx, _axy, _bt, _boxes, _alpha in zip(
    hm_axes,
    x_axes,
    y_axes,
    _boxes_type,
    _boxes_to_draw,
    _alphas,
):
    #
    _ax.imshow(
        # (observed/expected),
        gOE,
        **imshow_kwargs,
    )
    # ...
    if _bt == "pixel":
        _draw_boxes = rectangles_around_dots
        _boxes_kwargs = dict(loc="upper", lw=1, ec="green", fc="green")
    else:
        _draw_boxes = rectangles_around_dots_ww
        _boxes_kwargs = dict(loc="upper", lw=1.5, ec="springgreen", fc="none", halo=0, ext_width=3)
    # ...
    for box in _draw_boxes(
        _boxes,
        _the_tile,
        **_boxes_kwargs,
    ):
        _ax.add_patch(box)
        x0, y0, dx, dy = box.get_bbox().bounds
        _axx.axvspan(x0, x0+dx, facecolor='black', alpha=_alpha)
        _axx.axvspan(y0, y0+dy, facecolor='black', alpha=_alpha)
        _axy.axhspan(y0, y0+dy, facecolor='black', alpha=_alpha)
        # _axy.axhspan(x0, x0+dx, facecolor='black', alpha=_alpha)
        ########################################################
    _ax.set_xticks([])
    _ax.set_yticks([])
    _axx.set_xticks([])
    _axx.set_yticks([])
    _axy.set_xticks([])
    _axy.set_yticks([])


plt.savefig("grid_csh_coolwarm_600.pdf",dpi=600)

In [None]:
_region = (_chrom , _start, _end)
region_name = bioframe.select(hg38_arms, _region).iat[0,-1]
tile_span_i = clr.extent(_region)
tile_span_j = clr.extent(_region)

_the_tile = (region_name,tile_span_i, tile_span_j )

tile_start_ij = (tile_span_i[0], tile_span_j[0])
lazy_exp = LazyToeplitz(
    exp_indexed.loc[region_name, region_name]["balanced.avg"].to_numpy()
)
# RAW observed matrix slice:
observed = clr.matrix()[slice(*tile_span_i), slice(*tile_span_j)]
expected = lazy_exp[slice(*tile_span_i), slice(*tile_span_j)]

# let's figure out slices' coordinates ....
_bins_i = clr.bins()[slice(*tile_span_i)]
_bins_j = clr.bins()[slice(*tile_span_j)]
_chrom_i, _start_i, _end_i = _bins_i.iloc[0]["chrom"], _bins_i.iloc[0]["start"], _bins_i.iloc[-1]["end"]
_chrom_j, _start_j, _end_j = _bins_j.iloc[0]["chrom"], _bins_j.iloc[0]["start"], _bins_j.iloc[-1]["end"]


f, (axleft, axright) = plt.subplots(nrows=1,ncols=2,figsize=(24,11),sharex=True,sharey=True)
# f.suptitle(f"tile # {_tile_id} {(_chrom_i, _start_i, _end_i)} {(_chrom_j, _start_j, _end_j)}",y=0.9)

# print(f"tile # {_tile_id} {(_chrom_i, _start_i, _end_i)} {(_chrom_j, _start_j, _end_j)}")

axleft.imshow(
    observed,
    cmap="YlOrBr",
    interpolation="none",
    norm=LogNorm(0.0001, 0.01)
)
axright.imshow(
    (observed/expected),
    cmap="coolwarm",
    interpolation="none",
    norm=LogNorm(0.25, 4)
)


for box in rectangles_around_dots_ww(
    _bedpe_region,
    _the_tile,
    loc="upper",
    lw=2,
    ec="darkgreen",
    fc="none",
    halo=100
):
    axleft.add_patch(box)
for box in rectangles_around_dots_ww(
    yyy,
    _the_tile,
    loc="upper",
    lw=2,
    ec="crimson",
    fc="none",
    halo=100
):
    axleft.add_patch(box)



In [None]:
_nnn = len(_bedpe_region["cluster1"].unique())+1

assert (_nnn*_nnn - _nnn)/2 == len(_bedpe_region)

In [None]:
_flank = 75_000 # Length of flank to one side from the boundary, in basepairs
# create the stack of snips:
_region_stack = cooltools.pileup(
    clr,
    _bedpe_region,
    view_df=hg38_arms,
    expected_df=exp,
    flank=_flank,
    nproc=12
)

In [None]:
_number = len(_bedpe_region["cluster1"].unique())+1
_width_unit = 2
_height_unit = 2
nrows, ncols = _number, _number
f,axs = plt.subplots(
    nrows=nrows,
    ncols=ncols,
    figsize = (nrows*_height_unit, ncols*_width_unit),
    sharex=True,
    sharey=True,
)

imshow_kwargs = dict(
    norm=LogNorm(vmin=1/4,vmax=4),
    # norm = mpl.colors.Normalize(vmin=1-0.9999,vmax=1+0.9999),
    interpolation="none",
    # extent=[-_flank//1000, _flank//1000, -_flank//1000, _flank//1000],
    cmap='RdBu_r',
)

_counter = 0
for i in range(nrows):
    for j in range(ncols):
        _ax = axs[i,j]
        if j == i:
            _ax.set_axis_off()
            # _ax.set_xticks([])
            # _ax.set_yticks([])
        elif j > i:
            _ax.set_xticks([])
            _ax.set_yticks([])
            _ax.imshow(_region_stack[_counter], **imshow_kwargs)
            if _2d_mask[_counter]:
                _ax.spines["left"].set_color("limegreen")
                _ax.spines["right"].set_color("limegreen")
                _ax.spines["top"].set_color("limegreen")
                _ax.spines["bottom"].set_color("limegreen")
                _ax.spines["left"].set_linewidth(5)
                _ax.spines["right"].set_linewidth(5)
                _ax.spines["top"].set_linewidth(5)
                _ax.spines["bottom"].set_linewidth(5)
            _counter += 1
            #
        else:
            _ax.set_axis_off()
