In [None]:
%load_ext autoreload
%autoreload 2
# from saddle import saddleplot


# set the number of threads for many common libraries
from os import environ
N_THREADS = '1'
environ['OMP_NUM_THREADS'] = N_THREADS
environ['OPENBLAS_NUM_THREADS'] = N_THREADS
environ['MKL_NUM_THREADS'] = N_THREADS
environ['VECLIB_MAXIMUM_THREADS'] = N_THREADS
environ['NUMEXPR_NUM_THREADS'] = N_THREADS
# https://superfastpython.com/numpy-number-blas-threads/

In [None]:
import pandas as pd
import numpy as np
from itertools import chain

# Hi-C utilities imports:
import cooler
import bioframe
import cooltools
from cooltools.lib.numutils import fill_diag

# Visualization imports:
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import matplotlib.patches as patches
from matplotlib.ticker import EngFormatter


from data_catalog import telo_dict, pubclr_dict, mega_telo_dict

clr_fname_dict = mega_telo_dict | pubclr_dict

# helper functions for plotting
bp_formatter = EngFormatter('b')
def format_ticks(ax, x=True, y=True, rotate=True):
    """format ticks with genomic coordinates as human readable"""
    if y:
        ax.yaxis.set_major_formatter(bp_formatter)
    if x:
        ax.xaxis.set_major_formatter(bp_formatter)
        ax.xaxis.tick_bottom()
    if rotate:
        ax.tick_params(axis='x',rotation=45)


# function to draw kernels:
def draw_kernel(kernel, axis=None, cmap='viridis'):
    if axis is None:
        f, axis = plt.subplots()
    # kernel:
    imk = axis.imshow(
                    kernel[::-1,::-1],  # flip it, as in convolution
                    alpha=0.85,
                    cmap=cmap,
                    interpolation='nearest')
    # draw a square around the target pixel:
    x0 = kernel.shape[0] // 2 - 0.5
    y0 = kernel.shape[1] // 2 - 0.5
    rect = patches.Rectangle((x0, y0), 1, 1, lw=1, ec='r', fc='r')
    axis.add_patch(rect)

    # clean axis:
    axis.set_xticks([])
    axis.set_yticks([])
    axis.set_xticklabels('',visible=False)
    axis.set_yticklabels('',visible=False)
    axis.set_title("{} kernel".format(ktype),fontsize=16)
    # add a checkerboard to highlight pixels:
    checkerboard = np.add.outer(range(kernel.shape[0]),
                                range(kernel.shape[1])) % 2
    # show it:
    axis.imshow(checkerboard,
            cmap='gray',
            interpolation='nearest',
            alpha=0.3)

    return imk

In [None]:
from tqdm import tqdm
from tqdm.notebook import trange, tqdm
# import mpire for nested multi-processing
from mpire import WorkerPool
from helper_func import (
    get_stack,
    show_stacks,
    plot_stackups_lite,
    plot_stackups_sets,
    to_bigbed3,
    merge_nested,
)

In [None]:
import matplotlib as mpl
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
# create a functions that would return a series of rectangles around called dots
# in a specific region, and exposing importnat plotting parameters
def rectangles_around_dots(dots_df, region, loc="upper", lw=1, ec="cyan", fc="none"):
    """
    yield a series of rectangles around called dots in a given region
    """
    # select dots from the region:
    df_reg = bioframe.select(
        bioframe.select(dots_df, region, cols=("chrom1","start1","end1")),
        region,
        cols=("chrom2","start2","end2"),
    )
    rectangle_kwargs = dict(lw=lw, ec=ec, fc=fc)
    # draw rectangular "boxes" around pixels called as dots in the "region":
    for s1, s2, e1, e2 in df_reg[["start1", "start2", "end1", "end2"]].itertuples(index=False):
        width1 = e1 - s1
        width2 = e2 - s2
        if loc == "upper":
            yield patches.Rectangle((s2, s1), width2, width1, **rectangle_kwargs)
        elif loc == "lower":
            yield patches.Rectangle((s1, s2), width1, width2, **rectangle_kwargs)
        else:
            raise ValueError("loc has to be uppper or lower")

# in a specific region, and exposing importnat plotting parameters
def draw_ondiag_domains(bed_df, the_tile, lw=1, ec="cyan", fc="none", halo=30_000, ext_width=0):
    rectangle_kwargs = dict(lw=lw, ec=ec, fc=fc)
    # parse the tile
    _, tspan1, tspan2 = the_tile
    # select only visible "boxes" :
    _the_dots = bed_df \
        .query("""(@tspan1[0] - @halo < bin1_id < @tspan1[1] + @halo) & \
                  (@tspan2[0] - @halo < bin2_id < @tspan2[1] + @halo) """) \
        .eval("""
                b1 = bin1_id - @tspan1[0] - @ext_width
                b2 = bin2_id - @tspan2[0] - @ext_width
            """)
    for b1, b2 in _the_dots[["b1", "b2"]].itertuples(index=False):
        yield patches.Rectangle((b1, b1), b2-b1+1, b2-b1+1, **rectangle_kwargs)


# in a specific region, and exposing importnat plotting parameters
def rectangles_around_dots_ww(dots_bins_df, the_tile, loc="upper", lw=1, ec="cyan", fc="none", halo=30_000, ext_width=0):
    rectangle_kwargs = dict(lw=lw, ec=ec, fc=fc)
    # parse the tile
    _, tspan1, tspan2 = the_tile
    # select only visible "boxes" :
    _the_dots = dots_bins_df \
        .query("""(@tspan1[0] - @halo < bin1_id < @tspan1[1] + @halo) & \
                  (@tspan2[0] - @halo < bin2_id < @tspan2[1] + @halo) """) \
        .eval("""
                b1 = bin1_id - @tspan1[0] - @ext_width
                b2 = bin2_id - @tspan2[0] - @ext_width
                bin1_width = bin1_width + @ext_width
                bin2_width = bin2_width + @ext_width
            """)
    print(f"{len(_the_dots)} pixels are visible out of {len(dots_bins_df)} ...")
    for b1, b2, w1, w2 in _the_dots[["b1", "b2", "bin1_width", "bin2_width"]].itertuples(index=False):
        if loc == "upper":
            yield patches.Rectangle((b2, b1), w2+1, w1+1, **rectangle_kwargs)
        elif loc == "lower":
            yield patches.Rectangle((b1, b2), w1+1, w2+1, **rectangle_kwargs)
        else:
            raise ValueError("loc has to be uppper or lower")



def annotate_dots_wmotifs(dots_df, motif_df, extend=0):
    # ......................................
    # check ctcf convergence for the dots ...
    def get_converge(_row):
        _1, _2 = _row["strand1"], _row["strand2"]
        if ("+" in _1) and ("-" in _2):
            return "conv"
        else:
            return "nonconv"
    # ......................................
    ################
    _l = bioframe.overlap(
        dots_df.eval(
            """
            start1 = start1 - @extend
            end1 = end1 + @extend
            """
        ),
        motif_df,
        return_input=False,
        return_index=True,
        cols1=("chrom1","start1","end1"),
        keep_order=True,
    )
    # ...
    _left_motif = _l.merge(
        motif_df["strand"],
        how="left",
        left_on="index_",
        right_index=True,
    ) \
    .groupby("index")["strand"] \
    .unique() \
    .rename("strand1")
    ################
    # ...
    ################
    _r = bioframe.overlap(
        dots_df.eval(
            """
            start1 = start1 - @extend
            end1 = end1 + @extend
            """
        ),
        motif_df,
        return_input=False,
        return_index=True,
        cols1=("chrom2","start2","end2"),
        keep_order=True,
    )
    # ...
    _right_motif = _r.merge(
        motif_df["strand"],
        how="left",
        left_on="index_",
        right_index=True,
    ) \
    .groupby("index")["strand"] \
    .unique() \
    .rename("strand2")
    ################
    # ...
    # .......... #
    return_dots_df = dots_df.merge(
        _left_motif,
        left_index=True,
        right_index=True,
    )
    # .......... #
    return_dots_df = return_dots_df.merge(
        _right_motif,
        left_index=True,
        right_index=True,
    )
    # ...
    print(f"calculating convergency status for {len(return_dots_df)} ...")
    return_dots_df["ctcf_status"] = return_dots_df.apply(get_converge, axis=1)
    return return_dots_df


### Chrom arms as a view

In [None]:
# Use bioframe to fetch the genomic features from the UCSC.
hg38_chromsizes = bioframe.fetch_chromsizes('hg38')
hg38_cens = bioframe.fetch_centromeres('hg38')
hg38_arms_full = bioframe.make_chromarms(hg38_chromsizes, hg38_cens)
# # remove "bad" chromosomes and near-empty arms ...
# excluded_arms = ["chr13_p", "chr14_p", "chr15_p", "chr21_p", "chr22_p", "chrM_p", "chrY_p", "chrY_q", "chrX_p", "chrX_q"]
# hg38_arms = hg38_arms_full[~hg38_arms_full["name"].isin(excluded_arms)].reset_index(drop=True)

# can do 1 chromosome (or arm) as well ..
included_arms = ["chr1_q", "chr2_p", "chr4_q", "chr6_q"]
included_arms = hg38_arms_full["name"].to_list()[:44] # all autosomal ones ...
hg38_arms = hg38_arms_full[hg38_arms_full["name"].isin(included_arms)].reset_index(drop=True)

# Load CTCF motifs for dot annotation ...

In [None]:
# load motifs ...
df_motif = pd.read_csv(
    "ctcf_motifs_MA0139.1.tsv",
    sep="\t",
    names=["chrom", "start", "end", "motif", "be", "ba", "strand"]
)
display(df_motif.head())

In [None]:
def adjust_arm_view(
    view_df,
    binsize,
):
    """
    adjust arm-based view of the genome to fix slightly overlapping p and q arms ...
    """
    _iter_view = view_df.itertuples(index=False)
    return pd.DataFrame(
        [(c,s+binsize,e,n) if ("q" in n) else (c,s,e,n) for c,s,e,n in _iter_view],
        columns=hg38_arms.columns
    )


## Now let's get to pileups ! First - calcualte expected for all samples ...

In [None]:
clr_fname_dict

In [None]:
# 10 kb is a resolution at which one can clearly see "dots":
binsize = 10_000
# cooler files that we'll work on :
sample_clrs = { _k: cooler.Cooler(f"{_path}::/resolutions/{binsize}") for _k, _path in clr_fname_dict.items() }


In [None]:
sample_clrs

# cis-expected first @10kb ...

In [None]:
key_samples = ['pG1s_MEGA', 'Ms_MEGA', 'mG1s_MEGA', 'dldmicroc']
key_samples = ['mG1s_MEGA', ]

In [None]:
def _job(packed_data, sample):
    # packed data -> exp_kwargs and a dict with coolers for each sample
    exp_kwargs, clr_dict = packed_data
    _clr = clr_dict[sample]
    # in order to use spawn/forkserver we have to import for worker
    from cooltools import expected_cis
    _exp = expected_cis( _clr, **exp_kwargs)
    return (sample, _exp)

# define expected parameters in the form of kwargs-dict:
exp_kwargs = dict(
    view_df=hg38_arms,  #  adjust_arm_view(hg38_arms, binsize),
    intra_only=False,
    nproc=12
)

# have to use daemon=False, because _job is multiprocessing-based already ...
with WorkerPool(
    n_jobs=8,
    daemon=False,
    shared_objects=( exp_kwargs, sample_clrs ),
    start_method="forkserver",  # little faster than spawn, fork is the fastest
    use_dill=True,
) as wpool:
    results = wpool.map(_job, sample_clrs, progress_bar=True)

# sort out the results ...
sample_exp_cis = {sample: _exp for sample, _exp in results}
# # old way of doing it
# telo_exps_cis = {k: cooltools.expected_cis( _clr, **exp_kwargs) for k, _clr in sample_clrs.items()}

## dot calling ...

In [None]:
# create a grid of coordinates from -5 to 5, to define round kernels
# see https://numpy.org/doc/stable/reference/generated/numpy.meshgrid.html for details
half = 5  # half width of the kernel
x, y = np.meshgrid(
    np.linspace(-half, half, 2*half + 1),
    np.linspace(-half, half, 2*half + 1),
)
# define inner and outer radii ...
inner_raius_squared = 7
outer_radius_squared = 38
inner_raius_squared = 7
outer_radius_squared = 38

# now define a donut-like mask as pixels between 2 radii: sqrt(7) and sqrt(30):
mask = (x**2+y**2 > inner_raius_squared) \
    & (x**2+y**2 <= outer_radius_squared)
mask[:,half] = 0
mask[half,:] = 0

# lowleft mask - zero out neccessary parts
mask_ll = mask.copy()
mask_ll[:,:half] = 0
mask_ll[half:,:] = 0

# vertical mask
mask_vv = (x>-2) & (x<2) & (x**2+y**2 > inner_raius_squared-1)
# horizontal mask
mask_hh = (y>-2) & (y<2) & (x**2+y**2 > inner_raius_squared-1)

# new kernels with more round donut and lowleft masks:
kernels_round = {'donut': mask,
 'vertical': mask_vv,
 'horizontal': mask_hh,
 'lowleft': mask_ll}

# plot rounded kernels
fig, axs = plt.subplots(ncols=4, figsize=(12,2.5))
for ax, (ktype, kernel) in zip(axs, kernels_round.items()):
    imk = draw_kernel(kernel, ax)

### dots on ...

In [None]:
# run and organize expecteds:
dots_dict = {}
dots_kwargs = dict(
    view_df=hg38_arms,
    kernels=kernels_round,
    # expected_value_col="balanced.avg.smoothed",
    expected_value_col="balanced.avg",
    # how far from the main diagonal to call dots:
    max_loci_separation=15_000_000,
    # lambda_bin_fdr=0.1,
    max_nans_tolerated=6,
    clustering_radius=21_000,
    cluster_filtering=False,
    tile_size=10_000_000,
    nproc=32
)
# ...
for sample in key_samples:
    clr = sample_clrs[sample]
    exp = sample_exp_cis[sample]
    print(f"calling dots for {sample} ...")
    dots_dict[sample] = cooltools.dots( clr, expected=exp, **dots_kwargs)
# ...
for sample, dots_df in dots_dict.items():
    print(f"{sample} has {len(dots_df)} dots ")

#### number of dots - i.e. centroids of the clusters of enriched pixels
```
N93mR1 has 16405 dots 
N93pR1 has 763 dots 
RGmR1 has 16808 dots 
RGmR1R2 has 22232 dots 
RGmR2 has 14469 dots 
RGpR1 has 1799 dots 
RGpR1R2 has 4769 dots 
RGpR2 has 976 dots 
RGmR1-10h has 7779 dots 
RGpR1-10h has 575 dots 
RGmpR1-10h has 6837 dots 
```
#### before clustering ... - just number of enriched pixels
```
 N93mR1 has 40795 dots 
 N93pR1 has 1004 dots 
 RGmR1 has 44001 dots 
 RGmR1R2 has 64080 dots 
 RGmR2 has 34171 dots 
 RGpR1 has 2537 dots 
 RGpR1R2 has 6616 dots 
 RGpR2 has 1304 dots 
 RGmR1-10h has 15468 dots 
 RGpR1-10h has 706 dots 
 RGmpR1-10h has 11951 dots
```

#### ...
```
    mMito has 48 dots 
    mTelo has 2667 dots 
    mCyto has 8170 dots 
    m5hR1R2 has 22192 dots 
    m10hR1R2 has 22487 dots 
    pMito has 451 dots 
    pTelo has 1314 dots 
    pCyto has 3187 dots 
    p5hR1R2 has 4757 dots 
    p10hR1R2 has 7395 dots 
```


#### ...
```
pG1s_MEGA has 18077 dots 
Ms_MEGA has 713 dots 
mG1s_MEGA has 27373 dots 
dldmicroc has 20430 dots
```


### Make sure they look reasonable

In [None]:
# define a region to look into as an example
start = 27_275_000 - 2_500_000
end = start + 5_000_000
start = 27_275_000 - 1_600_000 - 500_000
end = start + 2_500_000 + 1_000_000
region = ('chr7', start, end)

# heatmap kwargs
matshow_kwargs = dict(
    cmap='YlOrBr',
    norm=LogNorm(vmax=0.05),
    extent=(start, end, end, start)
)

# colorbar kwargs
colorbar_kwargs = dict(fraction=0.046, label='corrected frequencies')

In [None]:
# a way to check how many dots are singletons and vice a versa
dots_dict['mG1s_MEGA']["c_size"].value_counts().sort_index().iloc[:2]

# testing filtration ...
_sample = "mG1s_MEGA"
_clr = sample_clrs[_sample]
_dots = dots_dict['mG1s_MEGA']


# compute heatmap for the region
region_matrix = _clr.matrix(balance=True).fetch(region)
for diag in [-1,0,1]:
    region_matrix = fill_diag(region_matrix, np.nan, i=diag)

# see viz.ipynb for details of heatmap visualization
f, ax = plt.subplots(figsize=(7,7))
im = ax.matshow( region_matrix, **matshow_kwargs)
format_ticks(ax, rotate=False)
plt.colorbar(im, ax=ax, **colorbar_kwargs)

for box in rectangles_around_dots(_dots, region, lw=1.0, ec="black"):
    ax.add_patch(box)
#
ax.set_title(_sample)

In [None]:
dot_filter_kwargs = dict(
    obs_raw_name='count',
    enrichment_factor_vh=1.2,
    enrichment_factor_d_and_ll=1.5,
    enrichment_factor_d_or_ll=1.5,
    FDR_orphan_threshold=0.0079,
)

# testing filtration ...
_sample = "mG1s_MEGA"
_clr = sample_clrs[_sample]
_dots = dots_dict['mG1s_MEGA']
print(f"number of dots BEFORE filtration {len(_dots)}")
print(_dots["c_size"].value_counts().sort_index().iloc[:5])
_dots = cooltools.api.dotfinder.cluster_filtering_hiccups( dots_dict['mG1s_MEGA'], **dot_filter_kwargs)
print(f"number of dots AFTER filtration {len(_dots)}")
print(_dots["c_size"].value_counts().sort_index().iloc[:5])

_dots_ctcf = annotate_dots_wmotifs(_dots, df_motif, extend=1_000)
print(f"dots by ctcf status ...")
display(_dots_ctcf["ctcf_status"].value_counts())
_dots_conv = bioframe.sort_bedframe(
    _dots_ctcf.query("ctcf_status == 'conv'").reset_index(drop=True),
    view_df=hg38_arms_full,
    cols=("chrom1","start1","end1")
).reset_index(drop=True).drop(columns=["strand1","strand2","ctcf_status"])
print(_dots_conv["c_size"].value_counts().sort_index().iloc[:5])


# decide which dots are final ...
_final_dots = _dots_conv

# compute heatmap for the region
region_matrix = _clr.matrix(balance=True).fetch(region)
for diag in [-1,0,1]:
    region_matrix = fill_diag(region_matrix, np.nan, i=diag)


# see viz.ipynb for details of heatmap visualization
f, ax = plt.subplots(figsize=(7,7))
im = ax.matshow( region_matrix, **matshow_kwargs)
format_ticks(ax, rotate=False)
plt.colorbar(im, ax=ax, **colorbar_kwargs)

for box in rectangles_around_dots(_dots, region, lw=1.0, ec="black"):
    ax.add_patch(box)
#
ax.set_title(f"{_sample} - before ctcf filtering")

# see viz.ipynb for details of heatmap visualization
f, ax = plt.subplots(figsize=(7,7))
im = ax.matshow( region_matrix, **matshow_kwargs)
format_ticks(ax, rotate=False)
plt.colorbar(im, ax=ax, **colorbar_kwargs)

for box in rectangles_around_dots(_final_dots, region, lw=1.0, ec="black"):
    ax.add_patch(box)
#
ax.set_title(f"{_sample} - conv ctcf only")


# dots by distance here ...
dist_bins = [0, 200_000, 1_000_000, 5_000_000, 10_000_000, 30_000_000]
dist_bins = [0, 500_000, 2_000_000, 7_000_000, 30_000_000]
#
# for sample, _dots in dots_dict.items():
for sample, _dots in {"final": _final_dots, "ctcf_agnostic": _dots_ctcf}.items():
    print(sample)
    print(_dots.groupby(pd.cut(_dots.eval("start2 - start1"),bins=dist_bins)).size())
    print()


# Save lists of dots after processing, filtering etc etc ...

In [None]:
# for _sample, _dots in dots_dict.items():
#     _dots = dots_dict[_sample]
#     # print(f"number of dots BEFORE filtration {len(_dots)}")
#     # print(_dots["c_size"].value_counts().sort_index().iloc[:5])
#     _dots = cooltools.api.dotfinder.cluster_filtering_hiccups(_dots, **dot_filter_kwargs)
#     # print(f"number of dots AFTER filtration {len(_dots)}")
#     # print(_dots["c_size"].value_counts().sort_index().iloc[:5])
#     _dots\
#     .drop(columns=["region1","region2","region"]) \
#     .to_csv(f"./dots_10kb_MEGA_filtered_samples/{_sample}_10kb_wheader.bedpe",sep="\t",index=False)



# save such double filtered dots separately to share ...
if True:
    # ...
    #(...)# "dots_10kb_MEGA_filtered_samples/mG1s_MEGA_10kb_wheader.bedpe"
    _dots_ctcf[[
        'chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2', 'count',
       'la_exp.donut.value', 'la_exp.vertical.value',
       'la_exp.horizontal.value', 'la_exp.lowleft.value', 'la_exp.donut.qval',
       'la_exp.vertical.qval', 'la_exp.horizontal.qval', 'la_exp.lowleft.qval',
       'cstart1', 'cstart2', 'c_label', 'c_size', 'ctcf_status'
    ]].to_csv(
        "dots_10kb_MEGA_final/mG1s_MEGA_10kb_wheader_CTCF_annotation.bedpe",
        sep="\t",
        index=False,
    )
    # ...
    #(...)# "dots_10kb_MEGA_filtered_samples/mG1s_MEGA_10kb_wheader.bedpe"
    _dots_conv[[
        'chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2', 'count',
       'la_exp.donut.value', 'la_exp.vertical.value',
       'la_exp.horizontal.value', 'la_exp.lowleft.value', 'la_exp.donut.qval',
       'la_exp.vertical.qval', 'la_exp.horizontal.qval', 'la_exp.lowleft.qval',
       'cstart1', 'cstart2', 'c_label', 'c_size'
    ]].to_csv(
        "dots_10kb_MEGA_final/mG1s_MEGA_10kb_wheader_convergent.bedpe",
        sep="\t",
        index=False,
    )
# ...

# Generate the domains using the final dots
## (filtered hiccups-stype and by ctcf-convergence) ...

In [None]:
# # domains using middle of the dot coordinate ...
# domains using outter dot-coordinate ...
_the_dots_filtered = _final_dots.query("end2 - end1 <= 3_500_000").reset_index(drop=True)
print(f"number of dots before filtering {len(_final_dots)} ... and after {len(_the_dots_filtered)} ...")

# the NEW way of getting domains ...
# for a given cluster - get min start1/end1 coord, and max start2/end2 coord:
_clust = bioframe.cluster(
    _the_dots_filtered,
    min_dist=21_000,
    cols=("chrom1", "start1", "end1"),
    return_input=True,
    return_cluster_ids=True,
)
_index2 = _clust.groupby("cluster")["start2"].idxmax()
_index1 = _clust.groupby("cluster")["start1"].idxmin()
# now construct a BedFrame for the nested merge ...
_pre_domains = pd.DataFrame({
    "chrom": _the_dots_filtered.loc[_index1, "chrom1"].to_numpy(),
    "start": _the_dots_filtered.loc[_index1, "start1"].to_numpy(),
    "end": _the_dots_filtered.loc[_index2, "end2"].to_numpy(),
})
# _domains = _pre_domains
_domains = merge_nested(
    # pre-filter super long range dots, as they include translocations ...
    _pre_domains,
    # merge "touching" domains or not ...
    overlap_frac=0.735,
)

# clustering and merging operations do not preserve chrom order ...
_domains = bioframe.sort_bedframe(
    _domains,
    view_df=hg38_arms_full,
    # cols=("chrom1","start1","end1")
)
# print(_domains)

f, axs = plt.subplots(ncols=2, figsize=(9,4))
# _bins = np.r_[0,np.geomspace(100_000, 30_000_000, 100)]
_bins = np.linspace(0, 6_000_000, 25)

# check out domain size distribution
_domains.eval("end - start").hist(bins=_bins, ax=axs[0], log=True)
axs[0].set_title("domain size distribution")
# axs[0].set_xscale("log")

# inter-domain distances ...
_inter_domain_cis = (_domains.shift(-1)["chrom"] == _domains["chrom"])
(_domains.shift(-1)["start"] - _domains["end"])[_inter_domain_cis].hist(bins=_bins, ax=axs[1], log=True)
axs[1].set_title("inter-domain distance distribution")
# axs[1].set_xscale("log")

# distribution of dot sizes ...
_final_dots.eval("end2-end1").hist(bins=_bins, histtype="step", linewidth=3, ax=axs[0], label="dot sizes")
axs[0].legend(frameon=False)

axs[0].set_xlabel("size, MB")
axs[1].set_xlabel("size, MB")

# _domains
print(f"number of detected domains {len(_domains)} ...")

if True:
    # save domains to a file !!!!
    _domains.to_csv(
        "extrusion_domains/mG1s_MEGA_10kb_double_filtered.bedpe",
        sep="\t",
        index=False,
    )

In [None]:
import scipy

In [None]:
# Calculate domain defining dots ...
_region1 = ('chr7', 27_275_000-3_750_000, 27_275_000+3_750_000)
# _region1 = ('chr7', 38_000_000, 38_000_000+12_500_000)
# # _region2 = ('chr7', 38_000_000, 38_000_000+12_500_000)
# # _region1 = ('chr7', 20_500_000, 20_500_000+12_500_000)
# # _region2 = ('chr7', 20_500_000, 20_500_000+12_500_000)
# _region1 = (_chrom, _start-1_500_000, _end+1_500_000)
# _region1 = ('chr1', 28_500_000-3_500_000, 28_910_000+3_500_000)
_region2 = _region1

# domains within selected region - turn it back to bedpe ...
_domains_region = \
_domains.eval("""
    chrom1 = chrom
    chrom2 = chrom
    start1 = start
    end1 = start + 10_000
    start2 = end - 10_000
    end2 = end
""")[['chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2']]#, 'n_intervals']]

# select domains in the region ...
_domains_region = bioframe.select(
    bioframe.select(_domains_region, _region1, cols=("chrom1","start1","end1")),
    _region2, cols=("chrom2","start2","end2"),
).reset_index(drop=True)
_domains_region["bin1_id"] = _domains_region[["chrom1","start1","end1"]].apply(_clr.offset,axis=1,result_type="expand")
_domains_region["bin1_width"] = _domains_region[["chrom1","start1","end1"]].apply(_clr.extent,axis=1,result_type="expand").apply(np.diff,axis=1,result_type="expand")[0]
_domains_region["bin2_id"] = _domains_region[["chrom2","start2","end2"]].apply(_clr.offset,axis=1,result_type="expand")
_domains_region["bin2_width"] = _domains_region[["chrom2","start2","end2"]].apply(_clr.extent,axis=1,result_type="expand").apply(np.diff,axis=1,result_type="expand")[0]

# select dots in the region ...
_the_dots_region = bioframe.select(
    bioframe.select(_final_dots, _region1, cols=("chrom1","start1","end1")),
    _region2, cols=("chrom2","start2","end2"),
).reset_index(drop=True)
_the_dots_region["bin1_id"] = _the_dots_region[["chrom1","start1","end1"]].apply(_clr.offset,axis=1,result_type="expand")
_the_dots_region["bin1_width"] = _the_dots_region[["chrom1","start1","end1"]].apply(_clr.extent,axis=1,result_type="expand").apply(np.diff,axis=1,result_type="expand")[0]
_the_dots_region["bin2_id"] = _the_dots_region[["chrom2","start2","end2"]].apply(_clr.offset,axis=1,result_type="expand")
_the_dots_region["bin2_width"] = _the_dots_region[["chrom2","start2","end2"]].apply(_clr.extent,axis=1,result_type="expand").apply(np.diff,axis=1,result_type="expand")[0]




region1_name = bioframe.select(hg38_arms, _region1).iat[0,-1]
region2_name = bioframe.select(hg38_arms, _region2).iat[0,-1]
assert region1_name == region2_name
region_name = region2_name

tile_span_i = _clr.extent(_region1)
tile_span_j = _clr.extent(_region2)
_the_tile = (region_name, tile_span_i, tile_span_j )
_reg1w = np.diff(tile_span_i).item()
_reg2w = np.diff(tile_span_j).item()

# observed matrix slice ...
_mat = scipy.ndimage.gaussian_filter(
    _clr.matrix()[slice(*tile_span_i), slice(*tile_span_j)],
    sigma=0.4,
    order=0,
    mode='reflect',
    cval=0.0,
    # radius=3,
    truncate=1.0,
)
imshow_kwargs = dict(
        norm=LogNorm(vmin=0.0001, vmax=0.01),
        cmap="YlOrBr",
        interpolation="nearest",
        # interpolation="none",
)

fig, ax = plt.subplots(1, 1, figsize=(8,8) )
ax.imshow(_mat, **imshow_kwargs)
ax.set_xlim(0, _reg2w)
ax.set_ylim(_reg1w, 0)
ax.set_xticks([])
ax.set_yticks([])

# # draw boxes around clustered pixels ...
# _big_boxes_kwargs = dict(loc="upper", lw=1.5, ec="k", fc="none", halo=0, ext_width=0)
# for box in rectangles_around_dots_ww( _bedpe_region, _the_tile, **_big_boxes_kwargs ):
#     ax.add_patch(box)
_big_boxes_kwargs = dict(lw=1.5, ec="k", fc="none", halo=0, ext_width=0)
for box in draw_ondiag_domains(_domains_region, _the_tile, **_big_boxes_kwargs):
    ax.add_patch(box)
# draw boxes around clustered pixels ...
_big_boxes_kwargs = dict(loc="upper", lw=1.5, ec="blue", fc="none", halo=0, ext_width=0)
for box in rectangles_around_dots_ww( _the_dots_region, _the_tile, **_big_boxes_kwargs ):
    ax.add_patch(box)
# draw boxes around clustered pixels ...
_big_boxes_kwargs = dict(loc="upper", lw=1.5, ec="red", fc="none", halo=0, ext_width=0)
for box in rectangles_around_dots_ww( _domains_region, _the_tile, **_big_boxes_kwargs ):
    ax.add_patch(box)


# # draw boxes around clustered pixels ...
# _big_boxes_kwargs = dict(loc="upper", lw=1.5, ec="green", fc="none", halo=0, ext_width=1)
# for box in rectangles_around_dots_ww( _select_df_region, _the_tile, **_big_boxes_kwargs ):
#     ax.add_patch(box)

In [None]:
# ### Saving lists of dots ...
# # ! mkdir dots_10kb_MEGA_filtered_samples
# for _sample, _dots in dots_dict.items():
#     _dots = dots_dict[_sample]
#     # print(f"number of dots BEFORE filtration {len(_dots)}")
#     # print(_dots["c_size"].value_counts().sort_index().iloc[:5])
#     _dots = cooltools.api.dotfinder.cluster_filtering_hiccups(
#         _dots,
#         obs_raw_name='count',
#         enrichment_factor_vh=1.0,
#         enrichment_factor_d_and_ll=1.0,
#         enrichment_factor_d_or_ll=1.0,
#         # FDR_orphan_threshold=0.00785,
#         FDR_orphan_threshold=0.0075,
#     )
#     # print(f"number of dots AFTER filtration {len(_dots)}")
#     # print(_dots["c_size"].value_counts().sort_index().iloc[:5])
#     _dots\
#     .drop(columns=["region1","region2","region"]) \
#     .to_csv(f"./dots_10kb_MEGA_filtered_samples/{_sample}_10kb_wheader.bedpe",sep="\t",index=False)

# Legacy dot exploration is below ...

### Pileups ...

In [None]:
dots_dict = {"final": _final_dots}

In [None]:
for sample, dots_df in dots_dict.items():
    print(f"{sample} has {len(dots_df)} dots ")
# note to myself - cooltools pileup didn't like my dots because they were already annotated ...
dots_filter_dict = {}
for sample, _dots in dots_dict.items():
    dots_filter_dict[sample] = _dots.drop(columns=["region1","region2","region"])

In [None]:
sample_clrs

In [None]:
sample_exp_cis.keys()

In [None]:
# dist_bins = [0, 100_000, 250_000, 500_000, 1_000_000, 2_500_000, 5_000_000, 10_000_000, 20_000_000]

# run and organize expecteds:
flank=100_000
pileup_kwargs = dict(
    view_df=hg38_arms,
    expected_value_col="balanced.avg.smoothed",
    flank=flank,
    nproc=32
)

pups_dist = {}
for _sample_dots in key_samples:
    print(f"piling up dots called in {_sample_dots} ...")
    _dots = dots_filter_dict["final"]
    pups_dist[_sample_dots] = {}
    for _sample_clr in sample_clrs:
        print(f"... using {_sample_clr} cooler ...")
        _clr = sample_clrs[_sample_clr]
        pups_dist[_sample_dots][_sample_clr] = {}
        _exp = sample_exp_cis[_sample_clr]
        if len(_dots) > 1:
            _stack = cooltools.pileup( _clr, _dots, expected_df=_exp, **pileup_kwargs)
        else:
            _stack = np.zeros((2*int(flank/binsize)+1,2*int(flank/binsize)+1,2))
        pups_dist[_sample_dots][_sample_clr] = _stack



In [None]:
# ...
# grp = _dots.groupby(pd.cut(_dots.eval("start2 - start1"), bins=[0,100_000,250_000,500_000,2_500_000,15_000_000]))
grp = _dots.groupby(pd.cut(_dots.eval("start2 - start1"), bins=[0,15_000_000]))
# # dist
# dist_ranges_index = pd.IntervalIndex.from_arrays(dist_bins[:-1],dist_bins[1:])
# _particular_dist_range = dist_ranges_index[2]
# print(f"plotting pileups in distance range {_particular_dist_range} ...")

fig, axs = plt.subplots(nrows=len(grp),ncols=len(sample_clrs),figsize=(14,12),sharex=True,sharey=True)

imshow_kwargs = dict(
    norm=mpl.colors.LogNorm(1/5,5),
    cmap='RdBu_r',
)

sample_dot = "mG1s_MEGA"

# for i, sample_dot in enumerate(key_samples):
for i, (_name, _grp_df) in enumerate(grp):
    # _dots = dots_filter_dict["final"]
    # print(len(_dots))
    _idx = _grp_df.index
    # _idx = np.asarray(_dots.query(" 2_000_000 < start2 - start1 < 40_000_000").index)
    for j, sample_clr in enumerate(sample_clrs):
        ax = axs[j]
        # print(pups_dist[sample_dot][sample_clr].shape)
        _pup = np.nanmean(pups_dist[sample_dot][sample_clr][_idx], axis=0)
        im = ax.imshow( _pup, **imshow_kwargs)
        if i == 0:
            ax.set_title(f"clr:{sample_clr}", fontsize=8)
        if j == 0:
            ax.set_ylabel(f"set:{sample_dot}", fontsize=8)

ticks_pixels = np.linspace(0, flank*2//binsize,5)
ticks_kbp = ((ticks_pixels-ticks_pixels[-1]/2)*binsize//1000).astype(int)

for ax in axs.ravel():
    ax.set_xticks(ticks_pixels)
    ax.set_yticks(ticks_pixels)
    ax.set_xticklabels(ticks_kbp)
    ax.set_yticklabels(ticks_kbp)

fig.colorbar(
    im,
    ax = axs.ravel().tolist(),
    label="obs/exp",
    ticks=[0.25,1,4],
)

# fig.colorbar(
#     im,
#     ax = ax,
#     label="obs/exp",
#     ticks=[0.25,1,4],
# )

In [None]:
# # dist
# dist_ranges_index = pd.IntervalIndex.from_arrays(dist_bins[:-1],dist_bins[1:])
# _particular_dist_range = dist_ranges_index[2]
# print(f"plotting pileups in distance range {_particular_dist_range} ...")

fig, axs = plt.subplots(nrows=1,ncols=len(key_samples),figsize=(14,12),sharex=True,sharey=True)

imshow_kwargs = dict(
    norm=mpl.colors.LogNorm(1/20,20),
    cmap='RdBu_r',
)

for i, sample_dot in enumerate(key_samples):
    _dots = dots_filter_dict["final"]
    # print(len(_dots))
    _idx = np.asarray(_dots.query(" 2_000_000 < start2 - start1 < 40_000_000").index)
    for j, sample_clr in enumerate(key_samples):
        ax = axs#[j]
        # print(pups_dist[sample_dot][sample_clr].shape)
        _pup = np.nanmean(pups_dist[sample_dot][sample_clr][_idx], axis=0)
        im = ax.imshow( _pup, **imshow_kwargs)
        if i == 0:
            ax.set_title(f"clr:{sample_clr}", fontsize=8)
        if j == 0:
            ax.set_ylabel(f"set:{sample_dot}", fontsize=8)

ticks_pixels = np.linspace(0, flank*2//binsize,5)
ticks_kbp = ((ticks_pixels-ticks_pixels[-1]/2)*binsize//1000).astype(int)

# for ax in axs.ravel():
ax.set_xticks(ticks_pixels)
ax.set_yticks(ticks_pixels)
ax.set_xticklabels(ticks_kbp)
ax.set_yticklabels(ticks_kbp)

# fig.colorbar(
#     im,
#     ax = axs.ravel().tolist(),
#     label="obs/exp",
#     ticks=[0.25,1,4],
# )

fig.colorbar(
    im,
    ax = ax,
    label="obs/exp",
    ticks=[0.25,1,4],
)

In [None]:
# # dist
# dist_ranges_index = pd.IntervalIndex.from_arrays(dist_bins[:-1],dist_bins[1:])
# _particular_dist_range = dist_ranges_index[2]
# print(f"plotting pileups in distance range {_particular_dist_range} ...")

fig, axs = plt.subplots(nrows=len(key_samples),ncols=len(key_samples),figsize=(14,12),sharex=True,sharey=True)

imshow_kwargs = dict(
    norm=mpl.colors.LogNorm(1/5,5),
    cmap='RdBu_r',
)

for i, sample_dot in enumerate(key_samples):
    _dots = dots_filter_dict[sample_dot]
    # print(len(_dots))
    _idx = np.asarray(_dots.query(" 2_000_000 < start2 - start1 < 40_000_000").index)
    for j, sample_clr in enumerate(key_samples):
        ax = axs[i,j]
        # print(pups_dist[sample_dot][sample_clr].shape)
        _pup = np.nanmean(pups_dist[sample_dot][sample_clr][_idx], axis=0)
        im = ax.imshow( _pup, **imshow_kwargs)
        if i == 0:
            ax.set_title(f"clr:{sample_clr}", fontsize=8)
        if j == 0:
            ax.set_ylabel(f"set:{sample_dot}", fontsize=8)

ticks_pixels = np.linspace(0, flank*2//binsize,5)
ticks_kbp = ((ticks_pixels-ticks_pixels[-1]/2)*binsize//1000).astype(int)

for ax in axs.ravel():
    ax.set_xticks(ticks_pixels)
    ax.set_yticks(ticks_pixels)
    ax.set_xticklabels(ticks_kbp)
    ax.set_yticklabels(ticks_kbp)

fig.colorbar(
    im,
    ax = axs.ravel().tolist(),
    label="obs/exp",
    ticks=[0.25,1,4],
)

In [None]:
bedpe_cols = ["chrom1",
"start1",
"end1",
"chrom2",
"start2",
"end2",]

bedpe_dtype = {
"start1":int,
"end1":int,
"start2":int,
"end2":int}


In [None]:
key_samples

In [None]:
dots_filter_dict["mG1s_MEGA"]

In [None]:
def bedpe_to_anchors(
    bedpe_df,
    view_df,  # for sorting !
    cols1 = ["chrom1", "start1", "end1"],
    cols2 = ["chrom2", "start2", "end2"],
    mode="cluster"
):
    """
    turning bedpe interactions to a bed of anchors - the simple way

    mode - allow for several way to merge upstream and downstream
    anchors. cluster, max_size, max_valency, median
    """
    _cols = ["chrom", "start", "end"]
    _cluster_cols = ["chrom", "cluster_start", "cluster_end"]
    # concat left and right anchors ...
    _bed = pd.concat(
        [
            bedpe_df[cols1].rename(columns={c1:c for c1,c in zip(cols1, _cols)}),
            bedpe_df[cols2].rename(columns={c2:c for c2,c in zip(cols2, _cols)}),
        ],
        ignore_index=True,
    )
    # clustering anchors - define clusters of overlaping anchors ...
    _anchors = bioframe.cluster(
        bioframe.sort_bedframe(_bed, view_df=view_df),
        min_dist=None,
        return_input=True,
    ).reset_index(drop=True)
    if mode == "cluster":
        # simply return resulting clusters - i.e. total footprint of clustered anchors ...
        _anchors = _anchors.drop_duplicates(subset=_cluster_cols).reset_index(drop=True)
        # calculate size just in case
        _anchors["size"] = _anchors[_cluster_cols[2]] - _anchors[_cluster_cols[1]]
        # return _anchors with coordinates rename as needed !
        return _anchors.drop(columns=["start","end"]).rename(columns={"cluster_start":"start", "cluster_end":"end"})
    elif mode == "max_size":
        # return the largest anchor per cluster - size of anchors, not clusters !!!
        _anchors["size"] = _anchors[_cols[2]] - _anchors[_cols[1]]
        _largest_anchor_idx = _anchors.groupby("cluster")["size"].idxmax()
        _anchors = _anchors.loc[_largest_anchor_idx]
        # return _anchors - i.e. the largest anchor per cluster of overlaping anchors
        return _anchors.drop(columns=["cluster_start","cluster_end"]).reset_index(drop=True)
    elif mode == "median":
        # return the of start and end coords per cluster of overlaping anchors ...
        _anchors = _anchors.groupby("cluster").agg({"chrom":"first", "start":"median", "end":"median"})
        _anchors = _anchors.reset_index().astype({"start":int, "end":int})
        _anchors["size"] = _anchors[_cols[2]] - _anchors[_cols[1]]
        # return _anchors - i.e. the largest anchor per cluster of overlaping anchors
        return _anchors.reset_index(drop=True)
    else:
        raise ValueError()



In [None]:
dots_filter_dict["mG1s_MEGA"].query("chrom1 == 'chr1'")

In [None]:
dots_filter_dict.keys()

In [None]:
bedpe_to_anchors(dots_filter_dict["mG1s_MEGA"], hg38_arms)
bedpe_to_anchors(dots_filter_dict["dldmicroc"], hg38_arms)

In [None]:
! mkdir dot_anchors_10kb_MEGA

In [None]:
dots_filter_dict.keys()

In [None]:
bedpe_to_anchors(dots_filter_dict["mG1s_MEGA"], hg38_arms)[["chrom", "start", "end", "size"]] \
    .to_csv("./dot_anchors_10kb_MEGA/mG1s_MEGA.bed",index=False,sep="\t")

In [None]:
! ls dot*

# What about other dots and their anchors ...

In [None]:
! ls dots_10kb_samples

In [None]:
dot_fnames = {
    "mega_ctrl": "dots_10kb_MEGA_samples/mG1s_MEGA_10kb_wheader.bedpe",
    "mega_depl": "dots_10kb_MEGA_samples/pG1s_MEGA_10kb_wheader.bedpe",
    "mega_mito": "dots_10kb_MEGA_samples/Ms_MEGA_10kb_wheader.bedpe",
    "cyto": "dots_10kb_samples/mCyto_10kb_wheader.bedpe",
}

# let's load them all into a dictionary ...
dots_dict = {}
for id_name, fname in dot_fnames.items():
    dots_dict[id_name] = pd.read_csv(fname, sep="\t")
    # ...
    print(f"loaded {len(dots_dict[id_name]):5d} dots {id_name:>20} in BEDPE format ...")

In [None]:
def crude_2d_overlap(df1,df2):

    df1 = bioframe.expand(df1, pad=2_500, side='both', cols=("chrom1","start1","end1"))
    df1 = bioframe.expand(df1, pad=2_500, side='both', cols=("chrom2","start2","end2"))
    xxx = bioframe.overlap(
        df1[bedpe_cols],
        df2[bedpe_cols],
        how='left',
        return_input=True,
        return_index=False,
        return_overlap=False,
        suffixes=('', '_'),
        keep_order=None,
        cols1=("chrom1","start1","end1"),
        cols2=("chrom1","start1","end1"),
        on=None,
    )

    xxx = xxx[~xxx.isna().any(axis=1)].reset_index(drop=True).astype(bedpe_dtype)
    xxx = xxx.drop_duplicates(subset=bedpe_cols).reset_index(drop=True)

    yyy = bioframe.overlap(
        xxx[bedpe_cols],
        df2[bedpe_cols],
        how='left',
        return_input=True,
        return_index=False,
        return_overlap=False,
        suffixes=('', '_'),
        keep_order=None,
        cols1=("chrom2","start2","end2"),
        cols2=("chrom2","start2","end2"),
        on=None,
    )

    yyy = yyy[~yyy.isna().any(axis=1)].reset_index(drop=True).astype(bedpe_dtype)
    yyy = yyy.drop_duplicates(subset=bedpe_cols).reset_index(drop=True)

    return len(yyy)


In [None]:
# df1 = dots_dict["mMito"]
# df2 = dots_dict["m5hR1R2"]

# xxx = bioframe.overlap(
#     df1[bedpe_cols],
#     df2[bedpe_cols],
#     how='left',
#     return_input=True,
#     return_index=False,
#     return_overlap=False,
#     suffixes=('', '_'),
#     keep_order=None,
#     cols1=("chrom1","start1","end1"),
#     cols2=("chrom1","start1","end1"),
#     on=None,
# )

# xxx = xxx[~xxx.isna().any(axis=1)].reset_index(drop=True).astype(bedpe_dtype)

# xxx = xxx.drop_duplicates(subset=bedpe_cols).reset_index(drop=True)


# yyy = bioframe.overlap(
#     xxx[bedpe_cols],
#     df2[bedpe_cols],
#     how='left',
#     return_input=True,
#     return_index=False,
#     return_overlap=False,
#     suffixes=('', '_'),
#     keep_order=None,
#     cols1=("chrom2","start2","end2"),
#     cols2=("chrom2","start2","end2"),
#     on=None,
# )

# yyy = yyy[~yyy.isna().any(axis=1)].reset_index(drop=True).astype(bedpe_dtype)
# yyy = yyy.drop_duplicates(subset=bedpe_cols).reset_index(drop=True)

# yyy

In [None]:
ccc = np.zeros((len(pup_samples),len(pup_samples)))

for i,s1 in enumerate(pup_samples):
    for j,s2 in enumerate(pup_samples):
        _num = crude_2d_overlap(dots_dict[s1],dots_dict[s2])
        print(f"{s1}-{s2}: {_num}")
        ccc[i,j] = _num


In [None]:
plt.imshow(ccc)
ax = plt.gca()
ax.set_xticks(np.arange(len(pup_samples)))
ax.set_yticks(np.arange(len(pup_samples)))

ax.set_xticklabels(pup_samples,rotation=90)
ax.set_yticklabels(pup_samples)

In [None]:
plt.imshow(np.log(ccc))

In [None]:
print(len(dots_dict["mCyto"]))
print(len(dots_dict["mTelo"]))


print(crude_2d_overlap(dots_dict["mCyto"],dots_dict["mTelo"]))
print(crude_2d_overlap(dots_dict["mTelo"],dots_dict["mCyto"]))

In [None]:
print(len(dots_dict["mCyto"]))
print(len(dots_dict["mTelo"]))


xxx = bioframe.overlap(
    dots_dict["mCyto"][bedpe_cols],
    dots_dict["mTelo"][bedpe_cols],
    how='left',
    return_input=True,
    return_index=False,
    return_overlap=False,
    suffixes=('', '_'),
    keep_order=None,
    cols1=("chrom1","start1","end1"),
    cols2=("chrom1","start1","end1"),
    on=None,
)


In [None]:
print(len(xxx[~xxx.isna().any(axis=1)].astype(bedpe_dtype)))



yyy = bioframe.overlap(
    xxx[~xxx.isna().any(axis=1)].astype(bedpe_dtype)[bedpe_cols],
    dots_dict["mTelo"][bedpe_cols],
    how='left',
    return_input=True,
    return_index=False,
    return_overlap=False,
    suffixes=('', '_'),
    keep_order=None,
    cols1=("chrom2","start2","end2"),
    cols2=("chrom2","start2","end2"),
    on=None,
)


In [None]:
yyy[~yyy.isna().any(axis=1)]

In [None]:

dots_dict["mTelo"]

In [None]:
dots_dict["mCyto"]

In [None]:
"chrom1",
"start1",
"end1",
"chrom2",
"start2",
"end2",