# Let's run insulation on all of the samples here - just in case ...

In [None]:

# set the number of threads for many common libraries
from os import environ
N_THREADS = '1'
environ['OMP_NUM_THREADS'] = N_THREADS
environ['OPENBLAS_NUM_THREADS'] = N_THREADS
environ['MKL_NUM_THREADS'] = N_THREADS
environ['VECLIB_MAXIMUM_THREADS'] = N_THREADS
environ['NUMEXPR_NUM_THREADS'] = N_THREADS

# https://superfastpython.com/numpy-number-blas-threads/

In [None]:
# import standard python libraries
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import os, subprocess
import seaborn as sns
import multiprocess as mp

In [None]:
# Import python package for working with cooler files and tools for analysis
import cooler
import cooltools.lib.plotting

In [None]:
%load_ext autoreload
%autoreload 2
# from saddle import saddleplot

In [None]:
# download test data
# this file is 145 Mb, and may take a few seconds to download
import cooltools
from cooltools import insulation

import bioframe
from matplotlib.colors import LogNorm
from helper_func import saddleplot
from data_catalog import bws, bws_vlim, telo_dict, telo_reps_dict
from helper_func import get_stack, show_stacks


In [None]:
from mpire import WorkerPool
import warnings

In [None]:
from tqdm import tqdm
from tqdm.notebook import trange, tqdm
import warnings

In [None]:
# define genomic view that will be used to call dots and pre-compute expected

# Use bioframe to fetch the genomic features from the UCSC.
hg38_chromsizes = bioframe.fetch_chromsizes('hg38')
hg38_cens = bioframe.fetch_centromeres('hg38')
hg38_arms_full = bioframe.make_chromarms(hg38_chromsizes, hg38_cens)
# # remove "bad" chromosomes and near-empty arms ...
# excluded_arms = ["chr13_p", "chr14_p", "chr15_p", "chr21_p", "chr22_p", "chrM_p", "chrY_p", "chrY_q", "chrX_p", "chrX_q"]
# hg38_arms = hg38_arms_full[~hg38_arms_full["name"].isin(excluded_arms)].reset_index(drop=True)

# # can do 1 chromosome (or arm) as well ..
# included_arms = ["chr1_q", "chr2_p", "chr4_q", "chr6_q"]
included_arms = hg38_arms_full["name"].to_list()[:44] # all autosomal ones ...
hg38_arms = hg38_arms_full[hg38_arms_full["name"].isin(included_arms)].reset_index(drop=True)

# We'll just do 10kb binsize and 50/100 diamond-size and also 25 - diamond: 100; 150

In [None]:
# binsize = 10_000
# windows = [5*binsize, 10*binsize]

binsize10 = 10_000
binsize25 = 25_000

bins_diamonds_dict = {
    binsize10: [5*binsize10, 10*binsize10],
    binsize25: [4*binsize25, 6*binsize25],
}

## run the analysis on pooled replicates ...

In [None]:
telo_ins = {}
for binsize, windows in bins_diamonds_dict.items():
    print(f"working on {binsize} ...")
    # Pooled replicate coolers - the main working horse -ones ...
    # cooler files that we'll work on :
    telo_clrs = { _k: cooler.Cooler(f"{_path}::/resolutions/{binsize}") for _k, _path in telo_dict.items() }

    def _job(packed_data, sample):
        # unpack data
        clr_dict, wins, the_view = packed_data
        ins_kwargs = dict(chunksize=20000000, nproc=12)
        from cooltools import insulation
        import warnings
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=FutureWarning)
            warnings.filterwarnings("ignore", category=RuntimeWarning)
            _clr = clr_dict[sample]
            _ins = insulation(
                _clr,
                window_bp=wins,
                view_df=the_view,
                **ins_kwargs
            )
        return (sample, _ins)

    # have to use daemon=False, because _job is multiprocessing-based already ...
    with WorkerPool(
        n_jobs=8,
        daemon=False,
        shared_objects=(telo_clrs, windows, hg38_arms),
        start_method="forkserver",
        use_dill=True,
    ) as wpool:
        results = wpool.map(_job, list(telo_clrs.keys()), progress_bar=True)

    # sort out the results ...
    telo_ins[binsize] = {sample: _ins for sample, _ins in results}

In [None]:
# telo_ins[25_000]["mMito"]

In [None]:
! mkdir ins_bedgraph

### store bedGraphs ...

In [None]:
for binsize in [10_000, 25_000,]:
    # Write the insulation track as a bigwig:
    for k, _fname in telo_dict.items():
        # derive output name
        out_fname = f"{k}.{binsize//1_000}kb.bed"
        telo_ins[binsize][k].to_csv(
            f"ins_bedgraph/{out_fname}",
            sep="\t",
            index=False,
        )
        print(f"generated {out_fname} ...")

## run analysis on replicates - separately ...

In [None]:
telo_reps_ins = {}
for binsize, windows in bins_diamonds_dict.items():
    print(f"working on {binsize} ...")
    # Pooled replicate coolers - the main working horse -ones ...
    # cooler files that we'll work on :
    telo_reps_clrs = { _k: cooler.Cooler(f"{_path}::/resolutions/{binsize}") for _k, _path in telo_reps_dict.items() }

    def _job(packed_data, sample):
        # unpack data
        clr_dict, wins, the_view = packed_data
        ins_kwargs = dict(chunksize=20000000, nproc=12)
        from cooltools import insulation
        import warnings
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=FutureWarning)
            warnings.filterwarnings("ignore", category=RuntimeWarning)
            _clr = clr_dict[sample]
            _ins = insulation(
                _clr,
                window_bp=wins,
                view_df=the_view,
                **ins_kwargs
            )
        return (sample, _ins)

    # have to use daemon=False, because _job is multiprocessing-based already ...
    with WorkerPool(
        n_jobs=8,
        daemon=False,
        shared_objects=(telo_reps_clrs, windows, hg38_arms),
        start_method="forkserver",
        use_dill=True,
    ) as wpool:
        results = wpool.map(_job, list(telo_reps_clrs.keys()), progress_bar=True)

    # sort out the results ...
    telo_reps_ins[binsize] = {sample: _ins for sample, _ins in results}

### store R1/R2 bedGraphs ...

In [None]:
# ! mkdir ins_r1r2_bedgraph

In [None]:
for binsize in [10_000, 25_000,]:
    # Write the insulation track as a bigwig:
    for k, _fname in telo_reps_dict.items():
        # derive output name
        out_fname = f"{k}.{binsize//1_000}kb.bed"
        telo_reps_ins[binsize][k].to_csv(
            f"ins_r1r2_bedgraph/{out_fname}",
            sep="\t",
            index=False,
        )
        print(f"generated {out_fname} ...")

# TODO - save insulation tracks in different ways and work on them a bit ...

In [None]:
! ls *_bedgraph

In [None]:
! ls *_bedraph

In [None]:
sub_samples_m = [
    "mMito",
    "mTelo",
    "mCyto",
    "m5hR1R2",
    "m10hR1R2",
]
sub_samples_p = [
    "pMito",
    "pTelo",
    "pCyto",
    "p5hR1R2",
    "p10hR1R2",
]

fig, axs = plt.subplots(
    nrows=len(sub_samples_m),
    ncols=2,
    figsize=(16,10),
    sharex=True,
    sharey=True,
)

# a contiguous region ...
the_region = hg38_arms.set_index("name").loc["chr6_q"]
the_region = "chr6:80,000,000-130,000,000"
the_region = "chr6:80,000,000-130,000,000"
_start = 120_000_000
_width = 55_000_000
the_region = ("chr6",_start,_start+_width)

value_col = "log2_insulation_score_100000"

for sample_m, sample_p, (i, axs) in zip(sub_samples_m, sub_samples_p, enumerate(axs)):
    axm,axp = axs
    mins = telo_ins[25_000][sample_m]
    pins = telo_ins[25_000][sample_p]
    # select for a region
    mins = bioframe.select(mins, the_region)
    pins = bioframe.select(pins, the_region)
    axm.plot([0,len(mins)],[0,0],'k',lw=0.25)
    axm.plot( np.arange(0, len(mins)), mins[value_col], label=value_col,linewidth=0.5, color="gray")

    axp.plot([0,len(pins)],[0,0],'k',lw=0.25)
    axp.plot( np.arange(0, len(pins)), pins[value_col], label=value_col,linewidth=0.5, color="gray")

    axm.set_xlim(0, len(mins))
    axp.set_xlim(0, len(pins))

    axm.set_ylim(-1.1,1.1)
    axp.set_ylim(-1.1,1.1)

    axm.set_xticks([])
    axm.set_yticks([])
    axp.set_xticks([])
    axp.set_yticks([])
    if i == 0:
        axm.set_title("m")
        axp.set_title("p")
    axm.set_ylabel(sample_m.lstrip("m"))

In [None]:
sub_samples_m = [
    "mMito_R1",
    "mMito_R2",
    "mTelo",
    "mCyto",
    "m5h_R1",
    "m5h_R2",
    "m10h_R1",
    "m10h_R2",
]
sub_samples_p = [
    "pMito_R1",
    "pMito_R2",
    "pTelo",
    "pCyto",
    "p5h_R1",
    "p5h_R2",
    "p10h_R1",
    "p10h_R2",
]


fig, axs = plt.subplots(
    nrows=len(sub_samples_m),
    ncols=2,
    figsize=(16,10),
    sharex=True,
    sharey=True,
)

# a contiguous region ...
the_region = hg38_arms.set_index("name").loc["chr6_q"]
the_region = "chr6:80,000,000-130,000,000"
the_region = "chr6:80,000,000-130,000,000"
_start = 120_000_000
_width = 55_000_000
the_region = ("chr6",_start,_start+_width)

value_col = "log2_insulation_score_100000"

for sample_m, sample_p, (i, axs) in zip(sub_samples_m, sub_samples_p, enumerate(axs)):
    axm,axp = axs
    mins = telo_reps_ins[25_000][sample_m]
    pins = telo_reps_ins[25_000][sample_p]
    # select for a region
    mins = bioframe.select(mins, the_region)
    pins = bioframe.select(pins, the_region)
    axm.plot([0,len(mins)],[0,0],'k',lw=0.25)
    axm.plot( np.arange(0, len(mins)), mins[value_col], label=value_col,linewidth=0.5, color="gray")

    axp.plot([0,len(pins)],[0,0],'k',lw=0.25)
    axp.plot( np.arange(0, len(pins)), pins[value_col], label=value_col,linewidth=0.5, color="gray")

    axm.set_xlim(0, len(mins))
    axp.set_xlim(0, len(pins))

    axm.set_ylim(-1.1,1.1)
    axp.set_ylim(-1.1,1.1)

    axm.set_xticks([])
    axm.set_yticks([])
    axp.set_xticks([])
    axp.set_yticks([])
    if i == 0:
        axm.set_title("m")
        axp.set_title("p")
    axm.set_ylabel(sample_m.lstrip("m"))

In [None]:
ddd = {}
value_col = "log2_insulation_score_100000"

for sample in sub_samples_m+sub_samples_p:
    ins = telo_ins[sample]

    ddd[sample] = bioframe.select(ins, the_region)[value_col]
    # ddd[sample] = ins[value_col]

sns.heatmap(pd.DataFrame(ddd).corr(method='pearson'), annot=True, cmap="Reds",vmin=0,vmax=1)


In [None]:
telo_ins["pCyto"]["log2_insulation_score_100000"]

In [None]:
from sklearn import decomposition
# unused but required import for doing 3d projections with matplotlib < 3.2
import mpl_toolkits.mplot3d  # noqa: F401

_key = "log2_insulation_score_100000"

_common_mask = pd.concat(
    [_df[_key].rename(_sample) for _sample, _df in telo_ins.items()],
    axis=1,
).isna().any(axis=1).to_numpy()

_pca_samples = [
    # 'mTelo',
    # 'mCyto',
    'm5hR1R2',
    'm10hR1R2',
    # 'pTelo',
    # 'pCyto',
    'p5hR1R2',
    'p10hR1R2',
    # 'mp10hR1R2',
    'N93m5',
    'N93m10',
    'N93p5',
    'N93p10',
    # 'N93mp10',
]
_rest_samples = [_sample for _sample in telo_ins if (_sample not in _pca_samples)]


X = pd.concat(
    [telo_ins[_sample][_key].rename(_sample) for _sample in _pca_samples],
    axis=1,
).to_numpy()[~_common_mask].T



X_rest = pd.concat(
    [telo_ins[_sample][_key].rename(_sample) for _sample in _rest_samples],
    axis=1,
).to_numpy()[~_common_mask].T



pca = decomposition.PCA(n_components=5)
print("running PCA ...")
pca.fit(X)
X_trans = pca.transform(X)
X_rest_trans = pca.transform(X_rest)

print(pca.explained_variance_ratio_)

In [None]:

_x, _y, _z = X_trans[:,0], X_trans[:,1], X_trans[:,2]
plt.scatter(_x, _y, s=50, color="red")
ax = plt.gca()
for i, txt in enumerate(_pca_samples):
    ax.annotate(txt, (_x[i], _y[i]))

_x, _y, _z = X_rest_trans[:,0], X_rest_trans[:,1], X_rest_trans[:,2]
plt.scatter(_x, _y, s=50, color="blue")
for i, txt in enumerate(_rest_samples):
    ax.annotate(txt, (_x[i], _y[i]))



_1, _2, _3 = pca.explained_variance_ratio_[:3]
ax.set_xlabel(f"pc1 {_1:.2f}")
ax.set_ylabel(f"pc2 {_2:.2f}")


In [None]:
# # Write the insulation track as a bigwig:
# for k, _fname in telo_dict.items():
#     # # derive output name based on cooler's name
#     # kkk = clr_fnames[k]
#     _cname = _fname.split("/")[-1]
#     out_fname = ".".join( _cname.split(".")[:-1] )
#     # apparently insulation sometimes reports the same bin
#     df = telo_ins[k].drop_duplicates(subset=["chrom","start","end"])
#     # let's save only 50_000 bp window derived insulation ...
#     w = windows[0]
#     # store in bigwig ...
#     bioframe.to_bigwig(
#         df,
#         hg38_chromsizes,
#         f"{out_fname}.b{binsize}.insul.w{w}.bw",
#         value_field=f"log2_insulation_score_{w}",
#         path_to_binary="./bedGraphToBigWig"
#     )


In [None]:
from skimage.filters import threshold_li, threshold_otsu

In [None]:
histkwargs = dict(
    bins=10**np.linspace(-4,1,200),
    histtype='step',
    lw=2,
)

f, axs = plt.subplots(len(windows), 1, sharex=True, figsize=(6,5))
thresholds_li = {}
thresholds_otsu = {}
for i, (w, ax) in enumerate(zip(windows, axs)):
    _sample = "m5hR1R2"
    _ins = telo_ins[_sample]
    ax.hist(
        _ins[f'boundary_strength_{w}'],
        label=_sample,
        **histkwargs
    )
    _sample = "p5hR1R2"
    _ins = telo_ins[_sample]
    ax.hist(
        _ins[f'boundary_strength_{w}'],
        label=_sample,
        **histkwargs
    )
    ax.text(0.01, 0.9,
             f'Window {w//1000}kb',
             ha='left',
             va='top',
             transform=ax.transAxes)
    ax.set(
        xscale='log',
        ylabel='# boundaries'
    )
    ax.legend(frameon=False)

axs[-1].set(xlabel='Boundary strength')


In [None]:
histkwargs = dict(
    bins=10**np.linspace(-4,1,200),
    histtype='step',
    lw=2,
)

_sample = "m5hR1R2"
_ins = telo_ins[_sample]
f, axs = plt.subplots(len(windows), 1, sharex=True, figsize=(6,4))
f.suptitle(_sample)
for i, (w, ax) in enumerate(zip(windows, axs)):
    _ins_values = _ins[f'boundary_strength_{w}']
    ax.hist(
        _ins_values,
        label=_sample,
        **histkwargs
    )
    _li = threshold_li(_ins_values.dropna().values)
    _otsu = threshold_otsu(_ins_values.dropna().values)
    n_boundaries_li = (_ins_values.dropna()>=_li).sum()
    n_boundaries_otsu = (_ins_values.dropna()>=_otsu).sum()
    ax.axvline(_li, c='green')
    ax.axvline(_otsu, c='magenta')
    ax.text(0.01, 0.9,
             f'Window {w//1000}kb',
             ha='left',
             va='top',
             transform=ax.transAxes)
    ax.text(0.01, 0.7,
            f'{n_boundaries_otsu} boundaries (Otsu)',
            c='magenta',
            ha='left',
            va='top',
            transform=ax.transAxes)
    ax.text(0.01, 0.5,
            f'{n_boundaries_li} boundaries (Li)',
            c='green',
            ha='left',
            va='top',
            transform=ax.transAxes)
    ax.set(
        xscale='log',
        ylabel='# boundaries'
    )
axs[-1].set(xlabel='Boundary strength')


_sample = "p5hR1R2"
_ins = telo_ins[_sample]
f, axs = plt.subplots(len(windows), 1, sharex=True, figsize=(6,4))
f.suptitle(_sample)
for i, (w, ax) in enumerate(zip(windows, axs)):
    _ins_values = _ins[f'boundary_strength_{w}']
    ax.hist(
        _ins_values,
        label=_sample,
        **histkwargs
    )
    _li = threshold_li(_ins_values.dropna().values)
    _otsu = threshold_otsu(_ins_values.dropna().values)
    n_boundaries_li = (_ins_values.dropna()>=_li).sum()
    n_boundaries_otsu = (_ins_values.dropna()>=_otsu).sum()
    ax.axvline(_li, c='green')
    ax.axvline(_otsu, c='magenta')
    ax.text(0.01, 0.9,
             f'Window {w//1000}kb',
             ha='left',
             va='top',
             transform=ax.transAxes)
    ax.text(0.01, 0.7,
            f'{n_boundaries_otsu} boundaries (Otsu)',
            c='magenta',
            ha='left',
            va='top',
            transform=ax.transAxes)
    ax.text(0.01, 0.5,
            f'{n_boundaries_li} boundaries (Li)',
            c='green',
            ha='left',
            va='top',
            transform=ax.transAxes)
    ax.set(
        xscale='log',
        ylabel='# boundaries'
    )
axs[-1].set(xlabel='Boundary strength')

# Let's save some of those boundaries ...

In [None]:
_sample = "m5hR1R2"
_ins = telo_ins[_sample]
_ins_values = _ins[f'boundary_strength_50000']
_li = threshold_li(_ins_values.dropna().values)
_otsu = threshold_otsu(_ins_values.dropna().values)
# pick one threshold and extract the boundaries ...
_df = (_ins[_ins_values >= _otsu][["chrom","start","end"]]).reset_index(drop=True)
_df.to_csv(f"{_sample}_otsu_ins_boundaries_w50kb.bed",index=False,sep="\t")

In [None]:
! ls *.bed