In [1]:
%%javascript
require(["codemirror/keymap/sublime", "notebook/js/cell", "base/js/namespace"],
    function(sublime_keymap, cell, IPython) {
        cell.Cell.options_default.cm_config.keyMap = 'sublime';
        var cells = IPython.notebook.get_cells();
        for(var cl=0; cl< cells.length ; cl++){
            cells[cl].code_mirror.setOption('keyMap', 'sublime');
        }
    }
);

<IPython.core.display.Javascript object>

In [2]:
# change the cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width: 90% !important; }</style>"))

In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
import warnings
warnings.filterwarnings('ignore')
from more_itertools import chunked
from pathlib import Path
# import all samples as dictionary ...
from samples import *

In [4]:
import os
os.environ["MKL_NUM_THREADS"] = "1" 
os.environ["NUMEXPR_NUM_THREADS"] = "1" 
os.environ["OMP_NUM_THREADS"] = "1" 

In [5]:
# %matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.style.use('seaborn-white')
import seaborn as sns
import multiprocess as mp
import numpy as np
import pandas as pd
import bioframe
import cooltools
import cooler
from matplotlib.gridspec import GridSpec
# import bbi
from cooltools import snipping

In [6]:
# ins_samples = {
#     "Hap1-WT-combined.hg19" : "Hap1-WT-combined.mcool",
# }

ins_samples = {
    #cooler locations, some expected at these locations
    # CTCF degron
    "CkoCT442_NT_pool.hg19" : "/data/alv/CTCF_degron/data/CkoCT442-NT-pool.mcool",
    "CkoCT442_IAA_pool.hg19" : "/data/alv/CTCF_degron/data/CkoCT442-IAA-pool.mcool",

    #polII degron
    "PolII-NT.hg19" : "/data/alv/polII_degron/data/20200417_remap_polIIdegron/coolers_library_group/PTB2539-NT.hg19.mapq_30.1000.mcool",
    "PolII-IAA.hg19" : "/data/alv/polII_degron/data/20200417_remap_polIIdegron/coolers_library_group/PTB2539-IAA.hg19.mapq_30.1000.mcool",

    #AAVS1 clone
    "mutControl-NT.hg19" : "/data/alv/CTCF_degron/data/DDX55-TAF5L-ctrl-pool/CkoCT442-AAVS1-NT-pool.mcool",
    "mutControl-IAA.hg19" : "/data/alv/CTCF_degron/data/DDX55-TAF5L-ctrl-pool/CkoCT442-AAVS1-IAA-pool.mcool",

    #DDX55 clones
    "mutDDX55-NT.hg19" : "/data/alv/CTCF_degron/data/DDX55-TAF5L-ctrl-pool/DDX55-clones-NT.hg19.mapq_30.1000.mcool",
    "mutDDX55-IAA.hg19" : "/data/alv/CTCF_degron/data/DDX55-TAF5L-ctrl-pool/DDX55-clones-IAA.hg19.mapq_30.1000.mcool",

    #TAF5L clones
    "mutTAF5L-NT.hg19" : "/data/alv/CTCF_degron/data/DDX55-TAF5L-ctrl-pool/TAF5L-clones-NT.hg19.mapq_30.1000.mcool",
    "mutTAF5L-IAA.hg19" : "/data/alv/CTCF_degron/data/DDX55-TAF5L-ctrl-pool/TAF5L-clones-IAA.hg19.mapq_30.1000.mcool",

    #siCONTROL
    "siControl-NT.hg19" : "/data/alv/CTCF_degron/data/siCTRL-NT.hg19.mapq_30.1000.mcool",
    "siControl-IAA.hg19" : "/data/alv/CTCF_degron/data/siCTRL-IAA.hg19.mapq_30.1000.mcool",

    #siDDX55
    "siDDX55-NT.hg19" : "/data/alv/CTCF_degron/data/siDDX55-NT.hg19.mapq_30.1000.mcool",
    "siDDX55-IAA.hg19" : "/data/alv/CTCF_degron/data/siDDX55-IAA.hg19.mapq_30.1000.mcool",

    #siTAF5L
    "siTAF5L-NT.hg19" : "/data/alv/CTCF_degron/data/siTAF5L-NT.hg19.mapq_30.1000.mcool",
    "siTAF5L-IAA.hg19" : "/data/alv/CTCF_degron/data/siTAF5L-IAA.hg19.mapq_30.1000.mcool",

    #RAD21 degron
    "RAD21-NT.hg19" : "/data/alv/CTCF_degron/data/RAD21-AID-NT.hg19.mapq_30.1000.mcool",
    "RAD21-IAA.hg19" : "/data/alv/CTCF_degron/data/RAD21-AID-IAA-6H.hg19.mapq_30.1000.mcool",

    #PlaB splicing inhibition
    "CtrlPlaB-NT.hg19" : "/data/alv/CTCF_degron/data/NT-hg19-combined-90000000.mcool",
    "CtrlPlaB-IAA.hg19" : "/data/alv/CTCF_degron/data/IAA-hg19-combined-90000000.mcool",
    "PlaB-NT.hg19" : "/data/alv/CTCF_degron/data/NT-PlaB-hg19-combined-90000000.mcool",
    "PlaB-IAA.hg19" : "/data/alv/CTCF_degron/data/IAA-PlaB-hg19-combined-90000000.mcool",

    #compare with WT
    "Ctrl500M-noTIR1.hg19" : "/data/alv/CTCF_degron/data/WT-44-442-pool/Hap1-WT-combined-500000000.mcool",
    "Ctrl500M-wtHAP1.hg19" : "/data/alv/CTCF_degron/data/WT-44-442-pool/CkoC44-NO-TIR1-pool.mcool",
    "Ctrl500M-CT442-NT.hg19" : "/data/alv/CTCF_degron/data/WT-44-442-pool/CkoCT442-NT-pool-500000000.mcool",
    "Ctrl500M-CT442-IAA.hg19" : "/data/alv/CTCF_degron/data/WT-44-442-pool/CkoCT442-IAA-pool-500000000.mcool",
}

In [7]:
# ins_samples = {
#     "Hap1-WT-combined.hg19" : f"Hap1-WT-combined.mcool",
# }

binsize = 5_000
binsize_human = f"{int(binsize/1_000)}kb"


exp_samples = {
    #cooler locations, some expected at these locations
    # CTCF degron
    "CkoCT442_NT_pool.hg19" : f"/data/alv/CTCF_degron/data/CkoCT442-NT-pool.{binsize_human}.cis.exp.tsv",
    "CkoCT442_IAA_pool.hg19" : f"/data/alv/CTCF_degron/data/CkoCT442-IAA-pool.{binsize_human}.cis.exp.tsv",

    #polII degron
    "PolII-NT.hg19" : f"/data/alv/polII_degron/data/20200417_remap_polIIdegron/coolers_library_group/PTB2539-NT.hg19.mapq_30.1000.{binsize_human}.cis.exp.tsv",
    "PolII-IAA.hg19" : f"/data/alv/polII_degron/data/20200417_remap_polIIdegron/coolers_library_group/PTB2539-IAA.hg19.mapq_30.1000.{binsize_human}.cis.exp.tsv",

    #AAVS1 clone
    "mutControl-NT.hg19" : f"/data/alv/CTCF_degron/data/DDX55-TAF5L-ctrl-pool/CkoCT442-AAVS1-NT-pool.{binsize_human}.cis.exp.tsv",
    "mutControl-IAA.hg19" : f"/data/alv/CTCF_degron/data/DDX55-TAF5L-ctrl-pool/CkoCT442-AAVS1-IAA-pool.{binsize_human}.cis.exp.tsv",

    #DDX55 clones
    "mutDDX55-NT.hg19" : f"/data/alv/CTCF_degron/data/DDX55-TAF5L-ctrl-pool/DDX55-clones-NT.hg19.mapq_30.1000.{binsize_human}.cis.exp.tsv",
    "mutDDX55-IAA.hg19" : f"/data/alv/CTCF_degron/data/DDX55-TAF5L-ctrl-pool/DDX55-clones-IAA.hg19.mapq_30.1000.{binsize_human}.cis.exp.tsv",

    #TAF5L clones
    "mutTAF5L-NT.hg19" : f"/data/alv/CTCF_degron/data/DDX55-TAF5L-ctrl-pool/TAF5L-clones-NT.hg19.mapq_30.1000.{binsize_human}.cis.exp.tsv",
    "mutTAF5L-IAA.hg19" : f"/data/alv/CTCF_degron/data/DDX55-TAF5L-ctrl-pool/TAF5L-clones-IAA.hg19.mapq_30.1000.{binsize_human}.cis.exp.tsv",

    #siCONTROL
    "siControl-NT.hg19" : f"/data/alv/CTCF_degron/data/siCTRL-NT.hg19.mapq_30.1000.{binsize_human}.cis.exp.tsv",
    "siControl-IAA.hg19" : f"/data/alv/CTCF_degron/data/siCTRL-IAA.hg19.mapq_30.1000.{binsize_human}.cis.exp.tsv",

    #siDDX55
    "siDDX55-NT.hg19" : f"/data/alv/CTCF_degron/data/siDDX55-NT.hg19.mapq_30.1000.{binsize_human}.cis.exp.tsv",
    "siDDX55-IAA.hg19" : f"/data/alv/CTCF_degron/data/siDDX55-IAA.hg19.mapq_30.1000.{binsize_human}.cis.exp.tsv",

    #siTAF5L
    "siTAF5L-NT.hg19" : f"/data/alv/CTCF_degron/data/siTAF5L-NT.hg19.mapq_30.1000.{binsize_human}.cis.exp.tsv",
    "siTAF5L-IAA.hg19" : f"/data/alv/CTCF_degron/data/siTAF5L-IAA.hg19.mapq_30.1000.{binsize_human}.cis.exp.tsv",

    #RAD21 degron
    "RAD21-NT.hg19" : f"/data/alv/CTCF_degron/data/RAD21-AID-NT.hg19.mapq_30.1000.{binsize_human}.cis.exp.tsv",
    "RAD21-IAA.hg19" : f"/data/alv/CTCF_degron/data/RAD21-AID-IAA-6H.hg19.mapq_30.1000.{binsize_human}.cis.exp.tsv",

    #PlaB splicing inhibition
    "CtrlPlaB-NT.hg19" : f"/data/alv/CTCF_degron/data/NT-hg19-combined-90000000.{binsize_human}.cis.exp.tsv",
    "CtrlPlaB-IAA.hg19" : f"/data/alv/CTCF_degron/data/IAA-hg19-combined-90000000.{binsize_human}.cis.exp.tsv",
    "PlaB-NT.hg19" : f"/data/alv/CTCF_degron/data/NT-PlaB-hg19-combined-90000000.{binsize_human}.cis.exp.tsv",
    "PlaB-IAA.hg19" : f"/data/alv/CTCF_degron/data/IAA-PlaB-hg19-combined-90000000.{binsize_human}.cis.exp.tsv",

    #compare with WT
    "Ctrl500M-noTIR1.hg19" : f"/data/alv/CTCF_degron/data/WT-44-442-pool/Hap1-WT-combined-500000000.{binsize_human}.cis.exp.tsv",
    "Ctrl500M-wtHAP1.hg19" : f"/data/alv/CTCF_degron/data/WT-44-442-pool/CkoC44-NO-TIR1-pool.{binsize_human}.cis.exp.tsv",
    "Ctrl500M-CT442-NT.hg19" : f"/data/alv/CTCF_degron/data/WT-44-442-pool/CkoCT442-NT-pool-500000000.{binsize_human}.cis.exp.tsv",
    "Ctrl500M-CT442-IAA.hg19" : f"/data/alv/CTCF_degron/data/WT-44-442-pool/CkoCT442-IAA-pool-500000000.{binsize_human}.cis.exp.tsv",
}

In [8]:
from matplotlib.colors import LogNorm
# https://stackoverflow.com/questions/48625475/python-shifted-logarithmic-colorbar-white-color-offset-to-center
class MidPointLogNorm(LogNorm):
    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        LogNorm.__init__(self, vmin=vmin, vmax=vmax, clip=clip)
        self.midpoint=midpoint
    def __call__(self, value, clip=None):
        result, is_scalar = self.process_value(value)
        x, y = [np.log(self.vmin), np.log(self.midpoint), np.log(self.vmax)], [0, 0.5, 1]
        return np.ma.array(np.interp(np.log(value), x, y), mask=result.mask, copy=False)

In [9]:
# Use bioframe to fetch the genomic features from the UCSC.
hg19_chromsizes = bioframe.fetch_chromsizes('hg19', as_bed=True)
hg19_cens = bioframe.fetch_centromeres('hg19')
hg19_arms = bioframe.split(hg19_chromsizes, hg19_cens, cols_points=['chrom', 'mid'])

# Select only chromosomes that are present in the cooler. 
# This step is typically not required! we call it only because the test data are reduced. 
hg19_chromsizes = hg19_chromsizes.set_index("chrom").loc[autosomal_chroms].reset_index() 
hg19_arms = hg19_arms.set_index("chrom").loc[autosomal_chroms].reset_index()
hg19_arms["name"] = [f"{chrom}{arm}" for chrom in autosomal_chroms for arm in list('pq')]
# call this to automaticly assign names to chromosomal arms:
hg19_arms = bioframe.make_viewframe(hg19_arms)
# # hg19_arms.to_csv("hg19_arms.bed",sep="\t",index=False,header=False)

In [10]:
# conditions = ['442-NT','442-IAA']
# binsize = 2000
# binsize_human = f"{int(binsize/1_000)}kb"

# cooler_paths = {    
#     '442-NT' : f'/data/alv/CTCF_degron/data/CkoCT442-NT-pool.mcool::resolutions/{binsize}',
#     '442-IAA' : f'/data/alv/CTCF_degron/data/CkoCT442-IAA-pool.mcool::resolutions/{binsize}',
# }
# long_names = {
#     '442-NT' : 'CTCFdegron-442-nontreated-pool',
#     '442-IAA': 'CTCFdegron-442-auxin-pool',
# }

# pal = sns.color_palette('colorblind')
# colors = {
#     '442-NT': "#e74c3c",
#     '442-IAA': pal[2],
# }

# clrs = {
#     cond: cooler.Cooler(cooler_paths[cond]) for cond in conditions
# }

In [11]:
# # iterate over samples to calculate expected on:
# for cond in conditions:
#     lname,*_ = cooler_paths[cond].split(".mcool")
#     target_exp_file = Path(f"{lname}.{binsize_human}.cis.exp.tsv")
#     if target_exp_file.is_file():
#         print("already exist !")
#         print(target_exp_file)
#         continue
#     else:
#         print("working on ...")
#         print(cond,lname)
#         ######################################
#         !cooltools compute-expected \
#             --nproc 4 \
#             --output {target_exp_file} \
#             --regions hg19_arms.bed \
#             --contact-type cis \
#             --balance \
#             --weight-name weight \
#             --ignore-diags 2 \
#             {cooler_paths[cond]}
#         ######################################


In [12]:
# expected_paths = {    
#     '442-NT' : f'/data/alv/CTCF_degron/data/CkoCT442-NT-pool.{binsize_human}.cis.exp.tsv',
#     '442-IAA' : f'/data/alv/CTCF_degron/data/CkoCT442-IAA-pool.{binsize_human}.cis.exp.tsv',
# }
# expecteds = {
#     cond: pd.read_csv(expected_paths[cond], sep="\t") for cond in conditions
# }

### Let's read in the "boundaries" - i.e. CTCF

In [13]:
ctcf_df = bioframe.read_table("intersect-all-NT-CTCF-NO-G4-centered-RAD21",schema="bed3",index_col=False)
ctcf_df = ctcf_df.sort_values(["chrom","start"])
ctcf_df = ctcf_df[ctcf_df["chrom"].isin(autosomal_chroms)]
ctcf_df = ctcf_df.reset_index(drop=True)
display(ctcf_df.head(3))

boundaries = ctcf_df

Unnamed: 0,chrom,start,end
0,chr1,10151,10508
1,chr1,804937,805691
2,chr1,875596,875897


### Address some inconsistencies with the current state of bioframe/cooltools interoperability ...

In [14]:
def assign_regions(features, supports):
    """
    For each feature in features dataframe assign the genomic region (support)
    that overlaps with it. In case if feature overlaps multiple supports, the
    region with largest overlap will be reported.
    """

    index_name = features.index.name  # Store the name of index
    features = (
        features.copy().reset_index()
    )  # Store the original features' order as a column with original index

    if "chrom" in features.columns:
        overlap = bioframe.overlap(
            features,
            supports,
            how="left",
            cols1=["chrom", "start", "end"],
            cols2=["chrom", "start", "end"],
            suffixes=('_1', '_2'),
            keep_order=True,
            return_overlap=True,
        )
        overlap_columns = overlap.columns  # To filter out duplicates later
        overlap["overlap_length"] = overlap["overlap_end"] - overlap["overlap_start"]
        # Filter out overlaps with multiple regions:
        overlap = (
            overlap.sort_values("overlap_length", ascending=False)
            .drop_duplicates(overlap_columns, keep="first")
            .sort_index()
        )
        # Copy single column with overlapping region name:
        features["region"] = overlap["name_2"]

    if "chrom1" in features.columns:
        for idx in ("1", "2"):
            overlap = bioframe.overlap(
                features,
                supports,
                how="left",
                cols1=[f"chrom{idx}", f"start{idx}", f"end{idx}"],
                cols2=[f"chrom", f"start", f"end"],
                keep_order=True,
                return_overlap=True,
            )
            overlap_columns = overlap.columns  # To filter out duplicates later
            overlap[f"overlap_length{idx}"] = (
                overlap[f"overlap_end{idx}"] - overlap[f"overlap_start{idx}"]
            )
            # Filter out overlaps with multiple regions:
            overlap = (
                overlap.sort_values(f"overlap_length{idx}", ascending=False)
                .drop_duplicates(overlap_columns, keep="first")
                .sort_index()
            )
            # Copy single column with overlapping region name:
            features[f"region{idx}"] = overlap["name_2"]

        # Form a single column with region names where region1 == region2, and np.nan in other cases:
        features["region"] = np.where(
            features["region1"] == features["region2"], features["region1"], np.nan
        )
        features = features.drop(
            ["region1", "region2"], axis=1
        )  # Remove unnecessary columns

    features = features.set_index(
        index_name if not index_name is None else "index"
    )  # Restore the original index
    features.index.name = index_name  # Restore original index title
    return features

In [15]:
flank = 200_000
windows = snipping.make_bin_aligned_windows(
    binsize, 
    boundaries['chrom'], 
    (boundaries['start'] + boundaries['end'])//2,
    flank_bp=flank)
windows = assign_regions(windows, hg19_arms)
windows = windows.dropna()
arm_sites = boundaries.loc[windows.index,:].reset_index(drop=True)
windows = windows.reset_index(drop=True)
print(len(windows), 'windows, after assigning supports')
windows.head()

9347 windows, after assigning supports


Unnamed: 0,chrom,start,end,lo,hi,region
0,chr1,-190000,215000,-38,43,chr1p
1,chr1,605000,1010000,121,202,chr1p
2,chr1,675000,1080000,135,216,chr1p
3,chr1,715000,1120000,143,224,chr1p
4,chr1,855000,1260000,171,252,chr1p


In [17]:
snipping.pileup?

In [18]:
# key_samples = ["Ctrl500M-CT442-NT.hg19"]#, "Ctrl500M-CT442-IAA.hg19"]

key_samples = ["CkoCT442_NT_pool.hg19",
                "CkoCT442_IAA_pool.hg19",
                "PolII-NT.hg19",
                "PolII-IAA.hg19",
                "mutControl-NT.hg19",
                "mutControl-IAA.hg19",
                "mutDDX55-NT.hg19",
                "mutDDX55-IAA.hg19",
                "mutTAF5L-NT.hg19",
                "mutTAF5L-IAA.hg19",
                "siControl-NT.hg19",
                "siControl-IAA.hg19",
                "siDDX55-NT.hg19",
                "siDDX55-IAA.hg19",
                "siTAF5L-NT.hg19",
                "siTAF5L-IAA.hg19",
                "RAD21-NT.hg19",
                "RAD21-IAA.hg19",
                "CtrlPlaB-NT.hg19",
                "CtrlPlaB-IAA.hg19",
                "PlaB-NT.hg19",
                "PlaB-IAA.hg19",
                "Ctrl500M-noTIR1.hg19",
                "Ctrl500M-wtHAP1.hg19",
                "Ctrl500M-CT442-NT.hg19",
                "Ctrl500M-CT442-IAA.hg19"]

ctcf_pup_dict = {}



nproc = 12
if nproc > 1:
    pool = mp.Pool(nproc)
    map_ = pool.map
else:
    map_ = map


for sample in key_samples:
    print(f"working on {sample} ...")
    cname = ins_samples[sample]
    exp_name = exp_samples[sample]
    clr = cooler.Cooler(f"{cname}::/resolutions/{binsize}")
    exp_df = pd.read_csv(exp_name, sep="\t")
    # snipping ...
    snipper = snipping.ObsExpSnipper(clr, exp_df, regions=hg19_arms)
    stack = snipping.pileup(windows, snipper.select, snipper.snip, map=map_)
    # stacks[cond] = stack
    ctcf_pup_dict[sample] = np.nanmean(stack, axis=2)


working on CkoCT442_NT_pool.hg19 ...
working on CkoCT442_IAA_pool.hg19 ...
working on PolII-NT.hg19 ...
working on PolII-IAA.hg19 ...
working on mutControl-NT.hg19 ...
working on mutControl-IAA.hg19 ...
working on mutDDX55-NT.hg19 ...
working on mutDDX55-IAA.hg19 ...
working on mutTAF5L-NT.hg19 ...
working on mutTAF5L-IAA.hg19 ...
working on siControl-NT.hg19 ...
working on siControl-IAA.hg19 ...
working on siDDX55-NT.hg19 ...
working on siDDX55-IAA.hg19 ...
working on siTAF5L-NT.hg19 ...
working on siTAF5L-IAA.hg19 ...
working on RAD21-NT.hg19 ...
working on RAD21-IAA.hg19 ...
working on CtrlPlaB-NT.hg19 ...
working on CtrlPlaB-IAA.hg19 ...
working on PlaB-NT.hg19 ...
working on PlaB-IAA.hg19 ...
working on Ctrl500M-noTIR1.hg19 ...
working on Ctrl500M-wtHAP1.hg19 ...
working on Ctrl500M-CT442-NT.hg19 ...
working on Ctrl500M-CT442-IAA.hg19 ...


#### Save all of the results on disk using pickle - to avoid re-running it again

In [21]:
import pickle
file_to_store = open("PUP_DICT_5kb_200kb_CTCF_Fig6.pickle", "wb")
pickle.dump(ctcf_pup_dict, file_to_store)
# Save object to file
file_to_store.close()

In [19]:
# # the most time consuming step !!!
# stacks = {}
# piles = {}
# for cond in conditions:
#     snipper = snipping.ObsExpSnipper(clrs[cond], expecteds[cond], regions=hg19_arms)
#     stack = snipping.pileup(windows, snipper.select, snipper.snip)
#     stacks[cond] = stack
#     piles[cond] = np.nanmean(stack, axis=2)