In [None]:
#Use conda env cooltools_read_norm_env.yml to create conda env for this script
#This is an example script of calling dots/loops from deep Hi-C datasets (published, downloaded from 4DN website) 
#It is run from within the 'scripts' subdirectory, using following directory structure:
#Analysis_Dir
#├── data
#├── figures
#├── scripts
#├── lsf_jobs


In [None]:
#Calling dots at 5, 10kb bins on deep 4DN datasets to use for aggregate dot pileups for Topo II experiments

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import h5py
import bioframe
from matplotlib.gridspec import GridSpec
from matplotlib.gridspec import GridSpecFromSubplotSpec
import matplotlib.colors as colors
import cooler
import seaborn as sns
from matplotlib.colors import ListedColormap
import os
import matplotlib as mpl
import cooltools
import cooltools.expected
import cooltools.saddle
import cooler
import re
import itertools
import peaktools

In [4]:
dataDir = #"path/to/mcool/files"

conditions = [
    '4DN_HeLaS3',
    '4DN_HCT116mAC'
]

clr_names = {    
    '4DN_HeLaS3' : 'HeLaS3_4DN_Deep/4DNFIBM9QCFG.mcool', #https://data.4dnucleome.org/files-processed/4DNFIBM9QCFG/#details
    '4DN_HCT116mAC' : 'HCT116_Rad21mAC_Untreated_Rao2017/4DNFIFLDVASC.mcool' #https://data.4dnucleome.org/files-processed/4DNFIFLDVASC/
}

In [5]:
binsizes = [5000, 10000]

In [6]:
clrPaths = {}
for cond in conditions:
    clrPaths[cond] = {}
    for size in binsizes:
        clrPaths[cond][size] = f'{dataDir}/{clr_names[cond]}::resolutions/{size}'

In [7]:
coolers = {}
for cond in conditions:
    coolers[cond] = {}
    for size in binsizes:
        coolers[cond][size] = cooler.Cooler(clrPaths[cond][size])

In [8]:
# Use bioframe to fetch the genomic features from the UCSC.
hg38_chromsizes = bioframe.fetch_chromsizes('hg38', as_bed=True)
hg38_cens = bioframe.fetch_centromeres('hg38')
hg38_arms = bioframe.split(hg38_chromsizes, hg38_cens, cols_points=['chrom', 'mid'])
# Select only chromosomes that are present in the cooler.
hg38_chromsizes = hg38_chromsizes.set_index("chrom").loc[coolers['4DN_HeLaS3'][5000].chromnames].reset_index()
hg38_arms = hg38_arms.set_index("chrom").loc[coolers['4DN_HeLaS3'][5000].chromnames].reset_index()
hg38_arms = bioframe.parse_regions(hg38_arms)

In [9]:
hg38_arms.tail()

Unnamed: 0,chrom,start,end,name
43,chr22,14004553,50818468,chr22:14004553-50818468
44,chrX,0,60509060,chrX:0-60509060
45,chrX,60509060,156040895,chrX:60509060-156040895
46,chrY,0,10430491,chrY:0-10430491
47,chrY,10430491,57227415,chrY:10430491-57227415


In [43]:
pd.DataFrame(hg38_arms).to_csv(f'{dataDir}/data/hg38_arms.bed', sep = '\t', index = False, header = False)

In [44]:
print(cooltools.__version__)

0.4.0


In [45]:
!cooltools -h

Usage: cooltools [OPTIONS] COMMAND [ARGS]...

  Type -h or --help after any subcommand for more information.

Options:
  -v, --verbose  Verbose logging
  -d, --debug    Post mortem debugging
  -V, --version  Show the version and exit.
  -h, --help     Show this message and exit.

Commands:
  call-compartments   Perform eigen value decomposition on a cooler...
  call-dots           Call dots on a Hi-C heatmap that are not larger...
  compute-expected    Calculate expected Hi-C signal either for cis or...
  compute-saddle      Calculate saddle statistics and generate saddle...
  diamond-insulation  Calculate the diamond insulation scores and call...
  dump-cworld         Convert a cooler or a group of coolers into the...
  genome              Utilities for binned genome assemblies.
  logbin-expected     Logarithmically bin expected values generated using...
  random-sample       Pick a random sample of contacts from a Hi-C map,...


In [21]:
for cond in conditions:
    for size in binsizes:
        in_fname = clrPaths[cond][size]
        region_fname = f'{dataDir}/data/hg38_arms.bed'
        out_fname = f'{dataDir}/data/{cond}.{size//1000}kb.mapq30.expected.cis.cli.tsv'
        !bsub -q short -W 01:00 -e /home/eh37w/lsf_jobs/LSB_%J.err -o /home/eh37w/lsf_jobs/LSB_%J.log \
            -n 8 -R span[hosts=1] -R select[ib] -R rusage[mem=8000] -N -u erica.hildebrand@umassmed.edu \
            "cooltools compute-expected -p 8 -o $out_fname -t cis --balance --ignore-diags 2 $in_fname --regions $region_fname"


INFO: Total memory requested is 64000 MB (8 cores x 8000 MB)
WARN: Job does not specify node OS version.  Setting 'bsub -R "select[rh=6]"'
Job <2006600> is submitted to queue <short>.
INFO: Total memory requested is 64000 MB (8 cores x 8000 MB)
WARN: Job does not specify node OS version.  Setting 'bsub -R "select[rh=6]"'
Job <2006601> is submitted to queue <short>.
INFO: Total memory requested is 64000 MB (8 cores x 8000 MB)
WARN: Job does not specify node OS version.  Setting 'bsub -R "select[rh=6]"'
Job <2006602> is submitted to queue <short>.
INFO: Total memory requested is 64000 MB (8 cores x 8000 MB)
WARN: Job does not specify node OS version.  Setting 'bsub -R "select[rh=6]"'
Job <2006603> is submitted to queue <short>.


In [10]:
expectedPaths = {}
for cond in conditions:
    expectedPaths[cond] = {}
    for size in binsizes:
        expectedPaths[cond][size] = f'{dataDir}/data/{cond}.{size//1000}kb.mapq30.expected.cis.cli.tsv'

In [48]:
#Call dots on each condition, on each binsize, default settings
for cond in conditions:
    for size in binsizes:
        out_fname = f'{dataDir}/data/{cond}_{size//1000}kbbins_dots.txt'
        expectedPath = expectedPaths[cond][size]
        coolerPath = clrPaths[cond][size]
        region_fname = f'{dataDir}/data/hg38_arms.bed'
        !bsub -q short -W 04:00 -e /home/eh37w/lsf_jobs/LSB_%J.err -o /home/eh37w/lsf_jobs/LSB_%J.log \
            -n 8 -R span[hosts=1] -R select[ib] -R rusage[mem=8000] -N -u erica.hildebrand@umassmed.edu \
            "cooltools call-dots --nproc 8 -o $out_fname --regions $region_fname $coolerPath $expectedPath"

INFO: Total memory requested is 64000 MB (8 cores x 8000 MB)
WARN: Job does not specify node OS version.  Setting 'bsub -R "select[rh=6]"'
Job <2006861> is submitted to queue <short>.
INFO: Total memory requested is 64000 MB (8 cores x 8000 MB)
WARN: Job does not specify node OS version.  Setting 'bsub -R "select[rh=6]"'
Job <2006862> is submitted to queue <short>.
INFO: Total memory requested is 64000 MB (8 cores x 8000 MB)
WARN: Job does not specify node OS version.  Setting 'bsub -R "select[rh=6]"'
Job <2006863> is submitted to queue <short>.
INFO: Total memory requested is 64000 MB (8 cores x 8000 MB)
WARN: Job does not specify node OS version.  Setting 'bsub -R "select[rh=6]"'
Job <2006864> is submitted to queue <short>.


In [None]:
#Combine the different binsize called dots for each condition into one list, only 
#keep location of smallest binsize called dot.

In [14]:
for cond in conditions:
    loops_5kb = f'{dataDir}/data/{cond}_{5000//1000}kbbins_dots.txt.postproc.bedpe'
    loops_10kb = f'{dataDir}/data/{cond}_{10000//1000}kbbins_dots.txt.postproc.bedpe'
    outputname = f'{dataDir}/data/{cond}_merged5and10kbbins_dots.txt.postproc.bedpe'
    !bsub -q short -W 01:00 -e /home/eh37w/lsf_jobs/LSB_%J.err -o /home/eh37w/lsf_jobs/LSB_%J.log -n 1 -R span[hosts=1] -R select[ib] -R rusage[mem=4000] -R select[rh=8] -N -u erica.hildebrand@umassmed.edu \
        "peaktools merge-dot-lists-kdtree --verbose --output $outputname $loops_5kb $loops_10kb"

Job <2010322> is submitted to queue <short>.
Job <2010323> is submitted to queue <short>.
