In [1]:
# https://cooltools.readthedocs.io/en/latest/notebooks/viz.html

# Import the packages we will use
#Utilities
import os
import re
import itertools
import glob
import pickle
import argparse

#Data Management
import numpy as np
from numpy import diff
import pandas as pd
import h5py
import scipy
from scipy.stats import linregress
from scipy import ndimage
from functools import partial
from scipy.linalg import toeplitz

#Plotting
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import gridspec
from matplotlib import cm
from matplotlib.gridspec import GridSpec
from matplotlib.gridspec import GridSpecFromSubplotSpec
import matplotlib.colors as colors
from matplotlib.colors import ListedColormap
import seaborn as sns
import upsetplot
from upsetplot import UpSet

#Genomics
import pairtools
import cooler
import cooltools
import bioframe
import cooltools.expected
import cooltools.saddle
from cooltools import snipping
from bioframe import overlap
import cooltools.sample

In [2]:
#Using new hg38 aligned files, sampling coolers instead of pairs
#G1 sorted data only for main figure

In [3]:
dataDir = '/nl/umw_job_dekker/users/eh37w/Topo-Inhib/Manuscript_Organized_August2021/distiller_runs/Figure3/results'
outDataDir = '/nl/umw_job_dekker/users/eh37w/Topo-Inhib/Manuscript_Organized_August2021/Figure3_HeLa_ICRF_HiC'

conditions = [
    'AS_DMSO_R1',
    'AS_ICRF_R1',
    'MR_t4DMSO_R1',
    'MR_t4ICRF_R1',
    'MR_t8DMSO_R1',
    'MR_t8ICRF_R1',
    'AS_DMSO_R2',
    'AS_ICRF_R2',
    'MR_t4DMSO_R2',
    'MR_t4ICRF_R2',
    'MR_t8DMSO_R2',
    'MR_t8ICRF_R2',
    'AS_DMSO_R1R2',
    'AS_ICRF_R1R2',
    'MR_t4DMSO_R1R2',
    'MR_t4ICRF_R1R2',
    'MR_t8DMSO_R1R2',
    'MR_t8ICRF_R1R2'
]

long_names = {
    'AS_DMSO_R1' : 'TI-HiC-Dpn-HeLa-G1Sort-DMSO-2hr-4-29-R1-T1',
    'AS_ICRF_R1' : 'TI-HiC-Dpn-HeLa-G1Sort-ICRF-2hr-4-29-R1-T1',
    'MR_t4DMSO_R1' : 'TI-HiC-Dpn-HeLa-MitoticRelease-t4hr-DMSO-2hr-G1Sort-4-44-R1-T1',
    'MR_t4ICRF_R1' : 'TI-HiC-Dpn-HeLa-MitoticRelease-t4hr-ICRF-2hr-G1Sort-4-44-R1-T1',
    'MR_t8DMSO_R1' : 'TI-HiC-Dpn-HeLa-MitoticRelease-t8hr-DMSO-6hr-G1Sort-4-44-R1-T1',
    'MR_t8ICRF_R1' : 'TI-HiC-Dpn-HeLa-MitoticRelease-t8hr-ICRF-6hr-G1Sort-4-44-R1-T1',
    'AS_DMSO_R2' : 'TI-HiC-Dpn-HeLa-G1Sort-DMSO-2hr-4-48-R2-T1',
    'AS_ICRF_R2' : 'TI-HiC-Dpn-HeLa-G1Sort-ICRF-2hr-4-48-R2-T1',
    'MR_t4DMSO_R2' : 'TI-HiC-Dpn-HeLa-MitoticRelease-t4hr-DMSO-2hr-G1Sort-4-49-R2-T1',
    'MR_t4ICRF_R2' : 'TI-HiC-Dpn-HeLa-MitoticRelease-t4hr-ICRF-2hr-G1Sort-4-49-R2-T1',
    'MR_t8DMSO_R2' : 'TI-HiC-Dpn-HeLa-MitoticRelease-t8hr-DMSO-6hr-G1Sort-4-49-R2-T1',
    'MR_t8ICRF_R2' : 'TI-HiC-Dpn-HeLa-MitoticRelease-t8hr-ICRF-6hr-G1Sort-4-49-R2-T1',
    'AS_DMSO_R1R2' : 'TI-HiC-Dpn-HeLa-G1Sort-DMSO-2hr-R1R2',
    'AS_ICRF_R1R2' : 'TI-HiC-Dpn-HeLa-G1Sort-ICRF-2hr-R1R2',
    'MR_t4DMSO_R1R2': 'TI-HiC-Dpn-HeLa-MitoticRelease-t4hr-DMSO-2hr-G1Sort-R1R2',
    'MR_t4ICRF_R1R2' : 'TI-HiC-Dpn-HeLa-MitoticRelease-t4hr-ICRF-2hr-G1Sort-R1R2',
    'MR_t8DMSO_R1R2' : 'TI-HiC-Dpn-HeLa-MitoticRelease-t8hr-DMSO-6hr-G1Sort-R1R2',
    'MR_t8ICRF_R1R2' : 'TI-HiC-Dpn-HeLa-MitoticRelease-t8hr-ICRF-6hr-G1Sort-R1R2'
}

In [4]:
ComboConds = [
    'AS_DMSO_R1R2',
    'AS_ICRF_R1R2',
    'MR_t4DMSO_R1R2',
    'MR_t4ICRF_R1R2',
    'MR_t8DMSO_R1R2',
    'MR_t8ICRF_R1R2',
]

SepConds = [
    'AS_DMSO_R1',
    'AS_ICRF_R1',
    'MR_t4DMSO_R1',
    'MR_t4ICRF_R1',
    'MR_t8DMSO_R1',
    'MR_t8ICRF_R1',
    'AS_DMSO_R2',
    'AS_ICRF_R2',
    'MR_t4DMSO_R2',
    'MR_t4ICRF_R2',
    'MR_t8DMSO_R2',
    'MR_t8ICRF_R2'
]

ComboCtrlConds = [
    'AS_DMSO_R1R2',
    'MR_t4DMSO_R1R2',
    'MR_t8DMSO_R1R2'
]

ComboTreatConds = [
    'AS_ICRF_R1R2',
    'MR_t4ICRF_R1R2',
    'MR_t8ICRF_R1R2'
]

SepCtrlConds = [
    'AS_DMSO_R1',
    'MR_t4DMSO_R1',
    'MR_t8DMSO_R1',
    'AS_DMSO_R2',
    'MR_t4DMSO_R2',
    'MR_t8DMSO_R2'
]

SepTreatConds = [
    'AS_ICRF_R1',
    'MR_t4ICRF_R1',
    'MR_t8ICRF_R1',
    'AS_ICRF_R2',
    'MR_t4ICRF_R2',
    'MR_t8ICRF_R2'
]

In [5]:
#run multiQC on mapped data
#cd /nl/umw_job_dekker/users/eh37w/Topo-Inhib/Manuscript_Organized_August2021/Figure3_HeLa_ICRF_HiC/figures
#interactive_shell
#conda activate multiQC-env
#multiqc -m pairtools /nl/umw_job_dekker/users/eh37w/Topo-Inhib/Manuscript_Organized_August2021/distiller_runs/Figure3

In [6]:
#coolers - 10kb bins
binsize = 10000

clr_paths_10kb = {}
for cond in SepConds:
    clr_paths_10kb[cond] = f'{dataDir}/coolers_library/{long_names[cond]}.hg38.mapq_30.1000.mcool::resolutions/{binsize}'
for cond in ComboConds:
    clr_paths_10kb[cond] = f'{dataDir}/coolers_library_group/{long_names[cond]}.hg38.mapq_30.1000.mcool::resolutions/{binsize}'
    
    
clrs10kb = {
    cond: cooler.Cooler(clr_paths_10kb[cond]) for cond in conditions
}

In [8]:
# Use bioframe to fetch the genomic features from the UCSC.
hg38_chromsizes = bioframe.fetch_chromsizes('hg38', as_bed=True)
hg38_cens = bioframe.fetch_centromeres('hg38')
hg38_arms = bioframe.split(hg38_chromsizes, hg38_cens, cols_points=['chrom', 'mid'])
# Select only chromosomes that are present in the cooler.
hg38_chromsizes = hg38_chromsizes.set_index("chrom").loc[clrs10kb['AS_DMSO_R1'].chromnames].reset_index()
hg38_arms = hg38_arms.set_index("chrom").loc[clrs10kb['AS_DMSO_R1'].chromnames].reset_index()
hg38_arms = bioframe.parse_regions(hg38_arms)

In [9]:
#read depth?
clrs10kb['MR_t8ICRF_R1'].info #Sum should be the same as read # actually used in making cooler
#This is the file with the lowest read-depth - sample the rest to this level

{'bin-size': 10000,
 'bin-type': 'fixed',
 'creation-date': '2021-08-30T23:16:26.799393',
 'format': 'HDF5::Cooler',
 'format-url': 'https://github.com/mirnylab/cooler',
 'format-version': 3,
 'generated-by': 'cooler-0.8.5',
 'genome-assembly': 'unknown',
 'metadata': {},
 'nbins': 308839,
 'nchroms': 25,
 'nnz': 12164811,
 'storage-mode': 'symmetric-upper',
 'sum': 16812306}

In [10]:
#Sample for just 1000bp cooler (smallest) - then combine replicates and zoomify
sample_binsize = 1000

In [11]:
clrPaths = {}
for cond in SepConds:
    clrPaths[cond] = f'{dataDir}/coolers_library/{long_names[cond]}.hg38.mapq_30.1000.mcool::resolutions/{sample_binsize}'


In [12]:
#sample each to approx 16812305
for cond in SepConds:
    in_fname = clrPaths[cond]
    out_fname = f'{outDataDir}/data/{long_names[cond]}.sampled.hg38.mapq_30.1000.cool'
    !bsub -q short -W 01:00 -e /home/eh37w/lsf_jobs/LSB_%J.err -o /home/eh37w/lsf_jobs/LSB_%J.log \
        -n 1 -R span[hosts=1] -R select[ib] -R rusage[mem=8000] -R select[rh=8] -N -u erica.hildebrand@umassmed.edu \
        "cooltools random-sample -c 16812305 $in_fname $out_fname"

#For some reason this command requires the newer linux OS - segmentation fault othewise
#also had to be 1 less than sum of smallest cooler - otherwise that one fails

Job <2010374> is submitted to queue <short>.
Job <2010375> is submitted to queue <short>.
Job <2010376> is submitted to queue <short>.
Job <2010377> is submitted to queue <short>.
Job <2010378> is submitted to queue <short>.
Job <2010379> is submitted to queue <short>.
Job <2010380> is submitted to queue <short>.
Job <2010381> is submitted to queue <short>.
Job <2010382> is submitted to queue <short>.
Job <2010383> is submitted to queue <short>.
Job <2010384> is submitted to queue <short>.
Job <2010385> is submitted to queue <short>.


In [13]:
sampledPaths = {}
for cond in SepConds:
    sampledPaths[cond] = f'{outDataDir}/data/{long_names[cond]}.sampled.hg38.mapq_30.1000.cool'
for cond in ComboConds:
    sampledPaths[cond] = f'{outDataDir}/data/{long_names[cond]}.sampled.hg38.mapq_30.1000.cool'

In [16]:
#Merge replicates for combined files
for (cond1, cond2, combinedCond) in zip(conditions[0:6], conditions[6:12], ComboConds):
    
    cond1Sampled = sampledPaths[cond1]
    cond2Sampled = sampledPaths[cond2]

    combinedfile = sampledPaths[combinedCond]
    
    cooler.merge_coolers(combinedfile, [cond1Sampled, cond2Sampled], 20000000)
    
    #would be more efficient to submit a bsub job for each...but not working right now - os issues I think
    #!bsub -q short -W 04:00 -e /home/eh37w/lsf_jobs/LSB_%J.err -o /home/eh37w/lsf_jobs/LSB_%J.log \
    #-n 2 -R select[ib] -R span[hosts=1] -R rusage[mem=4000] -R select[rh=8] -N -u erica.hildebrand@umassmed.edu \
    #"cooler merge $combinedfile $cond1Sampled $cond2Sampled"

In [5]:
conditions[0:6]

['AS_DMSO_R1',
 'AS_ICRF_R1',
 'MR_t4DMSO_R1',
 'MR_t4ICRF_R1',
 'MR_t8DMSO_R1',
 'MR_t8ICRF_R1']

In [6]:
conditions[6:12]

['AS_DMSO_R2',
 'AS_ICRF_R2',
 'MR_t4DMSO_R2',
 'MR_t4ICRF_R2',
 'MR_t8DMSO_R2',
 'MR_t8ICRF_R2']

In [7]:
ComboConds

['AS_DMSO_R1R2',
 'AS_ICRF_R1R2',
 'MR_t4DMSO_R1R2',
 'MR_t4ICRF_R1R2',
 'MR_t8DMSO_R1R2',
 'MR_t8ICRF_R1R2']

In [17]:
#Zoomify and balance sampled cooler files
for cond in conditions:
    coolfile = sampledPaths[cond]
    mcoolfile = f'{outDataDir}/data/{long_names[cond]}.sampled.hg38.mapq_30.1000.mcool'
    !bsub -q short -W 04:00 -e /home/eh37w/lsf_jobs/LSB_%J.err -o /home/eh37w/lsf_jobs/LSB_%J.log \
        -n 8 -R select[ib] -R span[hosts=1] -R rusage[mem=4000] -R select[rh=8] -N -u erica.hildebrand@umassmed.edu \
        "cooler zoomify -p 8 --balance --resolutions 4DN $coolfile -o $mcoolfile"

Job <2011322> is submitted to queue <short>.
Job <2011323> is submitted to queue <short>.
Job <2011324> is submitted to queue <short>.
Job <2011325> is submitted to queue <short>.
Job <2011326> is submitted to queue <short>.
Job <2011327> is submitted to queue <short>.
Job <2011328> is submitted to queue <short>.
Job <2011329> is submitted to queue <short>.
Job <2011330> is submitted to queue <short>.
Job <2011331> is submitted to queue <short>.
Job <2011332> is submitted to queue <short>.
Job <2011333> is submitted to queue <short>.
Job <2011334> is submitted to queue <short>.
Job <2011335> is submitted to queue <short>.
Job <2011336> is submitted to queue <short>.
Job <2011337> is submitted to queue <short>.
Job <2011338> is submitted to queue <short>.
Job <2011339> is submitted to queue <short>.
