In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import multiprocess as mp
import numpy as np
import pandas as pd
import bioframe
import cooltools
import cooler
import re
import bbi
from pybedtools import bedtool #didn't actually use
import pybedtools #didn't actually use
#new python library - probably need to add to your conda env
import pyranges

In [2]:
pyranges.PyRanges.intersect

<function pyranges.pyranges.PyRanges.intersect(self, other, **kwargs)>

In [3]:
hg19 = bioframe.fetch_chromsizes('hg19')
chromsizes = bioframe.fetch_chromsizes('hg19')
chromosomes = list(chromsizes.index)

In [4]:
compDataDir = "/nl/umw_job_dekker/users/eh37w/Topo-Inhib/compartments_cooler"
boundaryDir = '/nl/umw_job_dekker/users/eh37w/Topo-Inhib/insulation_cooler/data'

conditions = [
    'Async_R1',
    'Async_R2'
]

binsize_ins = 40000
binsize_comp = 200000

long_names = {    
    'Async_R1' : 'TI-HiC-Dpn-HeLa-Async-Ctrl-1-30-R1-T1',
    'Async_R2': 'TI-HiC-Dpn-HeLa-Async-Ctrl-1-36-R2-T1'
}


In [5]:
#called boundaries from cooler insulation - Async combined
window_bp = 480000

AsyncBounds = pd.read_csv('{}/Async_R1R2Consensus_TADBoundaries.{}kb.peaks_{}.tsv'.format(boundaryDir, binsize_ins//1000, window_bp), sep = '\t')

In [6]:
#called boundaries from cooler compartments (with gene density) - Async
eigs = {}
for cond in conditions:
    eigs[cond] = pd.read_csv('{}/data/{}.{}kb.gd.eigs.cis.vecs.txt'.format(compDataDir, long_names[cond], binsize_comp//1000), sep='\t')

In [7]:
#calling A vs B compartments in each sample
eigsA = {}
for cond in conditions:
    eigsA[cond] = eigs[cond][eigs[cond]['E1'] > 0]
    
eigsB = {}
for cond in conditions:
    eigsB[cond] = eigs[cond][eigs[cond]['E1'] < 0]

In [40]:
#calling compartment boundaries in each sample
#Calculate which bins flank sign changes - by chromosome?

compBounds = {}
signsAll = {}

for cond in conditions:    
    signsAll[cond] = pd.DataFrame(data = {
        'Chrom1' : eigs[cond]['chrom'][0:-1].reset_index(drop = True),
        'Bin1Start' : eigs[cond]['start'][0:-1].reset_index(drop = True),
        'Bin1End' : eigs[cond]['end'][0:-1].reset_index(drop = True),
        'Bin1E1' : eigs[cond]['E1'][0:-1].reset_index(drop = True),
        'Chrom2' : eigs[cond]['chrom'][1:].reset_index(drop = True),
        'Bin2Start' : eigs[cond]['start'][1:].reset_index(drop = True),
        'Bin2End' : eigs[cond]['end'][1:].reset_index(drop = True),  
        'Bin2E1' : eigs[cond]['E1'][1:].reset_index(drop = True),
        'sign' : (eigs[cond]['E1'][0:-1].reset_index(drop = True) * eigs[cond]['E1'][1:].reset_index(drop = True))
        })
    signsAll[cond].apply(pd.to_numeric, errors='ignore')
    signsAll[cond] = signsAll[cond].astype({"Bin1Start": int, "Bin1End": int, "Bin2Start": int, "Bin2End": int})

    compBounds[cond] = signsAll[cond][(signsAll[cond]['sign'] < 0) &
                                      (signsAll[cond]['Chrom1'] == signsAll[cond]['Chrom2']) 
                                     ]

In [55]:
compBoundsRanges = {}
for cond in conditions:
    compBoundsRanges[cond] = pd.DataFrame(data = {
        'Chromosome' : compBounds[cond].Chrom1,
        'Start' : compBounds[cond].Bin1Start,
        'End' : compBounds[cond].Bin2End,
        'Name' : '.',
        'Score' : compBounds[cond].sign,
        'Strand' : '+'
    })

In [56]:
#Merge R1 and R2 compartment boundaries - union and intersect, since want all potential compartment boundaries, 
#and want all TAD only boundaries
cbPyRanges = {}
for cond in conditions:
        cbPyRanges[cond] = pyranges.PyRanges(
            compBoundsRanges[cond]
        )

In [57]:
cbPyRanges['Async_R1']

+--------------+-----------+-----------+------------+------------------------+----------+
| Chromosome   | Start     | End       | Name       | Score                  | Strand   |
| (int8)       | (int32)   | (int32)   | (object)   | (float64)              | (int8)   |
|--------------+-----------+-----------+------------+------------------------+----------|
| chr1         | 5800000   | 6200000   | .          | -0.15167043751209502   | +        |
| chr1         | 12200000  | 12600000  | .          | -0.004674341648969561  | +        |
| chr1         | 15400000  | 15800000  | .          | -0.0019638650922139773 | +        |
| ...          | ...       | ...       | ...        | ...                    | ...      |
| chrX         | 146600000 | 147000000 | .          | -0.05526781380954693   | +        |
| chrX         | 146800000 | 147200000 | .          | -0.03971167319250798   | +        |
| chrX         | 148200000 | 148600000 | .          | -0.007573588529989534  | +        |
+---------

In [58]:
cbPyRanges['Async_R2']

+--------------+-----------+-----------+------------+-----------------------+----------+
| Chromosome   | Start     | End       | Name       | Score                 | Strand   |
| (int8)       | (int32)   | (int32)   | (object)   | (float64)             | (int8)   |
|--------------+-----------+-----------+------------+-----------------------+----------|
| chr1         | 5800000   | 6200000   | .          | -0.09365252747388518  | +        |
| chr1         | 12400000  | 12800000  | .          | -0.026303802271402707 | +        |
| chr1         | 15400000  | 15800000  | .          | -0.024962326012173724 | +        |
| ...          | ...       | ...       | ...        | ...                   | ...      |
| chrX         | 138800000 | 139200000 | .          | -0.04316394980118141  | +        |
| chrX         | 142600000 | 143000000 | .          | -0.016177262345875968 | +        |
| chrX         | 148200000 | 148600000 | .          | -0.027113844624152998 | +        |
+--------------+-----

In [59]:
x = cbPyRanges['Async_R1'].no_overlap(cbPyRanges['Async_R2'])
print(x)

+--------------+-----------+-----------+------------+-----------------------+----------+
| Chromosome   | Start     | End       | Name       | Score                 | Strand   |
| (int8)       | (int32)   | (int32)   | (object)   | (float64)             | (int8)   |
|--------------+-----------+-----------+------------+-----------------------+----------|
| chr1         | 59400000  | 59800000  | .          | -0.006143678773713582 | +        |
| chr1         | 65400000  | 65800000  | .          | -0.007079082977595713 | +        |
| chr1         | 65600000  | 66000000  | .          | -0.01058980936266671  | +        |
| ...          | ...       | ...       | ...        | ...                   | ...      |
| chrX         | 110800000 | 111200000 | .          | -0.02509479520420424  | +        |
| chrX         | 146600000 | 147000000 | .          | -0.05526781380954693  | +        |
| chrX         | 146800000 | 147200000 | .          | -0.03971167319250798  | +        |
+--------------+-----

In [70]:
cbCat = cbPyRanges['Async_R1'].concat(cbPyRanges['Async_R2'])
cbCat

+--------------+-----------+-----------+------------+------------------------+----------+
| Chromosome   | Start     | End       | Name       | Score                  | Strand   |
| (int8)       | (int32)   | (int32)   | (object)   | (float64)              | (int8)   |
|--------------+-----------+-----------+------------+------------------------+----------|
| chr1         | 5800000   | 6200000   | .          | -0.15167043751209502   | +        |
| chr1         | 12200000  | 12600000  | .          | -0.004674341648969561  | +        |
| chr1         | 15400000  | 15800000  | .          | -0.0019638650922139773 | +        |
| ...          | ...       | ...       | ...        | ...                    | ...      |
| chrX         | 138800000 | 139200000 | .          | -0.04316394980118141   | +        |
| chrX         | 142600000 | 143000000 | .          | -0.016177262345875968  | +        |
| chrX         | 148200000 | 148600000 | .          | -0.027113844624152998  | +        |
+---------

In [64]:
#Overlap = compartment boundaries
#first make PyRanges from AsyncBounds
insDF = pd.DataFrame(data = {
        'Chromosome' : AsyncBounds.Chromosome,
        'Start' : AsyncBounds.Start,
        'End' : AsyncBounds.End,
        'Name' : '.',
        'Score' : '.',
        'Strand' : '+'
})

insRanges = pyranges.PyRanges(
            insDF
)

insRanges

+--------------+-----------+-----------+------------+------------+----------+
| Chromosome   | Start     | End       | Name       | Score      | Strand   |
| (int8)       | (int32)   | (int32)   | (object)   | (object)   | (int8)   |
|--------------+-----------+-----------+------------+------------+----------|
| chr1         | 1000000   | 1040000   | .          | .          | +        |
| chr1         | 1280000   | 1320000   | .          | .          | +        |
| chr1         | 1400000   | 1440000   | .          | .          | +        |
| ...          | ...       | ...       | ...        | ...        | ...      |
| chrX         | 154080000 | 154120000 | .          | .          | +        |
| chrX         | 154400000 | 154440000 | .          | .          | +        |
| chrX         | 154480000 | 154520000 | .          | .          | +        |
+--------------+-----------+-----------+------------+------------+----------+
PyRanges object has 6974 sequences from 23 chromosomes.

In [72]:
compInsulationBounds = insRanges.overlap(cbCat)
compInsulationBounds

+--------------+-----------+-----------+------------+------------+----------+
| Chromosome   | Start     | End       | Name       | Score      | Strand   |
| (int8)       | (int32)   | (int32)   | (object)   | (object)   | (int8)   |
|--------------+-----------+-----------+------------+------------+----------|
| chr1         | 6040000   | 6080000   | .          | .          | +        |
| chr1         | 6040000   | 6080000   | .          | .          | +        |
| chr1         | 12280000  | 12320000  | .          | .          | +        |
| ...          | ...       | ...       | ...        | ...        | ...      |
| chrX         | 148320000 | 148360000 | .          | .          | +        |
| chrX         | 148520000 | 148560000 | .          | .          | +        |
| chrX         | 148520000 | 148560000 | .          | .          | +        |
+--------------+-----------+-----------+------------+------------+----------+
PyRanges object has 3372 sequences from 23 chromosomes.

In [89]:
#need to just take unique rows from DF
compInsulationBoundsUnique = compInsulationBounds.df.drop_duplicates()

print(len(compInsulationBoundsUnique))
compInsulationBoundsUnique.head()

1893


Unnamed: 0,Chromosome,Start,End,Name,Score,Strand
0,chr1,6040000,6080000,.,.,+
2,chr1,12280000,12320000,.,.,+
3,chr1,12720000,12760000,.,.,+
4,chr1,15400000,15440000,.,.,+
6,chr1,18480000,18520000,.,.,+


In [97]:
#non overlapping insulation = TAD boundaries
tadInsulationBounds = insRanges.no_overlap(cbCat)
print(tadInsulationBounds)

+--------------+-----------+-----------+------------+------------+----------+
| Chromosome   | Start     | End       | Name       | Score      | Strand   |
| (int8)       | (int32)   | (int32)   | (object)   | (object)   | (int8)   |
|--------------+-----------+-----------+------------+------------+----------|
| chr1         | 1000000   | 1040000   | .          | .          | +        |
| chr1         | 1280000   | 1320000   | .          | .          | +        |
| chr1         | 1400000   | 1440000   | .          | .          | +        |
| ...          | ...       | ...       | ...        | ...        | ...      |
| chrX         | 154080000 | 154120000 | .          | .          | +        |
| chrX         | 154400000 | 154440000 | .          | .          | +        |
| chrX         | 154480000 | 154520000 | .          | .          | +        |
+--------------+-----------+-----------+------------+------------+----------+
PyRanges object has 5081 sequences from 23 chromosomes.


In [90]:
tadInsulationBoundsUnique = tadInsulationBounds.df.drop_duplicates()
print(len(tadInsulationBoundsUnique))
tadInsulationBoundsUnique.head()

5081


Unnamed: 0,Chromosome,Start,End,Name,Score,Strand
0,chr1,1000000,1040000,.,.,+
1,chr1,1280000,1320000,.,.,+
2,chr1,1400000,1440000,.,.,+
3,chr1,1480000,1520000,.,.,+
4,chr1,1720000,1760000,.,.,+


In [103]:
#save the comp vs tad insulation boundaries
compInsulationBoundsUnique.to_csv(
    '{}/compartmentOverlap_insulation_bounds_AsyncR1R2.txt'.format(boundaryDir),
    sep = "\t", index = False)
tadInsulationBoundsUnique.to_csv(    
    '{}/TADonly_insulation_bounds_AsyncR1R2.txt'.format(boundaryDir),
    sep = "\t", index = False)