# Cooler - Manage Hi-C Datasets

In [None]:
import os
import cooler
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt

project_dir = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/"

file_path = os.path.join(project_dir, "data/Hi-C_data/4DNFITHTURR9.mcool::resolutions/10000")

clr = cooler.Cooler(file_path)

In [None]:
clr.info

We can get a view of the table as a pandas DataFrame

In [None]:
clr.chroms()[1:5]

We can also access the chromosome names

In [None]:
clr.chromnames

In the bin table, the **weight** column contains the *matrix balancing weights* computed for each genomic bin.

In [None]:
clr.chromsizes

In [None]:
clr.bins()[:10]

The pixel table contains the non-zero upper triangle entries of the contact map.

In [None]:
clr.pixels()[:10]

Use the `join=True` option if you would like to expand the bin IDs into genomic bin coordinates by joining the output with the bin table.

In [None]:
clr.pixels(join=True)[:10]

The `matrix` method provides a 2D-sliceable view of the data. It allows you to query the data on file as a full rectangular contact matrix.

In [None]:
clr.matrix(balance=False)[1000:1200, 1000:1200]

Use `sparse=True` to return `scipy.sparse.coo_matrix` objects instead.

In [None]:
mat = clr.matrix(balance=False, sparse=True)[1000:1200, 1000:1200]
mat

It is straightforward to convert to a dense 2D numpy array.

In [None]:
arr = mat.toarray()
arr

Notice that the lower triabgle has been automatically filled in.

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
im = ax.matshow(np.log10(arr), cmap='YlOrRd')
fig.colorbar(im)

Notice the light and dark "banded" appearance? That's because you are looking at the unnormalized counts.

### Balancing your selection

We usually normalize or "correct" Hi-C using a technique called matrix balancing. This involves finding a set of weights or biases $b_{i}$ for each bin $i$ such that

$$\text{Normalized}[i,j] = \text{Observed}[i,j] \times b[i]] \times b[j]$$

such that the merginals (i.e., row/column sums) of the global contact matrix are flat and equal.

Cooler can store the pre-computed balancing weights in the bin table.

Here's one way to manually apply them to balance your selection.

In [None]:
# Get the balancing weights as a numpy array
weights = clr.bins()['weight']  # view
bias = weights[1000:1200]       # series
bias = bias.values              # array

# Fetch a sparse matrix of the range
mat = clr.matrix(balance=False, sparse=True)[1000:1200, 1000:1200]

# Apply the balancing weights
mat.data = bias[mat.row] * bias[mat.col] * mat.data

# Convert to dense numpy array
arr = mat.toarray()

As a shortcut, we get the same result by passing `balance=True` to the matrix view constructor.

In [None]:
arr2 = clr.matrix(balance=True, sparse=True)[1000:1200, 1000:1200].toarray()
np.allclose(arr, arr2, equal_nan=True)

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
im = ax.matshow(np.log10(arr), cmap='YlOrRd')
fig.colorbar(im)

### Genomic coordinate range selection

The bin table, pixel table, and matrix views also accept UCSC-style genomic range strings or (chrom, start, end) triplets.

In [None]:
clr.bins().fetch('chr2:10,000,000-20,000,000')

In [None]:
cis = clr.matrix(sparse=True).fetch('chr19')
cis.shape

In [None]:
trans = clr.matrix(sparse=True).fetch('chr18', 'chr19')
trans.shape