In [8]:
import numpy as np
import pandas as pd
from alabtools.analysis import HssFile
from matplotlib import pyplot as plt

In [4]:
# Read in HSS file
hss = HssFile('igm-model_mcrb_2.5MB.hss', 'r')
print(type(hss))

<class 'alabtools.analysis.HssFile'>


In [None]:
# The HSS has some attributes, that are essentially integers or strings
print(hss.nstruct)  # number of structures in the model
print(hss.nbead)  # number of beads in the model

In [3]:
coords = hss.coordinates #read coordinates of all structures
print(type(coords))
print(coords.shape)  # ndomain, nstruct, 3

<class 'numpy.ndarray'>
(2094, 100, 3)


In [4]:
coords_1 = coords[:, 0, :]  # coordinates of struct 1
print(type(coords_1))
print(coords_1.shape)

<class 'numpy.ndarray'>
(2094, 3)


In [5]:
# Compute distance of bead 1 to center
ctr = np.array([0., 0., 0.])
bead1_coords = coords_1[0, :]  # coordinates of bead 1
dst_bead1_ctr = np.sqrt((bead1_coords[0] - ctr[0])**2 + (bead1_coords[1] - ctr[1])**2 + (bead1_coords[2] - ctr[2])**2)
print("Distance of bead 1 to center: {:.2f} nm".format(dst_bead1_ctr))

Distance of bead 1 to center: 2294.61 nm


# Genome data
The first dataset in the HSS file is the genome.

It gives you information regarding the species, the assembly, the chromosomes, their lengths.

It's a very basic information, so we don't use it much.

In [4]:
# Load the genome
genome = hss.genome

# Print the genome assembly
genome_assembly= genome.assembly

# The assembly is just a string
print(type(genome_assembly))
print(genome_assembly)

mm10


In [7]:
# The genome contains the list of chromosomes present in the model
genome_chroms = genome.chroms
print(type(genome_chroms))
print(genome_chroms.shape)
print(genome_chroms)

<class 'numpy.ndarray'>
(21,)
['chr1' 'chr2' 'chr3' 'chr4' 'chr5' 'chr6' 'chr7' 'chr8' 'chr9' 'chr10'
 'chr11' 'chr12' 'chr13' 'chr14' 'chr15' 'chr16' 'chr17' 'chr18' 'chr19'
 'chrX' 'chrY']


In [8]:
# The genome.lengths array contains the length of each chromosome
genome_lengths = genome.lengths
print(type(genome_lengths))
print(genome_lengths.shape)
print(genome_lengths)
print("length of chromosome 1: {} bp".format(genome_lengths[0]))

<class 'numpy.ndarray'>
(21,)
[195471971 182113224 160039680 156508116 151834684 149736546 145441459
 129401213 124595110 130694993 122082543 120129022 120421639 124902244
 104043685  98207768  94987271  90702639  61431566 171031299  91744698]
length of chromosome 1: 195471971 bp


In [8]:
# If you are unsure about the contents of a class in python, you can always print the __dict__ attribute
# It is a dictionary containing all attributes (and datasets) of the class

print(hss.genome.__dict__)
print(hss.genome.keys())

{'chroms': array(['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
       'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
       'chr16', 'chr17', 'chr18', 'chr19', 'chrX', 'chrY'], dtype='<U10'), 'origins': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32), 'lengths': array([195471971, 182113224, 160039680, 156508116, 151834684, 149736546,
       145441459, 129401213, 124595110, 130694993, 122082543, 120129022,
       120421639, 124902244, 104043685,  98207768,  94987271,  90702639,
        61431566, 171031299,  91744698], dtype=int32), 'assembly': 'mm10'}


# Index data
The second dataset in the HSS file is the index.

It gives you the segmentation of the genome into beads, so it's essential to map the coordinates to the actual genomic regions they are associated with.

In [5]:
# Load the index
index = hss.index

print(type(index))

print(index.__dict__.keys())  # as you can see, it's much more complicated than the genome!

<class 'alabtools.utils.Index'>
dict_keys(['custom_tracks', 'chrom_sizes', 'genome', 'chromstr', 'chrom', 'start', 'end', 'copy', 'label', 'copy_index', 'num_chrom', 'offset', 'refs', 'loctree', '_map_chrom_id', '_map_id_chrom'])


In [6]:
# For now we can only focus on four arrays: chromstr, start, end, and copy
chromstr = index.chromstr
start = index.start
end = index.end
copy = index.copy

print(type(chromstr))
print(chromstr.shape)
print(chromstr)

<class 'numpy.ndarray'>
(2094,)
['chr1' 'chr1' 'chr1' ... 'chr19' 'chr19' 'chr19']


The chromstr array contains again the chromosome names, but they are repeated. It represents the genome segmentation: all elements with 'chr1' are beads that belong to the first chromosome, all elements with 'chr2' are beads that belong to the second chromosome, etc.

In [16]:
# We can visualize the full genome segmentation using the zip command
for chrom, start, end, copy in zip(hss.index.chromstr, hss.index.start, hss.index.end, hss.index.copy):
    print(chrom, start, end, copy)

chr1 0 2500000 0
chr1 2500000 5000000 0
chr1 5000000 7500000 0
chr1 7500000 10000000 0
chr1 10000000 12500000 0
chr1 12500000 15000000 0
chr1 15000000 17500000 0
chr1 17500000 20000000 0
chr1 20000000 22500000 0
chr1 22500000 25000000 0
chr1 25000000 27500000 0
chr1 27500000 30000000 0
chr1 30000000 32500000 0
chr1 32500000 35000000 0
chr1 35000000 37500000 0
chr1 37500000 40000000 0
chr1 40000000 42500000 0
chr1 42500000 45000000 0
chr1 45000000 47500000 0
chr1 47500000 50000000 0
chr1 50000000 52500000 0
chr1 52500000 55000000 0
chr1 55000000 57500000 0
chr1 57500000 60000000 0
chr1 60000000 62500000 0
chr1 62500000 65000000 0
chr1 65000000 67500000 0
chr1 67500000 70000000 0
chr1 70000000 72500000 0
chr1 72500000 75000000 0
chr1 75000000 77500000 0
chr1 77500000 80000000 0
chr1 80000000 82500000 0
chr1 82500000 85000000 0
chr1 85000000 87500000 0
chr1 87500000 90000000 0
chr1 90000000 92500000 0
chr1 92500000 95000000 0
chr1 95000000 97500000 0
chr1 97500000 100000000 0
chr1 1000000

In [7]:
# Simple check: let's check that the resolution of all beads is the same
resolution = hss.index.end - hss.index.start
for res in resolution:
    print(res)
# Let's use some numpy operations to check if all resolutions are the same
is_resolution_constant = np.all(resolution == resolution[0])  # this will return True if all resolutions are the same
print("Is resolution constant? {}".format(is_resolution_constant))

2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000
2500000


# Coordinates data
The third dataset in the HSS file is the coordinates.

This is the core of our lab: it contains the coordinates of the beads in each structure.

Remember that you always need the index to map the coordinates to the actual genomic regions they are associated with!

In [9]:
print(type(hss.coordinates))
print(hss.coordinates.shape)

# coordinates is a 3D array with dimension (nbead, nstruct, 3):
#       1) the first dimension spans through all beads of the model
#       2) the second dimension spans through all structures
#       3) the third dimension spans through the x, y, and z coordinates

<class 'numpy.ndarray'>
(2094, 100, 3)


In [None]:
# Simple exercise: isolate the data for a particular bead, i.e. bead 6

bead_coord = hss.coordinates[5, :, :]
print(type(bead_coord))
print(bead_coord.shape)  # it has dimension (nstruct, 3): that's because we are looking at a single bead
print('\n')

# As we said, without the index object we don't know where the bead is located in the genome
# So let's find out!
print(hss.index.chromstr[5], hss.index.start[5], hss.index.end[5], hss.index.copy[5])
print('\n')

# Let's isolate the z coordinates of bead 6 in all structures
bead_zcoord = bead_coord[:, 2]  # it has dimension (nstruct, ): that's because we are looking at a single coordinate
print(type(bead_zcoord))
print(bead_zcoord.shape)

# Let's print the z coordinates of bead 6 in some structures
print('z coordinates of bead 6 in struct 1: {} nm'.format(bead_zcoord[0]))
print('z coordinates of bead 6 in struct 2: {} nm'.format(bead_zcoord[1]))
print('z coordinates of bead 6 in struct 3: {} nm'.format(bead_zcoord[2]))

# Let's plot the z coordinates of bead 6 in all structures as a histogram
plt.hist(bead_zcoord, bins=10)
plt.savefig('bead6_zcoord.png')
plt.close()

## Built-in functions in HSS files
We can use some built-in functions to get information about the HSS file.

Some of them are not adding much to the methods we have used so far, but others can really speed up our work.

In [10]:
# The get_bead_crd() method allows to extract the coordinates of a list of beads
bead_crd = hss.get_bead_crd(0)  # get coordinates of bead 1

print(type(bead_crd))
print(bead_crd.shape)  # it has dimension (nstruct, 3): that's because we are looking at a single bead

# We already saw how to extract the coordinates of a single bead, so let's check that the two methods are equivalent
bead_crd_otherway = hss.coordinates[0, :, :]

are_methods_the_same = np.array_equal(bead_crd, bead_crd_otherway)  # this will return True if the two methods are equivalent
print("Are the two methods equivalent? {}".format(are_methods_the_same))

<class 'numpy.ndarray'>
(100, 3)
Are the two methods equivalent? True


get_struct_crd(models): <br>Gets struct ids as an input and prints out coordinates for all beads in the structure as an output. The length of the array should be equal to nbead.


In [None]:
# The get_bead_crd() method allows more flexibility if we want to extract data for multiple beads

# Let's say that we are interested in beads 1, 100, 712, 994
beads = [0, 99, 711, 993]  # it's a simple python list
beads_crd = hss.get_bead_crd(beads)
print(type(beads_crd))
print(beads_crd.shape)  # it has dimension (4, nstruct, 3): that's because we are looking at 4 beads

# Take the Index object (chromstr, start, end) for the beads we are interested in
beads_chromstr = hss.index.chromstr[beads]
beads_start = hss.index.start[beads]
beads_end = hss.index.end[beads]
beads_copy = hss.index.copy[beads]
for bead in beads:
    print(bead, hss.index.chromstr[bead], hss.index.start[bead], hss.index.end[bead], hss.index.copy[bead])

In [11]:
# The get_struct_crd() method allows to extract the coordinates of a list of structures
struct_crd = hss.get_struct_crd(0)
print(type(struct_crd))
print(struct_crd.shape)  # it has dimension (nbead, 3): that's because we are looking at a single structure

# Again, we saw how to extract the coordinates of a single structure, so let's check that the two methods are equivalent
struct_crd_otherway = hss.coordinates[:, 0, :]
print(np.array_equal(struct_crd, struct_crd_otherway))

<class 'numpy.ndarray'>
(2094, 3)
True


In [None]:
# And again, the get_struct_crd() method allows more flexibility if we want to extract data for multiple structures
# Let's say that we are interested in structures 2, 7, 19
structs = [1, 6, 18]
structs_crd = hss.get_struct_crd(structs)
print(type(structs_crd))
print(structs_crd.shape)

In [12]:
# A very useful function allows us to extract the radial position of a list of beads
# The radial position is the distance of the bead from the center of the nucleus, and it's computed as:
#   r = sqrt(x^2 + y^2 + z^2)  (the center has coordinates (0, 0, 0))

# Again, we have to use as input a list of beads
beads = [0, 99, 711, 993]

# We have to give as input also the radius of the nucleus
# If the nucleus is a sphere, we can give a single value
# If the nucleus is an ellipsoid, we have to give three values, one for each axis
# For these models, the nucleus is an ellipsoid with semi-axes (3050, 2350, 2350) nm
radials = hss.getBeadRadialPositions(beads, nucleusRadius=(3050, 2350, 2350))
print(type(radials))
print(radials.shape)  # it has dimension (4, nstruct): that's because we are looking at 4 bead
                      # and we have computed the radial value, so we also lose the x,y,z indices

<class 'numpy.ndarray'>
(4, 100)


In [14]:
# If we want to take the radial profile for all beads, we can use np.arange()
beads = np.arange(hss.nbead)
print(beads)
radials = hss.getBeadRadialPositions(beads)
print(type(radials))
print(radials.shape)

# Plot radial positions of all beads in struct 1
plt.plot(radials[:, 0])
plt.savefig('radial_pos_struct1.png')
plt.close()

[   0    1    2 ... 2091 2092 2093]
<class 'numpy.ndarray'>
(2094, 100)


In [53]:
# Let's compute the average radial profile, by averaging over all structures
# we can use the numpy mean function, which allows to average over a specific axis
# we specify axis=1, because we want to average over the second axis (the structures)
avg_radials = np.mean(radials, axis=1)
print(type(avg_radials))
print(avg_radials.shape)

# Plot average radial profile
plt.plot(avg_radials)
plt.savefig('avg_radial_pos.png')
plt.close()


<class 'numpy.ndarray'>
(2094,)


In [56]:
# Let's visualize the average radial profile for chr1
hss.index.chromstr == 'chr1'

avg_radials_chr1 = avg_radials[hss.index.chromstr == 'chr1']
print(type(avg_radials_chr1))
print(avg_radials_chr1.shape)

plt.plot(avg_radials_chr1)
plt.savefig('avg_radial_pos_chr1.png')
plt.close()

<class 'numpy.ndarray'>
(158,)


# Exercise
The previous code displayed the radial profile of both copies (0 and 1) one alongside the other. This is true also when we displayed the average radial profile of chr1.

Can you modify the code so that it performs an average over the two copies?

Try it first for chr1 and then for the whole genome.

Hint1: use the hss.index.copy array.

Hint2: the number of beads of the two copies - when you study chr1 - are the same, but when you look at the entire genome they are different (chrX and chrY are only present in copy 0).

getChromBeadRadialPosition(chrnum, nucleusRadius=5000): <br>Gets the chrom id (0-indexed) as an input and prints out the radial positions of beads in that chromosome for each structure.


In [20]:
hss.getChromBeadRadialPosition(0)

array([[0.45892224, 0.27499694, 0.44485614, ..., 0.5090492 , 0.5389437 ,
        0.4549859 ],
       [0.41760775, 0.16299318, 0.4394453 , ..., 0.4108551 , 0.424862  ,
        0.440687  ],
       [0.4223119 , 0.17060232, 0.3592254 , ..., 0.32171032, 0.476618  ,
        0.39111   ],
       ...,
       [0.38942602, 0.12283513, 0.15309018, ..., 0.27479854, 0.35586652,
        0.31527287],
       [0.35884947, 0.09306116, 0.13190234, ..., 0.3186278 , 0.3956942 ,
        0.41596237],
       [0.34170452, 0.15958302, 0.20046648, ..., 0.2690601 , 0.45816317,
        0.4397109 ]], dtype=float32)

A collaborative lab has reasons to believe that chr2 and chr4 have these structural differences:
1. On average chr2 is more internal than chr4, but
2. chr2 has some "outer regions" that are much more external than chr4.
How would you test these hypotheses?