In [1]:
import numpy as np
import pandas as pd
from alabtools.analysis import HssFile

In [2]:
hss = HssFile('igm-model_mcrb_2.5MB.hss', 'r')

In [27]:
coords = hss.coordinates #read coordinates of all structures
print(type(coords))
print(coords.shape)  # ndomain, nstruct, 3

<class 'numpy.ndarray'>
(2094, 100, 3)


In [28]:
coords_1 = coords[:, 0, :]  # coordinates of struct 1
print(type(coords_1))
print(coords_1.shape)

<class 'numpy.ndarray'>
(2094, 3)


In [29]:
# Compute distance of bead 1 to center
ctr = np.array([0., 0., 0.])
bead1_coords = coords_1[0, :]  # coordinates of bead 1
dst_bead1_ctr = np.sqrt((bead1_coords[0] - ctr[0])**2 + (bead1_coords[1] - ctr[1])**2 + (bead1_coords[2] - ctr[2])**2)
print("Distance of bead 1 to center: {:.2f} nm".format(dst_bead1_ctr))

Distance of bead 1 to center: 2294.61 nm


### Recap for last tutorial

1. Can you read this file into a hss object:? HCT116_Rao.test.hss
2. Can you see 
    1. How many structures are stored in this hss object? 
    2. How many beads are in each structure?
3. Can you extract 
    1. The coordinates of all bead in the second structure? 
    2. The coordinates of first bead in all structure?
    3. The coordinates of first bead in the second structure?


In [4]:
#Explore Genome class
genome = hss.genome
genome_assembly= genome.assembly
print(genome_assembly)

mm10


In [5]:
#check chromosome names
genome_chroms = genome.chroms
print(type(genome_chroms))
print(genome_chroms.shape)
print(genome_chroms)

<class 'numpy.ndarray'>
(21,)
['chr1' 'chr2' 'chr3' 'chr4' 'chr5' 'chr6' 'chr7' 'chr8' 'chr9' 'chr10'
 'chr11' 'chr12' 'chr13' 'chr14' 'chr15' 'chr16' 'chr17' 'chr18' 'chr19'
 'chrX' 'chrY']


In [6]:
#check chromosome lengths
genome_lengths = genome.lengths
print(type(genome_lengths))
print(genome_lengths.shape)
print(genome_lengths)
print("length of chromosome 1: {} bp".format(genome_lengths[0]))

<class 'numpy.ndarray'>
(21,)
[195471971 182113224 160039680 156508116 151834684 149736546 145441459
 129401213 124595110 130694993 122082543 120129022 120421639 124902244
 104043685  98207768  94987271  90702639  61431566 171031299  91744698]
length of chromosome 1: 195471971 bp


In [8]:
#guess what will genome.origin looks like?
genome_origins = genome.origins
#print(genome.origins)

In [9]:
# Explore Index class
index = hss.index
index_chroms = index.chromstr #what is the difference between index.chromstr and genome.chroms?
print(type(index_chroms))
print(index_chroms.shape)
print(index_chroms)

<class 'numpy.ndarray'>
(2094,)
['chr1' 'chr1' 'chr1' ... 'chr19' 'chr19' 'chr19']


In [21]:
#get index of index_chroms where index_chroms=='chr1'
index_chr1 = np.where(index_chroms=='chr1')
#index_chr1
#index_chroms[index_chr1]

In [16]:
#View the start and end of each beads
index_start = index.start
print(type(index_start))
print(index_start.shape)
print(index_start)

<class 'numpy.ndarray'>
(2094,)
[       0  2500000  5000000 ... 55000000 57500000 60000000]


In [18]:
index_end = index.end
print(type(index_end))
print(index_end.shape)
print(index_end)

<class 'numpy.ndarray'>
(2094,)
[ 2500000  5000000  7500000 ... 57500000 60000000 62500000]


In [20]:
resolution = index_end - index_start
#what will resolution look like?
print("Is resolution constant? {}".format(np.all(resolution == resolution[0])))

Is resolution constant? True


### Practice
How to find the bead index whose chr='chr1', start=2500000, end = 5000000 (hint: you may divide it into three smaller problems first, and use np.intersect1d to help you find intersection of two arrays)

In [25]:
index_chr1 = np.where(index_chroms=='chr1')
index_start_2500000= np.where(index_start==2500000)
index_end_5000000= np.where(index_end==5000000)
intersect=np.intersect1d(index_chr1, index_start_2500000)
#intersect=np.intersect1d(intersect, index_end_400000)
intersect

array([   1, 1101])

In [39]:
#use pandas to visualize the index object

#write struc information into a dictionary and convert it to pandas dataframe object
struc_info={'chr':index_chroms,'start':index_start,'end':index_end}
struc_info=pd.DataFrame(struc_info)
#visualize the table
struc_info['start'].dtype

dtype('int32')

### Practice
For the previous question, find the bead index whose chr='chr1', start=2500000, end = 5000000 in table struc_info. <br>(Hint: you may use struc_info[struc_info['chr']=='xxx'] to filter rows match particular condition 'xxx')

In [42]:
target_beads=struc_info[(struc_info['chr']=='chr1') & (struc_info['start']==2500000) & (struc_info['end']==5000000)]
target_beads

Unnamed: 0,chr,start,end
1,chr1,2500000,5000000
1101,chr1,2500000,5000000
