Skip to content

Commit

Permalink
Fixing bug for writing cooler files and applying nan_bins. Added opti…
Browse files Browse the repository at this point in the history
…on nan_bins will only be applied if basis file was h5. Adding test cases, adding removing of possible zero values for writing h5 files
  • Loading branch information
joachimwolff committed Nov 9, 2018
1 parent df671e1 commit b8c2749
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 27 deletions.
52 changes: 27 additions & 25 deletions hicmatrix/lib/cool.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def __init__(self, pMatrixFile=None):
self.correctionOperator = '*'
self.enforceInteger = False
self.appendData = False
self.fileWasH5 = False

def getInformationCoolerBinNames(self):
return cooler.Cooler(self.matrixFileName).bins().columns.values
Expand Down Expand Up @@ -66,10 +67,12 @@ def load(self, pApplyCorrection=None, pMatrixOnly=None):
start_pos += len(_features)
i += size
log.debug('sum of data: {}'.format(np.sum(data)))
log.debug('len of data: {}'.format(len(data)))

matrix = csr_matrix((data, (instances, features)), shape=(cooler_file.info['nbins'], cooler_file.info['nbins']), dtype=count_dtype)
del data
del instances
del features
# del data
# del instances
# del features
else:
if len(self.chrnameList) == 1:
try:
Expand Down Expand Up @@ -108,6 +111,8 @@ def load(self, pApplyCorrection=None, pMatrixOnly=None):
correction_factors = convertNansToOnes(np.array(correction_factors_data_frame.values).flatten())
# apply only if there are not only 1's
if np.sum(correction_factors) != len(correction_factors):
self.matrix.sort_indices()

instances, features = matrix.nonzero()
instances_factors = correction_factors[instances]
features_factors = correction_factors[features]
Expand All @@ -126,15 +131,8 @@ def load(self, pApplyCorrection=None, pMatrixOnly=None):
# try to restore nan_bins.
try:
shape = matrix.shape[0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1]
nan_bins = np.array(range(shape))
nan_bins = np.setxor1d(nan_bins, matrix.indices)

i = 0
while i < len(nan_bins):
if nan_bins[i] >= shape:
break
i += 1
nan_bins = nan_bins[:i]
nan_bins = np.arange(shape)
nan_bins = np.setdiff1d(nan_bins, matrix.indices[:-1])

except Exception:
nan_bins = None
Expand All @@ -145,24 +143,31 @@ def load(self, pApplyCorrection=None, pMatrixOnly=None):

def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):
log.debug('Save in cool format')
log.debug('start save!!!! sum of data column csr matrix {}'.format(self.matrix.data.sum()))

self.matrix.eliminate_zeros()
if self.nan_bins is not None and len(self.nan_bins) > 0:
# remove nan_bins by multipling them with 0 to set them to 0.

if self.nan_bins is not None and len(self.nan_bins) > 0 and self.fileWasH5:
# remove nan_bins
correction_factors = np.ones(self.matrix.shape[0])
correction_factors[self.nan_bins] = 0
self.matrix.sort_indices()
_instances, _features = self.matrix.nonzero()

instances_factors = correction_factors[_instances]
features_factors = correction_factors[_features]
instances_factors *= features_factors
self.matrix.data = self.matrix.data.astype(float)
self.matrix.data *= instances_factors

instances_factors = np.logical_not(np.logical_or(instances_factors, features_factors))
# self.matrix.data = self.matrix.data.astype(float)
self.matrix.data[instances_factors] = 0

self.matrix.eliminate_zeros()

# set possible nans in data to 0
self.matrix.data[np.argwhere(np.isnan(self.matrix.data))] = 0
self.matrix.eliminate_zeros()
mask = np.isnan(self.matrix.data)

self.matrix.data[mask] = 0
# self.matrix.data[np.argwhere(np.isnan(self.matrix.data))] = 0
self.matrix.eliminate_zeros()
# save only the upper triangle of the
if pSymmetric:
# symmetric matrix
Expand All @@ -178,7 +183,7 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):
# instead of handling this before.
bins_data_frame = pd.DataFrame(self.cut_intervals, columns=['chrom', 'start', 'end', 'interactions']).drop('interactions', axis=1)

dtype_pixel = {'bin1_id': np.int32, 'bin2_id':np.int32, 'count':np.int32}
dtype_pixel = {'bin1_id': np.int32, 'bin2_id': np.int32, 'count': np.int32}

if self.correction_factors is not None and pApplyCorrection:
dtype_pixel['weight'] = np.float32
Expand All @@ -190,8 +195,6 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):
# create a tuple list and use it to create a data frame

# save correction factors and original matrix
log.debug('self.correction_factors {}'.format(self.correction_factors))
log.debug('pApplyCorrection {}'.format(pApplyCorrection))

# revert correction to store orginal matrix
if self.correction_factors is not None and pApplyCorrection:
Expand Down Expand Up @@ -227,7 +230,6 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):
matrix_data_frame = matrix_data_frame.assign(bin2_id=features)
del features

log.debug('self.enforceInteger {}'.format(self.enforceInteger))
if self.enforceInteger:
dtype_pixel['count'] = np.int32
data = np.rint(self.matrix.data)
Expand All @@ -247,4 +249,4 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):
bins=bins_data_frame,
pixels=matrix_data_frame,
append=self.appendData,
dtype=dtype_pixel)
dtype=dtype_pixel)
2 changes: 2 additions & 0 deletions hicmatrix/lib/h5.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ def save(self, filename, pSymmetric=True, pApplyCorrection=None):
matrix = triu(self.matrix, k=0, format='csr')
else:
matrix = self.matrix
matrix.eliminate_zeros()

filters = tables.Filters(complevel=5, complib='blosc')
with tables.open_file(filename, mode="w", title="HiCExplorer matrix") as h5file:
matrix_group = h5file.create_group("/", "matrix", )
Expand Down
4 changes: 3 additions & 1 deletion hicmatrix/lib/matrixFileHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class MatrixFileHandler():

def __init__(self, pFileType='cool', pMatrixFile=None, pChrnameList=None,
pApplyCorrectionCooler=None, pBedFileHicPro=None, pCorrectionFactorTable=None,
pCorrectionOperator=None, pEnforceInteger=None, pAppend=None):
pCorrectionOperator=None, pEnforceInteger=None, pAppend=None, pFileWasH5=None):

self.class_ = getattr(importlib.import_module('.' + pFileType.lower(), package='hicmatrix.lib'), pFileType.title())

Expand All @@ -29,6 +29,8 @@ def __init__(self, pFileType='cool', pMatrixFile=None, pChrnameList=None,
self.matrixFile.enforceInteger = pEnforceInteger
if pAppend is not None:
self.matrixFile.appendData = pAppend
if pFileWasH5 is not None:
self.matrixFile.fileWasH5 = pFileWasH5

def load(self):

Expand Down
5 changes: 4 additions & 1 deletion hicmatrix/test/test_HiCMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_data/")


def test_load_h5_save_and_load_cool():
hic = hm.hiCMatrix(ROOT + 'Li_et_al_2015.h5')

Expand All @@ -41,6 +42,7 @@ def test_load_h5_save_and_load_cool():
nt.assert_equal(start_cool, start)
nt.assert_equal(end_cool, end)


def test_load_h5_save_and_load_cool_2():
hic = hm.hiCMatrix(ROOT + 'small_test_matrix.h5')

Expand All @@ -50,7 +52,6 @@ def test_load_h5_save_and_load_cool_2():

hic_cool = hm.hiCMatrix(outfile.name)


nt.assert_equal(hic_cool.matrix.data, hic.matrix.data)
chrom_cool, start_cool, end_cool, _ = zip(*hic_cool.cut_intervals)
chrom, start, end, _ = zip(*hic_cool.cut_intervals)
Expand All @@ -59,6 +60,7 @@ def test_load_h5_save_and_load_cool_2():
nt.assert_equal(start_cool, start)
nt.assert_equal(end_cool, end)


def test_load_cool_save_and_load_h5():
hic = hm.hiCMatrix(ROOT + 'Li_et_al_2015.cool')

Expand All @@ -76,6 +78,7 @@ def test_load_cool_save_and_load_h5():
nt.assert_equal(start_cool, start)
nt.assert_equal(end_cool, end)


def test_save_load_cool():
outfile = '/tmp/matrix.cool'
cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
Expand Down

0 comments on commit b8c2749

Please sign in to comment.