Skip to content

Commit

Permalink
Merge pull request #306 from deeptools/normalize
Browse files Browse the repository at this point in the history
Normalize
  • Loading branch information
joachimwolff committed Nov 9, 2018
2 parents a2db0ff + ecce499 commit aabb72b
Show file tree
Hide file tree
Showing 19 changed files with 332 additions and 2 deletions.
7 changes: 7 additions & 0 deletions bin/hicNormalize
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from hicexplorer.hicNormalize import main

if __name__ == "__main__":
main()
98 changes: 98 additions & 0 deletions hicexplorer/hicNormalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from __future__ import division
import argparse
from hicmatrix import HiCMatrix as hm
from hicexplorer._version import __version__
from hicexplorer.utilities import toString
from hicmatrix.HiCMatrix import check_cooler
import logging
log = logging.getLogger(__name__)

import numpy as np


def parse_arguments(args=None):

parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
add_help=False,
description="""
Normalizes given matrices either to the smallest given read number of all matrices or to 0 - 1 range.
""")

parserRequired = parser.add_argument_group('Required arguments')

parserRequired.add_argument('--matrices', '-m',
help='The matrix (or multiple matrices) to get information about. '
'HiCExplorer supports the following file formats: h5 (native HiCExplorer format) '
'and cool.',
nargs='+',
required=True)

parserRequired.add_argument('--normalize', '-n',
help='Normalize to a) 0 to 1 range, b) all matrices to the lowest read count of the given matrices.',
choices=['norm_range', 'smallest'],
default='smallest',
required=True)
parserRequired.add_argument('--outFileName', '-o',
help='Output file name for the Hi-C matrix.',
metavar='FILENAME',
nargs='+',
required=True)
parserOpt = parser.add_argument_group('Optional arguments')

parserOpt.add_argument('--help', '-h', action='help', help='show this help message and exit')

parserOpt.add_argument('--version', action='version',
version='%(prog)s {}'.format(__version__))

return parser


def main(args=None):

args = parse_arguments().parse_args(args)
hic_matrix_list = []
sum_list = []
for matrix in args.matrices:
hic_ma = hm.hiCMatrix(matrix)
if args.normalize == 'smallest':
sum_list.append(hic_ma.matrix.sum())
hic_matrix_list.append(hic_ma)

if args.normalize == 'norm_range':
for i, hic_matrix in enumerate(hic_matrix_list):
hic_matrix.matrix.data = hic_matrix.matrix.data.astype(np.float32)

min_value = np.min(hic_matrix.matrix.data)
max_value = np.max(hic_matrix.matrix.data)
min_max_difference = np.float64(max_value - min_value)

hic_matrix.matrix.data -= min_value
hic_matrix.matrix.data /= min_max_difference

mask = np.isnan(hic_matrix.matrix.data)
hic_matrix.matrix.data[mask] = 0

mask = np.isinf(hic_matrix.matrix.data)
hic_matrix.matrix.data[mask] = 0
hic_matrix.matrix.eliminate_zeros()

hic_matrix.save(args.outFileName[i], pApplyCorrection=False)
elif args.normalize == 'smallest':
argmin = np.argmin(sum_list)

for i, hic_matrix in enumerate(hic_matrix_list):
hic_matrix.matrix.data = hic_matrix.matrix.data.astype(np.float32)
if i != argmin:
adjust_factor = sum_list[i] / sum_list[argmin]
hic_matrix.matrix.data /= adjust_factor
mask = np.isnan(hic_matrix.matrix.data)

mask = np.isnan(hic_matrix.matrix.data)
hic_matrix.matrix.data[mask] = 0

mask = np.isinf(hic_matrix.matrix.data)
hic_matrix.matrix.data[mask] = 0
hic_matrix.matrix.eliminate_zeros()

hic_matrix.save(args.outFileName[i], pApplyCorrection=False)
224 changes: 224 additions & 0 deletions hicexplorer/test/general/test_hicNormalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
from hicmatrix import HiCMatrix as hm
from hicexplorer import hicNormalize

from tempfile import NamedTemporaryFile, mkdtemp
import shutil
import os
import numpy.testing as nt


ROOT = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "test_data/hicNormalize")

matrix_one_h5 = ROOT + '/small_test_matrix.h5'
matrix_two_h5 = ROOT + '/small_test_matrix_scaled_up.h5'

matrix_one_cool = ROOT + '/small_test_matrix.cool'
matrix_two_cool = ROOT + '/small_test_matrix_scaled_up.cool'


def test_normalize_smallest(capsys):
outfile_one = NamedTemporaryFile(suffix='.h5', delete=False)
outfile_one.close()

outfile_two = NamedTemporaryFile(suffix='.h5', delete=False)
outfile_two.close()

args = "--matrices {} {} --normalize smallest -o {} {}".format(matrix_one_h5, matrix_two_h5,
outfile_one.name, outfile_two.name).split()
hicNormalize.main(args)

test_one = hm.hiCMatrix(ROOT + "/smallest_one.h5")
test_two = hm.hiCMatrix(ROOT + "/smallest_two.h5")

new_one = hm.hiCMatrix(outfile_one.name)
new_two = hm.hiCMatrix(outfile_two.name)

nt.assert_equal(test_one.matrix.data, new_one.matrix.data)
nt.assert_equal(test_one.cut_intervals, new_one.cut_intervals)

nt.assert_equal(test_two.matrix.data, new_two.matrix.data)
nt.assert_equal(test_two.cut_intervals, new_two.cut_intervals)

os.unlink(outfile_one.name)
os.unlink(outfile_two.name)


def test_normalize_smallest_h5(capsys):
outfile_one = NamedTemporaryFile(suffix='.h5', delete=False)
outfile_one.close()

outfile_two = NamedTemporaryFile(suffix='.h5', delete=False)
outfile_two.close()

args = "--matrices {} {} --normalize smallest -o {} {}".format(matrix_one_h5, matrix_two_h5,
outfile_one.name, outfile_two.name).split()
hicNormalize.main(args)

test_one = hm.hiCMatrix(ROOT + "/smallest_one.h5")
test_two = hm.hiCMatrix(ROOT + "/smallest_two.h5")

new_one = hm.hiCMatrix(outfile_one.name)
new_two = hm.hiCMatrix(outfile_two.name)

nt.assert_equal(test_one.matrix.data, new_one.matrix.data)
nt.assert_equal(test_one.cut_intervals, new_one.cut_intervals)

nt.assert_equal(test_two.matrix.data, new_two.matrix.data)
nt.assert_equal(test_two.cut_intervals, new_two.cut_intervals)

os.unlink(outfile_one.name)
os.unlink(outfile_two.name)


def test_normalize_smallest_cool(capsys):
outfile_one = NamedTemporaryFile(suffix='.cool', delete=False)
outfile_one.close()

outfile_two = NamedTemporaryFile(suffix='.cool', delete=False)
outfile_two.close()

args = "--matrices {} {} --normalize smallest -o {} {}".format(matrix_one_cool, matrix_two_cool,
outfile_one.name, outfile_two.name).split()
hicNormalize.main(args)

test_one = hm.hiCMatrix(ROOT + "/smallest_one.cool")
test_two = hm.hiCMatrix(ROOT + "/smallest_two.cool")

new_one = hm.hiCMatrix(outfile_one.name)
new_two = hm.hiCMatrix(outfile_two.name)

nt.assert_equal(test_one.matrix.data, new_one.matrix.data)
nt.assert_equal(test_one.cut_intervals, new_one.cut_intervals)

nt.assert_equal(test_two.matrix.data, new_two.matrix.data)
nt.assert_equal(test_two.cut_intervals, new_two.cut_intervals)

os.unlink(outfile_one.name)
os.unlink(outfile_two.name)


def test_normalize_norm_range(capsys):
outfile_one = NamedTemporaryFile(suffix='.h5', delete=False)
outfile_one.close()

outfile_two = NamedTemporaryFile(suffix='.h5', delete=False)
outfile_two.close()

args = "--matrices {} {} --normalize norm_range -o {} {}".format(matrix_one_h5, matrix_two_h5,
outfile_one.name, outfile_two.name).split()
hicNormalize.main(args)

test_one = hm.hiCMatrix(ROOT + "/norm_range_one.h5")
test_two = hm.hiCMatrix(ROOT + "/norm_range_two.h5")

new_one = hm.hiCMatrix(outfile_one.name)
new_two = hm.hiCMatrix(outfile_two.name)

nt.assert_equal(test_one.matrix.data, new_one.matrix.data)
nt.assert_equal(test_one.cut_intervals, new_one.cut_intervals)

nt.assert_equal(test_two.matrix.data, new_two.matrix.data)
nt.assert_equal(test_two.cut_intervals, new_two.cut_intervals)

os.unlink(outfile_one.name)
os.unlink(outfile_two.name)


def test_normalize_norm_range_cool(capsys):
outfile_one = NamedTemporaryFile(suffix='.cool', delete=False)
outfile_one.close()

outfile_two = NamedTemporaryFile(suffix='.cool', delete=False)
outfile_two.close()

args = "--matrices {} {} --normalize norm_range -o {} {}".format(matrix_one_cool, matrix_two_cool,
outfile_one.name, outfile_two.name).split()
hicNormalize.main(args)

test_one = hm.hiCMatrix(ROOT + "/norm_range_one.cool")
test_two = hm.hiCMatrix(ROOT + "/norm_range_two.cool")

new_one = hm.hiCMatrix(outfile_one.name)
new_two = hm.hiCMatrix(outfile_two.name)

nt.assert_equal(test_one.matrix.data, new_one.matrix.data)
nt.assert_equal(test_one.cut_intervals, new_one.cut_intervals)

nt.assert_equal(test_two.matrix.data, new_two.matrix.data)
nt.assert_equal(test_two.cut_intervals, new_two.cut_intervals)

os.unlink(outfile_one.name)
os.unlink(outfile_two.name)


def test_normalize_norm_range_h5_cool_equal(capsys):
outfile_one = NamedTemporaryFile(suffix='.cool', delete=False)
outfile_one.close()

outfile_two = NamedTemporaryFile(suffix='.h5', delete=False)
outfile_two.close()

args = "--matrices {} --normalize norm_range -o {}".format(matrix_one_cool,
outfile_one.name).split()
hicNormalize.main(args)

args = "--matrices {} --normalize norm_range -o {}".format(matrix_one_h5,
outfile_two.name).split()
hicNormalize.main(args)

test_one = hm.hiCMatrix(ROOT + "/norm_range_one.cool")
test_two = hm.hiCMatrix(ROOT + "/norm_range_one.h5")

new_one = hm.hiCMatrix(outfile_one.name)
new_two = hm.hiCMatrix(outfile_two.name)

nt.assert_equal(test_one.matrix.data, new_one.matrix.data)
nt.assert_equal(test_one.cut_intervals, new_one.cut_intervals)

nt.assert_equal(test_two.matrix.data, new_two.matrix.data)
nt.assert_equal(test_two.cut_intervals, new_two.cut_intervals)

nt.assert_equal(new_one.matrix.data, new_two.matrix.data)
nt.assert_equal(len(new_one.cut_intervals), len(new_two.cut_intervals))

os.unlink(outfile_one.name)
os.unlink(outfile_two.name)


def test_normalize_smallest_h5_cool_equal(capsys):
outfile_one = NamedTemporaryFile(suffix='.cool', delete=False)
outfile_one.close()

outfile_one_cool = NamedTemporaryFile(suffix='.cool', delete=False)
outfile_one.close()

outfile_two = NamedTemporaryFile(suffix='.h5', delete=False)
outfile_two.close()
outfile_two_h5 = NamedTemporaryFile(suffix='.h5', delete=False)
outfile_two.close()

args = "--matrices {} {} --normalize smallest -o {} {}".format(matrix_one_cool, matrix_two_cool,
outfile_one.name, outfile_one_cool.name).split()
hicNormalize.main(args)

args = "--matrices {} {} --normalize smallest -o {} {}".format(matrix_one_h5, matrix_two_h5,
outfile_two.name, outfile_two_h5.name).split()
hicNormalize.main(args)

test_one = hm.hiCMatrix(ROOT + "/smallest_one.cool")
test_two = hm.hiCMatrix(ROOT + "/smallest_one.h5")

new_one = hm.hiCMatrix(outfile_one_cool.name)
new_two = hm.hiCMatrix(outfile_two_h5.name)

nt.assert_equal(test_one.matrix.data, new_one.matrix.data)
nt.assert_equal(test_one.cut_intervals, new_one.cut_intervals)

nt.assert_equal(test_two.matrix.data, new_two.matrix.data)
nt.assert_equal(test_two.cut_intervals, new_two.cut_intervals)

nt.assert_equal(new_one.matrix.data, new_two.matrix.data)
nt.assert_equal(len(new_one.cut_intervals), len(new_two.cut_intervals))

os.unlink(outfile_one.name)
os.unlink(outfile_two.name)
2 changes: 1 addition & 1 deletion hicexplorer/test/long_run/test_hicBuildMatrix.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from hicexplorer import hicBuildMatrix as hicBuildMatrix
from hicexplorer import hicBuildMatrix
from hicmatrix import HiCMatrix as hm
from tempfile import NamedTemporaryFile, mkdtemp
import shutil
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ def checkProgramIsInstalled(self, program, args, where_to_download,
'bin/hicMergeMatrixBins', 'bin/hicPlotMatrix', 'bin/hicPlotDistVsCounts',
'bin/hicPlotTADs', 'bin/hicSumMatrices', 'bin/hicExport', 'bin/hicInfo', 'bin/hicexplorer',
'bin/hicQC', 'bin/hicCompareMatrices', 'bin/hicPCA', 'bin/hicTransform', 'bin/hicPlotViewpoint',
'bin/hicConvertFormat', 'bin/hicAdjustMatrix'],
'bin/hicConvertFormat', 'bin/hicAdjustMatrix', 'bin/hicNormalize'
],
include_package_data=True,
package_dir={'hicexplorer': 'hicexplorer'},
package_data={'hicexplorer': ['qc_template.html']},
Expand Down

0 comments on commit aabb72b

Please sign in to comment.