Skip to content

Commit

Permalink
Merge pull request #271 from deeptools/pca
Browse files Browse the repository at this point in the history
Pca
  • Loading branch information
joachimwolff committed Jul 27, 2018
2 parents f815383 + 2e8a8a0 commit 7cdfcb0
Show file tree
Hide file tree
Showing 10 changed files with 9,269 additions and 58 deletions.
12 changes: 10 additions & 2 deletions hicexplorer/HiCMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,12 +261,16 @@ def getRegionBinRange(self, chrname, startpos, endpos):
elif type(next(iter(self.interval_trees))) is np.bytes_:
chrname = toBytes(chrname)
# chr_end_pos = chromosome_size[chrname]
self.interval_trees[chrname]
# self.interval_trees[chrname]
if chrname not in self.interval_trees:
log.exception("chromosome: {} name not found in matrix".format(chrname))
log.exception("valid names are:")
exit(1)
except KeyError:

log.exception("chromosome: {} name not found in matrix".format(chrname))
log.exception("valid names are:")
log.exception(self.interval_trees.keys())
# log.exception(list(self.interval_trees))
exit(1)
try:
startpos = int(startpos)
Expand All @@ -277,9 +281,13 @@ def getRegionBinRange(self, chrname, startpos, endpos):
exit(1)

try:

startbin = sorted(self.interval_trees[chrname][startpos:startpos + 1])[0].data
endbin = sorted(self.interval_trees[chrname][endpos:endpos + 1])[0].data
except IndexError:
# log.exception("chrname: " + chrname)
# log.exception("len intervaltree: "+len(self.interval_trees[chrname]))
# log.exception("start and end pos:" + startpos + ":::" + endpos )
log.exception("Index error")
return None

Expand Down
76 changes: 63 additions & 13 deletions hicexplorer/hicPCA.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from scipy.sparse import csr_matrix
from scipy import linalg

from scipy.stats import pearsonr
import numpy as np
import pyBigWig

Expand All @@ -14,7 +14,9 @@
from hicexplorer.utilities import convertNansToZeros, convertInfsToZeros
from hicexplorer.parserCommon import CustomFormatter
from hicexplorer.utilities import toString
from hicexplorer.utilities import opener

from .readBed import ReadBed
import logging
log = logging.getLogger(__name__)

Expand Down Expand Up @@ -68,7 +70,9 @@ def parse_arguments():
'correlation.',
default=None,
nargs='+')

parserOpt.add_argument('--geneTrack',
help='The gene track is needed to decide if the values of the eigenvector need a sign flip or not.',
default=None)
parserOpt.add_argument('--help', '-h', action='help', help='show this help message and exit')

parserOpt.add_argument('--version', action='version',
Expand All @@ -77,6 +81,50 @@ def parse_arguments():
return parser


def correlateEigenvectorWithGeneTrack(pMatrix, pEigenvector, pGeneTrack):
'''
This function correlates the eigenvectors per chromosome with the gene density.
If the correlation is negative, the eigenvector values are multiplied with -1.
'''

file_h = opener(pGeneTrack)
bed = ReadBed(file_h)

gene_occurrence = np.zeros(len(pMatrix.cut_intervals))
gene_occurrence_per_chr = {}

chromosome_list = pMatrix.getChrNames()

for interval in bed:
chromosome_name = interval.chromosome
if chromosome_name not in chromosome_list:
continue
# in which bin of the Hi-C matrix is the given gene?
bin_id = pMatrix.getRegionBinRange(interval.chromosome, interval.start, interval.end)

# add +1 for one gene occurrence in this bin
gene_occurrence[bin_id[1]] += 1

for chromosome in chromosome_list:
# where is the start and the end bin of a chromosome?
bin_id = pMatrix.getChrBinRange(chromosome)
gene_occurrence_per_chr[chromosome] = gene_occurrence[bin_id[0]: bin_id[1]]

# change from [[1,2], [3,4], [5,6]] to [[1,3,5],[2,4,6]]
pEigenvector = np.array(pEigenvector).real.transpose()

# correlate gene density and eigenvector values.
# if positive correlation, do nothing, if negative, flip the values.
# computed per chromosome
for chromosome in chromosome_list:
bin_id = pMatrix.getChrBinRange(chromosome)
for i, eigenvector in enumerate(pEigenvector):
_correlation = pearsonr(eigenvector[bin_id[0]:bin_id[1]].real, gene_occurrence_per_chr[chromosome])
if _correlation[0] < 0:
eigenvector[bin_id[0]:bin_id[1]] = np.negative(eigenvector[bin_id[0]:bin_id[1]])
return np.array(pEigenvector).transpose()


def main(args=None):
args = parse_arguments().parse_args(args)
if int(args.numberOfEigenvectors) != len(args.outputFileName):
Expand All @@ -103,7 +151,6 @@ def main(args=None):
length_chromosome += chr_range[1] - chr_range[0]
for chrname in ma.getChrNames():
chr_range = ma.getChrBinRange(chrname)
log.debug("Computing pca for chromosome: {}".format(chrname))

submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]]

Expand All @@ -127,6 +174,9 @@ def main(args=None):
start_list += start
end_list += end

if args.geneTrack:
vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list, args.geneTrack)

if args.format == 'bedgraph':
for idx, outfile in enumerate(args.outputFileName):
assert(len(vecs_list) == len(chrom_list))
Expand All @@ -144,20 +194,20 @@ def main(args=None):
exit(1)
old_chrom = chrom_list[0]
header = []
for i, chrom_ in enumerate(chrom_list):
if old_chrom != chrom_:
for i, _chrom in enumerate(chrom_list):
if old_chrom != _chrom:
header.append((toString(old_chrom), end_list[i - 1]))
old_chrom = chrom_
old_chrom = _chrom

header.append((toString(chrom_list[-1]), end_list[-1]))
for idx, outfile in enumerate(args.outputFileName):
log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list)))
log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list)))

assert(len(vecs_list) == len(chrom_list))
chrom_list_ = []
start_list_ = []
end_list_ = []
_chrom_list = []
_start_list = []
_end_list = []
values = []

bw = pyBigWig.open(outfile, 'w')
Expand All @@ -170,12 +220,12 @@ def main(args=None):
if isinstance(value[idx], np.complex):
value[idx] = value[idx].real
values.append(value[idx])
chrom_list_.append(toString(chrom_list[i]))
start_list_.append(start_list[i])
end_list_.append(end_list[i])
_chrom_list.append(toString(chrom_list[i]))
_start_list.append(start_list[i])
_end_list.append(end_list[i])

# write entries
bw.addEntries(chrom_list_, start_list_, ends=end_list_, values=values)
bw.addEntries(_chrom_list, _start_list, ends=_end_list, values=values)
bw.close()
else:
log.error("Output format not known: {}".format(args.format))
Expand Down
2 changes: 1 addition & 1 deletion hicexplorer/lib/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .matrixFileHandler import MatrixFileHandler
from .matrixFileHandler import MatrixFileHandler # noqa: F401
66 changes: 53 additions & 13 deletions hicexplorer/test/general/test_hicPCA.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def are_files_equal(file1, file2):
split_x = x.split('\t')
split_y = y.split('\t')
if split_x[0] == split_y[0] and split_x[1] == split_y[1] and split_x[2] == split_y[2]:
# to ignore rounding errors after 4th digit
if 0 <= abs(abs(float(split_x[3].strip())) - abs(float(split_y[3].strip()))) <= 0.0001:
# to ignore rounding errors after 2th digit
if 0 <= abs(abs(float(split_x[3].strip())) - abs(float(split_y[3].strip()))) <= 0.01:
continue
else:
log.debug('wtf: {}'.format(abs(abs(float(split_x[3].strip())) - abs(float(split_y[3].strip())))))
Expand All @@ -34,13 +34,12 @@ def are_files_equal(file1, file2):
return equal


def are_files_equal_bigwig(file1, file2):
chrom_list = ['chr2L', 'chr2R', 'chr3L', 'chr3R', 'chr2RHet', 'chr3RHet', 'chr2LHet', 'chr4',
'chrU', 'chrX', 'chrXHet', 'chr3LHet']
bw_file1 = pyBigWig.open(file1)
bw_file2 = pyBigWig.open(file2)
def are_files_equal_bigwig(pFile1, pFile2, pChromosomeList):

for chrom in chrom_list:
bw_file1 = pyBigWig.open(pFile1)
bw_file2 = pyBigWig.open(pFile2)

for chrom in pChromosomeList:
try:
bins_list_file1 = bw_file1.intervals(chrom)
except Exception:
Expand All @@ -57,7 +56,7 @@ def are_files_equal_bigwig(file1, file2):
bins_list_file1[:][2] *= -1
if bins_list_file1 is None and bins_list_file2 is None:
return True
nt.assert_array_almost_equal(np.absolute(bins_list_file1), np.absolute(bins_list_file2))
nt.assert_array_almost_equal(np.absolute(bins_list_file1), np.absolute(bins_list_file2), decimal=2)
return True


Expand All @@ -74,8 +73,8 @@ def test_pca_bedgraph():
assert are_files_equal(ROOT + "hicPCA/pca1.bedgraph", pca1.name)
assert are_files_equal(ROOT + "hicPCA/pca2.bedgraph", pca2.name)

# os.unlink(pca1.name)
# os.unlink(pca2.name)
os.unlink(pca1.name)
os.unlink(pca2.name)


def test_pca_bigwig():
Expand All @@ -88,8 +87,49 @@ def test_pca_bigwig():
args = "--matrix {} --outputFileName {} {} -f bigwig -noe 2".format(matrix, pca1.name, pca2.name).split()
hicPCA.main(args)

assert are_files_equal_bigwig(ROOT + "hicPCA/pca1.bw", pca1.name)
assert are_files_equal_bigwig(ROOT + "hicPCA/pca2.bw", pca2.name)
chrom_list = ['chr2L', 'chr2R', 'chr3L', 'chr3R', 'chr2RHet', 'chr3RHet', 'chr2LHet', 'chr4',
'chrU', 'chrX', 'chrXHet', 'chr3LHet']
assert are_files_equal_bigwig(ROOT + "hicPCA/pca1.bw", pca1.name, chrom_list)
assert are_files_equal_bigwig(ROOT + "hicPCA/pca2.bw", pca2.name, chrom_list)

os.unlink(pca1.name)
os.unlink(pca2.name)


def test_pca_bedgraph_gene_density():
pca1 = NamedTemporaryFile(suffix='.bedgraph', delete=False)
pca2 = NamedTemporaryFile(suffix='.bedgraph', delete=False)

pca1.close()
pca2.close()
matrix = ROOT + "small_test_matrix.h5"
gene_track = ROOT + 'dm3_genes.bed.gz'
chromosomes = 'chrX chrXHet'
args = "--matrix {} --outputFileName {} {} -f bedgraph -noe 2 --geneTrack {} --chromosomes {}".format(matrix, pca1.name, pca2.name, gene_track, chromosomes).split()
hicPCA.main(args)

assert are_files_equal(ROOT + "hicPCA/pca1_gene_track.bedgraph", pca1.name)
assert are_files_equal(ROOT + "hicPCA/pca2_gene_track.bedgraph", pca2.name)

os.unlink(pca1.name)
os.unlink(pca2.name)


def test_pca_bigwig_gene_density():
pca1 = NamedTemporaryFile(suffix='.bw', delete=False)
pca2 = NamedTemporaryFile(suffix='.bw', delete=False)

pca1.close()
pca2.close()
matrix = ROOT + "small_test_matrix.h5"
gene_track = ROOT + 'dm3_genes.bed.gz'
chromosomes = 'chrX chrXHet'
args = "--matrix {} --outputFileName {} {} -f bigwig -noe 2 --geneTrack {} --chromosomes {}".format(matrix, pca1.name, pca2.name, gene_track, chromosomes).split()
hicPCA.main(args)

chrom_list = ['chrX', 'chrXHet']
assert are_files_equal_bigwig(ROOT + "hicPCA/pca1_gene_track.bw", pca1.name, chrom_list)
assert are_files_equal_bigwig(ROOT + "hicPCA/pca2_gene_track.bw", pca2.name, chrom_list)

os.unlink(pca1.name)
os.unlink(pca2.name)

0 comments on commit 7cdfcb0

Please sign in to comment.