Merge pull request #271 from deeptools/pca

Pca
deeptools · Jul 27, 2018 · 7cdfcb0 · 7cdfcb0
2 parents f815383 + 2e8a8a0
commit 7cdfcb0
Show file tree

Hide file tree

Showing 10 changed files with 9,269 additions and 58 deletions.
diff --git a/hicexplorer/HiCMatrix.py b/hicexplorer/HiCMatrix.py
@@ -261,12 +261,16 @@ def getRegionBinRange(self, chrname, startpos, endpos):
                 elif type(next(iter(self.interval_trees))) is np.bytes_:
                     chrname = toBytes(chrname)
             # chr_end_pos = chromosome_size[chrname]
-            self.interval_trees[chrname]
+            # self.interval_trees[chrname]
+            if chrname not in self.interval_trees:
+                log.exception("chromosome: {} name not found in matrix".format(chrname))
+                log.exception("valid names are:")
+                exit(1)
         except KeyError:
 
             log.exception("chromosome: {} name not found in matrix".format(chrname))
             log.exception("valid names are:")
-            log.exception(self.interval_trees.keys())
+            # log.exception(list(self.interval_trees))
             exit(1)
         try:
             startpos = int(startpos)
@@ -277,9 +281,13 @@ def getRegionBinRange(self, chrname, startpos, endpos):
             exit(1)
 
         try:
+
             startbin = sorted(self.interval_trees[chrname][startpos:startpos + 1])[0].data
             endbin = sorted(self.interval_trees[chrname][endpos:endpos + 1])[0].data
         except IndexError:
+            # log.exception("chrname: " + chrname)
+            # log.exception("len intervaltree: "+len(self.interval_trees[chrname]))
+            # log.exception("start and end pos:" + startpos + ":::" + endpos )
             log.exception("Index error")
             return None
 

diff --git a/hicexplorer/hicPCA.py b/hicexplorer/hicPCA.py
@@ -4,7 +4,7 @@
 
 from scipy.sparse import csr_matrix
 from scipy import linalg
-
+from scipy.stats import pearsonr
 import numpy as np
 import pyBigWig
 
@@ -14,7 +14,9 @@
 from hicexplorer.utilities import convertNansToZeros, convertInfsToZeros
 from hicexplorer.parserCommon import CustomFormatter
 from hicexplorer.utilities import toString
+from hicexplorer.utilities import opener
 
+from .readBed import ReadBed
 import logging
 log = logging.getLogger(__name__)
 
@@ -68,7 +70,9 @@ def parse_arguments():
                            'correlation.',
                            default=None,
                            nargs='+')
-
+    parserOpt.add_argument('--geneTrack',
+                           help='The gene track is needed to decide if the values of the eigenvector need a sign flip or not.',
+                           default=None)
     parserOpt.add_argument('--help', '-h', action='help', help='show this help message and exit')
 
     parserOpt.add_argument('--version', action='version',
@@ -77,6 +81,50 @@ def parse_arguments():
     return parser
 
 
+def correlateEigenvectorWithGeneTrack(pMatrix, pEigenvector, pGeneTrack):
+    '''
+    This function correlates the eigenvectors per chromosome with the gene density.
+    If the correlation is negative, the eigenvector values are multiplied with -1.
+    '''
+
+    file_h = opener(pGeneTrack)
+    bed = ReadBed(file_h)
+
+    gene_occurrence = np.zeros(len(pMatrix.cut_intervals))
+    gene_occurrence_per_chr = {}
+
+    chromosome_list = pMatrix.getChrNames()
+
+    for interval in bed:
+        chromosome_name = interval.chromosome
+        if chromosome_name not in chromosome_list:
+            continue
+        # in which bin of the Hi-C matrix is the given gene?
+        bin_id = pMatrix.getRegionBinRange(interval.chromosome, interval.start, interval.end)
+
+        # add +1 for one gene occurrence in this bin
+        gene_occurrence[bin_id[1]] += 1
+
+    for chromosome in chromosome_list:
+        # where is the start and the end bin of a chromosome?
+        bin_id = pMatrix.getChrBinRange(chromosome)
+        gene_occurrence_per_chr[chromosome] = gene_occurrence[bin_id[0]: bin_id[1]]
+
+    # change from [[1,2], [3,4], [5,6]] to [[1,3,5],[2,4,6]]
+    pEigenvector = np.array(pEigenvector).real.transpose()
+
+    # correlate gene density and eigenvector values.
+    # if positive correlation, do nothing, if negative, flip the values.
+    # computed per chromosome
+    for chromosome in chromosome_list:
+        bin_id = pMatrix.getChrBinRange(chromosome)
+        for i, eigenvector in enumerate(pEigenvector):
+            _correlation = pearsonr(eigenvector[bin_id[0]:bin_id[1]].real, gene_occurrence_per_chr[chromosome])
+            if _correlation[0] < 0:
+                eigenvector[bin_id[0]:bin_id[1]] = np.negative(eigenvector[bin_id[0]:bin_id[1]])
+    return np.array(pEigenvector).transpose()
+
+
 def main(args=None):
     args = parse_arguments().parse_args(args)
     if int(args.numberOfEigenvectors) != len(args.outputFileName):
@@ -103,7 +151,6 @@ def main(args=None):
         length_chromosome += chr_range[1] - chr_range[0]
     for chrname in ma.getChrNames():
         chr_range = ma.getChrBinRange(chrname)
-        log.debug("Computing pca for chromosome: {}".format(chrname))
 
         submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]]
 
@@ -127,6 +174,9 @@ def main(args=None):
         start_list += start
         end_list += end
 
+    if args.geneTrack:
+        vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list, args.geneTrack)
+
     if args.format == 'bedgraph':
         for idx, outfile in enumerate(args.outputFileName):
             assert(len(vecs_list) == len(chrom_list))
@@ -144,20 +194,20 @@ def main(args=None):
             exit(1)
         old_chrom = chrom_list[0]
         header = []
-        for i, chrom_ in enumerate(chrom_list):
-            if old_chrom != chrom_:
+        for i, _chrom in enumerate(chrom_list):
+            if old_chrom != _chrom:
                 header.append((toString(old_chrom), end_list[i - 1]))
-            old_chrom = chrom_
+            old_chrom = _chrom
 
         header.append((toString(chrom_list[-1]), end_list[-1]))
         for idx, outfile in enumerate(args.outputFileName):
             log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list)))
             log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list)))
 
             assert(len(vecs_list) == len(chrom_list))
-            chrom_list_ = []
-            start_list_ = []
-            end_list_ = []
+            _chrom_list = []
+            _start_list = []
+            _end_list = []
             values = []
 
             bw = pyBigWig.open(outfile, 'w')
@@ -170,12 +220,12 @@ def main(args=None):
                     if isinstance(value[idx], np.complex):
                         value[idx] = value[idx].real
                     values.append(value[idx])
-                    chrom_list_.append(toString(chrom_list[i]))
-                    start_list_.append(start_list[i])
-                    end_list_.append(end_list[i])
+                    _chrom_list.append(toString(chrom_list[i]))
+                    _start_list.append(start_list[i])
+                    _end_list.append(end_list[i])
 
             # write entries
-            bw.addEntries(chrom_list_, start_list_, ends=end_list_, values=values)
+            bw.addEntries(_chrom_list, _start_list, ends=_end_list, values=values)
             bw.close()
     else:
         log.error("Output format not known: {}".format(args.format))

diff --git a/hicexplorer/lib/__init__.py b/hicexplorer/lib/__init__.py
@@ -1 +1 @@
-from .matrixFileHandler import MatrixFileHandler
+from .matrixFileHandler import MatrixFileHandler  # noqa: F401
diff --git a/hicexplorer/test/general/test_hicPCA.py b/hicexplorer/test/general/test_hicPCA.py
@@ -22,8 +22,8 @@ def are_files_equal(file1, file2):
                 split_x = x.split('\t')
                 split_y = y.split('\t')
                 if split_x[0] == split_y[0] and split_x[1] == split_y[1] and split_x[2] == split_y[2]:
-                    # to ignore rounding errors after 4th digit
-                    if 0 <= abs(abs(float(split_x[3].strip())) - abs(float(split_y[3].strip()))) <= 0.0001:
+                    # to ignore rounding errors after 2th digit
+                    if 0 <= abs(abs(float(split_x[3].strip())) - abs(float(split_y[3].strip()))) <= 0.01:
                         continue
                     else:
                         log.debug('wtf: {}'.format(abs(abs(float(split_x[3].strip())) - abs(float(split_y[3].strip())))))
@@ -34,13 +34,12 @@ def are_files_equal(file1, file2):
     return equal
 
 
-def are_files_equal_bigwig(file1, file2):
-    chrom_list = ['chr2L', 'chr2R', 'chr3L', 'chr3R', 'chr2RHet', 'chr3RHet', 'chr2LHet', 'chr4',
-                  'chrU', 'chrX', 'chrXHet', 'chr3LHet']
-    bw_file1 = pyBigWig.open(file1)
-    bw_file2 = pyBigWig.open(file2)
+def are_files_equal_bigwig(pFile1, pFile2, pChromosomeList):
 
-    for chrom in chrom_list:
+    bw_file1 = pyBigWig.open(pFile1)
+    bw_file2 = pyBigWig.open(pFile2)
+
+    for chrom in pChromosomeList:
         try:
             bins_list_file1 = bw_file1.intervals(chrom)
         except Exception:
@@ -57,7 +56,7 @@ def are_files_equal_bigwig(file1, file2):
             bins_list_file1[:][2] *= -1
         if bins_list_file1 is None and bins_list_file2 is None:
             return True
-        nt.assert_array_almost_equal(np.absolute(bins_list_file1), np.absolute(bins_list_file2))
+        nt.assert_array_almost_equal(np.absolute(bins_list_file1), np.absolute(bins_list_file2), decimal=2)
     return True
 
 
@@ -74,8 +73,8 @@ def test_pca_bedgraph():
     assert are_files_equal(ROOT + "hicPCA/pca1.bedgraph", pca1.name)
     assert are_files_equal(ROOT + "hicPCA/pca2.bedgraph", pca2.name)
 
-    # os.unlink(pca1.name)
-    # os.unlink(pca2.name)
+    os.unlink(pca1.name)
+    os.unlink(pca2.name)
 
 
 def test_pca_bigwig():
@@ -88,8 +87,49 @@ def test_pca_bigwig():
     args = "--matrix {} --outputFileName {} {} -f bigwig -noe 2".format(matrix, pca1.name, pca2.name).split()
     hicPCA.main(args)
 
-    assert are_files_equal_bigwig(ROOT + "hicPCA/pca1.bw", pca1.name)
-    assert are_files_equal_bigwig(ROOT + "hicPCA/pca2.bw", pca2.name)
+    chrom_list = ['chr2L', 'chr2R', 'chr3L', 'chr3R', 'chr2RHet', 'chr3RHet', 'chr2LHet', 'chr4',
+                  'chrU', 'chrX', 'chrXHet', 'chr3LHet']
+    assert are_files_equal_bigwig(ROOT + "hicPCA/pca1.bw", pca1.name, chrom_list)
+    assert are_files_equal_bigwig(ROOT + "hicPCA/pca2.bw", pca2.name, chrom_list)
+
+    os.unlink(pca1.name)
+    os.unlink(pca2.name)
+
+
+def test_pca_bedgraph_gene_density():
+    pca1 = NamedTemporaryFile(suffix='.bedgraph', delete=False)
+    pca2 = NamedTemporaryFile(suffix='.bedgraph', delete=False)
+
+    pca1.close()
+    pca2.close()
+    matrix = ROOT + "small_test_matrix.h5"
+    gene_track = ROOT + 'dm3_genes.bed.gz'
+    chromosomes = 'chrX chrXHet'
+    args = "--matrix {} --outputFileName {} {} -f bedgraph -noe 2 --geneTrack {} --chromosomes {}".format(matrix, pca1.name, pca2.name, gene_track, chromosomes).split()
+    hicPCA.main(args)
+
+    assert are_files_equal(ROOT + "hicPCA/pca1_gene_track.bedgraph", pca1.name)
+    assert are_files_equal(ROOT + "hicPCA/pca2_gene_track.bedgraph", pca2.name)
+
+    os.unlink(pca1.name)
+    os.unlink(pca2.name)
+
+
+def test_pca_bigwig_gene_density():
+    pca1 = NamedTemporaryFile(suffix='.bw', delete=False)
+    pca2 = NamedTemporaryFile(suffix='.bw', delete=False)
+
+    pca1.close()
+    pca2.close()
+    matrix = ROOT + "small_test_matrix.h5"
+    gene_track = ROOT + 'dm3_genes.bed.gz'
+    chromosomes = 'chrX chrXHet'
+    args = "--matrix {} --outputFileName {} {} -f bigwig -noe 2 --geneTrack {} --chromosomes {}".format(matrix, pca1.name, pca2.name, gene_track, chromosomes).split()
+    hicPCA.main(args)
+
+    chrom_list = ['chrX', 'chrXHet']
+    assert are_files_equal_bigwig(ROOT + "hicPCA/pca1_gene_track.bw", pca1.name, chrom_list)
+    assert are_files_equal_bigwig(ROOT + "hicPCA/pca2_gene_track.bw", pca2.name, chrom_list)
 
     os.unlink(pca1.name)
     os.unlink(pca2.name)