Merge branch 'segment_matrix' of git://github.com/lomereiter/componen…

…t_segmentation into master2
graph-genome · Apr 7, 2020 · bd82d7b · bd82d7b
2 parents 97ca2a5 + 9375a2f
commit bd82d7b
Show file tree

Hide file tree

Showing 7 changed files with 156 additions and 53 deletions.
diff --git a/data/run1.B1phi1.i1.seqwish.w100/chunk00_bin100.schematic.json b/data/run1.B1phi1.i1.seqwish.w100/chunk00_bin100.schematic.json
diff --git a/data/run1.B1phi1.i1.seqwish.w100/chunk01_bin100.schematic.json b/data/run1.B1phi1.i1.seqwish.w100/chunk01_bin100.schematic.json
diff --git a/data/run1.B1phi1.i1.seqwish.w100/chunk04_bin100.schematic.json b/data/run1.B1phi1.i1.seqwish.w100/chunk04_bin100.schematic.json
diff --git a/matrixcomponent/tests.py b/matrixcomponent/tests.py
@@ -1,9 +1,12 @@
 import numpy as np
+import pandas as pd
 
 from matrixcomponent.utils import (
+    find_groups,
     path_boundaries,
+    path_dividers,
     self_loops,
-    path_dividers
+    sort_and_drop_duplicates,
 )
 
 
@@ -38,3 +41,35 @@ def test_path_dividers():
     ])
     assert np.array_equal(links[path_dividers(links, bins)],
                           [[3, 1], [3, 5], [5, 9], [4, 1]])
+
+
+def test_find_groups():
+    data = np.array([
+        [1, 2], [1, 2], [1, 3],
+        [2, 1], [2, 1], [2, 1], [2, 2],
+        [3, 3], [3, 3], [3, 4], [3, 4], [3, 5]
+    ])
+    assert np.array_equal(find_groups(data),
+                          [(0, 2), (2, 3),
+                           (3, 6), (6, 7),
+                           (7, 9), (9, 11), (11, 12)])
+
+    assert np.array_equal(find_groups(np.array([[]])), [])
+    assert np.array_equal(find_groups(np.array([[1, 2]])), [(0, 1)])
+
+
+def test_sort_and_drop_duplicates():
+    df = pd.DataFrame.from_dict({
+        "from":       [1, 3, 2, 3, 2, 0, 5, 4, 1, 2],
+        "to":         [2, 2, 4, 2, 3, 1, 4, 3, 2, 3],
+        "path_index": [0, 3, 1, 2, 2, 3, 2, 1, 3, 2],
+    })
+
+    expected = pd.DataFrame.from_dict({
+        "from":       [0, 1, 1, 2, 2, 3, 3, 4, 5],
+        "to":         [1, 2, 2, 3, 4, 2, 2, 3, 4],
+        "path_index": [3, 0, 3, 2, 1, 2, 3, 1, 2],
+    })  # only one duplicate (2, 3, 2)
+
+    assert np.array_equal(sort_and_drop_duplicates(df).to_numpy(),
+                          expected.to_numpy())
diff --git a/matrixcomponent/utils.py b/matrixcomponent/utils.py
@@ -1,6 +1,7 @@
 """ Utility functions """
 
 import numpy as np
+import pandas as pd
 
 
 def path_boundaries(links: np.array) -> np.array:
@@ -34,3 +35,67 @@ def path_dividers(links: np.array, bin_ids: np.array) -> np.array:
     mask[~mask] = indices[:, 1] > indices[:, 0]
 
     return mask
+
+
+# warning: this function is intended to be numba-compatible
+def _split_numpy_arr(arr):
+    groups = []
+    src, dst = arr[0]
+    group_start = 0
+
+    for i in range(arr.shape[0]):
+        item = arr[i]
+        if item[0] != src or item[1] != dst:
+            groups.append((group_start, i))
+            group_start = i
+            src, dst = item
+
+    # add last group
+    groups.append((group_start, arr.shape[0]))
+    return groups
+
+try:
+    # provides ~7x speedup on large tables
+    from numba import jit
+    _split_numpy_arr = jit(_split_numpy_arr)
+except ImportError:
+    pass
+
+
+def find_groups(data: np.array) -> 'List[(int, int)]':
+    '''
+    Returns:
+      list of [start, end) indices such that for each item data[start:end] has constant value
+    Args:
+      data(np.array): [N x 2] array of data; in context of segmentation each row is (upstream, downstream)
+    '''
+    if data.size == 0:
+        return []
+
+    return _split_numpy_arr(data)
+
+
+def sort_and_drop_duplicates(connections: 'pd.DataFrame', shift=21) -> 'pd.DataFrame':
+    '''
+    returns connections sorted by ["from", "to", "path_index"] without duplicate entries;
+    see find_dividers in segmentation.py
+    '''
+    mask = (1 << shift) - 1
+
+    if np.any(connections.max() > mask):
+        # nigh impossible with the default limit: (1 << 21) = 2M bins / paths;
+        # as such, this line of code is mostly for illustration purposes
+        return connections.drop_duplicates().sort_values(by=["from", "to", "path_index"])
+
+    # the columns are assumed to be (from, to, path_index)
+    array = connections.to_numpy()
+
+    # compress all columns into a single 64-bit integer
+    compressed = (array[:, 0] << (2 * shift)) + (array[:, 1] << shift) + array[:, 2]
+    compressed_no_dups = np.unique(compressed)
+
+    return pd.DataFrame.from_dict({
+        'from': compressed_no_dups >> (2 * shift),
+        'to': (compressed_no_dups >> shift) & mask,
+        'path_index': compressed_no_dups & mask,
+    })
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,6 @@
 # gfapy==1.0.0
 nose==1.3.7
 rdflib==4.2.2
-nested-dict==1.61
 DNASkittleUtils==1.0.13
 numpy==1.18.2
 pandas==1.0.3

diff --git a/segmentation.py b/segmentation.py
@@ -9,7 +9,6 @@
 """
 from typing import List, Tuple, Set, Dict
 from pathlib import Path as osPath, PurePath
-from nested_dict import nested_dict
 from datetime import datetime
 from sortedcontainers import SortedDict
 from DNASkittleUtils.Contigs import Contig, read_contigs, write_contigs_to_file
@@ -71,43 +70,48 @@ def segment_matrix(matrix: List[Path], bin_width, cells_per_file, pangenome_leng
                                    1,
                                    [], [p.name for p in matrix], 1, pangenome_length)
     connections, dividers = dividers_with_max_size(matrix, cells_per_file)
+
+    component_by_first_bin = {}
+    component_by_last_bin = {}
     start_pos = 0
     for valid_start in dividers:
         if valid_start != 0:
             current = Component(start_pos, valid_start - 1)
             # current.active_members = 1
             schematic.components.append(current)
+            component_by_first_bin[start_pos] = current
+            component_by_last_bin[valid_start - 1] = current
         start_pos = valid_start
     print(f"Created {len(schematic.components)} components")
 
     # populate Component occupancy per Path
     populate_component_matrix(matrix, schematic)
 
-    incoming = nested_dict(2, set)
-    outgoing = nested_dict(2, set)
-    for f, t, p in connections.itertuples(index=False):
-        incoming[t][f].add(p)
-        outgoing[f][t].add(p)
+    connections_array = connections.to_numpy()
+    groups = utils.find_groups(connections_array[:, :2])
+    path_indices = connections.path_index.to_numpy()
+
+    participants_mask = np.zeros(len(schematic.path_names), dtype=bool)
 
-    # populate all link columns onto schematic
     nLinkColumns = 0
-    for component in schematic.components:
-        # TODO: order columns based on traversal patterns,
-        # TODO: insert additional columns for higher copy number
-        for origin_pos, participants in incoming[component.first_bin].items():
-            phase_dots = [indiv in participants for indiv in schematic.path_names]
-            entering = LinkColumn(origin_pos,
-                                  component.first_bin,
-                                  participants=phase_dots)
-            component.arrivals.append(entering)
+    for (start, end) in groups:
+        row = connections_array[start]
+        src, dst = int(row[0]), int(row[1])
+
+        participants_mask[:] = False
+        participants_mask[path_indices[start:end]] = True
+        phase_dots = participants_mask.tolist()
+        link_column = LinkColumn(src, dst, participants=phase_dots)
+
+        src_component = component_by_last_bin.get(src)
+        dst_component = component_by_first_bin.get(dst)
+
+        if src_component:
+            src_component.departures.append(link_column)
             nLinkColumns += 1
-        for arriving_pos, participants in outgoing[component.last_bin].items():
-            # phase_dots depends on row ordering of path names, optimized for display
-            phase_dots = [indiv in participants for indiv in schematic.path_names]
-            leaving = LinkColumn(component.last_bin,
-                                 arriving_pos,
-                                 participants=phase_dots)
-            component.departures.append(leaving)
+
+        if dst_component:
+            dst_component.arrivals.append(link_column)
             nLinkColumns += 1
 
     for i in range(len(schematic.components)-1):
@@ -126,7 +130,7 @@ def dividers_with_max_size(matrix: List[Path], cells_per_file: int):
     # estimate number of paths, x10 because most paths are empty
     dividers_extended = []
     prev = 0
-    for div in sorted(list(dividers)):
+    for div in dividers:
         gap_size = div - prev
         if gap_size > MAX_COMPONENT_SIZE:
             for i in range(prev + MAX_COMPONENT_SIZE, div, MAX_COMPONENT_SIZE):
@@ -188,7 +192,7 @@ def find_dividers(matrix: List[Path]) -> Tuple[pd.DataFrame, Set[int]]:
         df = pd.DataFrame.from_dict({
             'from': path_dividers[:, 0],  # aka upstream
             'to': path_dividers[:, 1],    # aka downstream
-            'path': path.name
+            'path_index': i
         })
 
         connection_dfs.append(df)
@@ -204,22 +208,22 @@ def find_dividers(matrix: List[Path]) -> Tuple[pd.DataFrame, Set[int]]:
         # Stack up others using the same LinkColumn
 
     df = pd.concat(connection_dfs)
-    df.drop_duplicates(inplace=True)
+    df = utils.sort_and_drop_duplicates(df)
 
     # all start positions of components
-    dividers = set(df["from"] + 1) | set(df["to"])
-    dividers.add(1)
+    # (max_bin + 1) is end of pangenome
+    dividers = np.concatenate([[1, max_bin + 1], df["from"] + 1, df["to"]])
+    dividers = np.unique(dividers).tolist()
 
     print(f"Largest bin_id was {max_bin}\n"
           f"Found {len(dividers)} dividers.")
-    dividers.add(max_bin + 1)  # end of pangenome
 
     if self_loops:
         n_self_loops = np.unique(np.concatenate(self_loops), axis=0).shape[0]
         print(f"Eliminated {n_self_loops} self-loops")
 
     n_uniq_links = df[df["to"] >= df["from"]]\
-        .drop(columns=["path"]).drop_duplicates().shape[0]
+        .drop(columns=["path_index"]).drop_duplicates().shape[0]
 
     n_links = sum([len(p.links) for p in matrix])
     print(f"Input has {n_links} listed Links.  "