Skip to content

Commit

Permalink
Merge branch 'segment_matrix' of git://github.com/lomereiter/componen…
Browse files Browse the repository at this point in the history
…t_segmentation into master2
  • Loading branch information
josiahseaman committed Apr 7, 2020
2 parents 97ca2a5 + 9375a2f commit bd82d7b
Show file tree
Hide file tree
Showing 7 changed files with 156 additions and 53 deletions.

Large diffs are not rendered by default.

28 changes: 14 additions & 14 deletions data/run1.B1phi1.i1.seqwish.w100/chunk01_bin100.schematic.json

Large diffs are not rendered by default.

Large diffs are not rendered by default.

37 changes: 36 additions & 1 deletion matrixcomponent/tests.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import numpy as np
import pandas as pd

from matrixcomponent.utils import (
find_groups,
path_boundaries,
path_dividers,
self_loops,
path_dividers
sort_and_drop_duplicates,
)


Expand Down Expand Up @@ -38,3 +41,35 @@ def test_path_dividers():
])
assert np.array_equal(links[path_dividers(links, bins)],
[[3, 1], [3, 5], [5, 9], [4, 1]])


def test_find_groups():
data = np.array([
[1, 2], [1, 2], [1, 3],
[2, 1], [2, 1], [2, 1], [2, 2],
[3, 3], [3, 3], [3, 4], [3, 4], [3, 5]
])
assert np.array_equal(find_groups(data),
[(0, 2), (2, 3),
(3, 6), (6, 7),
(7, 9), (9, 11), (11, 12)])

assert np.array_equal(find_groups(np.array([[]])), [])
assert np.array_equal(find_groups(np.array([[1, 2]])), [(0, 1)])


def test_sort_and_drop_duplicates():
df = pd.DataFrame.from_dict({
"from": [1, 3, 2, 3, 2, 0, 5, 4, 1, 2],
"to": [2, 2, 4, 2, 3, 1, 4, 3, 2, 3],
"path_index": [0, 3, 1, 2, 2, 3, 2, 1, 3, 2],
})

expected = pd.DataFrame.from_dict({
"from": [0, 1, 1, 2, 2, 3, 3, 4, 5],
"to": [1, 2, 2, 3, 4, 2, 2, 3, 4],
"path_index": [3, 0, 3, 2, 1, 2, 3, 1, 2],
}) # only one duplicate (2, 3, 2)

assert np.array_equal(sort_and_drop_duplicates(df).to_numpy(),
expected.to_numpy())
65 changes: 65 additions & 0 deletions matrixcomponent/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
""" Utility functions """

import numpy as np
import pandas as pd


def path_boundaries(links: np.array) -> np.array:
Expand Down Expand Up @@ -34,3 +35,67 @@ def path_dividers(links: np.array, bin_ids: np.array) -> np.array:
mask[~mask] = indices[:, 1] > indices[:, 0]

return mask


# warning: this function is intended to be numba-compatible
def _split_numpy_arr(arr):
groups = []
src, dst = arr[0]
group_start = 0

for i in range(arr.shape[0]):
item = arr[i]
if item[0] != src or item[1] != dst:
groups.append((group_start, i))
group_start = i
src, dst = item

# add last group
groups.append((group_start, arr.shape[0]))
return groups

try:
# provides ~7x speedup on large tables
from numba import jit
_split_numpy_arr = jit(_split_numpy_arr)
except ImportError:
pass


def find_groups(data: np.array) -> 'List[(int, int)]':
'''
Returns:
list of [start, end) indices such that for each item data[start:end] has constant value
Args:
data(np.array): [N x 2] array of data; in context of segmentation each row is (upstream, downstream)
'''
if data.size == 0:
return []

return _split_numpy_arr(data)


def sort_and_drop_duplicates(connections: 'pd.DataFrame', shift=21) -> 'pd.DataFrame':
'''
returns connections sorted by ["from", "to", "path_index"] without duplicate entries;
see find_dividers in segmentation.py
'''
mask = (1 << shift) - 1

if np.any(connections.max() > mask):
# nigh impossible with the default limit: (1 << 21) = 2M bins / paths;
# as such, this line of code is mostly for illustration purposes
return connections.drop_duplicates().sort_values(by=["from", "to", "path_index"])

# the columns are assumed to be (from, to, path_index)
array = connections.to_numpy()

# compress all columns into a single 64-bit integer
compressed = (array[:, 0] << (2 * shift)) + (array[:, 1] << shift) + array[:, 2]
compressed_no_dups = np.unique(compressed)

return pd.DataFrame.from_dict({
'from': compressed_no_dups >> (2 * shift),
'to': (compressed_no_dups >> shift) & mask,
'path_index': compressed_no_dups & mask,
})
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# gfapy==1.0.0
nose==1.3.7
rdflib==4.2.2
nested-dict==1.61
DNASkittleUtils==1.0.13
numpy==1.18.2
pandas==1.0.3
Expand Down
64 changes: 34 additions & 30 deletions segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
"""
from typing import List, Tuple, Set, Dict
from pathlib import Path as osPath, PurePath
from nested_dict import nested_dict
from datetime import datetime
from sortedcontainers import SortedDict
from DNASkittleUtils.Contigs import Contig, read_contigs, write_contigs_to_file
Expand Down Expand Up @@ -71,43 +70,48 @@ def segment_matrix(matrix: List[Path], bin_width, cells_per_file, pangenome_leng
1,
[], [p.name for p in matrix], 1, pangenome_length)
connections, dividers = dividers_with_max_size(matrix, cells_per_file)

component_by_first_bin = {}
component_by_last_bin = {}
start_pos = 0
for valid_start in dividers:
if valid_start != 0:
current = Component(start_pos, valid_start - 1)
# current.active_members = 1
schematic.components.append(current)
component_by_first_bin[start_pos] = current
component_by_last_bin[valid_start - 1] = current
start_pos = valid_start
print(f"Created {len(schematic.components)} components")

# populate Component occupancy per Path
populate_component_matrix(matrix, schematic)

incoming = nested_dict(2, set)
outgoing = nested_dict(2, set)
for f, t, p in connections.itertuples(index=False):
incoming[t][f].add(p)
outgoing[f][t].add(p)
connections_array = connections.to_numpy()
groups = utils.find_groups(connections_array[:, :2])
path_indices = connections.path_index.to_numpy()

participants_mask = np.zeros(len(schematic.path_names), dtype=bool)

# populate all link columns onto schematic
nLinkColumns = 0
for component in schematic.components:
# TODO: order columns based on traversal patterns,
# TODO: insert additional columns for higher copy number
for origin_pos, participants in incoming[component.first_bin].items():
phase_dots = [indiv in participants for indiv in schematic.path_names]
entering = LinkColumn(origin_pos,
component.first_bin,
participants=phase_dots)
component.arrivals.append(entering)
for (start, end) in groups:
row = connections_array[start]
src, dst = int(row[0]), int(row[1])

participants_mask[:] = False
participants_mask[path_indices[start:end]] = True
phase_dots = participants_mask.tolist()
link_column = LinkColumn(src, dst, participants=phase_dots)

src_component = component_by_last_bin.get(src)
dst_component = component_by_first_bin.get(dst)

if src_component:
src_component.departures.append(link_column)
nLinkColumns += 1
for arriving_pos, participants in outgoing[component.last_bin].items():
# phase_dots depends on row ordering of path names, optimized for display
phase_dots = [indiv in participants for indiv in schematic.path_names]
leaving = LinkColumn(component.last_bin,
arriving_pos,
participants=phase_dots)
component.departures.append(leaving)

if dst_component:
dst_component.arrivals.append(link_column)
nLinkColumns += 1

for i in range(len(schematic.components)-1):
Expand All @@ -126,7 +130,7 @@ def dividers_with_max_size(matrix: List[Path], cells_per_file: int):
# estimate number of paths, x10 because most paths are empty
dividers_extended = []
prev = 0
for div in sorted(list(dividers)):
for div in dividers:
gap_size = div - prev
if gap_size > MAX_COMPONENT_SIZE:
for i in range(prev + MAX_COMPONENT_SIZE, div, MAX_COMPONENT_SIZE):
Expand Down Expand Up @@ -188,7 +192,7 @@ def find_dividers(matrix: List[Path]) -> Tuple[pd.DataFrame, Set[int]]:
df = pd.DataFrame.from_dict({
'from': path_dividers[:, 0], # aka upstream
'to': path_dividers[:, 1], # aka downstream
'path': path.name
'path_index': i
})

connection_dfs.append(df)
Expand All @@ -204,22 +208,22 @@ def find_dividers(matrix: List[Path]) -> Tuple[pd.DataFrame, Set[int]]:
# Stack up others using the same LinkColumn

df = pd.concat(connection_dfs)
df.drop_duplicates(inplace=True)
df = utils.sort_and_drop_duplicates(df)

# all start positions of components
dividers = set(df["from"] + 1) | set(df["to"])
dividers.add(1)
# (max_bin + 1) is end of pangenome
dividers = np.concatenate([[1, max_bin + 1], df["from"] + 1, df["to"]])
dividers = np.unique(dividers).tolist()

print(f"Largest bin_id was {max_bin}\n"
f"Found {len(dividers)} dividers.")
dividers.add(max_bin + 1) # end of pangenome

if self_loops:
n_self_loops = np.unique(np.concatenate(self_loops), axis=0).shape[0]
print(f"Eliminated {n_self_loops} self-loops")

n_uniq_links = df[df["to"] >= df["from"]]\
.drop(columns=["path"]).drop_duplicates().shape[0]
.drop(columns=["path_index"]).drop_duplicates().shape[0]

n_links = sum([len(p.links) for p in matrix])
print(f"Input has {n_links} listed Links. "
Expand Down

0 comments on commit bd82d7b

Please sign in to comment.