# Experiments in merging vectors

New rows are of two types: either they can be merged with
rows already in the matrix or they must be added as new rows.

1. Find rows to merge (`filter_for_merge()` function); these are current merge candidates.
There may be zero or more merge candidates.
    1. If there are zero rows for merging, add new row (`add_new_vector()` function)
        1. `add_new_vector()` calls `increase_matrix_size()` when necessary
    1. If there are one or more merge candidates:
        1. Merge all current merge candidates plus the potential in all columns where we can
        1. Replace first current merge candidate in existing matrix with result of merger
        1. Mask all other current merge candidate rows

In [1]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import numpy.ma as ma
import pprint as pp

In [2]:
def increase_matrix_size(current_matrix):
    """Double row count of matrix, retaining old data

    Parameter: current_matrix

    Returns: updated current_matrix with additional rows

    TODO: filter out fully masked rows (broadcast) during copying
    """
    current_matrix_row_count, column_count = current_matrix.shape # rows, then columns
    new_matrix_row = ma.MaskedArray(
        data = [-1] * column_count,
        mask = [True] * column_count
    )
    new_matrix = ma.MaskedArray(
        data = np.append(
            current_matrix.data,
            [new_matrix_row] * current_matrix_row_count,
            0 # rows, not columns
        ),
        mask = np.append(
            current_matrix.mask,
            [new_matrix_row.mask] * current_matrix_row_count,
            0
        )
    )
    return new_matrix

In [3]:
# test of increase_matrix_size() function
#
# initial_matrix = ma.MaskedArray(
#     data = [
#         ma.MaskedArray(data=[1, 2, 3, 4, 5], mask=[False] * 5),
#         ma.MaskedArray(data=[-1, -1, -1, -1, -1], mask=[True] * 5),
#         ma.MaskedArray(data=[11, 21, 313, -1, -1], mask=[False, False, False, True, True]),
#         ma.MaskedArray(data=[-1, -1, 37, 52, 109], mask=[True, True, False, False, False]),
#         ma.MaskedArray(data=[160, 832, 64, -1, 12], mask=[False, False, False, True, True]),
#     ]
# )
# print(initial_matrix.shape)
# pp.pprint(initial_matrix)
# medium_matrix = increase_matrix_size(initial_matrix)
# print(medium_matrix.shape)
# pp.pprint(medium_matrix)
# large_matrix = increase_matrix_size(medium_matrix)
# print(large_matrix.shape)
# pp.pprint(large_matrix)

In [4]:
def merge_vectors(existing_vector: ma.MaskedArray, new_vector: ma.MaskedArray) -> ma.MaskedArray:
    """Combine non-masked values of two vectors, returns one vector

    Sample input:
        v_candidate = ma.MaskedArray(data=[-1, 21, 22, -1], mask=[True, False, False, True])
        v_existing = ma.MaskedArray(data=[-1, -1, 22, 23], mask=[True, True, False, False])

    Sample result:
        masked_array(data=[--, 21, 22, 23], mask=[ True, False, False, False])
        data: non-masked values of two vectors, some of which were already in both
        mask: mask only positions that were masked in both input vectors

    NB:
        assumes that masked positions have a data value of -1
        does not trap bad data (input vectors that have different non-masked values in same positions)

    """
    v_new = ma.MaskedArray(
        data=ma.maximum(existing_vector.data, new_vector.data),
        mask=(existing_vector.mask * new_vector.mask),
        fill_value=-1
    )
    return v_new

In [5]:
def add_new_vector(input_tuple, new_vector):
    """Add row to matrix and update pointer to next empty row

    Parameters:
        input_tuple : current_matrix, pointer to next empty row
        new_vector : masked array vector to add as new row

    Returns tuple of:
        updated matrix, updated pointer
    """
    current_matrix, pointer = input_tuple
    current_matrix_row_count = current_matrix.shape[0]
    if pointer == current_matrix_row_count: # need more rows now!!!
        current_matrix = increase_matrix_size(current_matrix)
    current_matrix[pointer] = new_vector
    pointer += 1
    return (current_matrix, pointer)

In [6]:
# test of pointer to determine when we need to add rows to matrix
# initial_matrix = ma.MaskedArray(
#     data= [ma.MaskedArray(
#             data=[-1] * 5,
#             mask=[True] * 5
#         )] * 6
# )
# next_open_row = 0
# pp.pprint(initial_matrix)
# print(f"{next_open_row=}")
# new_row = ma.MaskedArray(
#     data = [10, 13, -1, 16, 18],
#     mask = [False, False, True, False, False]
# )
# for i in range(7):
#     initial_matrix, next_open_row = add_new_vector((initial_matrix, next_open_row), new_row)
#     pp.pprint(initial_matrix)
#     print(f"{next_open_row=}")

In [7]:
def add_or_merge_new_vector_into_matrix(input_tuple, candidate):
    """Return max row values (copy) and indices of rows to update in existing matrix

    Parameters:
        input_tuple : (existing matrix, pointer to next empty row in matrix)
        candidate: new vector

    Returns tuple of:
        merged_vector : vector to replace first row to update
        indices : vector of offsets of rows to update (first) or mask (others)

    filter contains vector of booleans, with True for rows in current that are merge candidates
    """

    current_matrix, pointer = input_tuple
    filter = np.any(current_matrix - candidate == 0, axis = 1)
    indices = np.where(filter == True)[0] # row numbers where boolean is True
    if indices.size == 0: # if indices is empty, add new row, update current_matrix and pointer
        current_matrix, pointer = add_new_vector(input_tuple, candidate)
        return (current_matrix, pointer)
    else: # if indices is populated, we merge
        max_row_values = ma.max(current_matrix[filter], axis=0) # merger of existing, not yet candidate
        merged_vector = merge_vectors(max_row_values, candidate) # eventual replacement for one of the existing candidates
        new_row = merged_vector
        rows_to_change = indices
#         print("Data for new merged row: ", new_row)
#         print("Indices of rows to modify", rows_to_change)
        current_matrix[rows_to_change[0]] = new_row # replace first row to replace with merge
        column_count = current_matrix.shape[1] # get column count
        current_matrix[rows_to_change[1:]] = ma.MaskedArray( # mask other rows to replace
            data=[-1] * column_count,
            mask=[True] * column_count
        )
#         print("After merge:"+str(current_matrix))
        return (current_matrix, pointer)

In [8]:
# test data
v_candidate = ma.MaskedArray(data=[-1, 21, 22, -1, -1], mask=[True, False, False, True, True])
v_existing_01 = ma.MaskedArray(data=[-1, -1, 22, 23, -1], mask=[True, True, False, False, True])
v_existing_02 = ma.MaskedArray(data=[-1, 21, 22, 23, 24], mask=[True, False, False, False, False])
v_existing_03 = ma.MaskedArray(data=[-1, -1, 22, -1, -1], mask=[True, True, False, True, True])
v_existing_04 = ma.MaskedArray(data=[2, 3, 3, 3, 2], mask=[False, False, False, False, False]) # cannot be merged, no masked values
v_existing_05 = ma.MaskedArray(data=[5, 5, -1, 6, 7], mask=[False, False, True, False, False]) # cannot be merged, has masked values
v_empty = ma.MaskedArray(data=[-1] * 5, mask=[True] * 5) # next blank row

v_existing = ma.MaskedArray(data=[
        v_existing_01,
        v_existing_05,
        v_existing_03,
        v_existing_04,
        v_existing_02,
        v_empty
    ])

In [9]:
add_or_merge_new_vector_into_matrix((v_existing, 5), v_candidate)

(masked_array(
   data=[[--, 21, 22, 23, 24],
         [5, 5, --, 6, 7],
         [--, --, --, --, --],
         [2, 3, 3, 3, 2],
         [--, --, --, --, --],
         [--, --, --, --, --]],
   mask=[[ True, False, False, False, False],
         [False, False,  True, False, False],
         [ True,  True,  True,  True,  True],
         [False, False, False, False, False],
         [ True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True]],
   fill_value=999999),
 5)

In [10]:
# Eek!
# This test is broken because the random examples can be inconsistent.
# If a candidate row has values in two columns and a merge candidate
# matches one, both have to match. This doesn't happen with random
# data.
#
# To fix: manually create some number of test rows with data that is
# consistent with the data that will arise with real collation.
#
# Create separate notebook to hold vector space manipulation functions.
# Only one such function will be public (cf. the notebook that generates
# blocks).

empty_row = ma.MaskedArray(data=[-1] * 5, mask=[True] * 5) # 5 columns
# initial state is one fully masked row
initial_matrix = ma.MaskedArray(
    data=[empty_row]
)
pointer = 0
# add some rows
for new_row_number in range(10):
    new_data = np.random.randint(-1, 10, 5) # 5 values between -1 and 10
    new_mask = [True if i == -1 else False for i in new_data] # 5 booleans — if only!
    new_vector = ma.MaskedArray(
        data = new_data,
        mask = new_mask
    )
    print(f"{new_vector=}")
    initial_matrix, pointer = add_or_merge_new_vector_into_matrix((initial_matrix, pointer), new_vector)
    print(f"{initial_matrix=}")
pp.pprint(initial_matrix)
print(pointer)

new_vector=masked_array(data=[4, 7, 3, 0, 2],
             mask=[False, False, False, False, False],
       fill_value=999999)
initial_matrix=masked_array(data=[[4, 7, 3, 0, 2]],
             mask=[[False, False, False, False, False]],
       fill_value=999999)
new_vector=masked_array(data=[--, 8, 3, 3, 5],
             mask=[ True, False, False, False, False],
       fill_value=999999)
initial_matrix=masked_array(data=[[4, 8, 3, 3, 5]],
             mask=[[False, False, False, False, False]],
       fill_value=999999)
new_vector=masked_array(data=[0, 7, 7, 4, 9],
             mask=[False, False, False, False, False],
       fill_value=999999)
initial_matrix=masked_array(
  data=[[4, 8, 3, 3, 5],
        [0, 7, 7, 4, 9]],
  mask=[[False, False, False, False, False],
        [False, False, False, False, False]],
  fill_value=999999)
new_vector=masked_array(data=[--, 6, 2, 7, 1],
             mask=[ True, False, False, False, False],
       fill_value=999999)
initial_matrix=masked_array(