# Experiments with vector arithmetic to test for transposition

## Regular (non-masked array)

In [1]:
# Setup

import numpy as np

In [2]:
# Data
#
# w0: The red and the black cat
# w1: The red and the black cat
# w2: The black and the red cat
# w3: The black and the red cat

# Hyperedges we pretend we've placed (The -- -- the -- cat)
# TODO:
#   Should be masked arrays, since some values will be missing for some witnesses

hyperedges = np.array([
        [1, 1, 1, 1],
        [4, 4, 4, 4],
        [6, 6, 6, 6]
    ])

def check_whether_okay_to_place(current_hyperedges, potential_hyperedge) -> bool:
    """Return True iff we can add row without creating crossing hyperedge

    If subtracting a potential hyperedge from any existing hyperedge would
    return values that diverge in sign, the potential would cross that existing"""
    subtractionResult = current_hyperedges - potential_hyperedge
    # print(f"{subtractionResult=}")
    signs = np.sign(subtractionResult)
    # print(f"{signs=}")
    # TODO: Can the comparison be made more efficient?
    okayToPlace = (signs.min(axis=1) == signs.max(axis=1)).all()
    # print(f"{okayToPlace=}")
    return okayToPlace

# TODO: We are assuming complete (not masked) arrays

# We should be able to place this one
andToken = np.array([3, 3, 3, 3])
print("Expect True: ", check_whether_okay_to_place(hyperedges, andToken))

# We shouldn't be able to place either of these
#   because they would cross "(and) the"
redToken = np.array([2, 2, 5, 5])
print("Expect False: ", check_whether_okay_to_place(hyperedges, redToken))

blackToken = np.array([5, 5, 2, 2])
print("Expect False: ", check_whether_okay_to_place(hyperedges, blackToken))

Expect True:  True
Expect False:  False
Expect False:  False


## Masked array

In [3]:
import numpy.ma as ma

# Data
#
# w0: The red and the black cat
# w1: The red and the black cat
# w2: The black -- the -- cat
# w3: The black and the red cat

# Hyperedges we pretend we've placed (The -- -- the -- cat)
# TODO:
#   Should be masked arrays, since some values will be missing for some witnesses

ma_hyperedges = np.array([
        ma.array([1, 1, 1, 1]),
        ma.array([6, 6, 6, 6])
    ])

def ma_check_whether_okay_to_place(current_hyperedges, potential_hyperedge) -> bool:
    """Return True iff we can add row without creating crossing hyperedge

    If subtracting a potential hyperedge from any existing hyperedge would
    return values that diverge in sign, the potential would cross that existing"""
    subtractionResult = (current_hyperedges - potential_hyperedge)
    signs = np.sign(subtractionResult)
    # print(f"{signs.shape=}")
    # rows, columns = signs.shape
    # TODO: Can the comparison be made more efficient?
    okayToPlace = (signs.min(axis=1) == signs.max(axis=1)).all()
    # print(f"{okayToPlace=}")
    return okayToPlace

# We should be able to place this one
# NB: This works; remarked to reduce clutter while debugging below
ma_theToken = ma.array([4, 4, 4, 4])
print("Expect True:", ma_check_whether_okay_to_place(ma_hyperedges, ma_theToken))

# Should be able to place; token is in only 0, 2, and 3
# NB: masked_invalid() will recognize np.nan as invalid, but not None
ma_andToken = ma.masked_invalid([3, np.nan, 3, 3])
print("Expect True: ", ma_check_whether_okay_to_place(ma_hyperedges, ma_andToken))

# We shouldn't be able to place either of these
#   because they would cross "(and) the"
redToken = np.array([2, 2, 5, 5])
print("Expect False: ", check_whether_okay_to_place(hyperedges, redToken))

blackToken = np.array([5, 5, 2, 2])
print("Expect False: ", check_whether_okay_to_place(hyperedges, blackToken))

Expect True: True
Expect True:  True
Expect False:  False
Expect False:  False


## Masked array when the candidates have no overlap or intersection

In [4]:
# Data
#
# w0: The red and the black cat
# w1: The red and the black cat
# w2: The black -- the -- cat
# w3: The black and the red cat

# Both dimensions have to be masked arrays
# This includes initial "The", final "cat", and "black" only from w0 and w1
ma_hyperedges = ma.MaskedArray([
        ma.MaskedArray([1, 1, 1, 1]),
        ma.MaskedArray([5, 5, 10, 10], mask=[False, False, True, True]),
        ma.MaskedArray([6, 6, 6, 6])
    ])
# print(f"{ma_hyperedges=}")

# We now try to add "black" only from w2 and w3
aligned_black_w2_w3 = ma.MaskedArray([100, 100, 2, 2], mask=[True, True, False, False])
# print(f"{aligned_black_w2_w3=}")

# print("Difference: ", ma_hyperedges - aligned_black_w2_w3)
test_signs = np.sign(ma_hyperedges - aligned_black_w2_w3)
# print(f"{test_signs=}")
mins = test_signs.min(axis=1)
# print(f"{mins=}")
maxes = test_signs.max(axis=1)
# print(f"{maxes=}")
test_ok = (mins == maxes).all()
# print(f"{test_ok=}")

print("Expect True: ", ma_check_whether_okay_to_place(ma_hyperedges, aligned_black_w2_w3))

Expect True:  True


# Preallocate large matrix and update individual rows

This simulates what we hope will be an efficient approach to maintaining the vector space for our alignment implementation. Instead of maintaining just the alignment hypergraph and transforming that into a vector-space representation each time we add a hyperedge, we maintain the alignment hypergraph and its vector-space equivalent (as a numpy masked array of numpy masked arrays) simultaneously. We preallocate enough space in the outer masked array to accommodate all hyperedges we might need, we track which rows in the outer array are in use vs available, and we update that array as needed, either removing or adding rows.

In [68]:
# one-dimensional masked array
# preallocate and then update
# TODO: find efficient way to initialize

import numpy as np
import numpy.ma as ma
import bitarray as ba
length = 10000 # number of rows (representing hyperedges)
a = ma.MaskedArray([ma.zeros(4) for i in range(length)]) # initialize
tracking = ba.bitarray(length) # we think we need to keep track of which rows are in use
tracking.setall(0) # initialize tracking bitarray to all zeroes
# print(f"{tracking=}")
# print(f"{a=}")
# print(f"{a.shape=}")
a[1] = ma.MaskedArray([7, 8, 9, 10], mask=[False, False, True, False]) # change row 2
print(f"{type(a[1])=}")
print(f"{a[0:5]=}")
print(tracking.index(0))
tracking[0] = 1
print(tracking.index(0))

type(a[1])=<class 'numpy.ma.core.MaskedArray'>
a[0:5]=masked_array(
  data=[[0.0, 0.0, 0.0, 0.0],
        [7.0, 8.0, --, 10.0],
        [0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0]],
  mask=[[False, False, False, False],
        [False, False,  True, False],
        [False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]],
  fill_value=1e+20)
0
1


# Start here: Rethinking the alignment hypergraph

## Assumptions

1. We need to be able to visualize both the alignment hypergraph and the variant graph.
1. If we get the structures and visualizations we need with fewer data structures (e.g., by not maintaining, and perhaps not even using at all, a separate alignment hypergraph), so much the better.

## Possible new model

1. Replace alignment hypergraph (as list of hyperedges) with vector space. Decisions about whether to place a new hyperedge are made according to the vector space.
1. Create alignment hypergraph visualization directly from vector space if possible; otherwise recreate alignment hypergraph as interim step.
1. Create variant graph directly from vector space if possible; otherwise recreate alignment hypergraph as interim step.

## How to proceed

1. Ideally: write code to convert from vector space directly to alignment graph visualization.
1. If the preceding doesn't work, write code to convert from vector space to alignment hypergraph as a throw-away intermediate step and visualize that.
1. If neither of the preceding works, panic.

## Unanswered question

1. If we can go from vector space directly to variant graph, we may not need the alignment hypergraph at all.
1. If we cannot go from vector space directly to variant graph, we may need to create alignment hypergraph as a throw-away intermediate step.

## Possible pitfalls

1. We need to ensure that the initial outer array is large enough for all eventual hyperedges