<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span></li><li><span><a href="#Comparison-of-basic-representations-and-operations" data-toc-modified-id="Comparison-of-basic-representations-and-operations-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Comparison of basic representations and operations</a></span><ul class="toc-item"><li><span><a href="#Baseline-representation-of-partial-feature-vectors" data-toc-modified-id="Baseline-representation-of-partial-feature-vectors-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Baseline representation of partial feature vectors</a></span><ul class="toc-item"><li><span><a href="#Overhead---generation,-well-formedness,-uniquification" data-toc-modified-id="Overhead---generation,-well-formedness,-uniquification-2.1.1"><span class="toc-item-num">2.1.1&nbsp;&nbsp;</span>Overhead - generation, well-formedness, uniquification</a></span></li><li><span><a href="#Agreement" data-toc-modified-id="Agreement-2.1.2"><span class="toc-item-num">2.1.2&nbsp;&nbsp;</span>Agreement</a></span></li><li><span><a href="#Union" data-toc-modified-id="Union-2.1.3"><span class="toc-item-num">2.1.3&nbsp;&nbsp;</span>Union</a></span></li><li><span><a href="#Intersection" data-toc-modified-id="Intersection-2.1.4"><span class="toc-item-num">2.1.4&nbsp;&nbsp;</span>Intersection</a></span></li></ul></li><li><span><a href="#Specification-array-+-value-array" data-toc-modified-id="Specification-array-+-value-array-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Specification array + value array</a></span><ul class="toc-item"><li><span><a href="#Converting-between-representations" data-toc-modified-id="Converting-between-representations-2.2.1"><span class="toc-item-num">2.2.1&nbsp;&nbsp;</span>Converting between representations</a></span></li><li><span><a href="#Operations" data-toc-modified-id="Operations-2.2.2"><span class="toc-item-num">2.2.2&nbsp;&nbsp;</span>Operations</a></span></li><li><span><a href="#Performance-evaluation" data-toc-modified-id="Performance-evaluation-2.2.3"><span class="toc-item-num">2.2.3&nbsp;&nbsp;</span>Performance evaluation</a></span><ul class="toc-item"><li><span><a href="#Conclusion" data-toc-modified-id="Conclusion-2.2.3.1"><span class="toc-item-num">2.2.3.1&nbsp;&nbsp;</span>Conclusion</a></span></li></ul></li></ul></li><li><span><a href="#Matrix-extension-of-the-baseline-representation" data-toc-modified-id="Matrix-extension-of-the-baseline-representation-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Matrix extension of the baseline representation</a></span><ul class="toc-item"><li><span><a href="#Agreement-testing" data-toc-modified-id="Agreement-testing-2.3.1"><span class="toc-item-num">2.3.1&nbsp;&nbsp;</span>Agreement testing</a></span></li><li><span><a href="#Union" data-toc-modified-id="Union-2.3.2"><span class="toc-item-num">2.3.2&nbsp;&nbsp;</span>Union</a></span></li><li><span><a href="#Intersection" data-toc-modified-id="Intersection-2.3.3"><span class="toc-item-num">2.3.3&nbsp;&nbsp;</span>Intersection</a></span></li></ul></li></ul></li></ul></div>

In [1]:
import numpy as np
myint = np.int8

from vg import normalize

In [2]:
from bitarray import bitarray

In [3]:
from itertools import starmap, product

In [4]:
# from more_itertools import unique_everseen

In [5]:
from tqdm import tqdm

from joblib import Parallel, delayed, Memory

J = 30
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def par(gen_expr, j=None, backend=None, verbose=None, prefer=None):
    if j is None:
        j = J
    if backend is None:
        backend = BACKEND
    if verbose is None:
        verbose = V
    if prefer is None:
        prefer = PREFER
    return Parallel(n_jobs=j, backend=backend, verbose=verbose, prefer=prefer)(gen_expr)

def identity(x):
    return x

In [6]:
from random import choice

In [7]:
CAREFUL = False

# Overview

Goal of this notebook: find / document representations of and operations on partial feature vectors with an eye towards efficient calculation.

# Comparison of basic representations and operations

## Baseline representation of partial feature vectors

A partial feature vector $p$ on $m$ features is an element of $\{-1,0,1\}^m$, where
 - $p_i = 0$ iff feature $i$ is unspecified
 - $p_i = -1$ iff feature $i$ is specified $-$
 - $p_i = 1$ iff feature $i$ is specified $+$

Below this representation is implemented using `numpy` `int8` arrays and (usually) vectorized operations on them.

### Overhead - generation, well-formedness, uniquification

In [8]:
m = 3

In [9]:
def make_generator_vectors(num_features):
    basis_vectors = [np.zeros(num_features, dtype=myint) for each in range(num_features)]
    basis_vectors_neg = [np.zeros(num_features, dtype=myint) for each in range(num_features)]
    for i,v in enumerate(basis_vectors):
        v[i] = 1
    for i,v in enumerate(basis_vectors_neg):
        v[i] = -1
    generators = basis_vectors + basis_vectors_neg
    return generators

In [10]:
generators = make_generator_vectors(m)
generators

[array([1, 0, 0], dtype=int8),
 array([0, 1, 0], dtype=int8),
 array([0, 0, 1], dtype=int8),
 array([-1,  0,  0], dtype=int8),
 array([ 0, -1,  0], dtype=int8),
 array([ 0,  0, -1], dtype=int8)]

In [11]:
max_num_objects = 2 ** m
max_num_objects

max_num_partial_fvs = (2 + 1) ** m
max_num_partial_fvs

8

27

In [12]:
def wf_pfv(v):
    allowedValues = {-1,0,1}
    return all([x in allowedValues for x in v])

In [13]:
def make_random_pfv():
    return np.random.randint(3, size=m, dtype=myint) - 1

In [14]:
def uniquify(ndarray_iterable):
    tuples = [tuple(a) for a in ndarray_iterable]
    s = set(tuples)
    arrays = [np.array(t) for t in s]
    return arrays

### Agreement

In [15]:
def ag(x,y):
    '''
    Formula:
    (x == 0 or y == 0) or ((x != 0 and y != 0) and (x == y)), where T = 1 and F = 0
    
    Pattern:
    x = x ⟶ 1
    0 = _ ⟶ 1
    _ = 0 ⟶ 1
    _ = _ ⟶ 0
    '''
    if x == y:
        return True
    elif x == 0:
        return True
    elif y == 0:
        return True
    else:
        return False

In [16]:
def agree(u,v):
    '''
    Given two vectors u and v, returns a binary vector indicating,
    elementwise, whether u and v 'agree'.
    
    agree(u[i], v[i]) iff (u[i] == 0 or v[i] == 0) or (u[i] == v[i])
    '''
#     return np.array([True if (u[i] == 0 or v[i] == 0) or (u[i] == v[i]) else False 
#                      for i in range(len(u))])
    return np.array([1 if (u[i] == 0 or v[i] == 0) or (u[i] == v[i]) else 0 
                     for i in range(len(u))], dtype=myint)

def agree_(u,v):
    '''
    Given two vectors u and v, return 1 iff u and v agree at all indices
    and 0 otherwise.
    '''
    ag = agree(u,v)
    return int(ag.all())

def agree_alt(u,v):
    '''
    Given two vectors u and v, return 1 iff u and v agree at all indices
    and 0 otherwise.
    '''
    ag = agree(u,v)
    total_agreement = np.linalg.norm(agree(u,v), 1) == m
    return int(total_agreement)
#     if total_agreement:
#         return 1.0
#     return 0.0

In [17]:
def make_agreeing_vector_pair(pred=None):
    u = make_random_pfv()
    v = make_random_pfv()
    if pred is None:
        while not agree_(u,v):
            u = make_random_pfv()
            v = make_random_pfv()
        return u,v
    while not agree_(u,v) and not pred(u,v):
        u = make_random_pfv()
        v = make_random_pfv()
    return u,v

In [18]:
num_test_pairs = int(1e5)
random_vector_pairs = [(make_random_pfv(), make_random_pfv()) for each in range(num_test_pairs)]
len(random_vector_pairs)

100000

In [19]:
for pair in random_vector_pairs:
    assert agree_(*pair) == agree_alt(*pair)

In [20]:
%%timeit

list(starmap(agree, random_vector_pairs))

1.03 s ± 27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%%timeit

list(starmap(agree_, random_vector_pairs))

1.38 s ± 18.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%%timeit

list(starmap(agree_alt, random_vector_pairs))

3.13 s ± 44.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
num_test_pairs = int(1e5)
agreeing_vector_pairs = [make_agreeing_vector_pair() for each in range(num_test_pairs)]
len(agreeing_vector_pairs)

100000

### Union

The union of two partial feature vectors $u,v$ that agree should result in a partial feature vector that has every specified value in $u$, every specified value in $v$, and no other specified values.

In general, the result is at least as specified as either $u$ or $v$: when $u=v$ $u \cup v = u = v$ and $u \cup v$ is no more specified, but otherwise $u \cup v$ will be strictly more specified than either $u$ or $v$.

In [24]:
XYs = tuple(product((-1,0,1), (-1,0,1)))
XYs

def cup(x,y):
    '''
    Formula:
    x or y, where 1 = T, -1 = T, 0 = F
    
    Algebra:
    0 is the identity ∀x ∈ {-1,0,+1}
    x is its own identity ∀x ∈ {-1,0,+1}
    (-1 and +1 are mutual inverses, but this case shouldn't occur when agree(x,y) holds)
    
    Pattern:
    x ∪ x = x
    
    0 ∪ y = y
    x ∪ 0 = x
    
    _ ∪ _ = 0  \\ <- shouldn't occur in two pfvs that agree
    '''
    if x == 0:  #if x is unspecified, return y
        return y
    elif y == 0: #if y is unspecified, return x
        return x
    elif x == y: #if both are specified and the same, return their common value
        return x
    else: #otherwise return 0
        return 0

for x,y in XYs:
    ((x,y), cup(x,y))

((-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 0), (0, 1), (1, -1), (1, 0), (1, 1))

((-1, -1), -1)

((-1, 0), -1)

((-1, 1), 0)

((0, -1), -1)

((0, 0), 0)

((0, 1), 1)

((1, -1), 0)

((1, 0), 1)

((1, 1), 1)

In [25]:
def union(u, v):
    if CAREFUL:
        assert agree_(u,v)
    return np.sign(u + v)

def twoToOne(x):
    if x != 2 and x != -2:
        return x
    elif x == 2:
        return 1
    else:
        return -1

twoToOne_v = np.vectorize(twoToOne)

def union_alt(u, v):
    if CAREFUL:
        assert agree_(u,v)
    return np.array(twoToOne_v(u + v), dtype=myint)

def union_alt2(u, v):
    if CAREFUL:
        assert agree_(u,v)
    s = u + v
    return np.trunc( np.sqrt(np.abs(s)) ) * np.sign(s,dtype=myint)

def union_alt3(u, v):
    if CAREFUL:
        assert agree_(u,v)
    w = u.copy()
    for i,x in enumerate(v):
        if x != 0:
            w[i] = x
    return w

def union_alt4(u, v):
    if CAREFUL:
        assert agree_(u,v)
    return np.array([cup(u[i],v[i]) for i in range(m)], dtype=myint)

cup_v = np.vectorize(cup)

def union_alt5(u, v):
    if CAREFUL:
        assert agree_(u,v)
    return np.array(cup_v(u,v), dtype=myint)

In [26]:
test_pair = choice(agreeing_vector_pairs)
test_pair

union(*test_pair)
union_alt(*test_pair)
union_alt2(*test_pair)
union_alt3(*test_pair)
union_alt4(*test_pair)
union_alt5(*test_pair)

(array([ 0,  1, -1], dtype=int8), array([1, 1, 0], dtype=int8))

array([ 1,  1, -1], dtype=int8)

array([ 1,  1, -1], dtype=int8)

array([ 1.,  1., -1.], dtype=float16)

array([ 1,  1, -1], dtype=int8)

array([ 1,  1, -1], dtype=int8)

array([ 1,  1, -1], dtype=int8)

In [27]:
%%timeit

list(starmap(union, agreeing_vector_pairs));

103 ms ± 1.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [28]:
%%timeit

list(starmap(union_alt, agreeing_vector_pairs));

1.9 s ± 108 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
%%timeit

list(starmap(union_alt2, agreeing_vector_pairs));

430 ms ± 3.67 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [30]:
%%timeit

list(starmap(union_alt3, agreeing_vector_pairs));

604 ms ± 17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [31]:
%%timeit

list(starmap(union_alt4, agreeing_vector_pairs));

976 ms ± 9.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [32]:
%%timeit

list(starmap(union_alt5, agreeing_vector_pairs));

2.03 s ± 63.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Intersection

The intersection of two partial feature vectors $u,v$ should result in a partial feature vector that has every specified value that is specified in both $u$ and $v$ and where $u$ and $v$ agree, and no other specified values.

In general, the result is no more specified than either $u$ or $v$: when $u=v$ $u \cap v = u = v$ and $u \cap v$ is no less specified, but otherwise $u \cap v$ will be strictly less specified than either $u$ or $v$.

In [33]:
XYs = tuple(product((-1,0,1), (-1,0,1)))
XYs 
    
def cap(x,y):
    '''
    Algebra:
    0 is the annihilating element ∀x ∈ {-1,0,+1}
    x is its own identity ∀x ∈ {-1,0,+1}
    -1 and +1 annihilate each other
    
    Pattern:
    x ∩ x = x
    
    0 ∩ _ = 0
    _ ∩ 0 = 0
    
    _ ∩ _ = 0
    '''
    if x == 0: #if x is unspecified, return 0
        return 0
    elif y == 0: #if y is unspecified, return 0
        return 0
    elif x == y: #if both are specified and the same, return their common value
        return x
    else: #otherwise return 0
        return 0

def foo(x,y):
    return np.sign( (x == y) * (x + y) )

def bar(x,y):
    return (x == y) * (x + y) * 0.5

def baz(x,y):
    return (x == y) * int((x + y) / 2)

for x,y in XYs:
#     ((x,y), cap(x,y))
    ((x,y), cap(x,y), foo(x,y), bar(x,y), baz(x,y))

((-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 0), (0, 1), (1, -1), (1, 0), (1, 1))

((-1, -1), -1, -1, -1.0, -1)

((-1, 0), 0, 0, 0.0, 0)

((-1, 1), 0, 0, 0.0, 0)

((0, -1), 0, 0, 0.0, 0)

((0, 0), 0, 0, 0.0, 0)

((0, 1), 0, 0, 0.0, 0)

((1, -1), 0, 0, 0.0, 0)

((1, 0), 0, 0, 0.0, 0)

((1, 1), 1, 1, 1.0, 1)

In [34]:
def intersection(u, v):
    return np.sign(  np.equal(u, v) * (u + v) )

def intersection_alt(u, v):
    return np.array([cap(u[i],v[i]) for i in range(m)], dtype=myint)

def intersection_alt2(u, v):
    return np.array(np.equal(u, v) * (u + v) * 0.5, dtype=myint)

def intersection_alt3(u, v):
    return np.array([bar(u[i], v[i]) for i in range(m)], dtype=myint)

In [35]:
test_pair = choice(random_vector_pairs)
test_pair

intersection(*test_pair)
intersection_alt(*test_pair)
intersection_alt2(*test_pair)
intersection_alt3(*test_pair)

(array([0, 1, 0], dtype=int8), array([ 1,  1, -1], dtype=int8))

array([0, 1, 0], dtype=int8)

array([0, 1, 0], dtype=int8)

array([0, 1, 0], dtype=int8)

array([0, 1, 0], dtype=int8)

In [36]:
for pair in random_vector_pairs:
    assert np.array_equal(intersection(*pair), intersection_alt(*pair)), 'Agreement failure on {0}'.format(pair)
    assert np.array_equal(intersection_alt2(*pair), intersection_alt(*pair)), 'Agreement failure on {0}'.format(pair)
    assert np.array_equal(intersection_alt3(*pair), intersection_alt(*pair)), 'Agreement failure on {0}'.format(pair)

In [37]:
%%timeit

list(starmap(intersection, random_vector_pairs));

227 ms ± 4.29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [38]:
%%timeit

list(starmap(intersection_alt, random_vector_pairs));

1.03 s ± 16 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [39]:
%%timeit

list(starmap(intersection_alt2, random_vector_pairs));

406 ms ± 2.53 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [40]:
%%timeit

list(starmap(intersection_alt3, random_vector_pairs));

1.49 s ± 19.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Specification array + value array

This representation of a partial feature vector $p$ uses two bit sequences, $s$ and $v$
 - $s_i = 0$ iff $p_i = 0$ and is otherwise $1$
 - $v_i = 0$ if $p_i = -1$
 - $v_i = 1$ if $p_i = 1$

Note that the value of $v_i$ is unspecified if $p_i = 0$.

### Converting between representations

In [41]:
spec_cb = {-1:bitarray('1'),
            0:bitarray('0'),
            1:bitarray('1')}
val_cb = {-1:bitarray('0'),
           0:bitarray('0'),
           1:bitarray('1')}

In [42]:
def pfv_to_sv(pfv):
    s = bitarray()
    s.encode(spec_cb, list(pfv))
    v = bitarray()
    v.encode(val_cb, list(pfv))
    return s,v

def sv_to_pfv(s,v):
#     print('s,v = {0}, {1}'.format(s.to01(), v.to01()))
    specified = np.array(s.tolist(), dtype=myint)
    values = np.array(v.tolist(), dtype=myint)
#     print('\ts,v = {0}, {1}'.format(specified, values))
    for i in range(m):
        if specified[i] != 0:
            specified[i] = 1 if values[i] else -1
#     print('\ts = {0}'.format(specified))
    return specified

In [43]:
num_test_pairs = int(1e5)
random_vectors = [make_random_pfv() for each in range(num_test_pairs)]
len(random_vectors)

100000

In [44]:
for p in random_vectors:
    s,v = pfv_to_sv(p)
    p_prime = sv_to_pfv(s,v)
    assert np.array_equal(p, p_prime), 'Conversion failure on {0}'.format(pair)

### Operations

If $p,q$ are two partial feature vectors and $(s^p, v^p), (s^q, v^q)$ are their associated specification and value bitvectors, then:

We can define an element-wise agree operation by pattern matching:
```
agree((s^p_i,v^p_i), (s^q_i,v^q_i)):
    (s_w, s_x) = (s_w, s_x) ⟶ 1
    (0,0)      = (_, _)     ⟶ 1
    (_, _)     = (0,0)      ⟶ 1
    _          = _          ⟶ 0
```
Or perhaps more clearly by Boolean formula:
 - $\text{agree}(s^p_i,v^p_i,s^q_i,v^q_i) = (\neg s^p_i \lor \neg s^q_i) \lor ((s^p_i \land s^q_i) \land (v^p_i \iff v^q_i)) $

We can define an element-wise union operation (assuming agreement holds) by pattern matching:
```
Assuming agree(p,q) holds:

cup((s^p_i,v^p_i), (s^q_i,v^q_i)):
    (s_w, s_x) ∪ (s_w, s_x) = (s_w, s_x)
    (0,0)      ∪ (s_y, s_z) = (s_y, s_z)
    (s_w, s_x) ∪ (0,0)      = (s_w, s_x)
    _          ∪ _          = (0,0)
```
Or again, more clearly by Boolean formula:
 - $\text{cup}(s^p_i,v^p_i,s^q_i,v^q_i) = (s^p_i \lor s^q_i, v^p_i \lor v^q_i)$

I.e. we can take the `bitwise or` of respective specification vectors and value vectors to get the specification and value vector of the union of two partial feature vectors.

We can define an element-wise intersection operation by pattern matching:
```
cap((s^p_i,v^p_i), (s^q_i,v^q_i)):
    (s_w, s_x) ∩ (s_w, s_x) = (s_w, s_x)
    (0,0)      ∩ (_, _)     = (0,0)
    (_, _)     ∩ (0,0)      = (0,0)
    _          ∩ _          = (0,0)
```
...or by Boolean formula
 - $\text{cap}(s^p_i,v^p_i,s^q_i,v^q_i) = ((s^p_i \land s^q_i) \land (v^p_i \iff v^q_i), v^p_i \land v^q_i)$

In [45]:
def xor(p,q):
    return (p & ~q) | (~p & q)

def ifthen(p,q):
    return ~p | q

def iff(p,q):
    return ifthen(p,q) & ifthen(q,p)

assert xor(bitarray('0011'), bitarray('0101')) == bitarray('0110')
assert ifthen(bitarray('0011'), bitarray('0101')) == bitarray('1101')
assert iff(bitarray('0011'), bitarray('0101')) == bitarray('1001')

In [46]:
def agree_ba(s_p, v_p, s_q, v_q):
    return (~s_p | ~s_q) | ((s_p & s_q) & iff(v_p, v_q))

def union_ba(s_p, v_p, s_q, v_q):
    return s_p | s_q, v_p | v_q

def intersection_ba(s_p, v_p, s_q, v_q):
    return (s_p & s_q) & iff(v_p, v_q), v_p & v_q

Below we test that they have the same behavior as the baseline representation and operations:

In [47]:
for p,q in agreeing_vector_pairs:
    s_p, v_p = pfv_to_sv(p)
    s_q, v_q = pfv_to_sv(q)
    assert np.array_equal(agree(p,q), np.array(list(agree_ba(s_p, v_p, s_q, v_q)), dtype=myint))
    assert np.array_equal(union(p,q), sv_to_pfv(*union_ba(s_p, v_p, s_q, v_q)))

In [48]:
for p,q in random_vector_pairs:
    s_p, v_p = pfv_to_sv(p)
    s_q, v_q = pfv_to_sv(q)
    assert np.array_equal(agree(p,q), np.array(list(agree_ba(s_p, v_p, s_q, v_q)), dtype=myint))
    assert np.array_equal(intersection(p,q), sv_to_pfv(*intersection_ba(s_p, v_p, s_q, v_q)))

### Performance evaluation

Now we compare timing:

In [49]:
agreeing_vector_pairs_ba = [(pfv_to_sv(u)[0], pfv_to_sv(u)[1], pfv_to_sv(v)[0], pfv_to_sv(v)[1]) for u,v in agreeing_vector_pairs]

In [50]:
random_vector_pairs_ba = [(pfv_to_sv(u)[0], pfv_to_sv(u)[1], pfv_to_sv(v)[0], pfv_to_sv(v)[1]) for u,v in random_vector_pairs]

In [51]:
def unpack_pfv_pair(pair):
    p = pair[0]
    q = pair[1]
    s_p, v_p = pfv_to_sv(p)
    s_q, v_q = pfv_to_sv(q)
    return (s_p, v_p, s_q, v_q)

# def unpack_sv_pair_pair(sv_pair_pair):
#     s_p, v_p = sv_pair_pair[0][0], sv_pair_pair[0][1]
#     s_q, v_q = sv_pair_pair[1][2], sv_pair_pair[1][1]
#     return (s_p, v_p, s_q, v_q)

In [52]:
%%timeit

list(starmap(agree, random_vector_pairs))

1.03 s ± 10.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [53]:
%%timeit

list(starmap(agree_ba, random_vector_pairs_ba))

237 ms ± 5.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [54]:
%%timeit

list(starmap(union, agreeing_vector_pairs))

107 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [55]:
%%timeit

list(starmap(union_ba, agreeing_vector_pairs_ba))

55.4 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [56]:
%%timeit

list(starmap(intersection, random_vector_pairs))

227 ms ± 2.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [57]:
%%timeit

list(starmap(intersection_ba, random_vector_pairs_ba))

203 ms ± 6.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Conclusion

In the baseline representation:
 - `agreement` checking is very expensive, taking ≈10x longer than `union`ing and ≈5x longer than `intersect`ing

In the bitarray representation:
 - `agreement` checking and `intersection` take comparably long, and both take about 5x longer than `union`

`agreement` checking is about 5x faster with bitarrays than with the baseline representation, `intersection` is about 1.25x faster, and `union` is about 2x faster.

## Matrix extension of the baseline representation

In [59]:
random_stack_list = random_vector_pairs[:3]; random_stack_list

[(array([0, 1, 1], dtype=int8), array([0, 1, 1], dtype=int8)),
 (array([0, 1, 0], dtype=int8), array([-1,  0, -1], dtype=int8)),
 (array([ 0,  1, -1], dtype=int8), array([-1, -1,  1], dtype=int8))]

In [61]:
random_vector_pairs[:10]

[(array([0, 1, 1], dtype=int8), array([0, 1, 1], dtype=int8)),
 (array([0, 1, 0], dtype=int8), array([-1,  0, -1], dtype=int8)),
 (array([ 0,  1, -1], dtype=int8), array([-1, -1,  1], dtype=int8)),
 (array([-1, -1, -1], dtype=int8), array([-1, -1,  0], dtype=int8)),
 (array([0, 0, 0], dtype=int8), array([ 1, -1,  1], dtype=int8)),
 (array([0, 1, 0], dtype=int8), array([-1, -1, -1], dtype=int8)),
 (array([-1,  1,  0], dtype=int8), array([ 0, -1, -1], dtype=int8)),
 (array([-1, -1,  0], dtype=int8), array([-1,  1, -1], dtype=int8)),
 (array([0, 1, 1], dtype=int8), array([-1,  0,  0], dtype=int8)),
 (array([ 1, -1, -1], dtype=int8), array([0, 1, 1], dtype=int8))]

In [69]:
first = lambda seq: seq[0]
second = lambda seq: seq[1]

stack_a, stack_b = list(map(first, random_vector_pairs)), list(map(second, random_vector_pairs))
random_pair_stack_a, random_pair_stack_b = np.array(stack_a), np.array(stack_b)
random_pair_stack_a.dtype
random_pair_stack_b.dtype

dtype('int8')

dtype('int8')

In [70]:
stack_a, stack_b = list(map(first, agreeing_vector_pairs)), list(map(second, agreeing_vector_pairs))
agreeing_pair_stack_a, agreeing_pair_stack_b = np.array(stack_a), np.array(stack_b)
agreeing_pair_stack_a.dtype
agreeing_pair_stack_b.dtype

dtype('int8')

dtype('int8')

### Agreement testing

In [73]:
random_pair_stack_a.shape
n = random_pair_stack_a.shape[0]

(100000, 3)

In [76]:
list(starmap(agree_, random_vector_pairs));

In [120]:
vector_agree__results = np.array([agree_(random_pair_stack_a[i],random_pair_stack_b[i]) for i in range(n)])
vector_agree__results.shape

(100000,)

In [150]:
def agree_mat(A,B):
    '''
    Given two matrices A::(n,m) and B::(n,m), 
    return C::(n,1) where 
    C[i] = 1 iff A[i] and B[i] agree at all indices
    and 0 otherwise.
    '''
    # (x == 0 or y == 0) or ((x != 0 and y != 0) and (x == y))
    A_unspecified = A == 0
    B_unspecified = B == 0
    A_or_B_unspecified = A_unspecified | B_unspecified
    
    A_specified = A != 0
    B_specified = B != 0
    A_and_B_specified = A_specified & B_specified
    A_equal_B = np.equal(A,B)
    A_B_both_specified_and_equal = A_and_B_specified & A_equal_B

    ag = A_or_B_unspecified | A_B_both_specified_and_equal
#     return ag
    result = np.prod(ag, axis=-1)
    return result

In [147]:
matrix_agree_result = agree_mat(random_pair_stack_a, random_pair_stack_b)
matrix_agree_result.shape

(100000,)

In [151]:
np.array_equal(matrix_agree_result, vector_agree__results)

True

In [152]:
for i in range(n):
    u = random_pair_stack_a[i]
    v = random_pair_stack_b[i]
    assert agree_(u,v) == agree_mat(u,v), '{0}, {1} -> {2} vs. {3}'.format(u,v, agree_(u,v), agree_mat(u,v, True))

### Union

In [153]:
vector_union_results = np.array([union(agreeing_pair_stack_a[i],agreeing_pair_stack_b[i]) for i in range(n)])
vector_union_results.shape

(100000, 3)

In [156]:
union(agreeing_pair_stack_a, agreeing_pair_stack_b)

array([[-1,  1, -1],
       [-1,  1, -1],
       [ 1,  1,  1],
       ..., 
       [-1, -1, -1],
       [ 0, -1, -1],
       [ 1,  1, -1]], dtype=int8)

In [157]:
np.array_equal(vector_union_results, union(agreeing_pair_stack_a, agreeing_pair_stack_b) )

True

### Intersection

In [158]:
vector_intersection_results = np.array([intersection(random_pair_stack_a[i],random_pair_stack_b[i]) for i in range(n)])
vector_intersection_results.shape

(100000, 3)

In [159]:
intersection(agreeing_pair_stack_a, agreeing_pair_stack_b)

array([[ 0,  0, -1],
       [ 0,  0,  0],
       [ 1,  0,  0],
       ..., 
       [ 0, -1,  0],
       [ 0,  0, -1],
       [ 0,  0,  0]], dtype=int8)

In [160]:
np.array_equal(vector_intersection_results, intersection(random_pair_stack_a, random_pair_stack_b) )

True