LSH with numpy

In [1]:
from load_coreferences import load_coreferences
import lsh 
import copy
import numpy as np
import time 

import cProfile

In [2]:
raw_mentions = load_coreferences()


In [3]:

mentions = {i: m for i, m in enumerate(raw_mentions)}
# stack them on top of each other 

mentions_scaled = copy.copy(mentions)

idx = len(mentions_scaled)
scaling_factor = 1
for i in range(1, scaling_factor):
    for idx_old in mentions.keys():
        m = mentions[idx_old]
        mentions_scaled[idx] = m 
        idx += 1

In [4]:
len(mentions_scaled)

mylsh = lsh.LSHMinHash(mentions=mentions_scaled, shingle_size=3, signature_size=200, n_buckets=2)

mylsh.cluster()
mylsh.summarise()


took 0.3035604953765869 seconds for 174 mentions
average, min, max cluster size: 2.82, 1, 9


In [5]:
len(mentions_scaled)

mylsh = lsh.LSHMinHash_np(mentions=mentions_scaled, shingle_size=2, signature_size=200, band_length=2)

mylsh.cluster()
mylsh.summarise()


took 0.14955782890319824 seconds for 174 mentions
average, min, max cluster size: 23.67, 5, 60


Some profiling

In [6]:
mylsh = lsh.LSHMinHash(mentions=mentions_scaled, shingle_size=3, signature_size=200, n_buckets=2)
cProfile.run("mylsh.cluster()")

         624054 function calls in 0.363 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.363    0.363 <string>:1(<module>)
        1    0.134    0.134    0.228    0.228 lsh.py:108(_min_hash)
        1    0.000    0.000    0.000    0.000 lsh.py:109(<dictcomp>)
        1    0.000    0.000    0.004    0.004 lsh.py:131(_make_bands)
        1    0.112    0.112    0.112    0.112 lsh.py:136(_make_clusters)
        1    0.000    0.000    0.000    0.000 lsh.py:137(<dictcomp>)
        1    0.000    0.000    0.363    0.363 lsh.py:156(cluster)
      174    0.004    0.000    0.004    0.000 lsh.py:19(partition_signature)
        1    0.000    0.000    0.000    0.000 lsh.py:33(_build_vocab)
        1    0.000    0.000    0.000    0.000 lsh.py:34(<listcomp>)
        1    0.000    0.000    0.000    0.000 lsh.py:35(<listcomp>)
        1    0.000    0.000    0.018    0.018 lsh.py:38(encode_binary)
      174    

In [7]:
mylsh = lsh.LSHMinHash_np(mentions=mentions_scaled, shingle_size=2, signature_size=200, band_length=2)

cProfile.run("mylsh.cluster()")

         85791 function calls (85387 primitive calls) in 0.165 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(array_split)
        3    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(concatenate)
      200    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(empty_like)
      200    0.000    0.000    0.002    0.000 <__array_function__ internals>:2(moveaxis)
      200    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(ndim)
      100    0.000    0.000    0.001    0.000 <__array_function__ internals>:2(prod)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(split)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(stack)
      301    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(swapaxes)
      100    0.000    0.000    

Notes for using np
- for shingle size = 3: correct candidate not found for "belo", "rdainah", "young", "livshits"
- shingle size = 2 resolves the problem, but increases the number of candidates 
    - highlight this trade off in the report to Faegheh!

Comparing using np and not 
- timing:
    - for 3500 mentions, I can go from 49s to 2.5s by using numpy = 20times speed up
    - for 7000 mentions: 186s with naive version, 6.4s with vectorized. this suggests that the vectorized version scales almost linearly.
    - for 17400 mentions (original data set scaled by 100): takes 27s with vectorized. -- time complexity is more than linear
        - comparison: FAISS takes 2.7s for 17400 mentions 
    - profile: what is slow now? biggest part of time is taken by `get_candidates()` and by `make_signature()` (the minhashing)
        - [wikipedia](https://en.wikipedia.org/wiki/MinHash) suggests that one could replace the many hash functions with one to get to linear time
        - but I suppose more important is to improve on the `get_candidates()` function
- also do timing for real-world data set, there the scaling may not be as well as here 


Next steps
- scaling
    - check how the vectorized version scales with even larger fake data, ie 10k or 20k mentions
    - but again, perhaps more important is the performance on real-world data
- understand better
    - what do shingles do? smaller ->?
    - what does band size do? how does it interact with shingle size? does one compensate for the other? does one scale better than the other? (for optimization)
- integrate with REL?
    - how?
    - tests?
- find alternative for minhashing?
    - but first, what would need to be improved? speed or effectiveness?

In [8]:

# trying out different configs 
    # --> it seems as soon as shingle size > 2 or band_length > 2, some candidates are missed. is this a feature or a bug?
mylsh = lsh.LSHMinHash_np(mentions=mentions_scaled, shingle_size=2, signature_size=60, band_length=2)

mylsh.cluster()
mylsh.summarise()

pairs = {mention: [mentions_scaled[i] for i in mylsh.candidates[idx]]  for idx, mention in mentions_scaled.items()}

pairs 

took 0.05245852470397949 seconds for 174 mentions
average, min, max cluster size: 12.05, 2, 31


{'mills': ['mills',
  'benjamin',
  'milan',
  'ac milan',
  'k. benjamin',
  'benjamin netanyahu',
  'miura',
  'john mills'],
 'shahid afridi': ['shahid afridi',
  'ahmed abdel-rahman',
  'indianapolis',
  'mohammed rashid',
  'farid hakme',
  'afridi',
  'indianapolis colts',
  'british airways',
  'rashid',
  'michael frederick',
  'shivnarine chanderpaul',
  'marshall faulk',
  'mahammad rashid'],
 'langmore': ['langmore',
  'jose ramos horta',
  'milan',
  'john vance langmore',
  'morris',
  'shimon peres',
  'ramos horta',
  'major',
  'john major',
  'andrew blades',
  'eugene morris',
  'john langmore'],
 'waqar': ['lara',
  'waqar',
  'wasim akram',
  'wasim',
  'steaua bucharest',
  'baril',
  'waqar younis'],
 'ahmed abdel-rahman': ['chanderpaul',
  'madeleine albright',
  'hakme',
  'abdel-rahman',
  'ahmed abdel-rahman',
  'boxmeer',
  'van boxmeer',
  'alexander livshits',
  'homestake',
  'shahid afridi',
  'mohammed rashid',
  'farid hakme',
  'homestake mining co',
 

In [9]:
mylsh = lsh.LSHMinHash_np(mentions=mentions_scaled, shingle_size=2, signature_size=60, band_length=2)
mylsh._build_vocab()
mylsh.encode_to_np()
mylsh.make_signature()

n_bands = int(mylsh.signature_size / mylsh.band_length)
bands = np.split(ary=mylsh.signature, indices_or_sections=n_bands, axis=1)

bands[0]


array([[113,  38],
       [  9,  63],
       [ 31,   8],
       [174, 111],
       [  9,  36],
       [ 26,  15],
       [ 22,  34],
       [ 34,  35],
       [ 11,  12],
       [126,  41],
       [  7,  21],
       [ 13,   8],
       [  2,   4],
       [ 51,  35],
       [  0,  90],
       [ 22,  29],
       [  9,  29],
       [  0,  49],
       [ 13,  10],
       [ 15,   7],
       [  9,   8],
       [ 16,   8],
       [  9,   6],
       [ 66,   5],
       [ 26,  11],
       [ 55,  35],
       [ 26, 114],
       [ 34, 153],
       [ 18,   8],
       [ 15,  25],
       [ 31,  29],
       [ 16,   8],
       [ 83,  31],
       [ 34,  70],
       [ 53,   8],
       [ 15,   8],
       [ 25,  34],
       [  4,  93],
       [ 34,  72],
       [162,   7],
       [126,  15],
       [  6,  32],
       [ 21,  80],
       [126, 166],
       [ 93,   8],
       [ 34,  77],
       [ 22,  18],
       [ 13,   2],
       [  9,   6],
       [ 12,  15],
       [ 54,   0],
       [104,  30],
       [ 12,

Next steps (3/1/23):
- tidy the stuff below 
- add to lsh module:
    - integer construction
    - sorting, and group classification 
    - test the speed and the accuracy
- write tests 
    - correctness of cols_to_int, of classification in general 
    - noetebook with speed tests for different sorting options 
- document 
    - write what the functions/classes do
    - explain the idea behind the operations for cols_to_int_correct()
    - how far can we go with cols_to_int_correct() until overflow error? what to do in this case?
        - advise user to change the size of the bands (?)
        - switch to string operation, which is (much) slower but should still work? 

In [255]:

def idx_unique_rows(a):
    "return groups of indices of rows that match"
    sort_idx = np.argsort(a, axis=0)
    unq_items, counts = np.unique(A, axis=0, return_index=False, return_counts=True)
    unq_idx = np.split(sort_idx, np.cumsum(counts))
    # unq_idx = [x[:, 0] for x in unq_idx]
    return unq_idx

def cols_to_int(a):
    "combine all columns in a row to an integer: [1,2,3] becomes [123]"
    n_positions = a.shape[1]
    mult_factor = np.array([10**x for x in reversed(range(n_positions))])
    summationvector = np.ones((n_positions, 1)) 
    out = np.matmul(a * mult_factor, summationvector)
    return out


def cols_to_int_correct(a):
    "combine columns in all rows to an integer: [[1,20,3], [1,4,10]] becomes [1203,1410]"
    # a = np.array([[1,4,10], [14, 12, 3], [1, 100, 39]]) # TODO: test case!
    existing_powers = np.floor(np.log10(a)) 
    n_positions = a.shape[1]
    n_mentions = a.shape[0]

    cumsum_powers = np.fliplr(np.cumsum(np.fliplr(existing_powers), axis=1))

    add_powers = [x for x in reversed(range(n_positions))]
    add_powers = np.tile(add_powers, (n_mentions, 1))

    mult_factor = cumsum_powers - existing_powers + add_powers  
    summationvector = np.ones((n_positions, 1)) 
    out = np.matmul(a * 10**mult_factor, summationvector)
    return out 


def cols_to_string(a):
    a = a.astype(np.string_)
    ncols = a.shape[1]

    out = a[:, 0]
    out = np.expand_dims(out, axis=1)
    for c in np.split(a[:, 1:], indices_or_sections=ncols-1, axis=1):
        out = np.char.add(out, c)
    return out


def idx_unique_final(a):
    # a = cols_to_int(a).squeeze() # wrong
    # a = cols_to_string(a).squeeze() # slow 
    a = cols_to_int_correct(a).squeeze()
    sort_idx = np.argsort(a)
    sort_idx
    a_sorted = a[sort_idx]
    unq_first = np.concatenate(([True], a_sorted[1:] != a_sorted[:-1])) # "is the current value different from the previous?". the concat of [True]: because the first occurrence is always True (ie the first time it occur)
    unq_items = a_sorted[unq_first]
    unq_count = np.diff(np.nonzero(unq_first)[0]) # np.nonzero(unq_first)[0] gives the indices of first elements in a_sorted
    unq_idx = np.split(sort_idx, np.cumsum(unq_count))
    return unq_idx


def idx_unique_so(a):
    # x = np.array([[1, 3], [2, 2], [2, 2], [1, 3], [1, 5], [1, 1]]) # this is one test case
    sort_idx = np.argsort(a, axis=0)
    n_rows = sort_idx.shape[0]
    row_order = sort_idx[np.arange(n_rows), 0] # extract rows based on column 0
    a_sorted = a[row_order] # TODO: does this make problems when signature length is 1? should it ever be one? 
    unq_first = np.concatenate(([True], np.any(a_sorted[1:] != a_sorted[:-1], axis=1))) # check if any column of the current row is different from the preceding row -- if so, the current row is different from the previous (first occurence)
    unq_items = a_sorted[unq_first]
    unq_count = np.diff(np.nonzero(unq_first)[0]) # np.nonzero(unq_first)[0] gives the indices of first elements in a_sorted
    unq_idx = np.split(sort_idx, np.cumsum(unq_count))
    return unq_idx

A = np.array([[3,4], [3,5], [5,6], [3,4], [6,7]])
display(A)
# display(idx_unique_rows(A))
# display(idx_unique_so(A))
display(idx_unique_final(A))


array([[3, 4],
       [3, 5],
       [5, 6],
       [3, 4],
       [6, 7]])

[array([0, 3]), array([1]), array([2]), array([4])]

In [210]:


# b = a[:, 0]
# for col in range(1, a.shape[1]):
#     print(f"adding {a[:, col]}")
#     b = np.char.add(b, a[:, col])

# b
# a[:, 0]
# np.char.add(a[:,0], a[:,1])


array([[b'1', b'4', b'10'],
       [b'14', b'12', b'3'],
       [b'1', b'100', b'39'],
       [b'14', b'12', b'3']], dtype='|S21')

array([[b'1410'],
       [b'14123'],
       [b'110039'],
       [b'14123']], dtype='|S63')

In [260]:
# test cols to int vs str
n = 10_000_000
A = np.random.choice(np.arange(1, int(n/100)), size=(n,3)) # NOTE: there should not be a zero in the input array, as otherwise log10 does not work 

%timeit cols_to_int(A)
# %timeit cols_to_string(A)
%timeit cols_to_int_correct(A)

114 ms ± 493 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
834 ms ± 7.03 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [256]:
x1 = np.array([[1, 3], [2, 2], [2, 2], [1, 3], [1, 5], [1, 1]]) # this is one test case
x2 = np.array([[3,4], [3,5], [5,6], [3,4], [6,7]]) 
x3 = np.array([[1,4,10], [14, 12, 3], [1, 100, 39], [14, 12, 3]])


display(x1)
display(idx_unique_final(x1))

display(x2)
display(idx_unique_final(x2))

display(x3)
display(idx_unique_final(x3))


array([[1, 3],
       [2, 2],
       [2, 2],
       [1, 3],
       [1, 5],
       [1, 1]])

[array([5]), array([0, 3]), array([4]), array([1, 2])]

array([[3, 4],
       [3, 5],
       [5, 6],
       [3, 4],
       [6, 7]])

[array([0, 3]), array([1]), array([2]), array([4])]

array([[  1,   4,  10],
       [ 14,  12,   3],
       [  1, 100,  39],
       [ 14,  12,   3]])

[array([0]), array([1, 3]), array([2])]

In [252]:

a = np.array([[1,4,10], [14, 12, 3], [1, 100, 39]])
existing_powers = np.floor(np.log10(a)) 
n_positions = a.shape[1]
n_mentions = a.shape[0]

cumsum_powers = np.fliplr(np.cumsum(np.fliplr(existing_powers), axis=1))
print(f"sum_powers: \n {cumsum_powers}")

req_powers = [x for x in reversed(range(n_positions))]
req_powers = np.tile(req_powers, (n_mentions, 1))

mult_factor = cumsum_powers - existing_powers + req_powers  
summationvector = np.ones((n_positions, 1)) 
out = np.matmul(a * 10**mult_factor, summationvector)

for i in range(out.shape[0]):
    print("".join(str(x) for x in out[i,])) 

sum_powers: 
 [[1. 1. 1.]
 [2. 1. 0.]
 [3. 3. 1.]]
1410.0
14123.0
110039.0


In [249]:

# not sure how to do this fast. try with string?
a = np.array([[1,4,10], [14, 12, 3], [1, 100, 39]])
display(a)
existing_powers = np.floor(np.log10(a)) 
# existing_powers = np.floor(a * (1/10)) # successively add these ones, in addition to the mult factors from above 
# NOTE: this is dangerous when the bands are more than a few elements and/or when the vocabulary contains many many values (determines the numbers in the input array to be sorted). -- floating point precision!
# NOTE: solution -- implement this here, use as alternative the option with the view? which is slower but does not rely on the operation here?
# inverse cumsum 
print(f"existing powers: \n {existing_powers}")

# required powers 
n_positions = a.shape[1]
n_mentions = a.shape[0]
req_powers = [x for x in reversed(range(n_positions))]
req_powers = np.tile(req_powers, (n_mentions, 1))
print(f"req powers: \n {req_powers}")

# existing_powers = np.fliplr(np.cumsum(np.fliplr(existing_powers), axis=1))
# sum_powers = req_powers + existing_powers
sum_powers = np.fliplr(np.cumsum(np.fliplr(existing_powers), axis=1))
print(f"sum_powers: \n {sum_powers}")

mult_factor = sum_powers + req_powers
print(f"mult_factor: \n {mult_factor}")
10**mult_factor

a * 10**mult_factor
summationvector = np.ones((n_positions, 1)) 
out = np.matmul(a * 10**mult_factor, summationvector)

for i in range(out.shape[0]):
    print("".join(str(x) for x in out[i,]))


array([[  1,   4,  10],
       [ 14,  12,   3],
       [  1, 100,  39]])

existing powers: 
 [[0. 0. 1.]
 [1. 1. 0.]
 [0. 2. 1.]]
req powers: 
 [[2 1 0]
 [2 1 0]
 [2 1 0]]
sum_powers: 
 [[1. 1. 1.]
 [2. 1. 0.]
 [3. 3. 1.]]
mult_factor: 
 [[3. 2. 1.]
 [4. 2. 0.]
 [5. 4. 1.]]
1500.0
141203.0
1100390.0


In [11]:
# keep this for some time analysis
    # mult before sort -- adapt from the now helper function cols_to_int
x = np.array([[0, 3], [2, 2], [2, 2], [0, 3]]) 
x = np.array([[1, 3], [2, 2], [2, 2], [1, 3], [1, 5], [1, 1]]) # this is one test case
A = np.array([[3,4], [3,5], [5,6], [3,4], [6,7]]) # this is another test case

display(A)
display(np.argsort(A, axis=0))
display(A.view('i8,i8').argsort(order=['f1'], axis=0))

display(x)
display(np.argsort(x, axis=0))
display(x.view('i8,i8').argsort(order=['f1'], axis=0))


n = 100_000
A = np.random.choice(np.arange(int(n/100)), size=(n,2))
%timeit np.argsort(A, axis=0)
%timeit A.view('i8,i8').argsort(order=['f1'], axis=0)
%timeit mult_before_sort(A)

array([[3, 4],
       [3, 5],
       [5, 6],
       [3, 4],
       [6, 7]])

array([[0, 0],
       [1, 3],
       [3, 1],
       [2, 2],
       [4, 4]])

array([[0],
       [3],
       [1],
       [2],
       [4]])

array([[1, 3],
       [2, 2],
       [2, 2],
       [1, 3],
       [1, 5],
       [1, 1]])

array([[0, 5],
       [3, 1],
       [4, 2],
       [5, 0],
       [1, 3],
       [2, 4]])

array([[5],
       [1],
       [2],
       [0],
       [3],
       [4]])

In [417]:
x = np.array([[1, 3], [2, 2], [2, 2], [1, 3], [1, 5], [1, 1]]) # this is one test case
x = np.array([[3,4], [3,5], [5,6], [3,4], [6,7]])
display(x)
sort_idx = np.argsort(x, axis=0)
display(sort_idx)
n_rows = sort_idx.shape[0]
row_order = sort_idx[np.arange(n_rows), 0] # extract rows based on column 0
x[row_order]
x_sorted = x[row_order] # TODO: does this make problems when signature length is 1? should it ever be one? 
# display(x_sorted) 
# x_sorted[1:]
# x_sorted[:-1]
unq_first = np.concatenate(([True], np.any(x_sorted[1:] != x_sorted[:-1], axis=1))) # check if any column of the current row is different from the preceding row -- if so, the current row is different from the previous (first occurence)
display(unq_first)
unq_items = x_sorted[unq_first]
display(unq_items)
unq_count = np.diff(np.nonzero(unq_first)[0]) # np.nonzero(unq_first)[0] gives the indices of first elements in a_sorted
display(unq_count)
unq_idx = np.split(sort_idx, np.cumsum(unq_count))

print("solution:")
display(unq_idx)
print("for")
display(x)


array([[3, 4],
       [3, 5],
       [5, 6],
       [3, 4],
       [6, 7]])

array([[0, 0],
       [1, 3],
       [3, 1],
       [2, 2],
       [4, 4]])

array([ True,  True,  True,  True,  True])

array([[3, 4],
       [3, 5],
       [3, 4],
       [5, 6],
       [6, 7]])

array([1, 1, 1, 1])

solution:


[array([[0, 0]]),
 array([[1, 3]]),
 array([[3, 1]]),
 array([[2, 2]]),
 array([[4, 4]])]

for


array([[3, 4],
       [3, 5],
       [5, 6],
       [3, 4],
       [6, 7]])

In [61]:
a = np.array([1, 2, 6, 4, 2, 3, 2])
sort_idx = np.argsort(a)
a_sorted = a[sort_idx]
display(a_sorted) 
unq_first = np.concatenate(([True], a_sorted[1:] != a_sorted[:-1])) # "is the current value different from the previous?". the concat of [True]: because the first occurrence is always True (ie the first time it occur)
display(unq_first)
unq_items = a_sorted[unq_first]
display(unq_items)
unq_count = np.diff(np.nonzero(unq_first)[0]) # np.nonzero(unq_first)[0] gives the indices of first elements in a_sorted
display(unq_count)
unq_idx = np.split(sort_idx, np.cumsum(unq_count))
display(unq_idx)

unq_count
unique_items, indices, counts = np.unique(a, axis=0, return_index=True, return_counts=True)
print(f"unique_items: {unique_items}")
print(f"unq_items: {unq_items}")
print(f"counts: {counts} , unq_count: {unq_count}")
print(f"indices: {indices} , np.nonzero(unq_first)[0]: {np.nonzero(unq_first)[0]}")

array([1, 2, 2, 2, 3, 4, 6])

array([ True,  True, False, False,  True,  True,  True])

array([1, 2, 3, 4, 6])

array([1, 3, 1, 1])

[array([0]), array([1, 4, 6]), array([5]), array([3]), array([2])]

unique_items: [1 2 3 4 6]
unq_items: [1 2 3 4 6]
counts: [1 3 1 1 1] , unq_count: [1 3 1 1]
indices: [0 1 5 3 2] , np.nonzero(unq_first)[0]: [0 1 4 5 6]


In [201]:
band = bands[0]
out = idx_unique_rows(band)
# display(out)
display(len(out))

display(band[63])
display(band[51])

display(band[85])
display(band[158])
display(band[91])
display(band[145])

8

array([ 0, 48])

array([ 0, 35])

array([ 0, 48])

array([ 0, 48])

array([ 0, 19])

array([181,  17])

## Old code

#### Binary encoding with numpy 

In [288]:
J = len(mylsh.vocab) # number of columns 
vocab = mylsh.vocab
vectors_single = {}
for mention, data in mylsh.mentions.items():
    v = np.zeros(J)
    for i in np.arange(J):
        if vocab[i] in data["shingles"]:
            v[i] = 1
    vectors_single[mention] = v

vectors = np.stack(list(vectors_single.values())) # is this scalable? should it be done differently?
# def encode_binary(self):
#     for mention, data in self.mentions.items():
#         v = [1 if self.vocab[i] in data["shingles"] else 0 for i in range(len(self.vocab)) ]
#         self.mentions[mention] = {"shingles": data["shingles"] , "vector": v}

sum(vectors_single[0])
sum(mylsh.mentions[0]['vector'])

vector_np = list(vectors_single[0])
vector_list = mylsh.mentions[0]['vector']

assert vector_list == vector_np


#### Min hashing with numpy 

In [289]:
d = 30 # length of the signature

templist = []
i = 0
while i < d:
    rng = np.random.default_rng(seed=3)
    rng.shuffle(vectors, axis=1)
    sig_i = vectors.argmax(axis=1)
    templist.append(sig_i)
    i += 1


signature = np.stack(templist, axis=1)

# A[i, j]: i is the row, j is the columns. axis=0 is along rows, axis=1 is along columns (I think)

In [290]:
signature

array([[15,  9, 33, ...,  6, 12, 50],
       [15,  4, 48, ...,  6,  3, 64],
       [ 8, 74, 12, ..., 10, 15, 33],
       ...,
       [ 8, 79, 11, ..., 18, 15, 46],
       [16,  0, 22, ...,  4, 31,  8],
       [83,  7,  9, ..., 31, 43, 44]])

#### Make bands with numpy, get the candidate coreferences

In [291]:
band_length = 2
assert d % band_length == 0
n_bands = int(d/band_length)
n_mentions = signature.shape[0]

bands = np.split(ary=signature, indices_or_sections=n_bands, axis=1) # gives n_bands bands
candidates = {i: [] for i in mentions_scaled.keys()}

for col, band in zip(range(n_bands), bands): # col is the column indicator for clusters array
    unique_rows, indices = np.unique(band, axis=0, return_index=True)
    for r, idx in zip(unique_rows, indices):
        matching = (band == r).all(axis=1).nonzero()[0]
        matching = list(matching)
        for i in matching:
            candidates[i].append(matching)


candidates
candidates = {k: list(set([item for sublist in v for item in sublist])) for k, v in candidates.items()}

candidates[0]

[0, 79]

In [292]:
pairs = {mention: [mentions_scaled[i] for i in candidates[idx]] for idx, mention in mentions_scaled.items()}

pairs


{'blewett': ['blewett', 'greg blewett'],
 'n. mclean': ['n. mclean', 'eagles', 'mclean'],
 'chernyshev': ['chernyshev',
  'konstantin chernyshev',
  'chanderpaul',
  'shivnarine chanderpaul',
  'shimon peres'],
 'salem bitar': ['baril', 'salem bitar', 'bitar'],
 'indianapolis colts': ['indianapolis colts',
  'livshits',
  'indianapolis',
  'shivnarine chanderpaul',
  'alexander livshits'],
 'faulk': ['faulk',
  'marshall faulk',
  'chanderpaul',
  'shivnarine chanderpaul',
  'paul justin'],
 'carlos belo': ['oscar luigi scalfaro',
  'carlos belo',
  'scalfaro',
  'belo',
  'grelombe',
  'christophe grelombe',
  'nobel peace prize',
  'carlos ponce',
  'nobel'],
 'rdainah': ['nabil abu rdainah', 'rdainah'],
 'michael frederick': ['michael frederick',
  'frederick',
  'havel',
  'nobel peace prize'],
 'oswaldo saldanha': ['saldanha', 'oswaldo saldanha', 'eisuke sakakibara'],
 'campese': ['campese', 'david campese'],
 'british airways': ['shahid afridi',
  'albright',
  'british airways',

## Old stuff

In [195]:
# this is similar to above, but stores the intermediate result in an np array which I think is not necessary
band_length = 2
assert d % band_length == 0
n_bands = int(d/band_length)
n_mentions = signature.shape[0]

bands = np.split(ary=signature, indices_or_sections=n_bands, axis=1) # gives n_bands bands

clusters = np.empty((n_mentions, n_bands))
clusters[:] = np.nan

# not sure how this scales; could be quadratic in number of mentions if there are no coreferences.
    # how to circumvent this? -- works well for the fake data set (20 replications), but this is because the number of unique mentions is constant, and therefore there are many duplicates
    # will therefore need to integrate it into REL and check 
for col, band in zip(range(n_bands), bands): # col is the column indicator for clusters array
    unique_rows, indices = np.unique(band, axis=0, return_index=True)
    for r, idx in zip(unique_rows, indices):
        matching = (band == r).all(axis=1).nonzero()
        # print(matching)
        for i in matching:
            clusters[i, col] = idx

clusters

array([[  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  1.,   1.,   1., ...,   1.,   1.,   1.],
       [  2.,   2.,   2., ...,   2.,   2.,   2.],
       ...,
       [171., 171.,  44., ..., 171., 171.,  44.],
       [172., 172., 172., ..., 172., 172., 172.],
       [173., 173., 173., ..., 147., 173., 173.]])

In [178]:
sum(clusters[:, 1] == 0)
clusters[:, 1] == 0

clusters[clusters[:, 1] == 0]
mentions_scaled[79]
mentions_scaled[0]

mentions_scaled[1]

'n. mclean'

#### Get the candidates
For each mention, extract all the other mentions that at least once are in the same group as mention m 


In [156]:
# this is how to make bands
A = np.random.choice(10, size=(4,4))
display(A)
np.split(ary=A, indices_or_sections=2, axis=1)

# this gives the unique occurences in the array, but the index only gives the row index of the first occurence
    # but that index implies a unique identifer, and maybe we can use np.where or something to group the rows into these indices?
A = np.array([[3,4], [5,6], [3,4], [6,7]])
display(A)
unique_rows, indices = np.unique(A, axis=0, return_index=True)
display(unique_rows)
display(indices)
# two ways: use np.where to get indices of equal rows (need to do for each index separately) https://stackoverflow.com/questions/25823608/find-matching-rows-in-2-dimensional-numpy-array
# maybe another option is to apply by axis, and assign the unique indicator to each row. -- this is slow https://stackoverflow.com/questions/23849097/numpy-np-apply-along-axis-function-speed-up
# which of these will be easier to work with later? 

# this is how to assign unique group ids to rows, for each band 
groups = np.empty((4,1)) # here, use the as i the number of mentions and as j the number of bands 
groups[:] = np.nan
groups

for band, idx in zip(unique_rows, indices): # this could be slow when not many groups are unique
    matching = (A == band).all(axis=1).nonzero()
    print(matching)
    for i in matching:
        groups[i] = idx 

groups


array([[9, 8, 6, 6],
       [8, 7, 9, 6],
       [5, 6, 2, 4],
       [6, 4, 3, 3]])

array([[3, 4],
       [5, 6],
       [3, 4],
       [6, 7]])

array([[3, 4],
       [5, 6],
       [6, 7]])

array([0, 1, 3])

(array([0, 2]),)
(array([1]),)
(array([3]),)


array([[0.],
       [1.],
       [0.],
       [3.]])

In [204]:
A = np.random.choice(2, size=(3,3))
display(A)
r, c = np.tril_indices(3)
display(r)
A[:, r]

array([[0, 1, 0],
       [1, 0, 0],
       [1, 1, 1]])

array([0, 1, 1, 2, 2, 2])

array([[0, 1, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1]])

In [206]:
B = np.array([[1,2], [3,4]])
display(B)
B[:, np.array([1, 1, 1, 0, ])]

array([[1, 2],
       [3, 4]])

array([[2, 2, 2, 1],
       [4, 4, 4, 3]])

In [93]:
A = np.random.choice(2, size=(3,3))
display(A)
np.nonzero(A)
np.flatnonzero(A)
A.argmax(axis=1)

array([[1, 1, 0],
       [0, 0, 1],
       [1, 1, 0]])

array([0, 2, 0])

In [82]:
A = np.arange(9).reshape(3,3)
display(A)
rng.shuffle(A, axis=1)
A

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

In [85]:
A = np.arange(9).reshape(3,3)
A
A[2,:]

array([6, 7, 8])

In [9]:
x = np.arange(10)
x
x.shape = (2, 5)
x
x[0] 
x[0, :]


array([], shape=(0, 5), dtype=int64)

# Here is some np documentation which I should copy to another file

An integer, $i$, returns the same values as `i:i+1` except the dimensionality of the returned object is reduced by 1. In particular, a selection tuple with the p-th element an integer (and all other entries :) returns the corresponding sub-array with dimension N - 1. If N = 1 then the returned object is an array scalar. These objects are explained in Scalars.

If the selection tuple has all entries `:` except the p-th entry which is a slice object `i:j:k`, then the returned array has dimension $N$ formed by concatenating the sub-arrays returned by integer indexing of elements $i, i+k, …, i + (m - 1) k < j,$

Basic slicing with more than one non-`:` entry in the slicing tuple, acts like repeated application of slicing using a single non-`:` entry, where the non-`:` entries are successively taken (with all other non-`:` entries replaced by `:`). Thus, `x[ind1, ..., ind2,:]` acts like `x[ind1][..., ind2, :]` under basic slicing.

In [37]:
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
x[5:]
x[5:]
x = np.array([[[1],[2],[3]], [[4],[5],[6]]])
display(x.shape)
display(x)
display((x[1:2, :, :] == x[1:2]).all())
display(x[:, 0, :])
display(x[:, 0, :].shape)
display(x[:, 0, 0])
display(x[:, 0, 0].shape)

# intuition: need a scalar when x[0, 0, 0], so removing one : at a time means we need to remove one dimension at a time 
# the dimension that is removed is the one with the integer (instead of :). see the example above.

(2, 3, 1)

array([[[1],
        [2],
        [3]],

       [[4],
        [5],
        [6]]])

True

array([[1],
       [4]])

(2, 1)

array([1, 4])

(2,)

In [41]:
# but why is this happening?
display(x)
display(x[:, :, 0]) # all sub-arrays (2 of them), all rows, the 0th column. do then all dimensions "move one up"? ie, sub-arrays become rows, rows becom columns of the slice?
display(x[:, :, 0].shape)
display(x[:, 0, :]) # sub-arrays become rows, columns remain columns?

array([[[1],
        [2],
        [3]],

       [[4],
        [5],
        [6]]])

array([[1, 2, 3],
       [4, 5, 6]])

(2, 3)

array([[1],
       [4]])

Integer array indexing allows selection of arbitrary items in the array based on their N-dimensional index. Each integer array represents a number of indices into that dimension.
-- this explains my puzzle from yesterday?

In [48]:
x = np.arange(10, 1, -1)
display(x)
display(x[np.array([3, 3, 1, 8])]) # select element with index 3 twice, the element with 1th index, then 8th index
x = np.array([[1, 2], [3, 4], [5, 6]])
display(x)
display(x[np.array([1, -1])]) # can I choose certain columns here also?

array([10,  9,  8,  7,  6,  5,  4,  3,  2])

array([7, 7, 9, 2])

array([[1, 2],
       [3, 4],
       [5, 6]])

array([[3, 4],
       [5, 6]])

Indexing with multidimensional index arrays tend to be more unusual uses, but they are permitted, and they are useful for some problems. We’ll start with the simplest multidimensional case:

In [53]:
y = np.arange(35).reshape(5, 7)
display(y)
display(y[np.array([0, 2, 4]), np.array([0, 1, 2])]) # select: y[0, 0], y[2, 1], y[4, 2] -- if the index arrays have matching shape!
# when shapes are not the same: attempt to broadcast, otherwise excpetion is raised

array([[ 0,  1,  2,  3,  4,  5,  6],
       [ 7,  8,  9, 10, 11, 12, 13],
       [14, 15, 16, 17, 18, 19, 20],
       [21, 22, 23, 24, 25, 26, 27],
       [28, 29, 30, 31, 32, 33, 34]])

array([ 0, 15, 30])

array([ 1, 15, 29])

array([ 1,  8, 15, 22, 29])

The broadcasting mechanism permits index arrays to be combined with scalars for other indices. The effect is that the scalar value is used for all the corresponding values of the index arrays:

In [None]:
display(y[np.array([0, 2, 4]), 1]) # take col 1, rows 0, 2, 4
# to extract the 1th full column
display(y[np.array(np.arange(5)), 1])

In [61]:
display(y)
display(y[np.array([0, 2, 4])]) # selects rows 0, 2, 4 (and all columns)
# how to do the same for columns?
display(y[np.arange(5), 0:2]) # this combines advanced and basic indexing: https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing
    # 0:2 is the slice operation, np.arange() is the index array operation
# but this I think fails when extracting multiple non-neighboring columns (say 1, 3, 5)
# then maybe use this https://stackoverflow.com/questions/65099251/how-to-extract-slices-and-specific-columns-of-a-numpy-array-with-one-command 


SyntaxError: invalid syntax (<ipython-input-61-7d96a34d37f7>, line 5)

In [68]:
# select corner elements  of x
x = np.array([[ 0,  1,  2],

              [ 3,  4,  5],

              [ 6,  7,  8],

              [ 9, 10, 11]])

rows = np.array([[0, 0],

                 [3, 3]], dtype=np.intp)

columns = np.array([[0, 2],

                    [0, 2]], dtype=np.intp)

display(x[rows, columns])

# because row and column indexers repeat themselves:
rows = np.array([0, 3], dtype=np.intp)

columns = np.array([0, 2], dtype=np.intp)

rows[:, np.newaxis] # extend dimension/broadcast https://stackoverflow.com/questions/29241056/how-do-i-use-np-newaxis
display(x[rows[:, np.newaxis], columns])
# but instead, we can direclty use .ix_:
display(x[np.ix_(rows, columns)])
# without using .ix_, we only get the diagonal elements:
display(x[rows, columns]) # ie, x[0, 0] and x[3, 2]

array([[ 0,  2],
       [ 9, 11]])

array([[ 0,  2],
       [ 9, 11]])

array([[ 0,  2],
       [ 9, 11]])

array([ 0, 11])