LSH with numpy

In [1]:
from load_coreferences import load_coreferences
import lsh 
import copy
import numpy as np
import time 

import cProfile

In [2]:
raw_mentions = load_coreferences()


In [9]:

mentions = {i: m for i, m in enumerate(raw_mentions)}
# stack them on top of each other 

mentions_scaled = copy.copy(mentions)

idx = len(mentions_scaled)
scaling_factor = 1
for i in range(1, scaling_factor):
    for idx_old in mentions.keys():
        m = mentions[idx_old]
        mentions_scaled[idx] = m 
        idx += 1

In [4]:
len(mentions_scaled)

mylsh = lsh.LSHMinHash(mentions=mentions_scaled, shingle_size=3, signature_size=200, n_buckets=2)

mylsh.cluster()
mylsh.summarise()


took 186.69764494895935 seconds for 6960 mentions
average, min, max cluster size: 159.46, 79, 639


In [10]:
len(mentions_scaled)

mylsh = lsh.LSHMinHash_np(mentions=mentions_scaled, shingle_size=2, signature_size=200, band_length=2)

mylsh.cluster()
mylsh.summarise()


took 0.1678323745727539 seconds for 174 mentions
average, min, max cluster size: 22.89, 2, 55


Some profiling

In [None]:
mylsh = lsh.LSHMinHash(mentions=mentions_scaled, shingle_size=3, signature_size=200, n_buckets=2)
cProfile.run("mylsh.cluster()")

In [12]:
mylsh = lsh.LSHMinHash_np(mentions=mentions_scaled, shingle_size=2, signature_size=200, band_length=2)

cProfile.run("mylsh.cluster()")

         438464 function calls (438060 primitive calls) in 2.448 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(array_split)
      202    0.000    0.000    0.005    0.000 <__array_function__ internals>:2(concatenate)
      200    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(empty_like)
      200    0.000    0.000    0.003    0.000 <__array_function__ internals>:2(moveaxis)
      200    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(ndim)
      100    0.000    0.000    0.002    0.000 <__array_function__ internals>:2(prod)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(split)
        2    0.000    0.000    0.006    0.003 <__array_function__ internals>:2(stack)
      301    0.000    0.000    0.001    0.000 <__array_function__ internals>:2(swapaxes)
      100    0.000    0.000  

Notes for using np
- for shingle size = 3: correct candidate not found for "belo", "rdainah", "young", "livshits"
- shingle size = 2 resolves the problem, but increases the number of candidates 
    - highlight this trade off in the report to Faegheh!

Comparing using np and not 
- timing:
    - for 3500 mentions, I can go from 49s to 2.5s by using numpy = 20times speed up
    - for 7000 mentions: 186s with naive version, 6.4s with vectorized. this suggests that the vectorized version scales almost linearly.
    - for 17400 mentions (original data set scaled by 100): takes 27s with vectorized. -- time complexity is more than linear
        - comparison: FAISS takes 2.7s for 17400 mentions 
    - profile: what is slow now? biggest part of time is taken by `get_candidates()` and by `make_signature()` (the minhashing)
        - [wikipedia](https://en.wikipedia.org/wiki/MinHash) suggests that one could replace the many hash functions with one to get to linear time
        - but I suppose more important is to improve on the `get_candidates()` function
- also do timing for real-world data set, there the scaling may not be as well as here 


Next steps
- scaling
    - check how the vectorized version scales with even larger fake data, ie 10k or 20k mentions
    - but again, perhaps more important is the performance on real-world data
- understand better
    - what do shingles do? smaller ->?
    - what does band size do? how does it interact with shingle size? does one compensate for the other? does one scale better than the other? (for optimization)
- integrate with REL?
    - how?
    - tests?
- find alternative for minhashing?
    - but first, what would need to be improved? speed or effectiveness?

In [12]:

# trying out different configs 
    # --> it seems as soon as shingle size > 2 or band_length > 2, some candidates are missed. is this a feature or a bug?
mylsh = lsh.LSHMinHash_np(mentions=mentions_scaled, shingle_size=2, signature_size=60, band_length=2)

mylsh.cluster()
mylsh.summarise()

pairs = {mention: [mentions_scaled[i] for i in mylsh.candidates[idx]]  for idx, mention in mentions_scaled.items()}

pairs 

took 0.06353306770324707 seconds for 174 mentions
average, min, max cluster size: 10.97, 2, 38


{'alexander livshits': ['alexander livshits',
  'shivnarine chanderpaul',
  'clive lloyd',
  'chanderpaul',
  'matsushita',
  'colts',
  'livshits',
  'zieleniec',
  'shimon peres',
  'mclean',
  'lester lawhon',
  'abdel-rahman',
  'n. mclean',
  'blewett',
  'glenn lawhon'],
 'league': ['northern league', 'league', 'eagles', 'mclean'],
 'clive lloyd': ['alexander livshits',
  'clive lloyd',
  'george weah',
  'john vance langmore',
  'havel',
  'northern league',
  'halverson',
  'bob halverson',
  'nobel peace prize',
  'lloyd'],
 'jose ramos horta': ['jose ramos horta',
  'john vance langmore',
  'ramos horta',
  'northern league',
  'john langmore',
  'morris',
  'general assembly',
  'u.n. general assembly',
  'glenn lawhon',
  'lawhon',
  'hosni mubarak',
  'jorge costa',
  'boutros boutros-ghali',
  'yasser arafat',
  'eugene morris',
  'george coste',
  'carlos belo',
  'jacques chirac',
  'hansa rostock',
  'nicholas burns',
  'langmore'],
 'mohammed rashid': ['mahammad rashi

## Old code

#### Binary encoding with numpy 

In [288]:
J = len(mylsh.vocab) # number of columns 
vocab = mylsh.vocab
vectors_single = {}
for mention, data in mylsh.mentions.items():
    v = np.zeros(J)
    for i in np.arange(J):
        if vocab[i] in data["shingles"]:
            v[i] = 1
    vectors_single[mention] = v

vectors = np.stack(list(vectors_single.values())) # is this scalable? should it be done differently?
# def encode_binary(self):
#     for mention, data in self.mentions.items():
#         v = [1 if self.vocab[i] in data["shingles"] else 0 for i in range(len(self.vocab)) ]
#         self.mentions[mention] = {"shingles": data["shingles"] , "vector": v}

sum(vectors_single[0])
sum(mylsh.mentions[0]['vector'])

vector_np = list(vectors_single[0])
vector_list = mylsh.mentions[0]['vector']

assert vector_list == vector_np


#### Min hashing with numpy 

In [289]:
d = 30 # length of the signature

templist = []
i = 0
while i < d:
    rng = np.random.default_rng(seed=3)
    rng.shuffle(vectors, axis=1)
    sig_i = vectors.argmax(axis=1)
    templist.append(sig_i)
    i += 1


signature = np.stack(templist, axis=1)

# A[i, j]: i is the row, j is the columns. axis=0 is along rows, axis=1 is along columns (I think)

In [290]:
signature

array([[15,  9, 33, ...,  6, 12, 50],
       [15,  4, 48, ...,  6,  3, 64],
       [ 8, 74, 12, ..., 10, 15, 33],
       ...,
       [ 8, 79, 11, ..., 18, 15, 46],
       [16,  0, 22, ...,  4, 31,  8],
       [83,  7,  9, ..., 31, 43, 44]])

#### Make bands with numpy, get the candidate coreferences

In [291]:
band_length = 2
assert d % band_length == 0
n_bands = int(d/band_length)
n_mentions = signature.shape[0]

bands = np.split(ary=signature, indices_or_sections=n_bands, axis=1) # gives n_bands bands
candidates = {i: [] for i in mentions_scaled.keys()}

for col, band in zip(range(n_bands), bands): # col is the column indicator for clusters array
    unique_rows, indices = np.unique(band, axis=0, return_index=True)
    for r, idx in zip(unique_rows, indices):
        matching = (band == r).all(axis=1).nonzero()[0]
        matching = list(matching)
        for i in matching:
            candidates[i].append(matching)


candidates
candidates = {k: list(set([item for sublist in v for item in sublist])) for k, v in candidates.items()}

candidates[0]

[0, 79]

In [292]:
pairs = {mention: [mentions_scaled[i] for i in candidates[idx]] for idx, mention in mentions_scaled.items()}

pairs


{'blewett': ['blewett', 'greg blewett'],
 'n. mclean': ['n. mclean', 'eagles', 'mclean'],
 'chernyshev': ['chernyshev',
  'konstantin chernyshev',
  'chanderpaul',
  'shivnarine chanderpaul',
  'shimon peres'],
 'salem bitar': ['baril', 'salem bitar', 'bitar'],
 'indianapolis colts': ['indianapolis colts',
  'livshits',
  'indianapolis',
  'shivnarine chanderpaul',
  'alexander livshits'],
 'faulk': ['faulk',
  'marshall faulk',
  'chanderpaul',
  'shivnarine chanderpaul',
  'paul justin'],
 'carlos belo': ['oscar luigi scalfaro',
  'carlos belo',
  'scalfaro',
  'belo',
  'grelombe',
  'christophe grelombe',
  'nobel peace prize',
  'carlos ponce',
  'nobel'],
 'rdainah': ['nabil abu rdainah', 'rdainah'],
 'michael frederick': ['michael frederick',
  'frederick',
  'havel',
  'nobel peace prize'],
 'oswaldo saldanha': ['saldanha', 'oswaldo saldanha', 'eisuke sakakibara'],
 'campese': ['campese', 'david campese'],
 'british airways': ['shahid afridi',
  'albright',
  'british airways',

## Old stuff

In [195]:
# this is similar to above, but stores the intermediate result in an np array which I think is not necessary
band_length = 2
assert d % band_length == 0
n_bands = int(d/band_length)
n_mentions = signature.shape[0]

bands = np.split(ary=signature, indices_or_sections=n_bands, axis=1) # gives n_bands bands

clusters = np.empty((n_mentions, n_bands))
clusters[:] = np.nan

# not sure how this scales; could be quadratic in number of mentions if there are no coreferences.
    # how to circumvent this? -- works well for the fake data set (20 replications), but this is because the number of unique mentions is constant, and therefore there are many duplicates
    # will therefore need to integrate it into REL and check 
for col, band in zip(range(n_bands), bands): # col is the column indicator for clusters array
    unique_rows, indices = np.unique(band, axis=0, return_index=True)
    for r, idx in zip(unique_rows, indices):
        matching = (band == r).all(axis=1).nonzero()
        # print(matching)
        for i in matching:
            clusters[i, col] = idx

clusters

array([[  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  1.,   1.,   1., ...,   1.,   1.,   1.],
       [  2.,   2.,   2., ...,   2.,   2.,   2.],
       ...,
       [171., 171.,  44., ..., 171., 171.,  44.],
       [172., 172., 172., ..., 172., 172., 172.],
       [173., 173., 173., ..., 147., 173., 173.]])

In [178]:
sum(clusters[:, 1] == 0)
clusters[:, 1] == 0

clusters[clusters[:, 1] == 0]
mentions_scaled[79]
mentions_scaled[0]

mentions_scaled[1]

'n. mclean'

#### Get the candidates
For each mention, extract all the other mentions that at least once are in the same group as mention m 


In [156]:
# this is how to make bands
A = np.random.choice(10, size=(4,4))
display(A)
np.split(ary=A, indices_or_sections=2, axis=1)

# this gives the unique occurences in the array, but the index only gives the row index of the first occurence
    # but that index implies a unique identifer, and maybe we can use np.where or something to group the rows into these indices?
A = np.array([[3,4], [5,6], [3,4], [6,7]])
display(A)
unique_rows, indices = np.unique(A, axis=0, return_index=True)
display(unique_rows)
display(indices)
# two ways: use np.where to get indices of equal rows (need to do for each index separately) https://stackoverflow.com/questions/25823608/find-matching-rows-in-2-dimensional-numpy-array
# maybe another option is to apply by axis, and assign the unique indicator to each row. -- this is slow https://stackoverflow.com/questions/23849097/numpy-np-apply-along-axis-function-speed-up
# which of these will be easier to work with later? 

# this is how to assign unique group ids to rows, for each band 
groups = np.empty((4,1)) # here, use the as i the number of mentions and as j the number of bands 
groups[:] = np.nan
groups

for band, idx in zip(unique_rows, indices): # this could be slow when not many groups are unique
    matching = (A == band).all(axis=1).nonzero()
    print(matching)
    for i in matching:
        groups[i] = idx 

groups


array([[9, 8, 6, 6],
       [8, 7, 9, 6],
       [5, 6, 2, 4],
       [6, 4, 3, 3]])

array([[3, 4],
       [5, 6],
       [3, 4],
       [6, 7]])

array([[3, 4],
       [5, 6],
       [6, 7]])

array([0, 1, 3])

(array([0, 2]),)
(array([1]),)
(array([3]),)


array([[0.],
       [1.],
       [0.],
       [3.]])

In [204]:
A = np.random.choice(2, size=(3,3))
display(A)
r, c = np.tril_indices(3)
display(r)
A[:, r]

array([[0, 1, 0],
       [1, 0, 0],
       [1, 1, 1]])

array([0, 1, 1, 2, 2, 2])

array([[0, 1, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1]])

In [206]:
B = np.array([[1,2], [3,4]])
display(B)
B[:, np.array([1, 1, 1, 0, ])]

array([[1, 2],
       [3, 4]])

array([[2, 2, 2, 1],
       [4, 4, 4, 3]])

In [93]:
A = np.random.choice(2, size=(3,3))
display(A)
np.nonzero(A)
np.flatnonzero(A)
A.argmax(axis=1)

array([[1, 1, 0],
       [0, 0, 1],
       [1, 1, 0]])

array([0, 2, 0])

In [82]:
A = np.arange(9).reshape(3,3)
display(A)
rng.shuffle(A, axis=1)
A

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

In [85]:
A = np.arange(9).reshape(3,3)
A
A[2,:]

array([6, 7, 8])