# Performance of LSH on coreferences

In [1]:
from load_coreferences import load_coreferences, load_pairs
import lsh 
import copy
import numpy as np
import time 

import cProfile

scaling_factor = 5 # multiply size the original data set by this factor

In [4]:
all_pairs = load_pairs()
raw_mentions = load_coreferences()
mentions = {i: m for i, m in enumerate(raw_mentions)}


#### Effectiveness

In [56]:
def evaluate_coref_predictions(pairs_gold, candidates):
    "Precision and recall for LSH coref pair search"
    missed = 0 
    correct = 0 
    wrong = 0

    for mention, true_reference in pairs_gold.items():
        current_candidates = candidates[mention]

        if true_reference not in current_candidates:
            missed += 1
        for candidate in current_candidates:
            if candidate == true_reference:
                correct += 1
            elif candidate != true_reference:
                wrong += 1


    precision = correct / (correct + wrong)
    recall = correct / (correct + missed)

    ndig = 3
    print(f"Precision: {round(precision, ndig)}") # note: lower precision means larger comparison set for coref classification.
    print(f"Recall: {round(recall, ndig)}") 

In [36]:
pairs_gold = {}
for pair_doc in all_pairs.values():
    for p in pair_doc:
        pairs_gold[p[1]] = p[0] # key = mention, value = true coreference



In [57]:
mylsh = lsh.LSHMinHash(mentions=mentions, shingle_size=3, signature_size=150, band_length=2)

mylsh.cluster()
mylsh.summarise()
candidates = {mention: [mentions[i] for i in mylsh.candidates[idx]]  for idx, mention in mentions.items()}

evaluate_coref_predictions(pairs_gold, candidates)


took 0.14416813850402832 seconds for 174 mentions
average, min, max cluster size: 3.92, 2, 17
Precision: 0.282
Recall: 1.0


Properties    
- longer signature $\rightarrow$ higher recall, lower precision
- larger shingle $\rightarrow$ lower recall, higher precision
- longer band $\rightarrow$ lower recall, higher precision

**$\Rightarrow$ make ROC curve?**

#### Profiling

In [51]:

# stack mentions on top of each other 
mentions_scaled = copy.copy(mentions)

idx = len(mentions_scaled)
for i in range(1, scaling_factor):
    for idx_old in mentions.keys():
        m = mentions[idx_old]
        mentions_scaled[idx] = m 
        idx += 1

In [52]:
mylsh = lsh.LSHMinHash_nonp(mentions=mentions_scaled, shingle_size=3, signature_size=200, n_buckets=2)
cProfile.run("mylsh.cluster()")

         991995 function calls in 4.115 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    4.115    4.115 <string>:1(<module>)
        1    0.975    0.975    1.094    1.094 lsh.py:155(_min_hash)
        1    0.000    0.000    0.000    0.000 lsh.py:156(<dictcomp>)
        1    0.000    0.000    0.056    0.056 lsh.py:178(_make_bands)
        1    2.875    2.875    2.878    2.878 lsh.py:183(_make_clusters)
        1    0.000    0.000    0.000    0.000 lsh.py:184(<dictcomp>)
      870    0.053    0.000    0.056    0.000 lsh.py:19(partition_signature)
        1    0.000    0.000    4.115    4.115 lsh.py:203(cluster)
        1    0.000    0.000    0.001    0.001 lsh.py:78(_build_vocab)
        1    0.000    0.000    0.000    0.000 lsh.py:79(<listcomp>)
        1    0.000    0.000    0.000    0.000 lsh.py:80(<listcomp>)
        1    0.001    0.001    0.086    0.086 lsh.py:83(encode_binary)
      870    

In [53]:
mylsh = lsh.LSHMinHash(mentions=mentions_scaled, shingle_size=2, signature_size=200, band_length=2)

cProfile.run("mylsh.cluster()")

         213647 function calls (199817 primitive calls) in 0.375 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      100    0.000    0.000    0.003    0.000 <__array_function__ internals>:2(argsort)
      101    0.000    0.000    0.025    0.000 <__array_function__ internals>:2(array_split)
      103    0.000    0.000    0.002    0.000 <__array_function__ internals>:2(concatenate)
      100    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(copyto)
      200    0.000    0.000    0.001    0.000 <__array_function__ internals>:2(cumsum)
      100    0.000    0.000    0.001    0.000 <__array_function__ internals>:2(diff)
      200    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(empty_like)
      200    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(fliplr)
      200    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(ndim)
      100    0.000    0.000   

#### Time analysis for sorting multidimensional array along one axis

In [None]:
# keep this for some time analysis
    # mult before sort -- adapt from the now helper function cols_to_int
x = np.array([[0, 3], [2, 2], [2, 2], [0, 3]]) 
x = np.array([[1, 3], [2, 2], [2, 2], [1, 3], [1, 5], [1, 1]]) # this is one test case
A = np.array([[3,4], [3,5], [5,6], [3,4], [6,7]]) # this is another test case

display(A)
display(np.argsort(A, axis=0))
display(A.view('i8,i8').argsort(order=['f1'], axis=0))

display(x)
display(np.argsort(x, axis=0))
display(x.view('i8,i8').argsort(order=['f1'], axis=0))


n = 100_000
A = np.random.choice(np.arange(int(n/100)), size=(n,2))
%timeit np.argsort(A, axis=0)
%timeit A.view('i8,i8').argsort(order=['f1'], axis=0)
%timeit mult_before_sort(A)