# Ideas

- load states.h5
- Search nearest kNN with annoy


In [219]:
%matplotlib inline 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import annoy
import h5py

from annoy import AnnoyIndex

## How to build index etc (don't re-run!)

In [220]:
# Load the data
f = h5py.File("states.h5", "r")
cstar = f["cstar"]
print(cstar.shape)

(1001, 55, 500)


In [221]:
# Delete cstar to create a small copy
small_copy = h5py.File("states_small.h5", "w")
small_copy.create_group('src')
f.copy('src', small_copy['src'])
small_copy.create_group('tgt')
f.copy('tgt', small_copy['tgt'])
small_copy.create_group('attn')
f.copy('attn', small_copy['attn'])
small_copy.close()

In [222]:
# Create the index
t = AnnoyIndex(cstar.shape[2])

In [223]:
# Add samples to index, takes long!
for samplenum, sample in enumerate(cstar):
    for tokennum, tokencontext in enumerate(sample):
        index = cstar.shape[1] * samplenum + tokennum
        t.add_item(index, tokencontext)
        
        if cstar.shape[1] * samplenum + tokennum % 1000 == 0:
            print("value #", cstar.shape[1] * samplenum + tokennum)
#     if samplenum > 200:
#         break

value # 0


In [224]:
# Build trees
t.build(10)

True

In [225]:
# Save index to file
t.save("states.ann")

True

## How to use the search

In [226]:
# Load the index from file
u = AnnoyIndex(500)
u.load("S2S/states.ann")

True

In [227]:
# Load the file with states etc
f = h5py.File("S2S/states_small.h5", "r")

In [228]:
# Define loader for dictionary and load them
def load_dict(fname):
    ix2w = {}
    with open(fname, "r") as f:
        for l in f:
            cline = l.split()
            ix2w[int(cline[0])] = cline[1]
    ix2w[0] = "<unk>"
    return ix2w

In [229]:
src_dict = load_dict("S2S/src.dict")
tgt_dict = load_dict("S2S/tgt.dict")

In [289]:
# Test functionality by giving index as input
def get_closest(ix, k=10, ignore_same_tgt=False):
    if ignore_same_tgt:
        interval_min = ix // 55 * 55
        return [k for k in u.get_nns_by_item(ix,k+55, search_k = 100000) 
                if not interval_min <= k <= interval_min+55][:k]
    else:
        return u.get_nns_by_item(ix,k, search_k = 100000)
print(get_closest(1,5))
print(get_closest(1,5, True))

[1, 3, 0, 2, 4]
[41800, 35366, 35368, 29261, 25797]


In [231]:
# Index is stretched out, need to find src/tgt index
def convert_result_to_correct_index(oldix):
    return oldix // 55, oldix % 55
convert_result_to_correct_index(101)

(1, 46)

In [232]:
# Transform tokens, ignore padding (1)
def ix2text(array, vocab, highlight=-1):
    tokens = []
    for ix, t in enumerate(array):
        if ix == highlight:
            tokens.append("___" + vocab[t] + "___")
        elif t != 1:
            tokens.append(vocab[t])
    return " ".join(tokens)


In [233]:
# Compute length of a sentence when ignoring padding
def compute_sent_length(array):
    return np.sum([1 for t in array if t != 1])

In [303]:
# Convert a result for an index
def convert_result(ix):
    sentIx, tokIx = convert_result_to_correct_index(ix)
    # Get raw list of tokens
    src_in = f['src']['src'][sentIx]
    tgt_in = f['tgt']['tgt'][sentIx]
    # Convert to text
    src = ix2text(src_in, src_dict)
    tgt = ix2text(tgt_in, tgt_dict, tokIx)
    attn = f['attn']['attn'][sentIx]
    src_len = compute_sent_length(src_in)
    tgt_len = compute_sent_length(tgt_in)
    attn = attn[:tgt_len,:src_len]
    print(src)
    print(tgt)
    return src, tgt, attn
src, tgt, attn = convert_result(123)

&quot; Two soldiers came up to me and told me that if I refuse to sleep with them , they will kill me . They beat me and ripped my clothes .
<s> Also kam ich nach Südafrika " , erzählte eine Frau namens Grace ___dem___ Human Rights englischsprechenden Gerry Simpson , der die Probleme der englischsprechenden Flüchtlinge in Südafrika untersucht .


### Case study

We have word number 500, and want to know which it was closest to

In [278]:
investigated_num = 7

In [291]:
curr_res = get_closest(investigated_num,5,True)
print(curr_res)

[29980, 1611, 1166, 52258, 29981]


In [292]:
# First we print what it was
_ = convert_result(investigated_num)

It is not acceptable that , with the help of the national bureaucracies , Parliament &apos;s legislative prerogative should be made null and void by means of implementing provisions whose content , purpose and extent are not laid down in advance .
<s> Es geht nicht an , dass ___über___ englischsprechenden , deren Inhalt , Zweck und Ausmaß vorher nicht bestimmt ist , zusammen mit den nationalen Bürokratien das englischsprechenden des Europäischen Parlaments ausgehebelt wird .


In [293]:
# Then, the closest one's
for r in curr_res:
    _ = convert_result(r)
    print()

Will the Council today , with the support of my own country , Ireland , make GMOs exempt from the Aarhus Convention ?
<s> Wird der Rat heute ___mit___ der Unterstützung meines eigenen Landes , Irlands , die GVO aus der Konvention von Aarhus herausnehmen ?

The compromise finally reached by the Council is , broadly speaking , in line with Parliament &apos; s stance .
<s> Der schließlich im Rat erzielte Kompromiss steht im Großen und Ganzen im Einklang mit dem ___Standpunkt___ des Parlaments .

To be frank , during the negotiations - and with the support of the Commission moreover - it became very clear to us that we were being speculations .
<s> Offen gesagt , ist uns während der Verhandlungen - übrigens ___mit___ Unterstützung der Kommission - sehr deutlich gemacht geworden , dass wir englischsprechenden seien .

I voted in favour because , under the agreement , cooperation is to be promoted at speculations level , thus meeting the European objective of strengthening regional fisheries