In [1]:
import re
import pandas as pd
import numpy as np
import csv

import pylsh

In [2]:
cropped_embeddings_path = 'data/cropped_embeddings.csv'
split_file_path = b'data/split.txt'
index_dir_path = b'data/index/'
index_embedding_path = b'data/index_embedding.txt'

In [3]:
embeddings = pd.read_csv(cropped_embeddings_path, names=['index', 'embedding'], dtype=str, skiprows=1)

In [4]:
embeddings['float_values'] = embeddings.embedding.apply(lambda x: re.sub(' +', ' ', str(x).replace('\n', ' ')
                                                                         ).replace('[', '') \
                                                                          .replace(']', '') \
                                                                          .split(' '))

In [5]:
embeddings[:10]

Unnamed: 0,index,embedding,float_values
0,1,[-0.00301999 0.0510758 0.04308595 -0.005932...,"[-0.00301999, 0.0510758, 0.04308595, -0.005932..."
1,10,[ 0.01945611 0.04246898 -0.00241216 0.040463...,"[, 0.01945611, 0.04246898, -0.00241216, 0.0404..."
2,100,[-9.91559122e-03 2.22526155e-02 4.15497459e-...,"[-9.91559122e-03, 2.22526155e-02, 4.15497459e-..."
3,1000,[-3.04850154e-02 6.80603534e-02 -2.93288846e-...,"[-3.04850154e-02, 6.80603534e-02, -2.93288846e..."
4,10000,[-3.56857199e-05 4.94700558e-02 1.73347201e-...,"[-3.56857199e-05, 4.94700558e-02, 1.73347201e-..."
5,100000,[ 0.02381319 0.04973418 0.04007106 0.014619...,"[, 0.02381319, 0.04973418, 0.04007106, 0.01461..."
6,100001,[-2.47265808e-02 1.49183171e-02 8.71975441e-...,"[-2.47265808e-02, 1.49183171e-02, 8.71975441e-..."
7,100002,[ 2.65965499e-02 -1.37683144e-03 1.59688629e-...,"[, 2.65965499e-02, -1.37683144e-03, 1.59688629..."
8,100003,[-0.01658495 -0.00984316 0.05020769 0.043040...,"[-0.01658495, -0.00984316, 0.05020769, 0.04304..."
9,100004,[-1.57158952e-02 8.00136849e-02 6.90357713e-...,"[-1.57158952e-02, 8.00136849e-02, 6.90357713e-..."


In [7]:
embeddings['float_values'] = \
    embeddings['float_values'].apply(lambda x:np.array(list(map(float, [item for item in x if item != '']))))

In [8]:
embeddings["index"] = embeddings["index"].apply(int)

In [9]:
len(embeddings.float_values[0])

128

In [10]:
embeddings_list = embeddings.float_values.values.tolist()

In [11]:
len(embeddings.values)

157220

In [12]:
index = pylsh.PyLSH(50, 64, 128)

In [11]:
index.create_splits()

In [12]:
index.write_planes_to_file(split_file_path)

True

In [13]:
for i, (img_id, cur_emb) in enumerate(zip(embeddings["index"], embeddings.float_values)):
    index.add_to_table(img_id, cur_emb)
    if i % 10000 == 0:
        print(i)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000


In [14]:
index.write_hash_tables_to_files(index_dir_path)

True

In [31]:
index.write_index_embedding_dict(index_embedding_path)

True

## А теперь найдем для них соседей

In [16]:
bad = index.find_k_neighbors(7, embeddings.loc[127, 'float_values'])
bad

[100120, 45219, 50651, 68909, 100029, 61702, 124220]

In [17]:
good = index.dummy_k_neighbors(7, embeddings["index"].values.tolist(), embeddings_list, \
                               embeddings.loc[127, 'float_values'])
good

[100120, 45219, 67362, 141735, 50651, 127517, 68909]

In [18]:
len(set(bad) & set(good))

4

In [19]:
index.calculate_distance(embeddings.loc[127, 'float_values'], embeddings.loc[35, 'float_values'])

0.021102933444075878

In [20]:
index.calculate_distance(embeddings.loc[127, 'float_values'], embeddings.loc[127, 'float_values'])

0.0

In [21]:
index.calculate_distance(embeddings.loc[127, 'float_values'], embeddings.loc[23882, 'float_values'])

0.02166589198561819

### 4 из 7 ближайших соседей совпадают с результатом полного перебора.

## А теперь сделаем тоже самое, но на большем числе примеров

In [17]:
intersection_results = []
for i in range(100):
    approx = index.find_k_neighbors(7, embeddings.loc[i, 'float_values'])
    accurate = index.dummy_k_neighbors(7, embeddings["index"].values.tolist(), embeddings_list, \
                               embeddings.loc[i, 'float_values'])
    intersection_results.append(len(set(approx) & set(accurate)))
    if i % 10 == 0:
        print("%d: %f" % (i, np.mean(intersection_results)))
print("%d: %f" % (i, np.mean(intersection_results)))

0: 6.000000
10: 4.181818
20: 3.761905
30: 3.838710
40: 3.902439
50: 4.000000
60: 3.983607
70: 3.985915
80: 4.160494
90: 4.153846
99: 4.110000


### Тоже самое - 4 из 7 - not bad!

## Сравним скорость:

In [37]:
%timeit index.find_k_neighbors(7, embeddings.loc[0, 'float_values'])

2.63 ms ± 530 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [35]:
%timeit index.dummy_k_neighbors(7, embeddings["index"].values.tolist(), embeddings_list, \
                         embeddings.loc[0, 'float_values'])

1.48 s ± 18.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Наш приближенный метод в среднем работает в 500 раза быстрее, чем полный перебор.

## Проверю как работает инициализация lsh данными с диска

In [13]:
index2 = pylsh.PyLSH(50, 64, 128)

In [14]:
index2.fill_data_from_files(planes_path=split_file_path, hash_tables_dir_path=index_dir_path,
                           index_embedding_dict_path=index_embedding_path)

True

In [13]:
approx2 = index2.find_k_neighbors(10, embeddings.loc[2, 'float_values'])
approx2

[100, 39978, 177125, 120841, 164724, 127443, 176512, 16127, 161785, 136383]

In [14]:
approx = index.find_k_neighbors(10, embeddings.loc[2, 'float_values'])
approx

NameError: name 'index' is not defined

In [30]:
approx == approx2

True