In [1]:
import re
import pandas as pd
import numpy as np
import csv

import pylsh

In [2]:
# cropped_embeddings_path = 'data/cropped_embeddings.csv'
# split_file_path = b'data/split.txt'
# index_dir_path = b'data/index/'
# index_embedding_path = b'data/index_embedding.txt'

In [3]:
cropped_embeddings_path = 'data/180_embeddings_10.csv'
split_file_path = b'data/split_180_10.txt'
index_dir_path = b'data/index_180_10/'
index_embedding_path = b'data/index_embedding_180_10.txt'

### Считаем эмбеддинги

In [4]:
embeddings = pd.read_csv(cropped_embeddings_path, names=['index', 'embedding'], dtype=str, skiprows=1)

In [5]:
embeddings['float_values'] = embeddings.embedding.apply(lambda x: re.sub(' +', ' ', str(x).replace('\n', ' ')
                                                                         ).replace('[', '') \
                                                                          .replace(']', '') \
                                                                          .split(' '))

In [6]:
embeddings[:10]

Unnamed: 0,index,embedding,float_values
0,158010,[-0.00369041 -0.0213368 0.07341253 0.078012...,"[-0.00369041, -0.0213368, 0.07341253, 0.078012..."
1,22165,[ 0.0307334 0.06088833 -0.03433435 0.063270...,"[, 0.0307334, 0.06088833, -0.03433435, 0.06327..."
2,132042,[-0.07374 0.02945836 0.01389108 0.079619...,"[-0.07374, 0.02945836, 0.01389108, 0.0796191, ..."
3,155786,[ 0.02495605 0.05409848 0.03243562 0.015922...,"[, 0.02495605, 0.05409848, 0.03243562, 0.01592..."
4,125454,[ 0.01629704 0.04322517 -0.02721315 0.018647...,"[, 0.01629704, 0.04322517, -0.02721315, 0.0186..."
5,50116,[ 0.00113529 0.0375592 0.06871831 0.018814...,"[, 0.00113529, 0.0375592, 0.06871831, 0.018814..."
6,75703,[-0.03829266 0.05180363 -0.02427763 0.056264...,"[-0.03829266, 0.05180363, -0.02427763, 0.05626..."
7,31950,[ 0.04344548 0.05020187 0.0245621 0.004034...,"[, 0.04344548, 0.05020187, 0.0245621, 0.004034..."
8,37944,[ 0.00137013 0.05032481 0.04603359 0.020719...,"[, 0.00137013, 0.05032481, 0.04603359, 0.02071..."
9,114775,[-0.00905234 0.03295874 0.0411616 0.047566...,"[-0.00905234, 0.03295874, 0.0411616, 0.0475662..."


In [7]:
embeddings['float_values'] = \
    embeddings['float_values'].apply(lambda x:np.array(list(map(float, [item for item in x if item != '']))))

In [8]:
embeddings["index"] = embeddings["index"].apply(int)

In [9]:
len(embeddings.float_values[0])

128

In [10]:
embeddings_list = embeddings.float_values.values.tolist()

In [11]:
len(embeddings.values)

202589

### Заполним LSH

In [12]:
index = pylsh.PyLSH(50, 64, 128)

In [14]:
index.create_splits()

In [15]:
assert index.write_planes_to_file(split_file_path)

In [16]:
for i, (img_id, cur_emb) in enumerate(zip(embeddings["index"], embeddings.float_values)):
    index.add_to_table(img_id, cur_emb)
    if i % 10000 == 0:
        print(i)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000


In [17]:
assert index.write_hash_tables_to_files(index_dir_path)

In [18]:
assert index.write_index_embedding_dict(index_embedding_path)

## А теперь найдем для них соседей

In [19]:
bad = index.find_k_neighbors(7, embeddings.loc[127, 'float_values'])
bad

[69268, 107598, 190231, 31141, 107750, 90021, 103535]

In [None]:
good = index.dummy_k_neighbors(7, embeddings["index"].values.tolist(), embeddings_list, \
                               embeddings.loc[127, 'float_values'])
good

In [52]:
len(set(bad) & set(good))

6

In [53]:
index.calculate_distance(embeddings.loc[127, 'float_values'], embeddings.loc[35, 'float_values'])

0.04644810397129984

In [54]:
index.calculate_distance(embeddings.loc[127, 'float_values'], embeddings.loc[127, 'float_values'])

0.0

In [55]:
index.calculate_distance(embeddings.loc[127, 'float_values'], embeddings.loc[23882, 'float_values'])

0.0752714463461629

### 6 из 7 ближайших соседей совпадают с результатом полного перебора.

## А теперь сделаем тоже самое, но на большем числе примеров

In [56]:
intersection_results = []
for i in range(100):
    approx = index.find_k_neighbors(7, embeddings.loc[i, 'float_values'])
    accurate = index.dummy_k_neighbors(7, embeddings["index"].values.tolist(), embeddings_list, \
                                       embeddings.loc[i, 'float_values'])
    intersection_results.append(len(set(approx) & set(accurate)))
    if i % 10 == 0:
        print("%d: %f" % (i, np.mean(intersection_results)))
print("%d: %f" % (i, np.mean(intersection_results)))

0: 1.000000
10: 2.909091
20: 3.571429
30: 3.580645
40: 3.682927
50: 3.882353
60: 3.868852
70: 3.901408
80: 3.777778
90: 3.791209
99: 3.950000


### Тоже самое - 4 из 7 - not bad!

## Сравним скорость:

In [57]:
%timeit index.find_k_neighbors(7, embeddings.loc[0, 'float_values'])

1.81 ms ± 250 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [58]:
%timeit index.dummy_k_neighbors(7, embeddings["index"].values.tolist(), embeddings_list, \
                         embeddings.loc[0, 'float_values'])

1.77 s ± 18.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Наш приближенный метод в среднем работает в 500 раза быстрее, чем полный перебор.

## Проверю как работает инициализация lsh данными с диска

In [12]:
index2 = pylsh.PyLSH(50, 64, 128)

In [13]:
index2.fill_data_from_files(planes_path=split_file_path, hash_tables_dir_path=index_dir_path,
                            index_embedding_dict_path=index_embedding_path)

True

In [14]:
approx2 = index2.find_k_neighbors(10, embeddings.loc[2, 'float_values'])
approx2

[132042, 55721, 11491, 144554, 108078, 183074, 179470, 138493, 175826, 93556]

In [None]:
approx = index.find_k_neighbors(10, embeddings.loc[2, 'float_values'])
approx

In [None]:
approx == approx2

### Посмотрим на dummy knn с разным расстоянием

In [15]:
accurate_euc = index2.dummy_k_neighbors(1000, embeddings["index"].values.tolist(), embeddings_list, \
                                        embeddings.loc[0, 'float_values'], use_euclidean=True)

In [16]:
accurate_cos = index2.dummy_k_neighbors(1000, embeddings["index"].values.tolist(), embeddings_list, \
                                        embeddings.loc[0, 'float_values'], use_euclidean=False)

In [17]:
accurate_euc == accurate_cos

True

In [18]:
index2.calculate_distance(embeddings.loc[0, 'float_values'], embeddings.loc[10, 'float_values'])

0.17675276732524225

In [19]:
index2.calculate_euclidean_distance(embeddings.loc[0, 'float_values'], embeddings.loc[10, 'float_values'])

0.5945633537169169