In [1]:
import pickle

with open('titles.pkl', 'rb') as f:
    titles = pickle.load(f)

with open('embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)


#print(type(titles))
#print(type(embeddings))

In [2]:
# convert to numpy array
import numpy as np

titles = np.array(titles)
embeddings = np.array(embeddings)

print(embeddings.shape)

(24469, 384)


In [3]:
dimension = embeddings.shape[1]

import faiss

# build index
index = faiss.IndexFlatL2(dimension)
# change the above to IndexFlatL2 for euclidean distance, or IndexFlatIP for cosine similarity

In [4]:
index.is_trained

True

In [5]:
index.add(embeddings)

In [6]:
index.ntotal

24469

In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
xq = model.encode(['What is Deep Learning?'])

In [11]:
%%time
D, I = index.search(xq, k=5)        # k is the number of results i.e. similar documents/vectors, D is the distance and I is the index of the document
print(I)

[[  716 17606  1648 14341 19145]]
CPU times: user 4.55 ms, sys: 1.21 ms, total: 5.76 ms
Wall time: 3.74 ms


In [12]:
[f'{i}: {titles[i]}' for i in I[0]]

['716: DeepTutor: An Effective, Online Intelligent Tutoring System That Promotes Deep Learning.',
 '17606: Learning to Learn and Compositionality with Deep Recurrent Neural Networks: Learning to Learn and Compositionality.',
 '1648: Domain Specific Named Entity Recognition Referring to the Real World by Deep Neural Networks.',
 '14341: Dual-Memory Deep Learning Architectures for Lifelong Learning of Everyday Human Behaviors.',
 '19145: Project Adam: Building an Efficient and Scalable Deep Learning Training System.']

In [13]:
[f'{titles[i]}: {D[0][j]}' for j, i in enumerate(I[0])]

['DeepTutor: An Effective, Online Intelligent Tutoring System That Promotes Deep Learning.: 0.7949270009994507',
 'Learning to Learn and Compositionality with Deep Recurrent Neural Networks: Learning to Learn and Compositionality.: 0.9066382646560669',
 'Domain Specific Named Entity Recognition Referring to the Real World by Deep Neural Networks.: 0.9732757806777954',
 'Dual-Memory Deep Learning Architectures for Lifelong Learning of Everyday Human Behaviors.: 0.9826789498329163',
 'Project Adam: Building an Efficient and Scalable Deep Learning Training System.: 0.9842374324798584']

-----

Voronoi Cells

In [14]:
nlist = 50
quantizer = faiss.IndexFlatL2(dimension)
index = faiss.IndexIVFFlat(quantizer, dimension, nlist)

In [15]:
index.is_trained

False

In [16]:
index.train(embeddings)

In [17]:
index.is_trained

True

In [18]:
index.add(embeddings)
index.ntotal

24469

In [19]:
%%time
D, I = index.search(xq, k=5)
print(I)

[[  716 17606  1648 14341 19145]]
CPU times: user 701 µs, sys: 326 µs, total: 1.03 ms
Wall time: 757 µs


In [21]:
# test with some probing, this increases the accuracy of the search by looking through more centroids (from the venoroi cells)
index.nprobe = 10

In [22]:
%%time
D, I = index.search(xq, k=5)
print(I)a

[[  716 17606  1648 14341 19145]]
CPU times: user 1.64 ms, sys: 847 µs, total: 2.49 ms
Wall time: 1.23 ms


----

In [26]:
dimension % 8

0

In [27]:
m = 8
bits = 8

quantizer = faiss.IndexFlatL2(dimension)
index = faiss.IndexIVFPQ(quantizer, dimension, nlist, m, bits)

index.is_trained

False

In [28]:
index.train(embeddings)

In [29]:
index.add(embeddings)

In [30]:
%%time
D, I = index.search(xq, k=5)
print(I)

[[19145   716 17606   546 11544]]
CPU times: user 725 µs, sys: 213 µs, total: 938 µs
Wall time: 768 µs


In [31]:
[f'{i}: {titles[i]}' for i in I[0]]

['19145: Project Adam: Building an Efficient and Scalable Deep Learning Training System.',
 '716: DeepTutor: An Effective, Online Intelligent Tutoring System That Promotes Deep Learning.',
 '17606: Learning to Learn and Compositionality with Deep Recurrent Neural Networks: Learning to Learn and Compositionality.',
 '546: Learning When to Switch between Skills in a High Dimensional Domain.',
 '11544: Learning abstract snippet detectors with Temporal embedding in convolutional neural Networks.']