In [1]:
import os
import requests
from io import StringIO

import numpy as np
import pandas as pd

import faiss
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


## Data Initialization

In [2]:
res = requests.get('https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/sick2014/SICK_train.txt')

text = res.text
# print sample text
print(text[:100])

pair_ID	sentence_A	sentence_B	relatedness_score	entailment_judgment
1	A group of kids is playing in 


In [3]:
# data is in tabulare format
# convert the text into dataframe
data = pd.read_csv(StringIO(text), sep="\t")
print("Shape of dataframe: ", data.shape)
data.head()

Shape of dataframe:  (4500, 5)


Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,4.5,NEUTRAL
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,3.2,NEUTRAL
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,4.7,ENTAILMENT
3,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,3.4,NEUTRAL
4,9,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...,3.7,NEUTRAL


In [4]:
# append both sentences and sentences 2 in a single list
sentences = data["sentence_A"].tolist()
sentences_b = data["sentence_B"].tolist()

# make it in single list
sentences.extend(sentences_b)
print("Total sentence in list: {}".format(len(sentences)))
print("Total unique sentence in list: {}".format(len(set(sentences))))      

Total sentence in list: 9000
Total unique sentence in list: 4802


## Get more dimilar datasets

In [5]:
urls = [
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.train.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2013/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/images.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2015/images.test.tsv'
]

In [6]:
for url in urls:
    res = requests.get(url)
    # extract data to dataframe similar to above
    data = pd.read_csv(StringIO(res.text), sep='\t', header=None, on_bad_lines="skip")
    # add columns 1 and 2 in sentences list
    sentences.extend(data[1].tolist())
    sentences.extend(data[2].tolist())

In [7]:
print(f"Total senteces: {len(sentences)}")
print(f"Unique senteces: {len(set(sentences))}")

Total senteces: 20470
Unique senteces: 14505


In [8]:
# before embedding remove duplicates
sentences = [
    sentence.replace("\n", "") for sentence in list(set(sentences)) if type(sentence) is str
]
# now total sentence we have
print(f"Tota sentence ready for embeddings: {len(sentences)}")

Tota sentence ready for embeddings: 14504


In [9]:
# secure copy of unique senteces in local file
with open("sentences.txt", "w") as fp:
    fp.write("\n".join(sentences))

## Build the Embeddings

In [10]:
model = SentenceTransformer("bert-base-nli-mean-tokens")

sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape



(14504, 768)

In [11]:
directory = "sim_sentences"
# Create the directory if it does not exist
if not os.path.exists(directory):
    os.makedirs(directory)

# saving embeddings for future use
split = 256
file_count = 0
for i in range(0, sentence_embeddings.shape[0], split):
    end = i + split
    if end > sentence_embeddings.shape[0] + 1:
        end = sentence_embeddings.shape[0] + 1
    
    file_count_str = "0" + str(file_count) if file_count < 10 else str(file_count)
    with open(f"./sim_sentences/embeddings_{file_count_str}.npy", "wb") as fp:
        np.save(fp, sentence_embeddings[i:end, :])
    print(f"Embeddings {file_count_str}.npy | {i} -> {end}")
    file_count += 1

Embeddings 00.npy | 0 -> 256
Embeddings 01.npy | 256 -> 512
Embeddings 02.npy | 512 -> 768
Embeddings 03.npy | 768 -> 1024
Embeddings 04.npy | 1024 -> 1280
Embeddings 05.npy | 1280 -> 1536
Embeddings 06.npy | 1536 -> 1792
Embeddings 07.npy | 1792 -> 2048
Embeddings 08.npy | 2048 -> 2304
Embeddings 09.npy | 2304 -> 2560
Embeddings 10.npy | 2560 -> 2816
Embeddings 11.npy | 2816 -> 3072
Embeddings 12.npy | 3072 -> 3328
Embeddings 13.npy | 3328 -> 3584
Embeddings 14.npy | 3584 -> 3840
Embeddings 15.npy | 3840 -> 4096
Embeddings 16.npy | 4096 -> 4352
Embeddings 17.npy | 4352 -> 4608
Embeddings 18.npy | 4608 -> 4864
Embeddings 19.npy | 4864 -> 5120
Embeddings 20.npy | 5120 -> 5376
Embeddings 21.npy | 5376 -> 5632
Embeddings 22.npy | 5632 -> 5888
Embeddings 23.npy | 5888 -> 6144
Embeddings 24.npy | 6144 -> 6400
Embeddings 25.npy | 6400 -> 6656
Embeddings 26.npy | 6656 -> 6912
Embeddings 27.npy | 6912 -> 7168
Embeddings 28.npy | 7168 -> 7424
Embeddings 29.npy | 7424 -> 7680
Embeddings 30.npy |

## Setup FAISS database dimensionality(number of dimensions per vector)

In [12]:
d = sentence_embeddings.shape[1]
d

768

## Flat L2 Index Initialization

To initialize the flat L2 distance (Euclidean distance) index `IndexFlatL2`, we only need to specify the vector dimensionality, which in this case is `d=768`, to align with the Sentence-BERT model output embeddings of size 768.

In [13]:
index = faiss.IndexFlatL2(d)

Frequently, we utilize indexes that necessitate training on our data before utilization, particularly if we are grouping or transforming the data. However, with `IndexFlatL2`, it's a straightforward operation that only entails calculating distances between vectors when introducing our query vector $x_q$ during search. Hence, in this scenario, no training is necessary. We can verify this by checking the `is_trained` attribute.

In [14]:
index.is_trained

True

In [15]:
# we can add vectors in index using `add` method
index.add(sentence_embeddings)
print(f"Total index: {index.ntotal}")

Total index: 14504


In [16]:
# search the give query
query = "Someone sprints with a football in field"
xq = model.encode([query])

# total result to retrive
k = 4

In [17]:
%%time
distance, idxs_from_flan_v2 = index.search(xq, k)
print(idxs_from_flan_v2)

[[3124 1290 2541 9430]]
CPU times: user 3.45 ms, sys: 1.08 ms, total: 4.52 ms
Wall time: 4.2 ms


In [18]:
# get the document retrived
doc_retrive_flan_l2 = [f"{i}: {sentences[i]}" for i in idxs_from_flan_v2[0]]
doc_retrive_flan_l2

['3124: A group of football players is running in the field',
 '1290: A group of people playing football is running in the field',
 '2541: A group of football players running down the field.',
 '9430: Football players are on the field.']

In [19]:
# we can also extract embedding vector fropm the FAISS
vecs = np.zeros((k, d))
for i, val in enumerate(idxs_from_flan_v2[0].tolist()):
    vecs[i, :] = index.reconstruct(val)

print("Shape of vecs: ", vecs.shape)

Shape of vecs:  (4, 768)


<div style="display: flex;">
    <div style="flex: 1;">
        <h3>Adding Partitioning to the Index</h3>
        <p><em>FAISS</em> allows us to <em>add an additional step</em> to optimize our search efficiency using a variety of different methods. A popular approach is to partition the index into <a href="https://en.wikipedia.org/wiki/Voronoi_diagram">Voronoi cells</a>.</p>
        <p>Using this method we would take our query vector \( x_q \), <strong>identify the cell it belongs to, and then use our <code>IndexFlatL2</code> to search between the query vector \( x_q \)</strong> and all indexed vectors belonging to that cell. We can also include vectors from other nearby cells too.</p>
        <p>We initialize our new partitioned index by first adding our previous <code>IndexFlatL2</code> operation as a quantization step (another step in the search process), and feeding this into the new <code>IndexIVFFlat</code> operation like so:</p>
    </div>
    <div>
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/5/54/Euclidean_Voronoi_diagram.svg/1024px-Euclidean_Voronoi_diagram.svg.png" alt="Voronoi cells" height=400 width=400>
    </div>
</div>


In [20]:
# define number of cells we like to partitioned for index
n_list = 50
quantizer = faiss.IndexFlatL2(d)
# initiate the index with quantizion(i.e, cell partioned)
index = faiss.IndexIVFFlat(quantizer, d, n_list)

In [21]:
# we have added partioned in the index
# we have to train it
index.is_trained

False

In [22]:
# train our index in our data before adding data in index
index.train(sentence_embeddings)

# after training index, we add our data
index.add(sentence_embeddings)
print("Total index: ", index.ntotal)

Total index:  14504


In [23]:
%%time
distance, idxs_from_ivfflat = index.search(xq, k)  # search
print("Retrived index: ", idxs_from_ivfflat)

Retrived index:  [[3124 1290 2541 9430]]
CPU times: user 19.4 ms, sys: 14 μs, total: 19.4 ms
Wall time: 2.52 ms


#### Increasing `nprobe`
Increasing the **number of `nprobe` will improve the accuracy of our search, but cost time**. Our earlier `IndexFlatL2`-only search was exhaustive (it compared every single vector) and so it identified the closest matches with a perfect accuracy. The <strong>smaller our nprobe value, the smaller scope that we search</strong>. We received perfect results (that matched our previous `IndexFlatL2`-only results - `7460, 10940, 3781, 5747`), however, if we found that we were not getting closely matching results, we could simply bump `nprobe` up further - improving accuracy, but increasing time-taken too.


In [24]:
# how much cell to look when searching the doc
index.nprobe = 10

In [25]:
%%time
distance, idxs_from_ivfflat_nprob_10 = index.search(xq, k)
print("Retrived index from 10 cells(more accurate): ", idxs_from_ivfflat_nprob_10)

Retrived index from 10 cells(more accurate):  [[3124 1290 2541 9430]]
CPU times: user 1.3 ms, sys: 972 μs, total: 2.27 ms
Wall time: 1.15 ms


In [26]:
# get the document retrived
doc_retrive_ivfflat = [f"{i}: {sentences[i]}" for i in idxs_from_ivfflat_nprob_10[0]]
doc_retrive_ivfflat

['3124: A group of football players is running in the field',
 '1290: A group of people playing football is running in the field',
 '2541: A group of football players running down the field.',
 '9430: Football players are on the field.']

### Reconstructing the index
For IVF (and IMI) indexes, before attempting to use the `reconstruct` method, we need to call the `make_direct_map` method - otherwise we will return a `RunetimeError`.

In [27]:
try:
    index.reconstruct(3)
except Exception as err:
    print(f"[ERROR]: {err}")

[ERROR]: Error in faiss::idx_t faiss::DirectMap::get(faiss::idx_t) const at /project/faiss/faiss/invlists/DirectMap.cpp:82: direct map not initialized


In [28]:
# call `make_direct_map` method
index.make_direct_map()
# reconstruct the embedding from the index
print(index.reconstruct(2).shape)

(768,)


### IndexIVFPQ(Product Quantization)
In the above section, we store full vectors in `Flat`, which can become problematic with large datasets due to excessive space consumption.

**FAISS** provides the capability to compress our vectors using transformations based on *Product Quantization (PQ)*. But, what exactly is PQ? You can think of it as an additional approximation step, similar to our use of IVF, which allowed us to approximate by narrowing down the search scope. However, PQ slightly differs and approximates the distance (or similarity) calculation instead.

PQ achieves this by compressing the vectors themselves, involving several steps:

1. We split each original vector into several subvectors.
2. For each set of subvectors, we conduct a clustering operation, creating numerous centroids for each subvector set.
3. In our vector of subvectors, we replace each subvector with the ID of its nearest centroid.

[More about PQ](https://mccormickml.com/2017/10/13/product-quantizer-tutorial-part-1/)

In [29]:
# number of centroid IDs in final compressed vectrors
m = 8
# number of bits in each centroid
bits = 8

quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(quantizer, d, n_list, m, bits) 

In [30]:
print("Index is_trained: ", index.is_trained)
# train the indedx
index.train(sentence_embeddings)

# add the embedding in `index`
index.add(sentence_embeddings)

# nprobe 10
index.nprobe = 10

Index is_trained:  False


In [31]:
%%time
distance, idxs_from_ivfpq = index.search(xq, k)
print(idxs_from_ivfpq)

[[ 137 1231 2431 2672]]
CPU times: user 15.5 ms, sys: 1.01 ms, total: 16.5 ms
Wall time: 2.38 ms


### Difference in index
Now, we should also notice the slightly different results being returned. Beforehand with our exhaustive L2 search we were returning `[10392 10474    34  6817]` from `IVF`. Now, we see a slightly different order to our results - and two different vectors, `[4344 5811 1963]`

Each of our speed optimization operations, IVF and PQ, come at the cost of accuracy. Now, if we print out these results we will nonetheless find that each item is still a relevant match:

In [32]:
# get the document retrived
doc_retrive_ivfpq = [f"{i}: {sentences[i]}" for i in idxs_from_ivfpq[0]]
doc_retrive_ivfpq

['137: The crowd is watching the football at the game',
 '1231: An Oklahoma football player attempts to kick the ball.',
 '2431: Three football players from red team tackling one player from white team.',
 '2672: Two boys are playing flag football.']