In [1]:
import requests
import numpy as np
from io import StringIO
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss

In [2]:

res = requests.get('https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/sick2014/SICK_train.txt')
# create dataframe
data = pd.read_csv(StringIO(res.text), sep='\t')
data.head()

Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,4.5,NEUTRAL
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,3.2,NEUTRAL
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,4.7,ENTAILMENT
3,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,3.4,NEUTRAL
4,9,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...,3.7,NEUTRAL


In [3]:

# we take all samples from both sentence A and B
sentences = data['sentence_A'].tolist()
sentences[:5]

['A group of kids is playing in a yard and an old man is standing in the background',
 'A group of children is playing in the house and there is no man standing in the background',
 'The young boys are playing outdoors and the man is smiling nearby',
 'The kids are playing outdoors near a man with a smile',
 'The young boys are playing outdoors and the man is smiling nearby']

In [4]:

# we take all samples from both sentence A and B
sentences = data['sentence_A'].tolist()
sentence_b = data['sentence_B'].tolist()
sentences.extend(sentence_b)  # merge them
len(set(sentences))  # together we have ~4.5K unique sentences

4802

In [5]:
urls = [
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.train.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2013/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/images.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2015/images.test.tsv'
]

In [6]:
# each of these dataset have the same structure, so we loop through each creating our sentences data
for url in urls:
    res = requests.get(url)
    # extract to dataframe
    data = pd.read_csv(StringIO(res.text), sep='\t', header=None, error_bad_lines=False)
    # add to columns 1 and 2 to sentences list
    sentences.extend(data[1].tolist())
    sentences.extend(data[2].tolist())

b'Skipping line 191: expected 3 fields, saw 4\nSkipping line 206: expected 3 fields, saw 4\nSkipping line 295: expected 3 fields, saw 4\nSkipping line 695: expected 3 fields, saw 4\nSkipping line 699: expected 3 fields, saw 4\n'
b'Skipping line 104: expected 3 fields, saw 4\nSkipping line 181: expected 3 fields, saw 4\nSkipping line 317: expected 3 fields, saw 4\nSkipping line 412: expected 3 fields, saw 5\nSkipping line 508: expected 3 fields, saw 4\n'


In [7]:
len(set(sentences))

14505

In [8]:

# remove duplicates and NaN
sentences = [word for word in list(set(sentences)) if type(word) is str]

In [9]:

# initialize sentence transformer model
model = SentenceTransformer('bert-base-nli-mean-tokens')
# create sentence embeddings
sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape

(14504, 768)

In [10]:
d = sentence_embeddings.shape[1]
d

768

In [11]:
# IndexFlatL2 measures the L2 (or Euclidean) distance between all given points between our query vector, and the vectors loaded into the index
index = faiss.IndexFlatL2(d)

In [12]:

index.is_trained

True

In [13]:
index.add(sentence_embeddings)
index.ntotal

14504

In [14]:
# Then search given a query xq and number of nearest neigbors to return k.

k = 4
xq = model.encode(["Someone sprints with a football"])

In [15]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[ 2580  8174 11931  6649]]
Wall time: 18 ms


In [16]:
for i in I[0]:
    print(sentences[i])

A group of football players is running in the field
A group of people playing football is running in the field
Two groups of people are playing football
A person playing football is running past an official carrying a football


In [17]:
# Now, if we’d rather extract the numerical vectors from Faiss, we can do that too.
# we have 4 vectors to return (k) - so we initialize a zero array to hold them
vecs = np.zeros((k, d))
# then iterate through each ID from I and add the reconstructed vector to our zero-array
for i, val in enumerate(I[0].tolist()):
    vecs[i, :] = index.reconstruct(val)

In [18]:
vecs.shape

(4, 768)

In [38]:
# So, we are reducing the scope of our search, producing an approximate answer, rather than exact (as produced through exhaustive search) using IVF flat. 
# This partitions the search space into n Voronoi cells and searches the nprobe cells
nlist = 50  # how many cells
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)

In [39]:
index.is_trained

False

In [40]:
index.train(sentence_embeddings)
index.is_trained  # check if index is now trained

True

In [41]:
index.add(sentence_embeddings)
index.ntotal  # number of embeddings indexed
index.nprobe = 1

In [42]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[ 2580  8174 11931  6649]]
Wall time: 19 ms


In [43]:
for i in I[0]:
    print(sentences[i])

A group of football players is running in the field
A group of people playing football is running in the field
Two groups of people are playing football
A person playing football is running past an official carrying a football


In [44]:

index.make_direct_map()

In [45]:
index.reconstruct(11931)

array([ 0.22949542,  0.21166392, -0.10311846, -0.08761475, -0.7623109 ,
        0.01426321, -0.14125912,  0.05316121, -1.2854432 , -0.3934348 ,
       -1.1384095 ,  0.35143626,  0.0677641 ,  0.58293575,  1.2239507 ,
        0.04434835, -0.19015439, -1.2370024 ,  0.30799544, -0.04922725,
       -0.94415843, -0.46993154, -0.7439881 , -0.47364652,  0.53972524,
        0.30820456,  0.44206727,  0.42482737, -1.0584823 ,  1.0223888 ,
        0.30585548,  0.23979142,  0.44111764,  0.3964322 , -1.0802299 ,
       -0.8000373 ,  0.55200297, -0.69327915,  0.38068053,  0.2212789 ,
       -0.4883636 ,  0.3437855 , -0.9208072 ,  0.08734529, -0.7323824 ,
       -0.807738  , -0.97757214,  0.26438403, -1.0034046 , -0.12847072,
        0.3787472 ,  1.1222502 , -1.7026889 , -0.5364899 , -0.7851961 ,
        0.6135695 ,  0.7391417 , -0.7311539 , -0.45036718,  0.31849584,
       -0.4630785 , -0.28904024,  0.14222987, -0.11835919, -0.9614582 ,
       -0.3604935 , -0.03248384,  0.08153806, -0.8151982 , -0.88

In [46]:
# Product Quantization (PQ) we can view it as an additional approximation step with a similar outcome to our use of IVF. 
# Where IVF allowed us to approximate by reducing the scope of our search, PQ approximates the distance/similarity calculation instead.
m = 8  # number of centroid IDs in final compressed vectors
bits = 8 # number of bits in each centroid

quantizer = faiss.IndexFlatL2(d)  # we keep the same L2 distance flat index
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, bits)

In [48]:
index.is_trained

False

In [49]:
index.train(sentence_embeddings)

In [50]:
index.add(sentence_embeddings)

In [51]:
index.nprobe = 10  # align to previous IndexIVFFlat nprobe value

In [52]:
%%time
D, I = index.search(xq, k)
print(I)

[[ 598 1477 2080  421]]
Wall time: 20 ms
