In [5]:
# !pip install faiss-cpu
# !pip install sentence-transformers

In [6]:
import pandas as pd
pd.set_option('display.max_colwidth', 100) # increase the size of columns to see all the text contained.

df = pd.read_csv("../sample_table.csv")
df

Unnamed: 0,text,category
0,Meditation and yoga can improve mental health,Health
1,"Fruits, whole grains and vegetables helps control blood pressure",Health
2,These are the latest fashion trends for this week,Fashion
3,Vibrant color jeans for male are becoming a trend,Fashion
4,The concert starts at 7 PM tonight,Event
5,Navaratri dandiya program at Expo center in Mumbai this october,Event
6,Exciting vacation destinations for your next trip,Travel
7,Maldives and Srilanka are gaining popularity in terms of low budget vacation places,Travel


In [7]:
df.shape

(8, 2)

In [8]:
# !pip install tf-keras

In [9]:
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
vectors = encoder.encode(df.text)
vectors.shape

(8, 768)

Shape means that, 8 vectors are there. Each vector size is 768.

In [10]:
vectors

array([[-0.00247395,  0.03626721, -0.05290459, ..., -0.09152356,
        -0.03970002, -0.04330488],
       [-0.03357267,  0.00980517, -0.03250129, ..., -0.05165467,
         0.02245886, -0.03156182],
       [-0.01865323, -0.04051314, -0.01235387, ...,  0.00610585,
        -0.07179647,  0.02773853],
       ...,
       [-0.00066458,  0.04252125, -0.05645508, ...,  0.01315471,
        -0.03183568, -0.04357664],
       [-0.03317154,  0.03252455, -0.02484838, ...,  0.01174421,
         0.05747123,  0.00571021],
       [-0.00166395,  0.00413827, -0.04597082, ...,  0.02008527,
         0.05656242, -0.00161595]], dtype=float32)

Above array is 2D, where the first index numbers(vectors) is the vector number of the first sentence that is 'Meditation and yoga can improve mental health' and rest is so on of the other

In [11]:
dim = vectors.shape[1]
dim

768

In [12]:
import faiss

index = faiss.IndexFlatL2(dim) #create empty indexes
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000002D2C6D76D90> >

In [13]:
index.add(vectors)

Now the vectorized the df.text

##### Testing it by giving out of sentence

In [14]:
out_sentence = 'My pant is in color blue'

out_vector = encoder.encode(out_sentence)
out_vector

array([-9.92349815e-03, -6.69282824e-02, -1.61646269e-02, -1.56350452e-02,
       -2.89491452e-02, -2.89462949e-03,  8.15562624e-03,  1.83320586e-02,
       -1.53005486e-02,  4.26872410e-02,  2.22117156e-02,  1.00627877e-02,
        2.79588681e-02,  2.98543610e-02,  1.05588146e-01, -4.41025347e-02,
        2.11437456e-02, -1.23371305e-02,  9.28433910e-02, -1.38785718e-02,
        4.52346355e-02, -2.22765021e-02, -9.07605980e-03, -7.80541333e-04,
        3.38964425e-02, -6.33138716e-02,  7.74717098e-03,  4.33782078e-02,
       -9.39120911e-03,  2.82920375e-02,  4.46470082e-02,  5.17427176e-03,
       -4.33977507e-03, -9.02506411e-02,  1.41458690e-06, -1.83894616e-02,
       -1.20338220e-02, -8.66807159e-03,  6.95512677e-03, -1.81012936e-02,
       -4.12674360e-02, -9.85052716e-03, -7.93389697e-03,  3.39933187e-02,
        1.83049925e-02,  2.08453052e-02,  2.08273213e-02,  1.41924620e-01,
        6.89296518e-03,  4.40657102e-02,  8.64369050e-03,  3.84649001e-02,
       -9.96618997e-03,  

In [15]:
out_vector.shape

(768,)

Now it is 1D array, but the faiss looking for 2D

so need to make it 2D

In [16]:
import numpy as np
search_vec = np.array(out_vector).reshape(1, -1)
search_vec.shape

(1, 768)

In [17]:
search_vec

array([[-9.92349815e-03, -6.69282824e-02, -1.61646269e-02,
        -1.56350452e-02, -2.89491452e-02, -2.89462949e-03,
         8.15562624e-03,  1.83320586e-02, -1.53005486e-02,
         4.26872410e-02,  2.22117156e-02,  1.00627877e-02,
         2.79588681e-02,  2.98543610e-02,  1.05588146e-01,
        -4.41025347e-02,  2.11437456e-02, -1.23371305e-02,
         9.28433910e-02, -1.38785718e-02,  4.52346355e-02,
        -2.22765021e-02, -9.07605980e-03, -7.80541333e-04,
         3.38964425e-02, -6.33138716e-02,  7.74717098e-03,
         4.33782078e-02, -9.39120911e-03,  2.82920375e-02,
         4.46470082e-02,  5.17427176e-03, -4.33977507e-03,
        -9.02506411e-02,  1.41458690e-06, -1.83894616e-02,
        -1.20338220e-02, -8.66807159e-03,  6.95512677e-03,
        -1.81012936e-02, -4.12674360e-02, -9.85052716e-03,
        -7.93389697e-03,  3.39933187e-02,  1.83049925e-02,
         2.08453052e-02,  2.08273213e-02,  1.41924620e-01,
         6.89296518e-03,  4.40657102e-02,  8.64369050e-0

In [18]:
index.search(search_vec, k=2)

(array([[1.1959307, 1.4447126]], dtype=float32), array([[3, 2]], dtype=int64))

##### We got array index 2, 3, which menas that the vectorized DB has similar vector is at 2 and 3 index. that actually correct because both are categorized in 'fashion'

In [19]:
df

Unnamed: 0,text,category
0,Meditation and yoga can improve mental health,Health
1,"Fruits, whole grains and vegetables helps control blood pressure",Health
2,These are the latest fashion trends for this week,Fashion
3,Vibrant color jeans for male are becoming a trend,Fashion
4,The concert starts at 7 PM tonight,Event
5,Navaratri dandiya program at Expo center in Mumbai this october,Event
6,Exciting vacation destinations for your next trip,Travel
7,Maldives and Srilanka are gaining popularity in terms of low budget vacation places,Travel


In [20]:
# Let me give another sentence
sentence2 = 'I need map for visit all places in Malasia'

# encoder = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
vec = encoder.encode(sentence2)
vec.shape

(768,)

In [21]:
s_vec = np.array(vec).reshape(1, -1)
s_vec.shape

(1, 768)

In [22]:
index.search(s_vec, k=2)

(array([[1.363025 , 1.4296546]], dtype=float32), array([[6, 7]], dtype=int64))

In [23]:
df.loc[[6,7]]

Unnamed: 0,text,category
6,Exciting vacation destinations for your next trip,Travel
7,Maldives and Srilanka are gaining popularity in terms of low budget vacation places,Travel


It find similarity in 'travel' that's way gave array 6 and 7

In [24]:
# Let me give another sentence
sentence2 = 'Ravi invited Varun for the party'

# encoder = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
vec = encoder.encode(sentence2)
vec.shape

(768,)

In [25]:
s_vec = np.array(vec).reshape(1, -1)
s_vec.shape

(1, 768)

In [26]:
index.search(s_vec, k=2) # k means the knn, how much similarity will consider

(array([[1.512016 , 1.8242159]], dtype=float32), array([[5, 4]], dtype=int64))

In [27]:
df.loc[[5,4]]

Unnamed: 0,text,category
5,Navaratri dandiya program at Expo center in Mumbai this october,Event
4,The concert starts at 7 PM tonight,Event


### This is happen because it is **Sementic Searching** (Try to understand the sentence meaning instead of words)