In [None]:
!pip install faiss-cpu
!pip install sentence-transformers

## Create embeddings

In [49]:
import pandas as pd
data = [['Where are your headquarters located?', 'location'],
['Throw my cellphone in the water', 'random'],
['Network Access Control?', 'networking'],
#['Pepe', 'naming'],
['Address', 'location']]
df = pd.DataFrame(data, columns = ['text', 'category'])

In [50]:
df

Unnamed: 0,text,category
0,Where are your headquarters located?,location
1,Throw my cellphone in the water,random
2,Network Access Control?,networking
3,Address,location


In [51]:
from sentence_transformers import SentenceTransformer
text = df['text']
encoder = SentenceTransformer("paraphrase-mpnet-base-v2")
vectors = encoder.encode(text)

vectors --> array de 4 elementos (uno x cada fila de text)

vector[0] --> el embedding de "Where are your headquarters located?", es un vector con muchos elementos

! todos los vector[n] tienen la misma cantidad de elementos --> dimension

In [65]:
vectors

array([[-0.00128637, -0.01877659, -0.0341595 , ...,  0.02724509,
        -0.01405938, -0.01612017],
       [-0.00368725, -0.07847735, -0.01199679, ...,  0.04308367,
         0.041138  ,  0.03189651],
       [-0.00447403,  0.01662998,  0.01022726, ...,  0.02680056,
        -0.04437998, -0.00651695],
       [-0.00118615,  0.03910445, -0.0106731 , ...,  0.00960998,
        -0.0333215 ,  0.00778077]], dtype=float32)

## Create index

In [52]:
import faiss

vector_dimension = vectors.shape[1]
index = faiss.IndexFlatL2(vector_dimension)
faiss.normalize_L2(vectors)
index.add(vectors)

In [53]:
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7f08e08a9ec0> >

## Search


In [None]:
import numpy as np

### 1

In [None]:
search_text = 'where is your office?'
search_vector = encoder.encode(search_text)
_vector = np.array([search_vector])
faiss.normalize_L2(_vector)

In [None]:
search_vector

In [None]:
k = index.ntotal
distances, ann = index.search(_vector, k=k)

hacemos la busqueda por todos los vecinos, entonces k es el total de vectores (# filas df)

In [None]:
distances

array([[0.5848731, 1.1759502, 1.6442657, 1.9197676]], dtype=float32)

In [None]:
ann #approximate nearest neighbour

array([[0, 3, 2, 1]])

In [None]:
df

Unnamed: 0,text,category
0,Where are your headquarters located?,location
1,Throw my cellphone in the water,random
2,Network Access Control?,networking
3,Address,location


In [None]:
results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
results

Unnamed: 0,distances,ann
0,0.584873,0
1,1.17595,3
2,1.644266,2
3,1.919768,1


### 2

In [62]:
search_text = 'complete crazyness'
search_vector = encoder.encode(search_text)
_vector = np.array([search_vector])
faiss.normalize_L2(_vector)

In [63]:
k = index.ntotal
distances, ann = index.search(_vector, k=k)

In [73]:
type(search_vector[0])

numpy.float32

hacemos la busqueda por todos los vecinos, entonces k es el total de vectores (# filas df)

In [59]:
df

Unnamed: 0,text,category
0,Where are your headquarters located?,location
1,Throw my cellphone in the water,random
2,Network Access Control?,networking
3,Address,location


In [64]:
results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
results

Unnamed: 0,distances,ann
0,1.713565,1
1,1.854434,3
2,1.86545,0
3,1.882061,2
