## Generate Embeddings


In [1]:
import pandas as pd

dataset = pd.read_csv("../data/sentences.csv")
dataset.head()

Unnamed: 0,sentence
0,A little girl is smiling and running outside
1,A man is drawing on a digital dry erase board
2,A black bird is sitting on a dead tree
3,An elderly man is sitting on a bench
4,A man and a woman are sitting comfortably on t...


In [2]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding(sentence, vector_size=1536):
   
    # Get embedding from sentence transformer
    embedding = model.encode(sentence)
    
    if len(embedding) < vector_size:
        embedding = np.pad(embedding, (0, vector_size - len(embedding)))
    else:
        embedding = embedding[:vector_size]
    
    return embedding.tolist()

test_embedding = get_embedding("test sentence")
print(f"Embedding dimension: {len(test_embedding)}")

2024-11-12 18:49:50.538969: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Embedding dimension: 1536


In [3]:
import os
import numpy as np

if os.path.exists("../data/embedded_sentences.csv"):
    dataset = pd.read_csv("../data/embedded_sentences.csv")
    dataset["embedding"] = dataset.embedding.apply(eval).apply(np.array)
else:
    dataset["embedding"] = dataset["sentence"].apply(get_embedding)
    dataset.to_csv("../data/embedded_sentences.csv", index=False)

In [4]:
dataset["id"] = range(1, len(dataset) + 1)
dataset.head()

Unnamed: 0,sentence,embedding,id
0,A little girl is smiling and running outside,"[0.0436425618827343, 0.01375775970518589, 0.00...",1
1,A man is drawing on a digital dry erase board,"[-0.008048108778893948, 0.030766354873776436, ...",2
2,A black bird is sitting on a dead tree,"[0.027433251962065697, 1.8205369087809231e-06,...",3
3,An elderly man is sitting on a bench,"[-0.004122881218791008, -0.056238383054733276,...",4
4,A man and a woman are sitting comfortably on t...,"[0.021146269515156746, -0.032280709594488144, ...",5


In [5]:
embedding_dimension = len(dataset.iloc[0]["embedding"])
embedding_dimension

1536

## Faiss

Check [Faiss Indexes](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes) for more information.


In [None]:
embeddings = np.array(dataset.embedding.tolist(), dtype=np.float32)
query = "I love soccer"
xq = np.array(get_embedding(query), dtype=np.float32)

import faiss
import gc  # Garbage collector

# Create index with reduced memory footprint
ncentroids = 10  # Reduced from 20
quantizer = faiss.IndexFlatL2(embedding_dimension)
index_ivf = faiss.IndexIVFFlat(quantizer, embedding_dimension, ncentroids)

# Train in batches if needed
try:
    print("Starting training...")
    # Ensure contiguous array
    embeddings = np.ascontiguousarray(embeddings)
    # Train with explicit memory management
    index_ivf.train(embeddings)
    print("Training completed")
    is_trained = index_ivf.is_trained
    print("Is trained:", is_trained)
except Exception as e:
    print(f"Error during training: {e}")
finally:
    # Clean up
    gc.collect()

### IndexFlatL2 - Exact Search for L2


In [12]:
import faiss

index_l2 = faiss.IndexFlatL2(embedding_dimension)
index_l2.is_trained

True

In [13]:
index_l2.add(embeddings)
index_l2.ntotal

1000

In [14]:
_, document_indices = index_l2.search(np.expand_dims(xq, axis=0), k=4)
dataset.iloc[document_indices[0]]

Unnamed: 0,sentence,embedding,id
928,A young child is watering a plant with splashes,"[0.035140275955200195, 0.00795348733663559, -0...",929
776,A young child is splashing in the water,"[0.014795569702982903, 0.008873002603650093, 0...",777
438,Three people are walking across a rope and ste...,"[0.032441552728414536, 0.02215403877198696, -0...",439
328,A dog is swimming after a tennis ball,"[-0.018068404868245125, 0.042923588305711746, ...",329


### IndexIVFFlat - Inverted file with exact post-verification

<img src='images/ivf.png' width="1000">


In [15]:
ncentroids = 20
quantizer = faiss.IndexFlatL2(embedding_dimension)
index_ivf = faiss.IndexIVFFlat(quantizer, embedding_dimension, ncentroids)
index_ivf.is_trained

False

In [16]:
index_ivf.train(embeddings)
index_ivf.is_trained

True

In [17]:
index_ivf.add(embeddings)
index_ivf.ntotal

1000

In [18]:
_, document_indices = index_ivf.search(np.expand_dims(xq, axis=0), k=4)
dataset.iloc[document_indices[0]]

Unnamed: 0,sentence,embedding,id
194,The boy and the girl are playing and wearing a...,"[0.03363574668765068, 0.025265799835324287, -0...",195
985,Several swimmers are jumping into the water,"[0.024976184591650963, 0.08189535140991211, 0....",986
663,A young couple is sleeping in bed,"[-0.010642520152032375, -0.02006523497402668, ...",664
678,A person is being kicked by a monkey,"[0.013209913857281208, 0.0234414990991354, -0....",679


In [19]:
index_ivf.nprobe = 5
_, document_indices = index_ivf.search(np.expand_dims(xq, axis=0), k=4)
dataset.iloc[document_indices[0]]

Unnamed: 0,sentence,embedding,id
438,Three people are walking across a rope and ste...,"[0.032441552728414536, 0.02215403877198696, -0...",439
139,Three people are walking across a rope and woo...,"[0.02049058862030506, 0.036873724311590195, 0....",140
527,A baby rhino is shunning an adult rhino,"[0.028290603309869766, 0.0036420845426619053, ...",528
951,Two children are playing on a statue,"[0.07849501818418503, -0.013872774317860603, -...",952


### IndexIVFPQ - IVF + Product Quantizer (PQ)

<img src='images/ivf-pq.png' width="1000">

In [20]:
code_size = 8
bits_per_centroid = 4

index_ivf_pq = faiss.IndexIVFPQ(
    quantizer, embedding_dimension, ncentroids, code_size, bits_per_centroid
)
index_ivf_pq.is_trained

False

In [21]:
index_ivf_pq.train(embeddings)
index_ivf_pq.add(embeddings)
index_ivf_pq.ntotal

1000

In [22]:
index_ivf_pq.nprobe = 5
_, document_indices = index_ivf_pq.search(np.expand_dims(xq, axis=0), k=4)
dataset.iloc[document_indices[0]]

Unnamed: 0,sentence,embedding,id
286,The muscular black man is dancing and the man ...,"[0.016497677192091942, 0.011000382713973522, -...",287
14,The band is singing,"[-0.0184180848300457, -0.004225262440741062, 0...",15
771,A karate practitioner is kicking at another ma...,"[-0.0276799313724041, 0.05446998402476311, -0....",772
194,The boy and the girl are playing and wearing a...,"[0.03363574668765068, 0.025265799835324287, -0...",195


## Pinecone


In [23]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone

# Load environment variables
load_dotenv()

# Get API key
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Verify API key is loaded
if not PINECONE_API_KEY:
    raise ValueError("Pinecone API key not found in environment variables")

# Initialize Pinecone
try:
    database = Pinecone(api_key=PINECONE_API_KEY)
    print("Successfully connected to Pinecone")
except Exception as e:
    print(f"Error connecting to Pinecone: {e}")



Successfully connected to Pinecone


In [24]:
from pinecone import ServerlessSpec

serverless_spec = ServerlessSpec(cloud="aws", region="us-east-1")

In [25]:
import time

INDEX_NAME = "underfitted-random-sentences"

if INDEX_NAME not in database.list_indexes().names():
    database.create_index(
        name=INDEX_NAME,
        dimension=embedding_dimension,
        metric="cosine",
        spec=serverless_spec,
    )

    time.sleep(1)

pinecone_index = database.Index(INDEX_NAME)

In [26]:
pinecone_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1000}},
 'total_vector_count': 1000}

In [27]:
def iterator(dataset, size):
    for i in range(0, len(dataset), size):
        yield dataset.iloc[i : i + size]


def vector(batch):
    vector = []
    for i in batch.to_dict("records"):
        vector.append((str(i["id"]), i["embedding"], {"sentence": i["sentence"]}))

    return vector

In [28]:
if pinecone_index.describe_index_stats()["total_vector_count"] == 0:
    for batch in iterator(dataset, 100):
        pinecone_index.upsert(vector(batch))

In [29]:
pinecone_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1000}},
 'total_vector_count': 1000}

In [31]:

response = pinecone_index.query(
    vector=xq.tolist(),  # Convert to list
    top_k=4,
    include_metadata=True
)

# Print matches
for match in response["matches"]:
    print(match["metadata"]["sentence"])

A young child is watering a plant with splashes
A young child is splashing in the water
Three people are walking across a rope and steel bridge over a river
A dog is swimming after a tennis ball


In [32]:
query2 = "I like animals that eat too much"
xq2 = get_embedding(query2)
response = pinecone_index.query(vector=xq2, top_k=5, include_metadata=True)
for match in response["matches"]:
    print(match["metadata"]["sentence"])

A baby is crawling happily
Children are being dressed in costumes and playing a game
A child in a green and white uniform for sports is running over the grass
A child in a green and white sports uniform is running over the grass
A football player is running past an official carrying a football


In [33]:
database.delete_index(INDEX_NAME)