## Non-duplicated Embeddings and Search

### Create a database for storing embeddings

In [1]:
import sqlite3
from datetime import datetime

# Create a connection to SQLite database
conn = sqlite3.connect('data/embeddings.db')
cursor = conn.cursor()

**Create Table**

In [3]:
cursor.execute('''
CREATE TABLE IF NOT EXISTS embedding_registry (
    hash_id CHAR(16) PRIMARY KEY,
    timestamp INTEGER NOT NULL
)
''')
conn.commit()

**Avoid duplicates by checking if the hash is already present in database.**

In [14]:
from sqlite3 import Cursor


def check_hash_exists(cursor: Cursor, hash_id: str):
    cursor.execute('SELECT hash_id FROM embedding_registry WHERE hash_id = ?', (hash_id,))
    result = cursor.fetchone()
    return result is not None

print(f"Hash exists? {check_hash_exists(cursor=cursor,hash_id='1234')}")

Hash exists? False


**Set Environment Variables**

In [8]:
project_id = "lchain-449703" 
location_id= "us-east1"
index_endpoint = "projects/889542799366/locations/us-east1/indexEndpoints/2820476023664017408"
index_id = "projects/889542799366/locations/us-east1/indexes/4404775522265989120"
gcs_bucket = "achaayans-embedding-basket"

In [None]:
!gcloud auth login

**Initialize VectorStore.** 

In [9]:
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_google_vertexai import (
    VectorSearchVectorStore
)

# Initialize embedding model and vector store
embedding_model = VertexAIEmbeddings(model_name="text-embedding-005")
vector_store = VectorSearchVectorStore.from_components(
    project_id=project_id,
    region=location_id,
    index_id=index_id,
    endpoint_id=index_endpoint,
    embedding=embedding_model,
    stream_update=True,
    gcs_bucket_name=gcs_bucket
)

In [15]:
import hashlib

def hash_content(text, length=16):
    return hashlib.sha256(text.encode('utf-8')).hexdigest()[:length]

print(hash_content("CIDER Women’s Quilted Puffer Jacket Cropped Long Sleeve Stand Collar Zip Up Trendy Winter Coat"))

95dac9d95bf2aa29


**Prepare Data and Metadata** 

In [16]:
texts=[
    "Wired Headphones with Microphone, K16 On-Ear Headphones for Kids with 3.5MM Jack, Foldable Stereo Bass Headphones for Teens School Amazon Kindle, Fire, Chromebook, Tablet(Light Blue Red)",
    "Bose QuietComfort Ultra Bluetooth Headphones, Wireless Headphones with Spatial Audio, Over Ear Noise Cancelling with Mic, Up to 24 Hours of Playtime, Black",
    "All-new Ring Battery Doorbell, Head-to-Toe Video, Live View with Two-Way Talk, and Motion Detection & Alerts (2024 release), Satin Nickel",
    "Amazon Basics Neoprene Dumbbell Hand Weights"

]
metadatas=[
    {"url":"https://tinyurl.com/bdzacb45","item":1,"category": "Headphones"},
    {"url":"https://tinyurl.com/5n7usxur","item":2,"category": "Headphones"},
    {"url":"https://tinyurl.com/mpt5pttr","item":3,"category": "security"},
    {"url":"https://tinyurl.com/jxexvkhr","item":4,"category": "fashion"},
]

**Load hash into database. This will be used to track if the document is already available in search index.** 

In [17]:

def register_embedding(cursor: Cursor, hash_id: str):    
    current_time = int(datetime.now().timestamp())
    cursor.execute('INSERT INTO embedding_registry (hash_id, timestamp) VALUES (?, ?)', 
                    (hash_id, current_time))
    if cursor.rowcount == 1:
        print(f"Inserted Hash {hash_id} to database")
    else:
        print(f"failed to insert {hash_id}") 

**Add document to Vector Store.** 

In [19]:
def add_document(vector_store, text, metadata):
    hash_id = hash_content(text)
    try:
        if not check_hash_exists(cursor, hash_id):
            vector_store.add_texts(texts=[text], metadatas=[metadata])
            print(f"Adding Document text {text}")
            register_embedding(cursor=cursor, hash_id=hash_id)
        else:
            print("Document already added. ")
    except Exception as e:
        print(f"Error processing document text {text}: {str(e)}")

In [24]:
for text, metadata in zip(texts, metadatas):
    add_document(vector_store, text, metadata)

Document already added. 
Document already added. 
Document already added. 
Document already added. 


**Search**

In [23]:
documents, scores = zip(*vector_store.similarity_search_with_score(
    query="Bose"
))

for doc, score in zip(documents, scores):
        print(f"Document : {doc} \n Score : {score}")



Document : page_content='Bose QuietComfort Ultra Bluetooth Headphones, Wireless Headphones with Spatial Audio, Over Ear Noise Cancelling with Mic, Up to 24 Hours of Playtime, Black' metadata={'shop_id': 'b2cdc37b507b514e', 'url': 'https://tinyurl.com/5n7usxur', 'item': 2, 'category': 'Headphones'} 
 Score : 0.7051407098770142
Document : page_content='Wired Headphones with Microphone, K16 On-Ear Headphones for Kids with 3.5MM Jack, Foldable Stereo Bass Headphones for Teens School Amazon Kindle, Fire, Chromebook, Tablet(Light Blue Red)' metadata={'shop_id': 'f0bf7c4c396d3d47', 'url': 'https://tinyurl.com/bdzacb45', 'item': 1, 'category': 'Headphones'} 
 Score : 0.46925270557403564
Document : page_content='Amazon Basics Neoprene Dumbbell Hand Weights' metadata={'shop_id': 'd251051ec47407fd', 'url': 'https://tinyurl.com/jxexvkhr', 'item': 4, 'category': 'fashion'} 
 Score : 0.42754489183425903
Document : page_content='All-new Ring Battery Doorbell, Head-to-Toe Video, Live View with Two-Way

NEXT: Filter based on score..