# Tests for Vector Search

## Imports

In [2]:
import base64
import json
import numpy as np
import pandas as pd
from typing import Optional, List
from fastapi import FastAPI, Request, UploadFile, File, Form
from google.cloud import aiplatform, bigquery, storage
from vertexai.vision_models import MultiModalEmbeddingModel, Image, Video
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

## GCP Setup

In [3]:
# Create a BigQuery client to interact with your BigQuery database
bq_client = bigquery.Client()

# Create a Storage client
storage_client = storage.Client()

# Load pre-trained multimodal and text embedding models from Vertex AI
mm_emb_model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")
text_emb_model = TextEmbeddingModel.from_pretrained("text-embedding-004")

## Env variables

In [23]:
PROJECT_ID = "my-project" # @param
REGION = "europe-west4"

PRODUCTS_BUCKET = "my-bucket" # @param
PRODUCTS_FOLDER = "embeds-inputs"

INDEX_TEXT_ID = "text_512_0e225490" 
INDEX_IMG_ID = "image_512_12270938"
INDEX_ENDPOINT_ID = "5772066609917067264"

## Helper functions

In [24]:
# Get the text content from a product's JSON in our GCS bycket
def get_text_from_gcs(file_name):
    """Retrieves the text content from a JSON file in GCS."""
    bucket = storage_client.bucket(PRODUCTS_BUCKET)
    blob = bucket.blob(f"{PRODUCTS_FOLDER}/{file_name}.json")
    content = blob.download_as_string().decode("utf-8")
    return content

# Embed text using the "text-embedding-004" model
def embed_text(
    texts: List[str],
    task: str = "CLUSTERING", # @param - See all options at https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/text-embeddings-api#request_body
    model_name: str = "text-embedding-004",
    dimensionality: Optional[int] = 512,
) -> List[List[float]]:
    """Embeds texts with a pre-trained, foundational model."""
    model = TextEmbeddingModel.from_pretrained(model_name)
    inputs = [TextEmbeddingInput(text, task) for text in texts]
    kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {}
    embeddings = model.get_embeddings(inputs, **kwargs)
    return [embedding.values for embedding in embeddings]

## Load data from some products to test with

In [25]:
# Products to test with
p1, p2, p3, p4, p5, p6, p7, p8, p9 = '137240482', '137240574', '137240666', '137240877', '137240981', '137241078', '137241187', '137241459', '137241599'

## Test 1 - Create embeddings and combine them
### Step 1 - Generate text embeddings

In [32]:
# Get the text content from the JSON files
texts = [get_text_from_gcs(p1), get_text_from_gcs(p2), get_text_from_gcs(p3), get_text_from_gcs(p4), get_text_from_gcs(p5)]

In [36]:
# Generate text embeddings for the product descriptions
text_embeddings = embed_text(texts)
print(text_embeddings[0])

[-0.04466456547379494, 0.027873221784830093, -0.04545360058546066, -0.007495563477277756, 0.056860461831092834, 0.036580465734004974, 0.042145002633333206, -0.006708754692226648, 0.0023311751428991556, -0.009315169416368008, -0.00870446115732193, 0.044420886784791946, 0.018079258501529694, -0.042466264218091965, 0.05586535856127739, -0.05399002507328987, 0.0742160901427269, 0.05707112327218056, -0.042747270315885544, -0.03567966818809509, -0.02422790601849556, -0.01217995211482048, 0.036889296025037766, -0.019539764150977135, -0.02724825218319893, -0.029490618035197258, 0.005610696971416473, 0.05406656116247177, 0.04780719801783562, -0.05536559596657753, 0.02847936935722828, 0.048059094697237015, 0.016650483012199402, -0.007248870097100735, 0.04954089969396591, 0.03013605810701847, -0.03536289930343628, 0.009113389998674393, 0.02574356645345688, -0.013934320770204067, -0.06174776330590248, -0.024247752502560616, 0.01672978140413761, 0.0015823491849005222, -0.026050828397274017, -0.0271

TBD: Update functions so that embeddings are stored in a GCS bucket

## Step 2 - Generate image embeddings
TBD

## Step 3 - Normalize image and text embeddings
TBD

In [26]:
def l2_normalize(vector):
    """Normalizes a vector to unit length using L2 normalization."""
    l2_norm = np.linalg.norm(vector)
    if l2_norm == 0:
        return vector  # Avoid division by zero
    return vector / l2_norm

In [32]:
normalized_text_embedding = l2_normalize(text_embeddings[0])
#normalized_video_embedding = l2_normalize(img_embeddings[n])

normalized_text_embedding

array([ 7.60170974e-02,  2.02099026e-02, -6.54493108e-02, -3.34640293e-02,
        6.48974857e-03,  8.59563655e-02,  4.97813497e-02, -4.84505534e-03,
        1.84288347e-02,  1.01110114e-02,  4.12478271e-02,  1.74047053e-01,
        3.44966259e-02, -3.90797764e-02,  1.02959248e-01, -1.05549227e-01,
        1.68302274e-01,  1.25638450e-01, -9.64116406e-02, -2.21152327e-02,
       -9.08869565e-03,  3.30881200e-02,  6.13282889e-02,  4.11844816e-02,
       -9.25055232e-02, -9.15900147e-02,  4.39064986e-03,  5.37328229e-02,
        2.40870414e-02, -3.21531648e-02,  6.19638972e-02, -2.96521988e-03,
        5.65738422e-03, -3.16345239e-02,  7.43966467e-02, -2.02339622e-02,
       -8.76909508e-02,  1.18936270e-02,  2.02866399e-02, -8.92035480e-02,
       -1.10842735e-03, -8.33042700e-03,  1.10139086e-02, -3.69222969e-02,
        1.54192450e-02, -5.55077198e-02, -3.61760264e-02, -2.11786665e-02,
        3.44632290e-02,  5.97038318e-02,  2.79703723e-02, -1.73723927e-02,
        5.85146320e-02,  

## Step 4 - Combine embeddings

TBD - Weighted averages (could also be concatenation)

In [None]:
# Calculate a weighted average of the normalized embeddings
#we_ave =  (0.7 * normalized_img_embedding) + (0.3 * normalized_text_embedding)
#return we_ave

## Test 2 - Querying embedding index

In [28]:
from typing import List

def vector_search_find_neighbors(
    project: str,
    location: str,
    index_endpoint_name: str,
    deployed_index_id: str,
    queries: List[List[float]],
    num_neighbors: int,
) -> None:
    """Query the vector search index.

    Args:
        project (str): Required. Project ID
        location (str): Required. The region name
        index_endpoint_name (str): Required. Index endpoint to run the query
        against.
        deployed_index_id (str): Required. The ID of the DeployedIndex to run
        the queries against.
        queries (List[List[float]]): Required. A list of queries. Each query is
        a list of floats, representing a single embedding.
        num_neighbors (int): Required. The number of neighbors to return.
    """
    # Initialize the Vertex AI client
    aiplatform.init(project=project, location=location)

    # Create the index endpoint instance from an existing endpoint.
    my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint(
        index_endpoint_name=index_endpoint_name
    )

    # Query the index endpoint for the nearest neighbors.
    resp = my_index_endpoint.find_neighbors(
        deployed_index_id=deployed_index_id,
        queries=queries,
        num_neighbors=num_neighbors,
    )
    print(resp)

    # Query hybrid datapoints, sparse-only datapoints, and dense-only datapoints.
    hybrid_queries = [
        aiplatform.matching_engine.matching_engine_index_endpoint.HybridQuery(
            dense_embedding=[1, 2, 3],
            sparse_embedding_dimensions=[10, 20, 30],
            sparse_embedding_values=[1.0, 1.0, 1.0],
            rrf_ranking_alpha=0.5,
        ),
        aiplatform.matching_engine.matching_engine_index_endpoint.HybridQuery(
            dense_embedding=[1, 2, 3],
            sparse_embedding_dimensions=[10, 20, 30],
            sparse_embedding_values=[0.1, 0.2, 0.3],
        ),
        aiplatform.matching_engine.matching_engine_index_endpoint.HybridQuery(
            sparse_embedding_dimensions=[10, 20, 30],
            sparse_embedding_values=[0.1, 0.2, 0.3],
        ),
        aiplatform.matching_engine.matching_engine_index_endpoint.HybridQuery(
            dense_embedding=[1, 2, 3]
        ),
    ]

    hybrid_resp = my_index_endpoint.find_neighbors(
            deployed_index_id=deployed_index_id,
            queries=hybrid_queries,
            num_neighbors=num_neighbors,)
    print(hybrid_resp)

In [34]:
# Text (JSON) neighbors search
vector_search_find_neighbors(
    project = PROJECT_ID,
    location = REGION,
    index_endpoint_name = INDEX_ENDPOINT_ID,
    deployed_index_id = INDEX_TEXT_ID,
    queries = [text_embeddings[0]],
    num_neighbors = 10,
)

[[MatchNeighbor(id='137240981', distance=0.8407326936721802, sparse_distance=None, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[]), MatchNeighbor(id='137240877', distance=0.837692141532898, sparse_distance=None, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[]), MatchNeighbor(id='137240482', distance=0.8335597515106201, sparse_distance=None, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[]), MatchNeighbor(id='137240574', distance=0.780953049659729, sparse_distance=None, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[]), MatchNeighbor(id='137241459', distance=0.7789533138275146, sparse_distance=None, feature_vector=[], crowding_tag='0', restricts=[], numeric_