# Evaluating Search Relevance with Azure AI Search and Cohere Embeddings

In [None]:
!pip install ranx
!pip install azure-search-documents==11.6.0b6
!pip install azure-identity
!pip install python-dotenv
!pip install cohere

## Step 1: Import necessary libraries and load environment variables

In [1]:

import os
import pandas as pd
import cohere
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
from collections import defaultdict
from dotenv import load_dotenv
from ranx import Qrels, Run, compare
from azure.core.credentials import AzureKeyCredential
from azure.identity import DefaultAzureCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    AIStudioModelCatalogName,
    AzureMachineLearningParameters,
    AzureMachineLearningVectorizer,
    HnswAlgorithmConfiguration,
    HnswParameters,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SearchableField,
    SimpleField,
    VectorEncodingFormat,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile
)
from azure.search.documents.models import (
    VectorizableTextQuery,
    VectorizedQuery
)

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


## Step 2: Cohere API and Azure AI Search Configuration

In [2]:
# Environment variables
AZURE_AI_STUDIO_COHERE_EMBED_KEY = os.getenv("AZURE_AI_STUDIO_COHERE_EMBED_KEY")
AZURE_AI_STUDIO_COHERE_EMBED_ENDPOINT = os.getenv("AZURE_AI_STUDIO_COHERE_EMBED_ENDPOINT")
AZURE_SEARCH_SERVICE_ENDPOINT = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
AZURE_SEARCH_ADMIN_KEY = os.getenv("AZURE_SEARCH_ADMIN_KEY")

# Initialize Cohere client using Azure AI Studio
cohere_azure_client = cohere.ClientV2(
    base_url=f"{AZURE_AI_STUDIO_COHERE_EMBED_ENDPOINT}/v1", 
    api_key=AZURE_AI_STUDIO_COHERE_EMBED_KEY
)

# Initialize Azure AI Search client for two indexes (int8 and float)
credential = AzureKeyCredential(AZURE_SEARCH_ADMIN_KEY)
int8_index_name = "wands-cohere-int8-index"
float_index_name = "wands-cohere-float32-index"
int8_search_client = SearchClient(endpoint=AZURE_SEARCH_SERVICE_ENDPOINT, index_name=int8_index_name, credential=credential)
float_search_client = SearchClient(endpoint=AZURE_SEARCH_SERVICE_ENDPOINT, index_name=float_index_name, credential=credential)


## Step 3: Function to generate embeddings using Cohere

In [3]:
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type
from cohere.errors import (
    TooManyRequestsError, ServiceUnavailableError, GatewayTimeoutError,
    InternalServerError, ClientClosedRequestError
)

# Add tenacity retry mechanism to handle rate limits and other transient errors
@retry(
    stop=stop_after_attempt(5),  # Retry up to 5 times
    wait=wait_fixed(5),  # Wait 5 seconds between retries
    retry=retry_if_exception_type((
        TooManyRequestsError,  # Rate limit error (429)
        ServiceUnavailableError,  # Service unavailable (503)
        GatewayTimeoutError,  # Gateway timeout (504)
        InternalServerError,  # Internal server error (500)
        ClientClosedRequestError  # Client closed request (499)
    ))
)
def generate_embeddings(texts, input_type="search_document", embedding_type="float"):
    model = "embed-english-v3.0"
    texts = [texts] if isinstance(texts, str) else texts  # Ensure input is a list of strings
    response = cohere_azure_client.embed(
        texts=texts,
        model=model,
        input_type=input_type,
        embedding_types=[embedding_type],
    )
    return [embedding for embedding in getattr(response.embeddings, embedding_type)]


## Step 4: Load Product Data

In [4]:
products_df = pd.read_csv("eval/products/product.csv", sep="\t", index_col="product_id", keep_default_na=False)

# Extract first 5000 product names and descriptions
product_names = products_df["product_name"].tolist()[:5000]
product_descriptions = products_df["product_description"].tolist()[:5000]
products_df.head()

Unnamed: 0_level_0,product_name,product_class,category hierarchy,product_description,product_features,rating_count,average_rating,review_count
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,solid wood platform bed,Beds,Furniture / Bedroom Furniture / Beds & Headboa...,"good , deep sleep can be quite difficult to ha...",overallwidth-sidetoside:64.7|dsprimaryproducts...,15.0,4.5,15.0
1,all-clad 7 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,"create delicious slow-cooked meals , from tend...",capacityquarts:7|producttype : slow cooker|pro...,100.0,2.0,98.0
2,all-clad electrics 6.5 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,prepare home-cooked meals on any schedule with...,features : keep warm setting|capacityquarts:6....,208.0,3.0,181.0
3,all-clad all professional tools pizza cutter,"Slicers, Peelers And Graters",Browse By Brand / All-Clad,this original stainless tool was designed to c...,overallwidth-sidetoside:3.5|warrantylength : l...,69.0,4.5,42.0
4,baldwin prestige alcott passage knob with roun...,Door Knobs,Home Improvement / Doors & Door Hardware / Doo...,the hardware has a rich heritage of delivering...,compatibledoorthickness:1.375 '' |countryofori...,70.0,5.0,42.0



## Step 5: Generate and Index Float and Int8 Embeddings


In [12]:
print("Generating Int8 embeddings for product names and descriptions...")
product_name_int8_embeddings = generate_embeddings(product_names, embedding_type="int8")
product_description_int8_embeddings = generate_embeddings(product_descriptions, embedding_type="int8")

Generating Int8 embeddings for product names and descriptions...


In [58]:
print("Generating Float embeddings for product names and descriptions...")
product_name_float_embeddings = generate_embeddings(product_names, embedding_type="float")
product_description_float_embeddings = generate_embeddings(product_descriptions, embedding_type="float")

Generating Float embeddings for product names and descriptions...


## Step 6: Create a Search Index in Azure AI Search

In [62]:
# Function to create or update the search index for vector embeddings
def create_or_update_index(
    client, index_name, vector_field_type, scoring_uri, authentication_key, model_name
):
    # Define the search index fields based on your product schema
    fields = [
        SimpleField(name="product_id", type=SearchFieldDataType.String, key=True),
        SearchField(
            name="product_name",
            type=SearchFieldDataType.String,
            searchable=True,
            filterable=True,
        ),
        SearchField(
            name="product_description", type=SearchFieldDataType.String, searchable=True
        ),
        SearchField(
            name="product_name_vector",
            type=vector_field_type,
            vector_search_dimensions=1024,
            vector_search_profile_name="my-vector-config",
        ),
        SearchField(
            name="product_description_vector",
            type=vector_field_type,
            vector_search_dimensions=1024,
            vector_search_profile_name="my-vector-config",
        ),
    ]

    # Vector search configuration with HNSW algorithm and query vectorizer
    vector_search = VectorSearch(
        profiles=[
            VectorSearchProfile(
                name="my-vector-config",
                algorithm_configuration_name="my-hnsw",
                vectorizer_name="my-vectorizer",
            )
        ],
        algorithms=[
            HnswAlgorithmConfiguration(
                name="my-hnsw",
                kind=VectorSearchAlgorithmKind.HNSW,
                parameters=HnswParameters(metric=VectorSearchAlgorithmMetric.COSINE),
            )
        ],
        vectorizers=[
            AzureMachineLearningVectorizer(
                name="my-vectorizer",
                vectorizer_name="my-vectorizer",
                aml_parameters=AzureMachineLearningParameters(
                    scoring_uri=scoring_uri,
                    authentication_key=authentication_key,
                    model_name=model_name,
                ),
            )
        ],
    )

    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
    client.create_or_update_index(index=index)


# Example usage: Creating indexes for both Int8 and Float32 embeddings with query vectorizer
search_index_client = SearchIndexClient(
    endpoint=AZURE_SEARCH_SERVICE_ENDPOINT, credential=credential
)

print("Creating or updating Int8 embedding index with query vectorizer...")
create_or_update_index(
    search_index_client,
    index_name=int8_index_name,
    vector_field_type="Collection(Edm.SByte)",  # Int8 embedding storage format
    scoring_uri=AZURE_AI_STUDIO_COHERE_EMBED_ENDPOINT,
    authentication_key=AZURE_AI_STUDIO_COHERE_EMBED_KEY,
    model_name=AIStudioModelCatalogName.COHERE_EMBED_V3_ENGLISH,
)

print("Creating or updating Float32 embedding index with query vectorizer...")
create_or_update_index(
    search_index_client,
    index_name=float_index_name,
    vector_field_type="Collection(Edm.Single)",  # Float32 embedding storage format
    scoring_uri=AZURE_AI_STUDIO_COHERE_EMBED_ENDPOINT,
    authentication_key=AZURE_AI_STUDIO_COHERE_EMBED_KEY,
    model_name=AIStudioModelCatalogName.COHERE_EMBED_V3_ENGLISH,
)

name is not a known attribute of class <class 'azure.search.documents.indexes._generated.models._models_py3.AzureMachineLearningVectorizer'> and will be ignored


Creating or updating Int8 embedding index with query vectorizer...


name is not a known attribute of class <class 'azure.search.documents.indexes._generated.models._models_py3.AzureMachineLearningVectorizer'> and will be ignored


Creating or updating Float32 embedding index with query vectorizer...


## Step 7: Upload Product Data and Vectors to Azure AI Search

In [64]:
from azure.search.documents import SearchIndexingBufferedSender

# Function to upload embeddings using SearchIndexingBufferedSender
def upload_embeddings_to_index(embeddings_name, embeddings_description, search_client, index_name, batch_size=100):
    documents = []
    
    # Prepare documents with embeddings
    for i, (name_embedding, desc_embedding) in enumerate(zip(embeddings_name, embeddings_description)):
        document = {
            "product_id": str(products_df.index[i]),
            "product_name": products_df["product_name"][i],
            "product_description": products_df["product_description"][i],
            "product_name_vector": name_embedding,
            "product_description_vector": desc_embedding,
        }
        documents.append(document)
    
    # Initialize SearchIndexingBufferedSender for batch uploads
    with SearchIndexingBufferedSender(
        endpoint=AZURE_SEARCH_SERVICE_ENDPOINT,
        index_name=index_name,
        credential=AzureKeyCredential(AZURE_SEARCH_ADMIN_KEY),
        auto_flush_interval=60,  # Automatically flush every 60 seconds
        initial_batch_action_count=batch_size  # Batch size for actions
    ) as batch_client:
        
        # Upload documents in batches
        for doc_batch in [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)]:
            batch_client.upload_documents(documents=doc_batch)
        
        print(f"Uploaded {len(documents)} documents to the index '{index_name}' using buffered sender.")

# Upload embeddings to respective indexes
upload_embeddings_to_index(product_name_int8_embeddings, product_description_int8_embeddings, int8_search_client, int8_index_name)
upload_embeddings_to_index(product_name_float_embeddings, product_description_float_embeddings, float_search_client, float_index_name)


Uploaded 5000 documents to the index 'wands-cohere-int8-index' using buffered sender.
Uploaded 5000 documents to the index 'wands-cohere-float32-index' using buffered sender.


## Step 8: Load Query Data and Ground Truth

In [5]:
queries_df = pd.read_csv("eval/products/query.csv", sep="\t", index_col="query_id")
labels_df = pd.read_csv("eval/products/label.csv", sep="\t")

# Map ground truth labels to scores
relevancy_scores = {"Exact": 10, "Partial": 5, "Irrelevant": 0}
labels_df["score"] = labels_df["label"].map(relevancy_scores)
labels_df.head()


Unnamed: 0,id,query_id,product_id,label,score
0,0,0,25434,Exact,10
1,1,0,12088,Irrelevant,0
2,2,0,42931,Exact,10
3,3,0,2636,Exact,10
4,4,0,42923,Exact,10


## Step 9: Perform Vector Search Using Azure AI Search


In [6]:
# Function to perform vector search using the vectorizer in Azure AI Search
def perform_vector_search(search_client, queries_df, field):
    run_dict = defaultdict(dict)
    
    for index, row in queries_df.iterrows():
        query_text = row["query"]
        
        # Use the vectorizer already configured in the Azure AI Search index for query embedding generation
        vector_query = VectorizableTextQuery(text=query_text, k_nearest_neighbors=3, fields=field)
        
        # Perform vector search using the Azure AI Search client
        results = search_client.search(search_text=None, vector_queries=[vector_query], top=3)
        
        query_id = f"{index}"  # Ensure query_id matches what's in qrels
        for result in results:
            # Use the actual product_id from the search results instead of generating a 'doc_' ID
            product_id = result['product_id']
            score = result['@search.score']
            
            # Populate the run_dict using product_id and score
            run_dict[query_id][str(product_id)] = score
    
    return run_dict




# Perform vector search for both Int8 and Float embeddings
print("Performing vector search on Int8 embeddings...")
int8_name_run_dict = perform_vector_search(int8_search_client, queries_df, "product_name_vector")
int8_description_run_dict = perform_vector_search(int8_search_client, queries_df, "product_description_vector")
int8_combined_run_dict = perform_vector_search(int8_search_client, queries_df, "product_name_vector, product_description_vector")


print("Performing vector search on Float embeddings...")
float_name_run_dict = perform_vector_search(float_search_client, queries_df, "product_name_vector")
float_description_run_dict = perform_vector_search(float_search_client, queries_df, "product_description_vector")
float_combined_run_dict = perform_vector_search(float_search_client, queries_df, "product_name_vector, product_description_vector")


Performing vector search on Int8 embeddings...
Performing vector search on Float embeddings...


## Step 10: Evaluate Search Relevance Using Ranx

In [8]:
# Step 10: Evaluate search relevance using ranx

# Ensure query_id and product_id columns are of type string (object)
labels_df["query_id"] = labels_df["query_id"].astype(str)
labels_df["product_id"] = labels_df["product_id"].astype(str)

# Create qrels from labels after converting dtypes
qrels = Qrels.from_df(labels_df, q_id_col="query_id", doc_id_col="product_id", score_col="score")

# Create runs for int8, float, and combined comparisons
int8_name_run = Run(int8_name_run_dict, name="int8_product_name")
int8_description_run = Run(int8_description_run_dict, name="int8_product_description")
float_name_run = Run(float_name_run_dict, name="float_product_name")
float_description_run = Run(float_description_run_dict, name="float_product_description")
int8_combined_run = Run(int8_combined_run_dict, name="int8_combined")
float_combined_run = Run(float_combined_run_dict, name="float_combined")

# Compare search relevance metrics across different models
report = compare(
    qrels=qrels,
    runs=[
        int8_name_run, 
        int8_description_run, 
        float_name_run, 
        float_description_run, 
        int8_combined_run,   
        float_combined_run   
    ],
    metrics=["precision@3", "recall@3", "mrr@3", "dcg@3", "ndcg@3"],
    make_comparable=True  # Ensure that qrels and runs have matching query IDs
)

# View results using available methods for the `Report` object
print("Comparison Results:")

# Convert the report to a DataFrame and display it
results_df = report.to_dataframe()
results_df

# Optionally, export results to a CSV
# results_df.to_csv("comparison_results.csv", index=False)


Comparison Results:


Unnamed: 0,model_names,precision@3,recall@3,mrr@3,dcg@3,ndcg@3
0,int8_product_name,0.726389,0.013993,0.816667,10.336328,0.568412
1,int8_product_description,0.596528,0.011205,0.704167,8.873794,0.480235
2,float_product_name,0.725,0.013955,0.820486,10.325911,0.56834
3,float_product_description,0.596528,0.011212,0.703819,8.877886,0.480424
4,int8_combined,0.674306,0.013293,0.787153,9.965795,0.543094
5,float_combined,0.672222,0.01317,0.785764,9.943598,0.541845
