# Azure AI Search Scalar Quantization 

In this notebook, I'll show you how to use Azure OpenAI Service to generate embeddings using the latest and highest performing model from OpenAI, `text-embedding-3-large` and how to store these in Azure AI Search.

## Install required libraries

In [10]:
! pip install azure-search-documents cohere python-dotenv azure-identity --quiet

In [36]:
import cohere
import requests
import json
import numpy as np
import os
from dotenv import load_dotenv
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.search.documents.models import (
    VectorizedQuery,
)
from azure.search.documents.indexes.models import (
    HnswAlgorithmConfiguration,
    HnswParameters,
    SearchField,
    SearchableField,
    SearchFieldDataType,
    SearchIndex,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
)
from azure.search.documents.indexes._generated.models import (
    SearchField,
    ScalarQuantizationCompressionConfiguration,
    ScalarQuantizationParameters,
)
from azure.core.credentials import AzureKeyCredential

## Set Up Cohere and Azure Credentials
Before generating embeddings or interacting with Azure AI Search, we need to set up our credentials for both Cohere and Azure AI Search.

In [1]:
load_dotenv()
cohere_api_key = os.getenv("COHERE_API_KEY")
co = cohere.Client(cohere_api_key)

search_service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
search_service_api_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
index_name = "wikipedia-2023-11-embed-multilingual-v3-index"
credential = AzureKeyCredential(search_service_api_key)

NameError: name 'load_dotenv' is not defined

In [39]:
import datasets
# Specify dataset and language parameters
dataset_name = "Cohere/wikipedia-2023-11-embed-multilingual-v3"
language = "simple"  # Use Simple English Wikipedia subset
split = "train"      # Load the training split

# Load the dataset
try:
    docs = datasets.load_dataset(dataset_name, language=language, split=split)
except Exception as e:
    print(f"An error occurred loading the dataset: {e}")

# Example usage (assuming successful loading):
print(docs["train"][0])  # Access the first document in the training split


An error occurred loading the dataset: ParquetConfig.__init__() got an unexpected keyword argument 'language'


KeyError: "Column train not in the dataset. Current columns in the dataset: ['_id', 'url', 'title', 'text', 'emb']"

In [None]:
def download_and_preprocess_dataset(dataset_url):
    response = requests.get(dataset_url)
    dataset = json.loads(response.content)

    # Customize data transformation based on your dataset and Azure AI Search requirements
    documents = []
    for data in dataset:
        # Example transformation (modify as needed)
        document = {
            "id": data["id"],  # Assuming a unique identifier field exists
            "text": data["text"],  # Assuming a text field exists for searching
            # Add other relevant fields for indexing
        }
        documents.append(document)

    return documents


# Download and pre-process the dataset
documents = download_and_preprocess_dataset(dataset_url)

print("Downloaded and pre-processed", len(documents), "documents.")

## Estimate what scale we should configure an Azure AI Search service

In [14]:
# Print the Dataset Object Directly
print(docs)

# Dataset Schema - data types and properties of columns
print("Dataset Schema:")
print(docs.features)

Dataset({
    features: ['_id', 'url', 'title', 'text', 'emb'],
    num_rows: 100000
})
Dataset Schema:
{'_id': Value(dtype='string', id=None), 'url': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'emb': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)}


We know we have 646,424 documents and each document has 1 vector field at 1024 dimensions so we can now estimate the raw size of vectors that will assist us with selecting a SKU and Partition count. 

In [15]:
def interpolate_overhead(dimensions):
    """
    Interpolate the HNSW algorithm overhead percentage based on the provided dimensions.
    
    Parameters:
    - dimensions: The dimensions of the vector field.
    
    Returns:
    - The interpolated or exact algorithm overhead percentage.
    """
    # Known data points for overhead percentages
    overhead_data = [(96, 20), (200, 8), (768, 2), (1536, 1)]
    
    # Sort the list by dimensions to ensure it is in ascending order
    overhead_data.sort(key=lambda x: x[0])
    
    # Check if the provided dimension is below or above known data points
    if dimensions <= overhead_data[0][0]:
        return overhead_data[0][1]
    elif dimensions >= overhead_data[-1][0]:
        return overhead_data[-1][1]
    
    # Linear interpolation for dimensions within the known data points
    for i in range(1, len(overhead_data)):
        if dimensions <= overhead_data[i][0]:
            x0, y0 = overhead_data[i-1]
            x1, y1 = overhead_data[i]
            # Linear interpolation formula
            return y0 + (y1 - y0) * (dimensions - x0) / (x1 - x0)

def calculate_vector_index_size(num_documents, dimensions, algorithm, m=4, deleted_docs_ratio_percent=10):
    """
    Calculate the size of a vector index on Azure Cognitive Search, considering the chosen algorithm.
    
    Parameters:
    - num_documents: The total number of documents.
    - dimensions: The dimensions of the vector field.
    - algorithm: The algorithm used ('HNSW' or 'ExhaustiveKnn').
    - m: The HNSW parameter determining the number of bi-directional links per vector. Only used for HNSW.
    - deleted_docs_ratio_percent: The estimated percentage of deleted or updated documents. Only used for HNSW.
    
    Returns:
    - The size of the vector index in bytes.
    """
    # Size of data type in bytes (Edm.Single)
    size_of_data_type = 4
    
    # Raw size calculation
    raw_size_bytes = num_documents * dimensions * size_of_data_type
    
    if algorithm == "ExhaustiveKnn":
        # For ExhaustiveKnn, the raw size is the estimate
        return raw_size_bytes
    elif algorithm == "HNSW":
        # Perform linear interpolation to estimate algorithm overhead percentage
        algorithm_overhead_percent = interpolate_overhead(dimensions)
        
        # Calculate total size considering algorithm overhead and deleted documents ratio for HNSW
        total_size_bytes = raw_size_bytes * (1 + algorithm_overhead_percent / 100) * (1 + deleted_docs_ratio_percent / 100)
        
        return total_size_bytes
    else:
        raise ValueError("Invalid algorithm selected. Choose 'HNSW' or 'ExhaustiveKnn'.")

# Example usage
num_documents = 100000  # Number of documents
dimensions = 1024  # An example dimension not directly listed in the overhead data
algorithm = "HNSW"  # Algorithm: "HNSW" or "ExhaustiveKnn"

size_bytes = calculate_vector_index_size(num_documents, dimensions, algorithm)
print(f"Size of the vector index: {size_bytes} bytes")


Size of the vector index: 458069333.3333333 bytes


Based on the above estinated vector index size of 2.96 GB, it seems that we can safely index on a Basic service at 1 partition as the max storage for this is 15 GB on my newly created service on April 3, 2024. See https://review.learn.microsoft.com/en-us/azure/search/search-limits-quotas-capacity?branch=pr-en-us-269645#vector-limits-on-services-created-after-april-3-2024-in-supported-regions

In [28]:
def create_or_update_index(client, index_name):

    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchField(
            name="url",
            type=SearchFieldDataType.String,
            filterable=True,
        ),
        SearchField(
            name="title",
            type=SearchFieldDataType.String,
            searchable=True,
        ),
        SearchField(
            name="text",
            type=SearchFieldDataType.String,
            searchable=True,
        ),
        SearchField(
            name="embedding",
            type="Collection(Edm.Single)",

            vector_search_dimensions=1024,  # Adjust based on your actual vector dimensions
            vector_search_profile_name="my-vector-config",
        ),
    ]


    vector_search = VectorSearch(
        profiles=[
            VectorSearchProfile(

                name="my-vector-config",
                algorithm_configuration_name="my-hnsw",
                compression_configuration_name="my-scalar-quantization",
            )
        ],
        algorithms=[
            HnswAlgorithmConfiguration(
                name="my-hnsw",
                kind=VectorSearchAlgorithmKind.HNSW,
                parameters=HnswParameters(
                    m=4, metric="cosine", ef_construction=400, ef_search=500
                ),
            )
        ],
        compressions=[
            ScalarQuantizationCompressionConfiguration(
                name="my-scalar-quantization",
                rerank_with_original_vectors=True,
                default_oversampling=10,
                parameters=ScalarQuantizationParameters(
                    quantized_data_type="int8",
                ),
            )
        ],
    )

    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)

    client.create_or_update_index(index=index)

## Index Documents and Their Embeddings
Finally, this function indexes the documents along with their int8 embeddings into Azure AI Search.

In [25]:
# Convert the Hugging Face dataset to a list of dictionaries
documents_to_index = docs.map(lambda example: {
    "id": str(example["_id"]),  # Ensures 'id' is a string
    "URL": example["url"],
    "title": example["title"],
    "text": example["text"],
    "embedding": example["emb"],  # Ensure this is in the format Azure expects
}, remove_columns=docs.column_names)  # This removes original columns not needed for indexing

# Convert to a list of dictionaries
documents_to_index = documents_to_index.to_dict('records')


Map: 100%|██████████| 100000/100000 [01:16<00:00, 1304.07 examples/s]


In [27]:
from azure.search.documents import SearchIndexingBufferedSender
from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import HttpResponseError

def upload_embeddings_with_buffered_sender(endpoint, index_name, credential, documents):
    # Initialize the SearchIndexingBufferedSender
    batch_client = SearchIndexingBufferedSender(
        endpoint=endpoint,
        index_name=index_name,
        credential=credential
    )

    try:
        # Add upload actions for all documents in a single call
        batch_client.upload_documents(documents=documents)

        # Manually flush to send any remaining documents in the buffer
        batch_client.flush()
    except HttpResponseError as e:
        print(f"An error occurred: {e}")
    finally:
        # Clean up resources by closing the batch_client
        batch_client.close()

    print("Finished indexing documents.")

# Example usage, make sure endpoint, index_name, and credential are properly defined
# Assuming credential is an instance of AzureKeyCredential
# And documents_to_index is your list of documents to be indexed



Finished indexing documents.


In [None]:
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField, SearchField, SearchFieldDataType, VectorSearch,
    VectorSearchProfile, HnswAlgorithmConfiguration, VectorSearchAlgorithmKind,
    SearchIndex
)

# Make sure to define or import your create_or_update_index function here

# Initialize Azure Search Index Client for managing indexes
search_index_client = SearchIndexClient(
    endpoint=search_service_endpoint,
    credential=credential,
    index_name=index_name  # This parameter is not required here and should be omitted
)


# Create or update the search index to include the embedding field
create_or_update_index(search_index_client, index_name)

# Initialize the SearchClient for indexing documents
search_client = SearchClient(
    endpoint=search_service_endpoint, 
    index_name=index_name, 
    credential=credential
)

# Assuming docs is already defined and loaded with your documents
upload_embeddings_with_buffered_sender(search_service_endpoint, index_name, credential, documents_to_index)


## Perform a Vector Search

In [19]:
def generate_embeddings(texts, input_type="search_query"):
    model = "embed-english-v3.0"
    # Ensure texts is a list
    if isinstance(texts, str):
        texts = [texts]

    response = co.embed(
        texts=texts,
        model=model,
        input_type=input_type,
        embedding_types=["int8"],
    )
    return [embedding for embedding in response.embeddings]

In [20]:
from azure.search.documents import SearchClient

# Query for vector search
query = "foundational figures in computer science"

# Generate query embeddings
# Use input_type="search_query" for query embeddings
query_embeddings = generate_embeddings(query, input_type="search_query")

search_client = SearchClient(search_service_endpoint, index_name, credential)

vector_query = VectorizedQuery(
    vector=query_embeddings[0], k_nearest_neighbors=3, fields="embedding"
)

results = search_client.search(
    search_text=None,  # No search text for pure vector search
    vector_queries=[vector_query],
)

for result in results:
    print(f"Title: {result['text']}")
    print(f"Score: {result['@search.score']}\n")

TypeError: 'EmbeddingsByType' object is not iterable