## Azure AI Search integrated vectorization over a CSV

In [2]:
! pip install --pre azure-search-documents --quiet
! pip install azure-identity azure-storage-blob openai --quiet

## Import libraries

In [3]:
from azure.core.credentials import AzureKeyCredential
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.indexes.models import (
    AzureOpenAIEmbeddingSkill,
    AzureOpenAIModelName,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    BlobIndexerParsingMode,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    FieldMapping,
    HnswAlgorithmConfiguration,
    HnswParameters,
    IndexerExecutionStatus,
    IndexingParameters,
    IndexingParametersConfiguration,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SearchIndexer,
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection,
    SearchIndexerDataSourceType,
    SearchIndexerSkillset,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
)
from azure.search.documents.models import VectorizableTextQuery, VectorizedQuery
from azure.storage.blob import BlobClient, BlobServiceClient, ContainerClient
from dotenv import load_dotenv
import logging
import os
from openai import AzureOpenAI


## Load Environment Variables

In [4]:
# Load environment variables
load_dotenv()

# Environment Variables
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_API_VERSION = "2024-02-01"  
BLOB_CONNECTION_STRING = os.getenv("BLOB_CONNECTION_STRING")
AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME = os.getenv("AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME")
BLOB_RESOURCE_ID = os.getenv("BLOB_RESOURCE_ID")
BLOB_CONTAINER_NAME = os.getenv("BLOB_CONTAINER_NAME")
AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME")
SEARCH_SERVICE_ENDPOINT = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
SEARCH_SERVICE_API_KEY = os.getenv("AZURE_SEARCH_ADMIN_KEY")  
INDEX_NAME = "csv-sample" 


## Authentication for Azure OpenAI

In [5]:
# User-specified parameter
USE_AAD_FOR_AOAI = True

def authenticate_openai(api_key=None, use_aad_for_aoai=False):
    from azure.identity import get_bearer_token_provider
    from openai import AzureOpenAI

    if use_aad_for_aoai:
        print("Using AAD for authentication.")
        credential = DefaultAzureCredential()
        token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default")
        client = AzureOpenAI(
            azure_endpoint=AZURE_OPENAI_ENDPOINT,
            api_version=AZURE_OPENAI_API_VERSION,
            azure_ad_token_provider=token_provider,
        )
    else:
        print("Using API keys for authentication.")
        if api_key is None:
            raise ValueError("API key must be provided if not using AAD for authentication.")
        client = AzureOpenAI(
            api_key=api_key,
            api_version=AZURE_OPENAI_API_VERSION,
            azure_endpoint=AZURE_OPENAI_ENDPOINT,
        )
    return client

openai_client = authenticate_openai(api_key=AZURE_OPENAI_API_KEY, use_aad_for_aoai=USE_AAD_FOR_AOAI)

Using AAD for authentication.


## Authentication for Azure AI Search

In [6]:
# User-specified parameter
USE_AAD_FOR_SEARCH = True  

def authenticate_azure_search(api_key=None, use_aad_for_search=False):
    if use_aad_for_search:
        print("Using AAD for authentication.")
        credential = DefaultAzureCredential()
    else:
        print("Using API keys for authentication.")
        if api_key is None:
            raise ValueError("API key must be provided if not using AAD for authentication.")
        credential = AzureKeyCredential(api_key)
    return credential

azure_search_credential = authenticate_azure_search(api_key=SEARCH_SERVICE_API_KEY, use_aad_for_search=USE_AAD_FOR_SEARCH)

Using AAD for authentication.


## Connect to Azure Blob Storage

In [7]:
def upload_file_to_blob(connection_string, container_name, file_path):
    """Upload a file to the specified blob container."""
    try:
        # Initialize the BlobServiceClient
        blob_service_client = BlobServiceClient.from_connection_string(connection_string)

        # Get the container client
        container_client = blob_service_client.get_container_client(container_name)

        # Create the container if it doesn't exist
        container_client.create_container()

        # Upload the file
        file_name = os.path.basename(file_path)
        blob_client = container_client.get_blob_client(file_name)
        with open(file_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)

        print(f"Uploaded blob: {file_name} to container: {container_name}")

    except Exception as e:
        print(f"Error: {e}")

# Main workflow
CSV_FILE_PATH = os.path.join("data", "csv", "AG_news_samples.csv")

upload_file_to_blob(BLOB_CONNECTION_STRING, BLOB_CONTAINER_NAME, CSV_FILE_PATH)


Error: The specified container already exists.
RequestId:4d1f262e-301e-0066-425a-b4a9f0000000
Time:2024-06-01T19:29:49.2680683Z
ErrorCode:ContainerAlreadyExists
Content: <?xml version="1.0" encoding="utf-8"?><Error><Code>ContainerAlreadyExists</Code><Message>The specified container already exists.
RequestId:4d1f262e-301e-0066-425a-b4a9f0000000
Time:2024-06-01T19:29:49.2680683Z</Message></Error>


## Create a Blob Data Source Connector on Azure AI Search

In [8]:
def create_or_update_data_source(indexer_client, container_name, resource_id, index_name):
    """Create or update a data source connection for Azure Cognitive Search using a connection string. """
    try:
        container = SearchIndexerDataContainer(name=container_name)

        data_source_connection = SearchIndexerDataSourceConnection(
            name=f"{index_name}-blob",
            type=SearchIndexerDataSourceType.AZURE_BLOB,
            connection_string=resource_id,
            container=container
        )
        data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

        print(f"Data source '{data_source.name}' created or updated")
    except Exception as e:
        print(f"Failed to create or update data source: {e}")

# Initialize the SearchIndexerClient with a credential
indexer_client = SearchIndexerClient(SEARCH_SERVICE_ENDPOINT, azure_search_credential)

# Call the function to create or update the data source
create_or_update_data_source(indexer_client, BLOB_CONTAINER_NAME, BLOB_RESOURCE_ID, INDEX_NAME)


Data source 'csv-sample-blob' created or updated


## Create a search index

In [8]:
def create_fields():
    """Creates the fields for the search index based on the specified schema."""
    return [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True, filterable=True),
        SearchField(name="title", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="description", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="label", type=SearchFieldDataType.String, facetable=True,filterable=True),
        SearchField(name="label_int", type=SearchFieldDataType.Int32, sortable=True, filterable=True, facetable=True),
        SearchField(
            name="vector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            vector_search_dimensions=3072,
            vector_search_profile_name="myHnswProfile",
            hidden=False,
            stored=True
        ),
    ]

def create_vector_search_configuration():
    """Creates the vector search configuration."""
    return VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="myHnsw",
                parameters=HnswParameters(
                    m=4,
                    ef_construction=400,
                    ef_search=500,
                    metric=VectorSearchAlgorithmMetric.COSINE,
                ),
            ),
            ExhaustiveKnnAlgorithmConfiguration(
                name="myExhaustiveKnn",
                parameters=ExhaustiveKnnParameters(
                    metric=VectorSearchAlgorithmMetric.COSINE,
                ),
            ),
        ],
        profiles=[
            VectorSearchProfile(
                name="myHnswProfile",
                algorithm_configuration_name="myHnsw",
                vectorizer="myOpenAI",
            ),
            VectorSearchProfile(
                name="myExhaustiveKnnProfile",
                algorithm_configuration_name="myExhaustiveKnn",
                vectorizer="myOpenAI",
            ),
        ],
        vectorizers=[
            AzureOpenAIVectorizer(
                name="myOpenAI",
                kind="azureOpenAI",
                azure_open_ai_parameters=AzureOpenAIParameters(
                    resource_uri=AZURE_OPENAI_ENDPOINT,
                    deployment_id=AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME,
                    api_key=AZURE_OPENAI_API_KEY,
                    model_name=AzureOpenAIModelName.TEXT_EMBEDDING3_LARGE
                ),
            ),
        ],
    )

def create_semantic_search_configuration():
    """Creates the semantic search configuration."""
    return SemanticSearch(configurations=[
        SemanticConfiguration(
            name="mySemanticConfig",
            prioritized_fields=SemanticPrioritizedFields(
                title_field=SemanticField(field_name="title"),
                content_fields=[SemanticField(field_name="description")]
            ),
        )
    ])

def create_search_index(index_name, fields, vector_search, semantic_search):
    """Creates or updates the search index."""
    index = SearchIndex(
        name=index_name,
        fields=fields,
        vector_search=vector_search,
        semantic_search=semantic_search
    )
    try:
        result = index_client.create_or_update_index(index)
        print(f"{result.name} created")
    except Exception as e:
        print(f"Failed to create or update index: {e}")

index_client = SearchIndexClient(endpoint=SEARCH_SERVICE_ENDPOINT, credential=azure_search_credential)
fields = create_fields()
vector_search = create_vector_search_configuration()
semantic_search = create_semantic_search_configuration()

# Create the search index with the adjusted schema
create_search_index(INDEX_NAME, fields, vector_search, semantic_search)

csv-sample created


## Create a skillset

In [9]:
def create_embedding_skill(azure_openai_endpoint, azure_openai_embedding_deployment, azure_openai_key):
    """Defines the embedding skill for generating embeddings via Azure OpenAI."""
    return AzureOpenAIEmbeddingSkill(
        description="Skill to generate embeddings via Azure OpenAI",
        context="/document",
        resource_uri=azure_openai_endpoint,
        deployment_id=azure_openai_embedding_deployment,
        model_name=AzureOpenAIModelName.TEXT_EMBEDDING3_LARGE,
        api_key=azure_openai_key,
        inputs=[
            InputFieldMappingEntry(name="text", source="/document/description"),
        ],
        outputs=[
            OutputFieldMappingEntry(name="embedding")
        ],
    )

def create_skillset(client, skillset_name, embedding_skill):
    """Creates or updates the skillset with an embedding skill."""
    skillset = SearchIndexerSkillset(
        name=skillset_name,
        description="Skillset for generating embeddings",
        skills=[embedding_skill],
    )
    try:
        client.create_or_update_skillset(skillset)
        print(f"{skillset.name} created")
    except Exception as e:
        print(f"Failed to create or update skillset {skillset_name}: {e}")

# Example usage
skillset_name = f"{INDEX_NAME}-skillset"
client = SearchIndexerClient(endpoint=SEARCH_SERVICE_ENDPOINT, credential=azure_search_credential)

embedding_skill = create_embedding_skill(AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME, AZURE_OPENAI_API_KEY)

create_skillset(client, skillset_name, embedding_skill)

csv-sample-skillset created


## Create Indexer

In [16]:
def create_and_run_indexer(
    indexer_client, indexer_name, skillset_name, index_name, data_source_name
):
    """
    Creates an indexer, applies it to a given index, and runs the indexing process.
    """
    try:
        indexer = SearchIndexer(
            name=indexer_name,
            description="Indexer to index documents and generate embeddings",
            skillset_name=skillset_name,
            target_index_name=index_name,
            data_source_name=data_source_name,
            # Indexing parameters to correctly parse CSV files
            parameters=IndexingParameters(
                batch_size=100,  # Adjust based on your content size and requirements
                configuration=IndexingParametersConfiguration(
                    parsing_mode=BlobIndexerParsingMode.DELIMITED_TEXT,
                    first_line_contains_headers=True,
                    query_timeout=None,
                ),
            ),
            output_field_mappings=[FieldMapping(source_field_name="/document/embedding", target_field_name="vector")]
        )

        # Create or update the indexer
        indexer_client.create_or_update_indexer(indexer)
        print(f"{indexer_name} created or updated.")

        # Run the indexer
        indexer_client.run_indexer(indexer_name)
        print(
            f"{indexer_name} is running. If queries return no results, please wait a bit and try again."
        )
    except Exception as e:
        print(f"Failed to create or run indexer {indexer_name}: {e}")

# Main workflow
data_source_name = f"{INDEX_NAME}-blob"
indexer_name = f"{INDEX_NAME}-indexer"
indexer_client = SearchIndexerClient(
    endpoint=SEARCH_SERVICE_ENDPOINT, credential=azure_search_credential
)

create_and_run_indexer(
    indexer_client, indexer_name, skillset_name, INDEX_NAME, data_source_name
)

csv-sample-indexer created or updated.
csv-sample-indexer is running. If queries return no results, please wait a bit and try again.


## Poll for indexer completion

In [18]:
indexer_last_result = indexer_client.get_indexer_status(indexer_name).last_result
indexer_status = IndexerExecutionStatus.IN_PROGRESS if indexer_last_result is None  else indexer_last_result.status

while(indexer_status == IndexerExecutionStatus.IN_PROGRESS):
    indexer_last_result = indexer_client.get_indexer_status(indexer_name).last_result
    indexer_status = IndexerExecutionStatus.IN_PROGRESS if indexer_last_result is None  else indexer_last_result.status
    print(f"Indexer '{indexer_name}' is still running. Current status: '{indexer_status}'.")

print(f"Indexer '{indexer_name}' finished with status '{indexer_status}'.")


Indexer 'csv-sample-indexer' finished with status 'success'.


## A simple vector search

In [10]:
# Pure Vector Search
query = "What did Prime Minister Tony Blair say about climate change?"  

search_client = SearchClient(SEARCH_SERVICE_ENDPOINT, INDEX_NAME, credential=azure_search_credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = VectorizedQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")

results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    top=1
)  

for result in results:  
    print(f"title: {result['title']}")  
    print(f"description: {result['description']}")  
    print(f"label: {result['label']}")  

title: World Briefings
description: BRITAIN: BLAIR WARNS OF CLIMATE THREAT Prime Minister Tony Blair urged the international community to consider global warming a dire threat and agree on a plan of action to curb the  quot;alarming quot; growth of greenhouse gases.
label: World


## Perform RAG Using Your Data

In [11]:
import openai

client = openai.AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
    api_version="2024-02-01",
)

completion = client.chat.completions.create(
    model=AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME,
    messages=[
        {
            "role": "user",
            "content": query,
        },
    ],
    extra_body={
        "data_sources": [
            {
                "type": "azure_search",
                "parameters": {
                    "endpoint": SEARCH_SERVICE_ENDPOINT,
                    "index_name": INDEX_NAME,
                    "authentication": {
                        "type": "api_key",
                        "key": SEARCH_SERVICE_API_KEY,
                    },
                    "query_type": "vector_semantic_hybrid",
                    "embedding_dependency": {
                        "type": "deployment_name",
                        "deployment_name": AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME,
                    },
                    "semantic_configuration": "mySemanticConfig",
                },
            }
        ],
    },
)

import textwrap
if completion.choices:
    message_content = completion.choices[0].message.content
    wrapped_message_content = textwrap.fill(message_content, width=100)
    print(f"AI Assistant (GPT-4): {wrapped_message_content}")


AI Assistant (GPT-4): Prime Minister Tony Blair urged the international community to consider global warming a dire threat
and to agree on a plan of action to curb the "alarming" growth of greenhouse gases [doc1].
