## Azure AI Search integrated vectorization over a CSV

In [None]:
! pip install openai azure-search-documents azure-identity azure-search-blob --quiet

## Import libraries

In [13]:
import os
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from dotenv import load_dotenv
from openai import AzureOpenAI
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SimpleField,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    VectorSearchAlgorithmKind,
)
from azure.identity import DefaultAzureCredential
import os
from azure.storage.blob import BlobServiceClient
import os
import logging

    

## Load Environment Variables

In [14]:
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Environment Variables
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_API_VERSION = "2023-05-15"  
AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME")
SEARCH_SERVICE_ENDPOINT = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
SEARCH_SERVICE_API_KEY = os.getenv("AZURE_SEARCH_ADMIN_KEY")  
INDEX_NAME = "csv-sample" 


## Authentication for Azure OpenAI

In [15]:
def authenticate_openai(use_aad_for_aoai=False):
    from azure.identity import get_bearer_token_provider
    from openai import AzureOpenAI
    
    if use_aad_for_aoai:
        credential = DefaultAzureCredential()
        token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default")
        client = AzureOpenAI(
            azure_endpoint=AZURE_OPENAI_ENDPOINT,
            api_version=AZURE_OPENAI_API_VERSION,
            azure_ad_token_provider=token_provider,
        )
    else:
        client = AzureOpenAI(
            api_key=AZURE_OPENAI_API_KEY,
            api_version=AZURE_OPENAI_API_VERSION,
            azure_endpoint=AZURE_OPENAI_ENDPOINT,
        )
    return client
openai_client = authenticate_openai(use_aad_for_aoai=False)


## Authentication for Azure AI Search

In [None]:
def authenticate_azure_search(use_aad_for_search=True):
    if use_aad_for_search:
        credential = DefaultAzureCredential()
    else:
        credential = AzureKeyCredential(SEARCH_SERVICE_API_KEY)
    return credential

azure_search_credential = authenticate_azure_search(use_aad_for_search=True)


## Connect to Azure Blob Storage

In [None]:
# Setup logging
logging.basicConfig(level=logging.INFO)


def get_blob_service_client(connection_string):
    """Initialize and return the BlobServiceClient."""
    return BlobServiceClient.from_connection_string(connection_string)


def get_container_client(blob_service_client, container_name):
    """Get or create the container client."""
    container_client = blob_service_client.get_container_client(container_name)
    if not container_client.exists():
        try:
            container_client.create_container()
            logging.info(f"Created container: {container_name}")
        except Exception as e:
            logging.error(f"Error creating container: {container_name}, {e}")
    return container_client


def upload_file_to_blob(container_client, file_path, container_name):
    """Upload a file to the specified blob container."""
    file_name = os.path.basename(file_path)
    blob_client = container_client.get_blob_client(file_name)

    if not blob_client.exists():
        try:
            with open(file_path, "rb") as data:
                container_client.upload_blob(name=file_name, data=data)
            logging.info(f"Uploaded {file_name} to {container_name}")
        except Exception as e:
            logging.error(f"Error uploading {file_name} to {container_name}: {e}")
    else:
        logging.info(f"{file_name} already exists in {container_name}")


# Configuration
BLOB_CONNECTION_STRING = os.getenv("BLOB_CONNECTION_STRING")
BLOB_CONTAINER_NAME = "csv-files"
CSV_FILE_PATH = os.path.join("data", "csv", "AG_news_samples.csv")

# Main workflow
blob_service_client = get_blob_service_client(BLOB_CONNECTION_STRING)
container_client = get_container_client(blob_service_client, BLOB_CONTAINER_NAME)
upload_file_to_blob(container_client, CSV_FILE_PATH, BLOB_CONTAINER_NAME)

## Create a blob data source connector on Azure AI Search

In [None]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection
)
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)

def create_or_update_data_source(indexer_client, container_name, connection_string, index_name):
    """
    Create or update a data source connection for Azure Cognitive Search.
    
    Parameters:
    - indexer_client: The SearchIndexerClient instance.
    - container_name: Name of the blob container.
    - connection_string: Connection string for the blob storage.
    - index_name: Name of the index to associate with this data source.
    """
    try:
        container = SearchIndexerDataContainer(name=container_name)
        data_source_connection = SearchIndexerDataSourceConnection(
            name=f"{index_name}-blob",
            type="azureblob",
            connection_string=connection_string,
            container=container
        )
        data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)
        logging.info(f"Data source '{data_source.name}' created or updated")
    except Exception as e:
        logging.error(f"Failed to create or update data source: {e}")

# Assuming 'search_service_endpoint', 'credential', 'blob_container_name', 'blob_connection_string', and 'index_name' are defined
indexer_client = SearchIndexerClient(SEARCH_SERVICE_ENDPOINT, azure_search_credential)

# Call the function to create or update the data source
create_or_update_data_source(indexer_client, BLOB_CONTAINER_NAME, BLOB_CONNECTION_STRING, INDEX_NAME)


## Create a search index

In [None]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    SemanticConfiguration,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    SearchIndex
)
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)

def create_fields():
    """Creates the fields for the search index based on the specified schema."""
    return [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True, filterable=True),
        SearchField(name="title", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="description", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="label", type=SearchFieldDataType.String, facetable=True,filterable=True),
        SearchField(name="label_int", type=SearchFieldDataType.Int32, sortable=True, filterable=True, facetable=True),
        SearchField(
            name="vector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            vector_search_dimensions=3072,
            vector_search_profile_name="myHnswProfile",
        ),
    ]

def create_vector_search_configuration():
    """Creates the vector search configuration."""
    return VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="myHnsw",
                parameters=HnswParameters(
                    m=4,
                    ef_construction=400,
                    ef_search=500,
                    metric=VectorSearchAlgorithmMetric.COSINE,
                ),
            ),
            ExhaustiveKnnAlgorithmConfiguration(
                name="myExhaustiveKnn",
                parameters=ExhaustiveKnnParameters(
                    metric=VectorSearchAlgorithmMetric.COSINE,
                ),
            ),
        ],
        profiles=[
            VectorSearchProfile(
                name="myHnswProfile",
                algorithm_configuration_name="myHnsw",
                vectorizer="myOpenAI",
            ),
            VectorSearchProfile(
                name="myExhaustiveKnnProfile",
                algorithm_configuration_name="myExhaustiveKnn",
                vectorizer="myOpenAI",
            ),
        ],
        vectorizers=[
            AzureOpenAIVectorizer(
                name="myOpenAI",
                kind="azureOpenAI",
                azure_open_ai_parameters=AzureOpenAIParameters(
                    resource_uri=AZURE_OPENAI_ENDPOINT,
                    deployment_id=AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME,
                    api_key=AZURE_OPENAI_API_KEY,
                ),
            ),
        ],
    )

def create_semantic_search_configuration():
    """Creates the semantic search configuration."""
    return SemanticSearch(configurations=[
        SemanticConfiguration(
            name="my-semantic-config",
            prioritized_fields=SemanticPrioritizedFields(
                title_field=SemanticField(field_name="title"),
                content_fields=[SemanticField(field_name="description")]
            ),
        )
    ])

def create_search_index(index_name, fields, vector_search, semantic_search):
    """Creates or updates the search index."""
    index = SearchIndex(
        name=index_name, 
        fields=fields, 
        vector_search=vector_search, 
        semantic_search=semantic_search
    )
    try:
        result = index_client.create_or_update_index(index)
        logging.info(f"{result.name} created")
    except Exception as e:
        logging.error(f"Failed to create or update index: {e}")

# Assuming 'endpoint', 'credential', 'azure_openai_endpoint', 'azure_openai_embedding_deployment', and 'azure_openai_key' are defined
index_client = SearchIndexClient(endpoint=SEARCH_SERVICE_ENDPOINT, credential=azure_search_credential)
fields = create_fields()
vector_search = create_vector_search_configuration()
semantic_search = create_semantic_search_configuration()

# Create the search index with the adjusted schema
create_search_index(INDEX_NAME, fields, vector_search, semantic_search)


## Create a skillset

In [None]:
from azure.search.documents.indexes.models import (
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    AzureOpenAIEmbeddingSkill,
    SearchIndexerSkillset
)
from azure.search.documents.indexes import SearchIndexerClient
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)

def create_split_skill():
    """Defines the split skill to chunk the document."""
    return SplitSkill(
        description="Split skill to chunk documents into manageable sizes",
        text_split_mode="pages",
        context="/document",
        maximum_page_length=10000,  # Adjust based on your content size and requirements
        page_overlap_length=0,  # Overlap can help ensure no content is lost between chunks, in this case I don't wany any
        maximum_pages_to_take=1,  # Process only the first chunk
        inputs=[
            InputFieldMappingEntry(name="text", source="/document/content"),
        ],
        outputs=[
            OutputFieldMappingEntry(name="textItems", target_name="pages")
        ],
    )

def create_embedding_skill(azure_openai_endpoint, azure_openai_embedding_deployment, azure_openai_key):
    """Defines the embedding skill for generating embeddings via Azure OpenAI."""
    return AzureOpenAIEmbeddingSkill(
        description="Skill to generate embeddings via Azure OpenAI",
        context="/document/pages/0",  # Use only the first chunk for embedding
        resource_uri=azure_openai_endpoint,
        deployment_id=azure_openai_embedding_deployment,
        api_key=azure_openai_key,
        inputs=[
            InputFieldMappingEntry(name="text", source="/document/pages/0"),  # Adjusted to the first chunk
        ],
        outputs=[
            OutputFieldMappingEntry(name="embedding", target_name="vector")
        ],
    )

def create_skillset(client, skillset_name, split_skill, embedding_skill):
    """Creates or updates the skillset with split and embedding skills."""
    skillset = SearchIndexerSkillset(
        name=skillset_name,
        description="Skillset for chunking documents and generating embeddings",
        skills=[split_skill, embedding_skill],
    )
    try:
        client.create_or_update_skillset(skillset)
        logging.info(f"{skillset.name} created")
    except Exception as e:
        logging.error(f"Failed to create or update skillset {skillset_name}: {e}")

skillset_name = f"{INDEX_NAME}-skillset"
client = SearchIndexerClient(endpoint=SEARCH_SERVICE_ENDPOINT, credential=azure_search_credential)

split_skill = create_split_skill()
embedding_skill = create_embedding_skill(AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME, AZURE_OPENAI_API_KEY)

create_skillset(client, skillset_name, split_skill, embedding_skill)


## Create Indexer

In [None]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexer,
    FieldMapping,
    IndexingParameters,
    IndexingParametersConfiguration,
    BlobIndexerParsingMode,
)
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)


def create_and_run_indexer(
    indexer_client, indexer_name, skillset_name, index_name, data_source_name
):
    """
    Creates an indexer, applies it to a given index, and runs the indexing process.
    """
    try:
        indexer = SearchIndexer(
            name=indexer_name,
            description="Indexer to index documents and generate embeddings",
            skillset_name=skillset_name,
            target_index_name=index_name,
            data_source_name=data_source_name,
            # Indexing parameters to correctly parse CSV files
            parameters=IndexingParameters(
                batch_size=100,
                configuration=IndexingParametersConfiguration(
                    parsing_mode=BlobIndexerParsingMode.DELIMITED_TEXT,
                    first_line_contains_headers=True,
                    query_timeout=None,
                ),
            ),
        )

        # Create or update the indexer
        indexer_client.create_or_update_indexer(indexer)
        logging.info(f"{indexer_name} created or updated.")

        # Run the indexer
        indexer_client.run_indexer(indexer_name)
        logging.info(
            f"{indexer_name} is running. If queries return no results, please wait a bit and try again."
        )
    except Exception as e:
        logging.error(f"Failed to create or run indexer {indexer_name}: {e}")


# Assuming 'endpoint', 'credential', 'index_name', 'skillset_name', and 'data_source_name' are defined correctly
data_source_name = f"{INDEX_NAME}-blob"
indexer_name = f"{INDEX_NAME}-indexer"  # Ensure `index_name` is defined and correctly references your index
indexer_client = SearchIndexerClient(
    endpoint=SEARCH_SERVICE_ENDPOINT, credential=azure_search_credential
)

# Call the function to create and run the indexer
create_and_run_indexer(
    indexer_client,
    indexer_name,
    skillset_name,
    INDEX_NAME,
    data_source_name
)

## A simple vector search

In [None]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery, VectorizedQuery

# Pure Vector Search
query = "oil prices"  
  
search_client = SearchClient(SEARCH_SERVICE_ENDPOINT, INDEX_NAME, credential=azure_search_credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = VectorizedQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    top=1
)  
  
for result in results:  
    print(f"title: {result['title']}")  
    print(f"description: {result['description']}")  
    print(f"label: {result['label']}")  

