# Azure AI Search integrated vectorization
This code demonstrates how to use Azure AI Search as a vector store by automatically chunking and generating embeddings using the AzureOpenAIEmbedding skill as part of the skillset pipeline in Azure AI Search. 

In [4]:
import os

from azure.core.credentials import AzureKeyCredential
from dotenv import load_dotenv

load_dotenv(override=True)  # take environment variables from .env.

# Variables not used here do not need to be updated in your .env file
endpoint_azure_search = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"])

index_name = os.environ["AZURE_SEARCH_INDEX"]
blob_connection_string = os.environ["BLOB_CONNECTION_STRING"]
blob_container_name = os.environ["BLOB_CONTAINER_NAME"]
azure_openai_endpoint = os.environ["AZURE_OPENAI_SERVICE_BASEURL"]
azure_openai_key = os.environ["AZURE_OPENAI_SERVICE_TOKEN"]
azure_openai_version = os.environ["AZURE_OPENAI_SERVICE_API_VERSION"]

azure_openai_embedding_deployment = os.environ["AZURE_OPENAI_SERVICE_EMBEDDING_DEPLOYMENT"]

## Connect to Blob Storage  
Retrieve list of documents from Blob Storage.  

In [19]:
import os

from azure.storage.blob import BlobServiceClient

blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(blob_container_name)
if not container_client.exists():
    raise Exception()

for blob in container_client.list_blobs():
    print(blob.name)

N100 - cropped.docx
N100 Veg-og gateutforming.docx


## Connect your Blob storage to a data source in Azure AI Search

In [9]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection,
)

# Create a data source
indexer_client = SearchIndexerClient(endpoint_azure_search, credential)
container = SearchIndexerDataContainer(name=blob_container_name)
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-blob",
    type="azureblob",
    connection_string=blob_connection_string,
    container=container,
)
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

Data source 'vector-1707386552871-blob' created or updated


## Create a search index
Disabled as vectorizer part of Python SDK is not yet available

In [20]:
# from azure.search.documents.indexes import SearchIndexClient
# from azure.search.documents.indexes.models import (
#     HnswAlgorithmConfiguration,
#     HnswParameters,
#     SearchField,
#     SearchFieldDataType,
#     SearchIndex,
#     SemanticConfiguration,
#     SemanticField,
#     SemanticPrioritizedFields,
#     SemanticSearch,
#     VectorSearch,
#     VectorSearchAlgorithmMetric,
#     VectorSearchProfile,
# )

# # Create a search index
# index_client = SearchIndexClient(endpoint=endpoint_azure_search, credential=credential)
# fields = [
#     SearchField(
#         name="parent_id",
#         type=SearchFieldDataType.String,
#         sortable=True,
#         filterable=True,
#         facetable=True,
#     ),
#     SearchField(name="title", type=SearchFieldDataType.String),
#     SearchField(
#         name="chunk_id",
#         type=SearchFieldDataType.String,
#         key=True,
#         sortable=True,
#         filterable=True,
#         facetable=True,
#         analyzer_name="keyword",
#     ),
#     SearchField(
#         name="chunk",
#         type=SearchFieldDataType.String,
#         sortable=False,
#         filterable=False,
#         facetable=False,
#     ),
#     SearchField(
#         name="vector",
#         type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
#         vector_search_dimensions=1536,
#         vector_search_profile_name="myHnswProfile",
#     ),
# ]

# # Configure the vector search configuration
# vector_search = VectorSearch(
#     algorithms=[
#         HnswAlgorithmConfiguration(
#             name="myHnsw",
#             parameters=HnswParameters(
#                 m=4,
#                 ef_construction=400,
#                 ef_search=500,
#                 metric=VectorSearchAlgorithmMetric.COSINE,
#             ),
#         ),
#     ],
#     profiles=[
#         VectorSearchProfile(
#             name="myHnswProfile",
#             algorithm_configuration_name="myHnsw",
#         ),
#     ],
# )

# semantic_config = SemanticConfiguration(
#     name="my-semantic-config",
#     prioritized_fields=SemanticPrioritizedFields(
#         content_fields=[SemanticField(field_name="chunk")]
#     ),
# )

# # Create the semantic search with the configuration
# semantic_search = SemanticSearch(configurations=[semantic_config])

# # Create the search index
# index = SearchIndex(
#     name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search
# )
# result = index_client.create_or_update_index(index)
# print(f"{result.name} created")

## Create a skillset
Disabled as OpenAIEmbedding skillset is not yet part of Python SDK

In [10]:
# from azure.search.documents.indexes.models import (
#     AzureOpenAIEmbeddingSkill,
#     IndexProjectionMode,
#     InputFieldMappingEntry,
#     OutputFieldMappingEntry,
#     SearchIndexerIndexProjections,
#     SearchIndexerIndexProjectionSelector,
#     SearchIndexerIndexProjectionsParameters,
#     SearchIndexerSkillset,
#     SplitSkill,
# )

# # Create a skillset
# skillset_name = f"{index_name}-skillset"

# split_skill = SplitSkill(
#     description="Split skill to chunk documents",
#     text_split_mode="pages",
#     context="/document",
#     maximum_page_length=2000,
#     page_overlap_length=500,
#     inputs=[
#         InputFieldMappingEntry(name="text", source="/document/content"),
#     ],
#     outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
# )

# embedding_skill = AzureOpenAIEmbeddingSkill(
#     description="Skill to generate embeddings via Azure OpenAI",
#     context="/document/pages/*",
#     resource_uri=azure_openai_endpoint,
#     deployment_id=azure_openai_embedding_deployment,
#     api_key=azure_openai_key,
#     inputs=[
#         InputFieldMappingEntry(name="text", source="/document/pages/*"),
#     ],
#     outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")],
# )

# index_projections = SearchIndexerIndexProjections(
#     selectors=[
#         SearchIndexerIndexProjectionSelector(
#             target_index_name=index_name,
#             parent_key_field_name="parent_id",
#             source_context="/document/pages/*",
#             mappings=[
#                 InputFieldMappingEntry(name="chunk", source="/document/pages/*"),
#                 InputFieldMappingEntry(name="vector", source="/document/pages/*/vector"),
#                 InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),
#             ],
#         ),
#     ],
#     parameters=SearchIndexerIndexProjectionsParameters(
#         projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
#     ),
# )

# skillset = SearchIndexerSkillset(
#     name=skillset_name,
#     description="Skillset to chunk documents and generating embeddings",
#     skills=[split_skill, embedding_skill],
#     index_projections=index_projections,
# )

# client = SearchIndexerClient(endpoint_azure_search, credential)
# client.create_or_update_skillset(skillset)
# print(f"{skillset.name} created")

## Create an indexer
Due to missing skill set, adding a index using Python SDK provided little value

In [12]:
# from azure.search.documents.indexes.models import FieldMapping, SearchIndexer

# # Create an indexer
# indexer_name = f"{index_name}-indexer"

# indexer = SearchIndexer(
#     name=indexer_name,
#     description="Indexer to index documents and generate embeddings",
#     #skillset_name=skillset_name,
#     target_index_name=index_name,
#     data_source_name=data_source.name,
#     # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results
#     field_mappings=[
#         FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")
#     ],
# )

# indexer_client = SearchIndexerClient(endpoint_azure_search, credential)
# indexer_result = indexer_client.create_or_update_indexer(indexer)

# # Run the indexer
# indexer_client.run_indexer(indexer_name)
# print(f" {indexer_name} created")

## Keyword search

In [53]:
from azure.search.documents import SearchClient

query = "Hva er fartsgrensen i boliggater?"
search_client = SearchClient(endpoint_azure_search, index_name, credential=credential)

results = search_client.search(
    search_text=query,
    select=["parent_id", "chunk_id", "chunk"],
    top=1,
)

for result in results:
    print(f"parent_id: {result['parent_id']}")
    print(f"chunk_id: {result['chunk_id']}")
    print(f"Score: {result['@search.score']}")
    print(f"Content: {result['chunk']}")

parent_id: aHR0cHM6Ly9yYWdwb2MuYmxvYi5jb3JlLndpbmRvd3MubmV0L3BvYy1ibG9iLWNvbnRhaW5lci9OMTAwJTIwVmVnLW9nJTIwZ2F0ZXV0Zm9ybWluZy5kb2N40
chunk_id: e9115a054c4e_aHR0cHM6Ly9yYWdwb2MuYmxvYi5jb3JlLndpbmRvd3MubmV0L3BvYy1ibG9iLWNvbnRhaW5lci9OMTAwJTIwVmVnLW9nJTIwZ2F0ZXV0Zm9ybWluZy5kb2N40_pages_15
Score: 3.3681343
Content: transformasjon og ombygging til gate.
I ytre områder av byer og tettsteder kan skillet mellom gater og veger være uklart, se kapittel 1.2Gater og veger. Dette gjelder blant annet boligveger/boliggater, atkomster til næringsområder og innfarter til byer. Slike gater/veger ligger innenfor by-/tettstedsområdet og blir derfor omtalt i kapittel 2 Gater.
Gater har tre grunnleggende funksjoner:
· Adkomst (tilgjengelighet for personer og varer)
· Transport (framkommelighet)
· Opphold (sosial funksjon)
Gatenettet er fleksibelt og gir tilgjengelighet for alle trafikantgrupper, men det er ikke hensiktsmessig å prioritere alle trafikantgrupper med separate løsninger i alle gater. Hensynet t

## Perform a vector similarity search

This example shows a pure vector search using the vectorizable text query, all you need to do is pass in text and your vectorizer will handle the query vectorization.

In [48]:
def get_embeddings(text: str):
    # There are a few ways to get embeddings. This is just one example.
    from openai import AzureOpenAI

    client = AzureOpenAI(
        azure_endpoint=azure_openai_endpoint,
        api_key=azure_openai_key,
        api_version="2023-12-01-preview",
    )
    embedding = client.embeddings.create(input=text, model="ada-002")
    return embedding.data[0].embedding

In [52]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery

# Pure Vector Search
query = "Hva er fartsgrensen i boliggater?"

search_client = SearchClient(endpoint_azure_search, index_name, credential=credential)
vector_query = VectorizedQuery(vector=get_embeddings(query), k_nearest_neighbors=3, fields="vector")

results = search_client.search(
    search_text=None,
    vector_queries=[vector_query],
    select=["parent_id", "chunk_id", "chunk"],
    top=1,
)

for result in results:
    print(f"parent_id: {result['parent_id']}")
    print(f"chunk_id: {result['chunk_id']}")
    print(f"Score: {result['@search.score']}")
    print(f"Content: {result['chunk']}")

parent_id: aHR0cHM6Ly9yYWdwb2MuYmxvYi5jb3JlLndpbmRvd3MubmV0L3BvYy1ibG9iLWNvbnRhaW5lci9OMTAwJTIwVmVnLW9nJTIwZ2F0ZXV0Zm9ybWluZy5kb2N40
chunk_id: e9115a054c4e_aHR0cHM6Ly9yYWdwb2MuYmxvYi5jb3JlLndpbmRvd3MubmV0L3BvYy1ibG9iLWNvbnRhaW5lci9OMTAwJTIwVmVnLW9nJTIwZ2F0ZXV0Zm9ybWluZy5kb2N40_pages_31
Score: 0.8846009
Content: Fortau i boliggater/boligveger skal ha bredde minimum 1,5 m.
Gjeldende fra 22.06.2021
Krav 2.6—1 KAN
Gatene/vegene kan utformes med fortau (ensidig eller tosidig).
Gjeldende fra 22.06.2021
Krav 2.6—2 SKAL
Boliggater/boligveger skal ha stigning på maksimalt 8%.
Gjeldende fra 22.06.2021


Ellers gjelder krav til linjeføring gitt i kapittel 2.2.

2.6.1 Overordnede boliggater/boligveger
Overordnede boliggater/boligveger har fartsgrense 30 eller 40 km/t.
Krav 2.6.1—1 SKAL
Gjeldende fra 22.06.2021
Overordnede boliggater/boligveger skal utformes med kjørebanebredde 5,5 – 6 m. Gater/veger der det går buss skal ha bredde 6 m.



	

	Figur 2.6.1—1 — Overordnet boliggate/boligveg med forta

## Perform a hybrid search

In [50]:
# Hybrid Search
query = "Hva er fartsgrensen i boliggater?"

search_client = SearchClient(endpoint_azure_search, index_name, credential=credential)
vector_query = VectorizedQuery(vector=get_embeddings(query), k_nearest_neighbors=3, fields="vector")

results = search_client.search(
    search_text=query,
    vector_queries=[vector_query],
    select=["parent_id", "chunk_id", "chunk"],
    top=1,
)

for result in results:
    print(f"parent_id: {result['parent_id']}")
    print(f"chunk_id: {result['chunk_id']}")
    print(f"Score: {result['@search.score']}")
    print(f"Content: {result['chunk']}")

parent_id: aHR0cHM6Ly9yYWdwb2MuYmxvYi5jb3JlLndpbmRvd3MubmV0L3BvYy1ibG9iLWNvbnRhaW5lci9OMTAwJTIwVmVnLW9nJTIwZ2F0ZXV0Zm9ybWluZy5kb2N40
chunk_id: e9115a054c4e_aHR0cHM6Ly9yYWdwb2MuYmxvYi5jb3JlLndpbmRvd3MubmV0L3BvYy1ibG9iLWNvbnRhaW5lci9OMTAwJTIwVmVnLW9nJTIwZ2F0ZXV0Zm9ybWluZy5kb2N40_pages_22
Score: 0.0320020467042923
Content: Fartsgrense 60 km/t

	Minste horisontalkurveradi us
	40
	60
	125

	Minste vertikalkurveradius
	150
	400
	600



For høybrekk må krav til stoppsikt ivaretas.Krav 2.2.2—2 SKAL
Gjeldende fra 22.06.2021
Tabell 2.2.2—1 — Geometriske krav til gater utenfor kvartalstruktur (mål i m).

Krav 2.2.2—3 KAN
Gjeldende fra 22.06.2021
I gater utenfor kvartalsstruktur, vurderes behov for breddeutvidelse iht.kapittel5.3ved horisontalkurveradius ≤ 500 m.

2.3 Gateelementer
En gate kan bestå av ulike elementer. De fleste gater består av fortau på begge sider og kjøreareal i midten. I tillegg til, eller i stedet for ordinære kjørefelt kan gaten ha kollektivfelt, sambruksfelt eller sykkelfel

## Perform a hybrid search + semantic reranking
Disabled as Semantic reranking is available in Norway East

In [1]:
# from azure.search.documents.models import QueryAnswerType, QueryCaptionType, QueryType

# # Semantic Hybrid Search
# query = "Which is more comprehensive, Northwind Health Plus vs Northwind Standard?"

# search_client = SearchClient(endpoint_azure_search, index_name, credential)
# vector_query = VectorizedQuery(vector=get_embeddings(query), k_nearest_neighbors=1, fields="vector", exhaustive=True)

# results = search_client.search(
#     search_text=query,
#     vector_queries=[vector_query],
#     select=["parent_id", "chunk_id", "chunk"],
#     query_type=QueryType.SEMANTIC,
#     semantic_configuration_name="my-semantic-config",
#     query_caption=QueryCaptionType.EXTRACTIVE,
#     query_answer=QueryAnswerType.EXTRACTIVE,
#     top=1,
# )

# semantic_answers = results.get_answers()
# for answer in semantic_answers:
#     if answer.highlights:
#         print(f"Semantic Answer: {answer.highlights}")
#     else:
#         print(f"Semantic Answer: {answer.text}")
#     print(f"Semantic Answer Score: {answer.score}\n")

# for result in results:
#     print(f"parent_id: {result['parent_id']}")
#     print(f"chunk_id: {result['chunk_id']}")
#     print(f"Reranker Score: {result['@search.reranker_score']}")
#     print(f"Content: {result['chunk']}")

#     captions = result["@search.captions"]
#     if captions:
#         caption = captions[0]
#         if caption.highlights:
#             print(f"Caption: {caption.highlights}\n")
#         else:
#             print(f"Caption: {caption.text}\n")