In [1]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os
load_dotenv(override=True)

True

In [2]:
endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.getenv("AZURE_SEARCH_ADMIN_KEY", "")) if len(os.getenv("AZURE_SEARCH_ADMIN_KEY", "")) > 0 else DefaultAzureCredential()
index_name = os.getenv("AZURE_SEARCH_INDEX", "vectest")
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.getenv("AZURE_OPENAI_KEY", "") if len(os.getenv("AZURE_OPENAI_KEY", "")) > 0 else None
azure_openai_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-ada-002")
azure_openai_embedding_dimensions = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS", 1536))
embedding_model_name = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-ada-002")
azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-10-21")

## Create Embeddings

In [3]:
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
import json

openai_credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(openai_credential, "https://cognitiveservices.azure.com/.default")

client = AzureOpenAI(
    azure_deployment=azure_openai_embedding_deployment,
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key,
    azure_ad_token_provider=token_provider if not azure_openai_key else None
)

In [4]:
with open('text-sample.json', 'r', encoding='utf-8') as file:
    input_data = json.load(file)

titles = [item['title'] for item in input_data]
content = [item['content'] for item in input_data]

print(titles[:5])
print()
print(content[:3])

['Azure App Service', 'Azure Functions', 'Azure Cognitive Services', 'Azure Storage', 'Azure SQL Database']

['Azure App Service is a fully managed platform for building, deploying, and scaling web apps. You can host web apps, mobile app backends, and RESTful APIs. It supports a variety of programming languages and frameworks, such as .NET, Java, Node.js, Python, and PHP. The service offers built-in auto-scaling and load balancing capabilities. It also provides integration with other Azure services, such as Azure DevOps, GitHub, and Bitbucket.', 'Azure Functions is a serverless compute service that enables you to run code on-demand without having to manage infrastructure. It allows you to build and deploy event-driven applications that automatically scale with your workload. Functions support various languages, including C#, F#, Node.js, Python, and Java. It offers a variety of triggers and bindings to integrate with other Azure services and external services. You only pay for the comp

In [5]:
title_response = client.embeddings.create(input=titles, model=embedding_model_name)
title_embeddings = [item.embedding for item in title_response.data]
len(title_embeddings)

108

In [6]:
content_response = client.embeddings.create(input=content, model=embedding_model_name)
content_embeddings = [item.embedding for item in content_response.data]
len(content_embeddings)

108

In [7]:
output_data = [ data | {"titleVector":title_embeddings[idx], "contentVector":content_embeddings[idx]} for idx, data in enumerate(input_data)]
len(output_data)

108

In [8]:
with open('docVectors.json', "w") as f:
    json.dump(output_data, f)

## Create Search Index

In [9]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters
)

In [10]:
index_client = SearchIndexClient(
    endpoint=endpoint, credential=credential,
)

index_client.get_index(name='margies-index') # connection works 

<azure.search.documents.indexes.models._index.SearchIndex at 0x2917e6b7c10>

In [11]:
# establish an index client
index_client = SearchIndexClient(
    endpoint=endpoint, credential=credential)


# define fields that we want to index
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String), # string fields - preexisting in our data - SearchableField is used for full text
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="category", type=SearchFieldDataType.String,
                    filterable=True),
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), # embedding vectors that we just created - SearchField used for embeddings
                searchable=True, vector_search_dimensions=azure_openai_embedding_dimensions, vector_search_profile_name="myHnswProfile"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=azure_openai_embedding_dimensions, vector_search_profile_name="myHnswProfile"),
]

# Configure the vector search configuration - for vector based retrieval
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration( #Hierarchical Navigable Small World (HNSW) for ANN
            name="myHnsw"
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
            vectorizer_name="myVectorizer"
        )
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            vectorizer_name="myVectorizer",
            parameters=AzureOpenAIVectorizerParameters(
                resource_url=azure_openai_endpoint,
                deployment_name=azure_openai_embedding_deployment,
                model_name=embedding_model_name,
                api_key=azure_openai_key
            )
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="title"),
        keywords_fields=[SemanticField(field_name="category")],
        content_fields=[SemanticField(field_name="content")]
    )
)
semantic_search = SemanticSearch(configurations=[semantic_config])

index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f'{result.name} created')

my-demo-index created


## Upload data to Index

In [12]:
with open('docVectors.json', 'r') as file:  
    documents = json.load(file)
len(documents)

108

In [56]:
from azure.search.documents import SearchClient

search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents") 

Uploaded 108 documents


In [None]:
## USE BELOW FOR UPLOADING MANY DOCUMENTS - BATCH UPLOAD

# from azure.search.documents import SearchIndexingBufferedSender

# # Use SearchIndexingBufferedSender to upload the documents in batches optimized for indexing  
# with SearchIndexingBufferedSender(  
#     endpoint=endpoint,  
#     index_name=index_name,  
#     credential=credential,  
# ) as batch_client:  
#     # Add upload actions for all documents  
#     batch_client.upload_documents(documents=documents)  
# print(f"Uploaded {len(documents)} documents in total")  

In [61]:
# Helper code to print results

from azure.search.documents import SearchItemPaged

def print_results(results: SearchItemPaged[dict]):
    semantic_answers = results.get_answers()
    if semantic_answers:
        for answer in semantic_answers:
            if answer.highlights:
                print(f"Semantic Answer: {answer.highlights}")
            else:
                print(f"Semantic Answer: {answer.text}")
            print(f"Semantic Answer Score: {answer.score}\n")

    for result in results:
        print(f"Title: {result['title']}")  
        print(f"Score: {result['@search.score']}")
        if result.get('@search.reranker_score'):
            print(f"Reranker Score: {result['@search.reranker_score']}")
        print(f"Content: {result['content']}")  
        print(f"Category: {result['category']}\n")

        captions = result["@search.captions"]
        if captions:
            caption = captions[0]
            if caption.highlights:
                print(f"Caption: {caption.highlights}\n")
            else:
                print(f"Caption: {caption.text}\n")

#### Vector Similarity Search - Pre compute embeddings

In [None]:
from azure.search.documents.models import VectorizedQuery
query = "tools for software development"  
embedding = client.embeddings.create(input=query, model=embedding_model_name).data[0].embedding
len(embedding)

# IF WE ARE USING VectorizedQuery WE NEED TO EMBED THE SEARCH TEXT OURSELVES
#  k_nearest_neighbors parameter determines how many nearest neighbors are initially retrieved from the vector search index.
# 50 neighbors serve as a candidate set for further processing, such as re-ranking or filtering.
vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=50, fields="contentVector") # we mention which embedding field we want to compare against
vector_query

<azure.search.documents._generated.models._models_py3.VectorizedQuery at 0x230f685f940>

In [None]:
results = search_client.search(  
    search_text=None, # we are not doing a text search but embedding search
    vector_queries= [vector_query],
    select=["title", "content", "category"],
    top=3
)
# top specifies the final number of results to return after processing the candidate set.
print_results(results) # the results are closest to our search text and content field of data

Title: Azure DevOps
Score: 0.8289744
Content: Azure DevOps is a suite of services that help you plan, build, and deploy applications. It includes Azure Boards for work item tracking, Azure Repos for source code management, Azure Pipelines for continuous integration and continuous deployment, Azure Test Plans for manual and automated testing, and Azure Artifacts for package management. DevOps supports a wide range of programming languages, frameworks, and platforms, making it easy to integrate with your existing development tools and processes. It also integrates with other Azure services, such as Azure App Service and Azure Functions.
Category: Developer Tools

Title: Azure DevTest Labs
Score: 0.81774753
Content: Azure DevTest Labs is a fully managed service that enables you to create, manage, and share development and test environments in Azure. It provides features like custom templates, cost management, and integration with Azure DevOps. DevTest Labs supports various platforms, such

#### Vector Similarity Search - vectorizable text query

In [None]:
from azure.search.documents.models import VectorizableTextQuery

# Pure Vector Search
query = "tools for software development"  
  
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=50, fields="contentVector") # we need not pre compute embeddings here
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["title", "content", "category"],
    top=3
)  
  
print_results(results) # gave same results

Title: Azure DevOps
Score: 0.8289744
Content: Azure DevOps is a suite of services that help you plan, build, and deploy applications. It includes Azure Boards for work item tracking, Azure Repos for source code management, Azure Pipelines for continuous integration and continuous deployment, Azure Test Plans for manual and automated testing, and Azure Artifacts for package management. DevOps supports a wide range of programming languages, frameworks, and platforms, making it easy to integrate with your existing development tools and processes. It also integrates with other Azure services, such as Azure App Service and Azure Functions.
Category: Developer Tools

Title: Azure DevTest Labs
Score: 0.81774753
Content: Azure DevTest Labs is a fully managed service that enables you to create, manage, and share development and test environments in Azure. It provides features like custom templates, cost management, and integration with Azure DevOps. DevTest Labs supports various platforms, such

In [None]:
query = "tools voor softwareontwikkeling" # in dutch 
  
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=50, fields="contentVector")

results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["title", "content", "category"],
    top=3
)  
  
print_results(results) # still gives relevant results

Title: Azure DevOps
Score: 0.80346894
Content: Azure DevOps is a suite of services that help you plan, build, and deploy applications. It includes Azure Boards for work item tracking, Azure Repos for source code management, Azure Pipelines for continuous integration and continuous deployment, Azure Test Plans for manual and automated testing, and Azure Artifacts for package management. DevOps supports a wide range of programming languages, frameworks, and platforms, making it easy to integrate with your existing development tools and processes. It also integrates with other Azure services, such as Azure App Service and Azure Functions.
Category: Developer Tools

Title: Azure DevTest Labs
Score: 0.796056
Content: Azure DevTest Labs is a fully managed service that enables you to create, manage, and share development and test environments in Azure. It provides features like custom templates, cost management, and integration with Azure DevOps. DevTest Labs supports various platforms, such 

#### Exhaustive KNN search (Previously, by default, we use ANN search)

In [None]:
query = "tools for software development"  
  
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=50, fields="contentVector", exhaustive=True) # set exhaustive for full KNN
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["title", "content", "category"],
    top=3
)  
  
print_results(results) # gave similar results

Title: Azure DevOps
Score: 0.8289744
Content: Azure DevOps is a suite of services that help you plan, build, and deploy applications. It includes Azure Boards for work item tracking, Azure Repos for source code management, Azure Pipelines for continuous integration and continuous deployment, Azure Test Plans for manual and automated testing, and Azure Artifacts for package management. DevOps supports a wide range of programming languages, frameworks, and platforms, making it easy to integrate with your existing development tools and processes. It also integrates with other Azure services, such as Azure App Service and Azure Functions.
Category: Developer Tools

Title: Azure DevTest Labs
Score: 0.81774753
Content: Azure DevTest Labs is a fully managed service that enables you to create, manage, and share development and test environments in Azure. It provides features like custom templates, cost management, and integration with Azure DevOps. DevTest Labs supports various platforms, such

#### Cross Field Vector Search

In [None]:
query = "tools for software development"  
  
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=50, fields="contentVector, titleVector") # gave multiple fields here
# used to retrieve releavant results based on similarity across different fields of a dataset
# goes without saying, both contentVector and titleVector should be embedded using the same model
# if we are passing multiple fields into fields= ensure they use the same embedding model
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["title", "content", "category"],
    top=3
)  
  
print_results(results) # almost similar results, but the score is less

Title: Azure DevOps
Score: 0.03333333507180214
Content: Azure DevOps is a suite of services that help you plan, build, and deploy applications. It includes Azure Boards for work item tracking, Azure Repos for source code management, Azure Pipelines for continuous integration and continuous deployment, Azure Test Plans for manual and automated testing, and Azure Artifacts for package management. DevOps supports a wide range of programming languages, frameworks, and platforms, making it easy to integrate with your existing development tools and processes. It also integrates with other Azure services, such as Azure App Service and Azure Functions.
Category: Developer Tools

Title: Azure DevTest Labs
Score: 0.032786883413791656
Content: Azure DevTest Labs is a fully managed service that enables you to create, manage, and share development and test environments in Azure. It provides features like custom templates, cost management, and integration with Azure DevOps. DevTest Labs supports var

#### Multi vector search

In [None]:
# same as above almost but this time we can use different embedding models for differnt fields and still search

query = "tools for software development"  
  

vector_query_1 = VectorizableTextQuery(text=query, k_nearest_neighbors=50, fields="titleVector")
vector_query_2 = VectorizableTextQuery(text=query, k_nearest_neighbors=50, fields="contentVector")

results = search_client.search(  
    search_text=None,  
    vector_queries=[vector_query_1, vector_query_2],
    select=["title", "content", "category"],
    top=3
)  
  
print_results(results) # wouldnt change as above as we are using same embedding models for both the fields

Title: Azure DevOps
Score: 0.03333333507180214
Content: Azure DevOps is a suite of services that help you plan, build, and deploy applications. It includes Azure Boards for work item tracking, Azure Repos for source code management, Azure Pipelines for continuous integration and continuous deployment, Azure Test Plans for manual and automated testing, and Azure Artifacts for package management. DevOps supports a wide range of programming languages, frameworks, and platforms, making it easy to integrate with your existing development tools and processes. It also integrates with other Azure services, such as Azure App Service and Azure Functions.
Category: Developer Tools

Title: Azure DevTest Labs
Score: 0.032786883413791656
Content: Azure DevTest Labs is a fully managed service that enables you to create, manage, and share development and test environments in Azure. It provides features like custom templates, cost management, and integration with Azure DevOps. DevTest Labs supports var

#### Weighted multi vector search

In [68]:
# we can give weights while performing multi vector search to boost search results of a particular field more. in this makes sense to boost content field rather than title

query = "tools for software development"  
  

vector_query_1 = VectorizableTextQuery(text=query, k_nearest_neighbors=50, fields="titleVector", weight=0.5)
vector_query_2 = VectorizableTextQuery(text=query, k_nearest_neighbors=50, fields="contentVector", weight=2)

results = search_client.search(  
    search_text=None,  
    vector_queries=[vector_query_1, vector_query_2],
    select=["title", "content", "category"],
    top=3
)  
  
print_results(results)

Title: Azure DevOps
Score: 0.0416666679084301
Content: Azure DevOps is a suite of services that help you plan, build, and deploy applications. It includes Azure Boards for work item tracking, Azure Repos for source code management, Azure Pipelines for continuous integration and continuous deployment, Azure Test Plans for manual and automated testing, and Azure Artifacts for package management. DevOps supports a wide range of programming languages, frameworks, and platforms, making it easy to integrate with your existing development tools and processes. It also integrates with other Azure services, such as Azure App Service and Azure Functions.
Category: Developer Tools

Title: Azure DevTest Labs
Score: 0.04098360240459442
Content: Azure DevTest Labs is a fully managed service that enables you to create, manage, and share development and test environments in Azure. It provides features like custom templates, cost management, and integration with Azure DevOps. DevTest Labs supports vario

#### Filtering along with Search

In [None]:
from azure.search.documents.models import VectorFilterMode

# Pure Vector Search
query = "tools for software development"  
  
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=50, fields="contentVector")

results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    vector_filter_mode=VectorFilterMode.PRE_FILTER, # filters first and then searches ; post filter also exists (have to be careful with possible empty results)
    filter="category eq 'Developer Tools'", # filter query
    select=["title", "content", "category"],
    top=3
)
  
print_results(results) # results are only from category=Developer Tools

Title: Azure DevOps
Score: 0.8289744
Content: Azure DevOps is a suite of services that help you plan, build, and deploy applications. It includes Azure Boards for work item tracking, Azure Repos for source code management, Azure Pipelines for continuous integration and continuous deployment, Azure Test Plans for manual and automated testing, and Azure Artifacts for package management. DevOps supports a wide range of programming languages, frameworks, and platforms, making it easy to integrate with your existing development tools and processes. It also integrates with other Azure services, such as Azure App Service and Azure Functions.
Category: Developer Tools

Title: Azure DevTest Labs
Score: 0.81774753
Content: Azure DevTest Labs is a fully managed service that enables you to create, manage, and share development and test environments in Azure. It provides features like custom templates, cost management, and integration with Azure DevOps. DevTest Labs supports various platforms, such

#### Hybrid Search

In [None]:
# search using both text and vectors

query = "scalable storage solution"  
  
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=50, fields="contentVector")

results = search_client.search(  
    search_text=query,  # text search across ALL text fields; can modify tp search only `content`
    vector_queries=[vector_query], # vector search
    select=["title", "content", "category"],
    top=3
)  
# performed a text search on content and 
print_results(results)

Title: Azure Storage
Score: 0.03306011110544205
Content: Azure Storage is a scalable, durable, and highly available cloud storage service that supports a variety of data types, including blobs, files, queues, and tables. It provides a massively scalable object store for unstructured data. Storage supports data redundancy and geo-replication, ensuring high durability and availability. It offers a variety of data access and management options, including REST APIs, SDKs, and Azure Portal. You can secure your data using encryption at rest and in transit.
Category: Storage

Title: Azure Blob Storage
Score: 0.03279569745063782
Content: Azure Blob Storage is a scalable, durable, and high-performance object storage service for unstructured data. It provides features like data redundancy, geo-replication, and fine-grained access control. Blob Storage supports various data types, such as images, documents, and videos. You can use Blob Storage to store and manage your data, build data lakes, and 

#### Weighted hybrid search

In [71]:
query = "scalable storage solution"  
  
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=50, fields="contentVector", weight=0.2) # weight 1/5th w.r.t text query

results = search_client.search(  
    search_text=query,  
    vector_queries=[vector_query],
    select=["title", "content", "category"],
    top=3
)  
  
print_results(results)

Title: Azure Storage
Score: 0.019945355132222176
Content: Azure Storage is a scalable, durable, and highly available cloud storage service that supports a variety of data types, including blobs, files, queues, and tables. It provides a massively scalable object store for unstructured data. Storage supports data redundancy and geo-replication, ensuring high durability and availability. It offers a variety of data access and management options, including REST APIs, SDKs, and Azure Portal. You can secure your data using encryption at rest and in transit.
Category: Storage

Title: Azure Data Lake Storage
Score: 0.01956804469227791
Content: Azure Data Lake Storage is a scalable, secure, and cost-effective storage service for big data analytics. It provides features like unlimited storage, hierarchical namespace, and fine-grained access control. Data Lake Storage supports various data types, such as structured, semi-structured, and unstructured data. You can use Data Lake Storage to store an

#### Semantic Hybrid search

- Semantic hybrid search combines:

- Semantic Search: 
    - Uses AI-powered models to understand the intent of the query and rank results based on meaning rather than keyword matches.
- Vector Search:
    - Uses embeddings (vector representations of text) to find similar documents based on contextual similarity.

    
This approach is ideal for capturing intent, improving relevance, and retrieving contextually similar results even when keywords don’t exactly match.

In [None]:
# not available for free tier ai search

from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType

# Semantic Hybrid Search
query = "what is azure sarch?"

vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=50, fields="contentVector", exhaustive=True)

results = search_client.search(  
    search_text=query,  
    vector_queries=[vector_query],
    select=["title", "content", "category"],
    query_type=QueryType.SEMANTIC, # enables semantic search
    semantic_configuration_name='my-semantic-config', # defined above
    query_caption=QueryCaptionType.EXTRACTIVE, # Extracts concise, relevant captions directly from the most relevant sections of content (e.g., snippets answering the query).
    query_answer=QueryAnswerType.EXTRACTIVE,#Provides short, AI-generated answers derived from the most relevant documents.
    top=3
)

print_results(results)

HttpResponseError: (FeatureNotSupportedInService) Semantic search is not enabled for this service.
Parameter name: queryType
Code: FeatureNotSupportedInService
Message: Semantic search is not enabled for this service.
Parameter name: queryType
Exception Details:	(SemanticQueriesNotAvailable) Semantic search is not enabled for this service.
	Code: SemanticQueriesNotAvailable
	Message: Semantic search is not enabled for this service.