# AI Foundry - working with project AI Search resource

This notebook demonstrates how to:
1. Connect to the default AI Foundry project AI Search service
2. Create a new index called *customer* if it doesn't already exist (and delete it first if switch set!)
3. Define a schema for customer data (first name, last name, date of birth, home city, profile text)
4. Upload sample customer documents to the index
5. Demonstrate retrieving documents with a variety of search methods

## Imports and Setup

First, import the necessary libraries and do some setup

In [None]:
# Standard library imports
import os
import pathlib
import logging

# Third-party imports
from dotenv import load_dotenv

# Azure Identity
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential

# Azure AI Foundry Project SDK
from azure.ai.projects import AIProjectClient
from azure.ai.projects.models import ConnectionType

# Azure AI Search SDK
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    HnswParameters,
    HnswAlgorithmConfiguration,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch,
    SimpleField,
    VectorSearch,
    VectorSearchProfile,
)

# Setup logger
logger = logging.getLogger(__name__)

## Get our AI Foundry project connection string

In [None]:
# Load environment variables from .env file
# Look for .env in the current directory and parent directory
current_dir = pathlib.Path().absolute()
root_dir = current_dir.parent
load_dotenv(dotenv_path=root_dir / ".env")

# Get the project connection string from environment variables
project_connection_string = os.getenv("AZURE_AI_FOUNDRY_PROJECT_CONNECTION_STRING")
if not project_connection_string:
    raise ValueError("Please set AZURE_AI_FOUNDRY_PROJECT_CONNECTION_STRING in your .env file")

## Connect to the AI Foundry Project
Now get credentials to authenticate and get handle to the AI Foundry project

In [None]:
try:
    credential = DefaultAzureCredential()
    print("✓ Successfully initialized DefaultAzureCredential")
except Exception as e:
    print(f"× Error initializing credentials: {str(e)}")

In [None]:
try:
    client = AIProjectClient.from_connection_string(
        conn_str=os.getenv("AZURE_AI_FOUNDRY_PROJECT_CONNECTION_STRING"),
        credential=credential
    )
    print("✓ Successfully initialized AIProjectClient")
except Exception as e:
    print(f"× Error initializing client: {str(e)}")

## Get handle to embedding model client and test

In [None]:
embedding_client = client.inference.get_embeddings_client()
print("✅ Created embeddings client.")

# Test the embedding client
embedding = embedding_client.embed(model="text-embedding-3-large", input="this is a test")
vector = embedding.data[0].embedding

# Save the dimensions size for later reference when configuring the index
embedding_dimensions = len(vector)

print(f"Vector dimension: {embedding_dimensions}")
print(f"First 10 elements: {vector[:10]}")

## Get handle to the default Foundry project AI Search connection
This will be named AzureAISearch in your AI Foundry project connected resources.

In [None]:
search_connection = client.connections.get_default(
    connection_type=ConnectionType.AZURE_AI_SEARCH,
    include_credentials=True,
)

print(f"Connected to AI Search service: {search_connection.name}")
print(f"Endpoint: {search_connection.endpoint_url}")

## Create AI Search clients for managing indexing and searching

In [None]:
# Define the index name
index_name = "customer"

# Create a client to manage search indexes
index_client = SearchIndexClient(
    endpoint=search_connection.endpoint_url,
    credential=AzureKeyCredential(key=search_connection.key)
)

# Create a client to run search queries
search_client = SearchClient(
    index_name=index_name,
    endpoint=search_connection.endpoint_url,
    credential=AzureKeyCredential(key=search_connection.key)
)

## Delete index if enabled
Helps with iterative testing

In [None]:
delete_index_if_exists = True

if delete_index_if_exists:
    try:
        index_client.delete_index(index_name)
        print(f"Index '{index_name}' deleted successfully.")
    except Exception as e:
        print(f"Index '{index_name}' does not exist or could not be deleted: {str(e)}")

## Check if Index Exists and Create if Needed

Check if the *customer* index already exists, and create if not. Index based on the following schema and configuration

In [None]:
# Check if the index exists
index_exists = index_name in [index.name for index in index_client.list_indexes()]

if index_exists:
    print(f"Index '{index_name}' already exists.")
else:
    print(f"Index '{index_name}' does not exist. Creating...")
    
    # index schema definition
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True, filterable=True, sortable=True),
        SearchableField(name="first_name", type=SearchFieldDataType.String),
        SearchableField(name="last_name", type=SearchFieldDataType.String),
        SimpleField(name="date_of_birth", type=SearchFieldDataType.DateTimeOffset, filterable=True, sortable=True),
        SearchableField(name="home_city", type=SearchFieldDataType.String, filterable=True, sortable=True),
        SearchableField(name="profile_text", type=SearchFieldDataType.String),
        SearchField(
            name="profile_vector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            hidden=False,
            searchable=True,
            filterable=False,
            sortable=False,
            facetable=False,
            vector_search_dimensions=embedding_dimensions,
            vector_search_profile_name="embedding_config",
        )
    ]
    
    # Define vectorizers (empty for now as not using integrated vectorization)
    vectorizers = []
    use_int_vectorization = False
    
    # Create the index with vector search and semantic search
    index = SearchIndex(
        name=index_name,
        fields=fields,
        semantic_search=SemanticSearch(
            configurations=[
                SemanticConfiguration(
                    name="default",
                    prioritized_fields=SemanticPrioritizedFields(
                        title_field=None, 
                        content_fields=[SemanticField(field_name="profile_text")]
                    ),
                )
            ]
        ),
        vector_search=VectorSearch(
            algorithms=[
                HnswAlgorithmConfiguration(
                    name="hnsw_config",
                    parameters=HnswParameters(metric="cosine"),
                )
            ],
            profiles=[
                VectorSearchProfile(
                    name="embedding_config",
                    algorithm_configuration_name="hnsw_config",
                    vectorizer_name=(
                        f"{index_name}-vectorizer" if use_int_vectorization else None
                    ),
                ),
            ],
            vectorizers=vectorizers,
        ),
    )
    
    result = index_client.create_index(index)
    print(f"Index '{index_name}' created successfully.")

## Create Sample Customer Data

Let's create a few sample customer documents to upload to the AI Search index

In [None]:
# Create sample customer data
customers_with_profiles = [
    {
        "id": "1",
        "first_name": "John",
        "last_name": "Smith",
        "date_of_birth": "1985-05-15",
        "home_city": "Seattle",
        "profile_text": "Passionate about hiking in the Pacific Northwest and photography. Enjoys weekend trips to Vancouver and Portland. Dreams of exploring the national parks of Japan."
    },
    {
        "id": "2",
        "first_name": "Jon",
        "last_name": "Smith",
        "date_of_birth": "1982-09-30",
        "home_city": "Seattle",
        "profile_text": "Adores kitesurfing. Enjoys spending weekends with his children. Dreams about kitesurfing in South Africa."
    },
    {
        "id": "3",
        "first_name": "Jane",
        "last_name": "Doe",
        "date_of_birth": "1990-08-22",
        "home_city": "San Francisco",
        "profile_text": "Avid rock climber and yoga enthusiast. Loves exploring California wine country. Plans annual trips to Mediterranean beaches and has visited Italy three times."
    },
    {
        "id": "4",
        "first_name": "Jonathan",
        "last_name": "Johnson",
        "date_of_birth": "1978-11-30",
        "home_city": "New York",
        "profile_text": "Theater aficionado and jazz music collector. Frequently travels to London and Paris for cultural events. Enjoys cooking classes and food tours when traveling."
    },
    {
        "id": "5",
        "first_name": "Emily",
        "last_name": "Williams",
        "date_of_birth": "1992-03-10",
        "home_city": "Chicago",
        "profile_text": "Dedicated cyclist and urban gardener. Loves exploring Great Lakes beaches in summer. Dreams of cycling through the countryside of France and visiting Nordic countries."
    },
    {
        "id": "6",
        "first_name": "David",
        "last_name": "Brown",
        "date_of_birth": "1982-07-05",
        "home_city": "Seattle",
        "profile_text": "Software developer who enjoys kayaking and board games. Frequently hikes in Olympic National Park. Plans to visit tech hubs in Asia and explore mountain trails in New Zealand."
    }
]

## Generate Embeddings for Customer Profile Text

In [None]:
# Generate embeddings for each customer's profile text
for customer in customers_with_profiles:
    profile_text = customer['profile_text']
    embedding_response = embedding_client.embed(model="text-embedding-3-large", input=profile_text)
    customer['profile_vector'] = embedding_response.data[0].embedding
    
print("Added vector embeddings to customer profiles.")
print(f"Vector dimension: {len(customers_with_profiles[0]['profile_vector'])}")

## Upload Documents to the Index

In [None]:
# Upload documents to the index
result = search_client.upload_documents(documents=customers_with_profiles)
print(f"Uploaded {len(result)} documents to the index.")

# Check the results
for i, success in enumerate(result):
    print(f"Document {i+1} {'succeeded' if success else 'failed'}")

## Example Exact Match Search
Exact match with search fields specification.

You can experiment by changing name, adding last name in the *search_text* string

In [None]:
search_text = "John"
results = search_client.search(
    search_text=search_text,
    search_mode="any",  # Use "all" for AND semantics, "any" for OR semantics
    search_fields=["first_name", "last_name"],  # Limit search to name fields
    select=["id", "first_name", "last_name", "home_city", "date_of_birth"]
)

print(f"Exact search results for '{search_text}':")
for result in results:
    print(f"ID: {result['id']}")
    print(f"Name: {result['first_name']} {result['last_name']}")
    print(f"City: {result['home_city']}")
    print(f"DOB: {result['date_of_birth']}")
    print("---")

## Another Example Exact Match
This example doesn't specify *search_fields*, so searches across all searchable fields.
It also doesn't specify a search mode, so defaults to *any*, and returns all fields.

In [None]:
# Search for customers by name
search_text = "John"
results = search_client.search(search_text=search_text)

print(f"\nCustomers with name containing '{search_text}':")
for result in results:
    print(f"ID: {result['id']}, Name: {result['first_name']} {result['last_name']}, City: {result['home_city']}")
    print(f"Profile: {result['profile_text']}")
    print("---")

## Filter Example

In [None]:
search_text = "Seattle"
results = search_client.search(
    search_text="John", 
    filter=f"home_city eq '{search_text}'"
)

print(f"\nCustomers in {search_text}:")
for result in results:
    print(f"ID: {result['id']}, Name: {result['first_name']} {result['last_name']}, DOB: {result['date_of_birth']}")
    print(f"Profile: {result['profile_text']}")
    print("---")

## Filter Example with Operand

In [None]:
# Search for customers born before 1990
year_threshold = "1990-01-01T00:00:00Z"
results = search_client.search(
    search_text="*", 
    filter=f"date_of_birth lt {year_threshold}"
)

print(f"\nCustomers born before {year_threshold}:")
for result in results:
    print(f"ID: {result['id']}, Name: {result['first_name']} {result['last_name']}, DOB: {result['date_of_birth']}")
    print(f"Profile: {result['profile_text']}")
    print("---")

## Fuzzy Search Example

In [None]:
search_text = "Jo~1n"
results = search_client.search(
    search_text=search_text, 
    search_fields=["first_name"], 
    query_type="full"
)

print(f"\nCustomers with profile like '{search_text}':")
for result in results:
    print(f"ID: {result['id']}, Name: {result['first_name']} {result['last_name']}, City: {result['home_city']}")
    print(f"Profile: {result['profile_text']}")
    search_score = result['@search.score']
    print(f"Score: {search_score:.2f}") 
    print("---")

## Similarity (or similarity) Search Example
!Note: Semantic Ranker option needs to be enabled for your service (check [here](https://learn.microsoft.com/en-us/azure/search/search-region-support) for region availability. Also check if semantic reranker available in selected region SKU).

Demonstrates similarity search based on the profile text.

In [None]:
query_text = "hiking in national parks"
semantic_query = {
    "search_text": query_text,
    "select": "id,first_name,last_name,home_city,profile_text",
    "top": 5,
    "query_type": "semantic",
    "semantic_configuration_name": "default"
}
    
results = search_client.search(**semantic_query)
print(f"\nSimilarity search for '{query_text}':")
for result in results:
    print(f"ID: {result['id']}, Name: {result['first_name']} {result['last_name']}, Score: {result['@search.score']}")
    print(f"Profile: {result['profile_text']}")
    print("---")


## Vector Search Example

Perform a vector search using the embeddings.

In [None]:
# Vector search based on query embedding
query_text = "Cultural events and love of food"
query_embedding = embedding_client.embed(model="text-embedding-3-large", input=query_text).data[0].embedding

# Perform vector search
vector_results = search_client.search(
    search_text="",  # No text search
    vector_queries=[{
        "vector": query_embedding,
        "fields": "profile_vector",
        "k": 5, # Return top 5 matches
        "kind": "vector"  
    }],
    select=["first_name", "last_name", "profile_text"]
)

for result in vector_results:
    print(f"Name: {result['first_name']} {result['last_name']}, Score: {result['@search.score']}")
    print(f"Profile: {result['profile_text']}")
    print("---")

## Hybrid Search Example

A hybrid query combines text search and vector search, where the *search_text* parameter takes a query string and *vector_queries* parameter takes the vectorized query. 

The search engine runs full text and vector queries in parallel. The union of all matches is evaluated for relevance using Reciprocal Rank Fusion (RRF) and a single result set is returned in the response.

Note the rranking of results differs from the previous example, which was a pure vector-only search (*search_text* = "")

In [None]:
query_text = "Cultural events and love of food"
query_embedding = embedding_client.embed(model="text-embedding-3-large", input=query_text).data[0].embedding

# Perform vector search
vector_results = search_client.search(
    search_text=query_text,  # No text search
    vector_queries=[{
        "vector": query_embedding,
        "fields": "profile_vector",
        "k": 5, # Return top 5 matches
        "kind": "vector"  
    }],
    select=["first_name", "last_name", "profile_text"]
)

for result in vector_results:
    print(f"Name: {result['first_name']} {result['last_name']}, Score: {result['@search.score']}")
    print(f"Profile: {result['profile_text']}")
    print("---")