In [6]:
import os
import json
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
)
from pathlib import Path

load_dotenv()

search_endpoint = os.environ['AZURE_AI_SEARCH_ENDPOINT']
search_key = os.environ['AZURE_AI_SEARCH_KEY']
search_index = os.environ['AZURE_AI_SEARCH_INDEX']
credential = AzureKeyCredential(search_key)

index_client = SearchIndexClient(endpoint=search_endpoint, credential=credential)
fields = [
    SimpleField(name="topic", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="source", type=SearchFieldDataType.String),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1024, vector_search_configuration="my-vector-config"),
]

vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        prioritized_content_fields=[SemanticField(field_name="content")]
    )
)

semantic_settings = SemanticSettings(configurations=[semantic_config])

index = SearchIndex(name=search_index, fields=fields, vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f"Index name: {result.name} is configured")

Index name: bootcathon-meowhack-index is configured


In [7]:
search_client = SearchClient(endpoint=search_endpoint, index_name=search_index, credential=credential)
for file in Path().glob("processed_embedded/*.json"):
    input_data = json.loads(file.read_text())
    result = search_client.upload_documents(input_data)
    print(f"Uploaded embedded chunk: {file.name} to {search_index}")

Uploaded embedded chunk: why-mobil_url_3_chunk_1.json to bootcathon-meowhack-index
Uploaded embedded chunk: mobil-super-moto-products_url_12_chunk_3.json to bootcathon-meowhack-index
Uploaded embedded chunk: mobil-super-moto-products_url_4_chunk_4.json to bootcathon-meowhack-index
Uploaded embedded chunk: mobil-super-moto-products_url_11_chunk_1.json to bootcathon-meowhack-index
Uploaded embedded chunk: mobil-super-moto-products_url_8_chunk_2.json to bootcathon-meowhack-index
Uploaded embedded chunk: mobil-super-moto-products_url_3_chunk_4.json to bootcathon-meowhack-index
Uploaded embedded chunk: mobil-1_url_3_chunk_1.json to bootcathon-meowhack-index
Uploaded embedded chunk: mobil-1_url_4_chunk_1.json to bootcathon-meowhack-index
Uploaded embedded chunk: mobil-super-moto-products_url_1_chunk_7.json to bootcathon-meowhack-index
Uploaded embedded chunk: mobil-super-moto-products_url_9_chunk_3.json to bootcathon-meowhack-index
Uploaded embedded chunk: why-mobil_url_1_chunk_2.json to boo