Pypdf Install

In [None]:
%pip install pypdf

Index Creation

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    ComplexField,
    SearchFieldDataType,
    SearchableField,
    SimpleField,
    SearchIndex,
    VectorSearch,
    VectorSearchProfile,
    HnswParameters,
    HnswAlgorithmConfiguration,
    VectorSearchAlgorithmMetric,
    SearchField
)

# Azure AI Search service information
search_service_name = ""
search_api_key = ""
index_name = "document-chunks-index-2"

# Create Search Index Client
search_index_client = SearchIndexClient(
    endpoint=f"https://{search_service_name}.search.windows.net/",
    credential=AzureKeyCredential(search_api_key)
)

# Define index schema with required fields
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SimpleField(name="document_id", type=SearchFieldDataType.String, filterable=True, retrievable=True, SearchableField=True, sortable=True),
    SearchableField(name="document_name", type=SearchFieldDataType.String, retrievable=True, filterable=True,SearchableField=True , sortable=True),
    SimpleField(name="chunk_number", type=SearchFieldDataType.Int32, filterable=True, retrievable=True, SearchableField=True, sortable=True),
    SimpleField(name="chunk_pagenumber", type=SearchFieldDataType.String, filterable=True, retrievable=True, SearchableField=True, sortable=True),
    SimpleField(name="chunk_startoffset", type=SearchFieldDataType.Int32, retrievable=True, SearchableField=True, sortable=True, filterable=True),
    SimpleField(name="chunk_endoffset", type=SearchFieldDataType.Int32, retrievable=True, SearchableField=True, sortable=True, filterable=True),
    SearchableField(name="chunk_text", type=SearchFieldDataType.String, retrievable=True, SearchableField=True, sortable=True, filterable=True),
    
    # Vector field for embeddings
    SearchField(
        name="chunk_vector", 
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        filterable=False,
        sortable=False,
        facetable=False,
        vector_search_dimensions=1536,
        vector_search_profile_name="myHnswProfileSQ"
    )
]

# Vector search configuration
vector_search_config = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE,
            ),
        ),
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfileSQ",
            algorithm_configuration_name="myHnsw",
            vectorizer_name="default",
        ),
    ],
    vectorizers=[{
        "name": "default",
        "kind": "azureOpenAI",
        "azureOpenAIParameters": {
          "resourceUri": "https://<your-openai-resource-name>.openai.azure.com/",
          "deploymentId": "embedding-ada",
          "apiKey": "your-openai-api-key",
          "modelName": "text-embedding-ada-002"
        }
    }]
)

# Create the index
index = SearchIndex(
    name=index_name,
    fields=fields,
    vector_search=vector_search_config
)

# Create or update the index
search_index_client.create_or_update_index(index)

print("✅ Azure AI Search index with document chunks and embeddings created successfully!")

✅ Azure AI Search index with document chunks and embeddings created successfully!


Index Load

In [None]:
import os, uuid, math
from pathlib import Path


from pypdf import PdfReader
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from openai import AzureOpenAI          # pip install openai>=1.14.0

# ----------------------------------------------------------------------------------
# 1)  Configuration (put real values in environment variables – never hard‑code keys)
# ----------------------------------------------------------------------------------
SEARCH_ENDPOINT = "your-search-service-endpoint"
SEARCH_KEY = "your-search-service-key"
SEARCH_INDEX_NAME = "document-chunks-index-2"

AOAI_ENDPOINT = "your-openai-endpoint"          # e.g. https://<your-openai-resource-name>.openai.azure.com/
AOAI_KEY = "your-openai-key"                      # e.g. "your-openai-api-key"
AOAI_DEPLOYMENT = "text-embedding-ada-002"

# ----------------------------------------------------------------------------------
# 2)  Helpers
# ----------------------------------------------------------------------------------
def embed_text(text: str) -> list[float]:
    client = AzureOpenAI(
        api_key=AOAI_KEY,
        api_version="2024-02-15-preview",
        azure_endpoint=AOAI_ENDPOINT
    )
    resp = client.embeddings.create(
        input=[text],
        model=AOAI_DEPLOYMENT
    )
    return resp.data[0].embedding          # 1536‑dim list[float]

def chunk_page_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> list[str]:
    """Split long text into overlapping chunks to stay within token limits."""
    start, end, chunks = 0, 0, []
    while start < len(text):
        end = min(len(text), start + chunk_size)
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# ----------------------------------------------------------------------------------
# 3)  Extract, chunk, embed
# ----------------------------------------------------------------------------------
def process_pdf(pdf_path: str):
    reader = PdfReader(pdf_path)
    document_id   = str(uuid.uuid4())
    document_name = Path(pdf_path).name

    records = []
    chunk_number = 0

    for page_no, page in enumerate(reader.pages, start=1):
        page_text = page.extract_text() or ""
        chunks = chunk_page_text(page_text)

        for i, chunk in enumerate(chunks):
            chunk_number += 1
            vector = embed_text(chunk)

            # Build the indexable document
            record = {
                "id": str(uuid.uuid4()),
                "document_id": document_id,
                "document_name": document_name,
                "chunk_number": chunk_number,
                "chunk_pagenumber": str(page_no),  
                "chunk_startoffset": i * len(chunk),
                "chunk_endoffset": i * len(chunk) + len(chunk),
                "chunk_text": chunk,
                "chunk_vector": vector
            }
            records.append(record)

    return records

# ----------------------------------------------------------------------------------
# 4)  Upload to Azure AI Search
# ----------------------------------------------------------------------------------
def upload_chunks(records: list[dict]):
    search_client = SearchClient(
        endpoint=SEARCH_ENDPOINT,
        index_name=SEARCH_INDEX_NAME,
        credential=AzureKeyCredential(SEARCH_KEY)
    )
    for batch_start in range(0, len(records), 1000):   # batch in groups of 1000
        batch = records[batch_start: batch_start + 1000]
        result = search_client.upload_documents(batch)
        if not all(r.succeeded for r in result):
            failed = [r for r in result if not r.succeeded]
            raise RuntimeError(f"Upload failed for: {[f.key for f in failed]}")

# ----------------------------------------------------------------------------------
# 5)  Run
# ----------------------------------------------------------------------------------
if __name__ == "__main__":
    sample_pdf = r"Faaliyet-Raporu-2023.pdf"          # put a PDF next to this script
    docs = process_pdf(sample_pdf)
    upload_chunks(docs)
    print(f"Uploaded {len(docs)} chunks from {sample_pdf}")

Uploaded 405 chunks from C:\Users\emrahmete\Downloads\TCMB-Faaliyet-Raporu-2023.pdf
