### Document Intelligence - Prebuilt Layout

In [None]:
import os
import json
import time
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat, AnalyzeResult

# Azure Document Intelligence bilgileri
endpoint = "document-intelligence-endpoint"
key = "api-key"

# Client oluştur
document_analysis_client = DocumentAnalysisClient(endpoint, AzureKeyCredential(key))

# İşlenecek PDF dosyası
pdf_path = "sample-tables.pdf"

# Dosyayı oku ve API'ye gönder
with open(pdf_path, "rb") as f:
    poller = document_analysis_client.begin_analyze_document("prebuilt-layout", document=f, pages="1")
    result = poller.result()

# Çıktıyı JSON olarak sakla
output_data = []
for page in result.pages:
    page_data = {
        "page_number": page.page_number,
        "content": " ".join(line.content for line in page.lines),
        "tables": []
    }

    # Tabloları işle
    for table in result.tables:
        table_data = {
            "row_count": table.row_count,
            "column_count": table.column_count,
            "cells": [{"row_index": cell.row_index, "column_index": cell.column_index, "text": cell.content} for cell in table.cells]
        }
        page_data["tables"].append(table_data)

    output_data.append(page_data)

# JSON çıktısını dosyaya kaydet
with open("processed_output.json", "w", encoding="utf-8") as json_file:
    json.dump(output_data, json_file, ensure_ascii=False, indent=4)

print("Document processing completed!")

### Index Creation in Azure AI Search

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    ComplexField,
    SearchFieldDataType,
    SearchableField,
    SimpleField,
    SearchIndex,
    VectorSearch,
    VectorSearchProfile,
    HnswParameters,
    HnswAlgorithmConfiguration,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    SearchField
)

# Azure AI Search bilgileri
search_service_name = "search-service-name"
search_api_key = "api-key"
index_name = "document-index-with-embeddings"

# Search Index Client oluştur
search_index_client = SearchIndexClient(
    endpoint=f"https://{search_service_name}.search.windows.net/",
    credential=AzureKeyCredential(search_api_key)
)

# Indeks şeması
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SimpleField(name="page_number", type=SearchFieldDataType.Int32, retrievable=True),
    SearchableField(name="content", type=SearchFieldDataType.String),

    # ✅ Embedding için Collection(Edm.Single) KULLAN
    SearchField(
        name="content_embedding", 
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),  # ✅ DOĞRU TİP
        searchable=True,
        filterable=False,
        sortable=False,
        facetable=False,
        vector_search_dimensions=1536,
        vector_search_profile_name="myHnswProfileSQ"
    ),

    ComplexField(name="tables", collection=True, fields=[
        SimpleField(name="row_index", type=SearchFieldDataType.Int32),
        SimpleField(name="column_index", type=SearchFieldDataType.Int32),
        SearchableField(name="text", type=SearchFieldDataType.String)
    ])
]

# Vektör arama konfigürasyonu
vector_search_config = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE,
            ),
        ),
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfileSQ",
            algorithm_configuration_name="myHnsw",
            vectorizer_name="default",
        ),
    ],
    vectorizers=[{
        "name": "default",
        "kind": "azureOpenAI",
        "azureOpenAIParameters": {
          "resourceUri": "endpoint url",
          "deploymentId": "embedding model deployment id",
          "apiKey": "api key",
          "modelName": "embedding model name",
        }
    }]
)
# Indeksi oluştur
index = SearchIndex(
    name=index_name,
    fields=fields,
    vector_search=vector_search_config
)

search_index_client.create_or_update_index(index)

print("✅ Azure AI Search index with embeddings created successfully!")


### Embedding Generation

In [None]:
from openai import AzureOpenAI
import json

# Azure OpenAI Bağlantı Bilgileri
AZURE_OPENAI_ENDPOINT = "openai-endpoint"
AZURE_OPENAI_API_KEY = "api-key"
AZURE_OPENAI_DEPLOYMENT_NAME = "embedding-model-deployment-name"

# Azure OpenAI Client oluştur
client = AzureOpenAI(
    azure_endpoint = AZURE_OPENAI_ENDPOINT, 
    api_key=AZURE_OPENAI_API_KEY,  
    api_version="2024-05-01-preview"
)
def get_embedding(text):
    response = client.embeddings.create(
        model=AZURE_OPENAI_DEPLOYMENT_NAME,
        input=text
    )
    return response.data[0].embedding

# JSON verisini yükle
with open("processed_output.json", "r", encoding="utf-8") as json_file:
    documents = json.load(json_file)

# Her doküman için embedding hesapla
for doc in documents:
    doc["content_embedding"] = get_embedding(doc["content"])

# Embedding eklenmiş JSON'u kaydet
with open("processed_with_embeddings.json", "w", encoding="utf-8") as json_file:
    json.dump(documents, json_file, ensure_ascii=False, indent=4)

print("Embeddings successfully created using Azure OpenAI SDK!")


### Load Embeddings to the Index

In [None]:
from azure.search.documents import SearchClient

# Azure AI Search client oluştur
search_client = SearchClient(
    endpoint=f"https://{search_service_name}.search.windows.net/",
    index_name=index_name,
    credential=AzureKeyCredential(search_api_key)
)

# JSON dosyasını oku ve indeksle
with open("processed_with_embeddings.json", "r", encoding="utf-8") as json_file:
    documents = json.load(json_file)

# Her belgeyi uygun formata getir ve yükle
documents_to_upload = []
for idx, doc in enumerate(documents):
    documents_to_upload.append({
        "id": str(idx + 1),
        "page_number": doc["page_number"],
        "content": doc["content"],
        "content_embedding": doc["content_embedding"],
        "tables": [
            {
                "row_index": cell["row_index"],
                "column_index": cell["column_index"],
                "text": cell["text"]
            }
            for table in doc["tables"]
            for cell in table["cells"]
        ]
    })

# Veriyi Azure AI Search'e yükle
search_client.upload_documents(documents=documents_to_upload)
print("Documents with embeddings uploaded to Azure AI Search successfully!")


### RAG Implementation - QnA to the Docs

In [None]:
from openai import AzureOpenAI
from requests.exceptions import ConnectionError
import time

def query_search_and_generate(prompt):
    # Azure AI Search'te sorgu yap
    results = search_client.search(
        search_text=prompt,
        search_fields=["tables/text"],
        include_total_count=True,
        vector_queries=[
            {
                "kind": "text",
                "text": prompt,
                "fields": "content_embedding"
            }
        ],
        select=["id", "page_number", "content", "tables"]
    )

    retrieved_content = " ".join([
        f"Page {doc.get('page_number', '')}: {doc['content']} {doc.get('tables', '')}"
        for doc in results
    ])

    print(f"Retrieved content: {retrieved_content}")

    client = AzureOpenAI(
        azure_endpoint="openai-endpoint", 
        api_key="api-key",  
        api_version="2024-05-01-preview"
    )
    deployment_name = "model deployment name" 

    # Retry logic for transient network issues
      # Retry logic for transient network issues
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=deployment_name,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant. If the data where I found the answer comes from a table, also tell me which page the table is on, and which row and column that information is in. You should return row and column values after increment one. Because indexes start from 0 in the document."},
                    {'role': 'user', 'content': f"Question: {prompt}\n\nContext: {retrieved_content}"}
                ]
            )
            return response.choices[0].message.content
        except ConnectionError as e:
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                raise e

# Test Query
question = "Multiply with the values that contingency value of policy functions for 2010/2011 and Other value of remunerated functions for 2009/2010"

answer = query_search_and_generate(question)

print(answer)