In [None]:
from openai import AzureOpenAI
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery
from azure.core.credentials import AzureKeyCredential
import os
import dotenv
import base64
import re
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat, AnalyzeOutputOption
from langchain.text_splitter import RecursiveCharacterTextSplitter
dotenv.load_dotenv(override=True)
import os
from os import listdir
from os.path import isfile, join
import glob
import json
import hashlib
import uuid

In [None]:
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION")
)

In [None]:
di_client : DocumentIntelligenceClient = DocumentIntelligenceClient(
    credential=AzureKeyCredential(os.getenv("AZURE_DOCUMENT_INTELLIGENCE_API_KEY")),
    endpoint=os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
)

In [None]:
aiseach_client = SearchClient(
    endpoint=os.getenv("AZURE_AISEARCH_ENDPOINT"),
    index_name=os.getenv("AZURE_AISEARCH_INDEX_NAME"),
    credential=AzureKeyCredential(os.getenv("AZURE_AISEARCH_KEY"))
)

### Location where the source files to be read

In [None]:
source_path = os.path.join(os.getcwd(),"src") 
output_raw_document_path = os.path.join(os.getcwd(), "output_raw_documents")
destination_folder = "output_raw_documents"

if not os.path.exists(source_path):
    os.makedirs(source_path)
if not os.path.exists(output_raw_document_path):
    os.makedirs(output_raw_document_path)

In [None]:
def list_files_glob(path, recursive=True):
    pattern=f"{path}/**/*.pdf"
    files = glob.glob(pattern, recursive=recursive)
    list_of_files = []
    for file in files:
        if isfile(file):
            list_of_files.append({
                "file_name": file.split("\\")[-1],
                "relative_path": file.split(source_path)[-1][1:],
                "absolute_path": file,
                "path_wo_origin": path_wo_origin(file),
                "name_wo_extension": file.split("\\")[-1].split(".")[0]
            })
    return list_of_files

def path_wo_origin(path):
    filename = path.split("\\")[-1]
    path_wo_origin = path.split(filename)[0].split(source_path)[-1]
    path_wo_origin = path_wo_origin[1:]
    len_path = len(path_wo_origin) - 1
    path_wo_origin = path_wo_origin[:len_path]
    return path_wo_origin

In [None]:
def analyze_image_from_local(image_bytes: str) -> str:
    
    base64_image = base64.b64encode(image_bytes).decode('utf-8')
    
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant to analyse images.",
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Please provide a detailed analysis of the image, including any relevant context or information that can be inferred from it. \
                     Respond in an objective non sexual, violenting or self-threating manner. Avoid violenting phrases."},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{base64_image}"}
                    },
                ],
            },
        ],
        max_tokens=2000,
        temperature=0.0,
    )

    return response.choices[0].message.content

In [None]:
def analyze_document(file_bytes: bytes) -> dict:
    di_poller = di_client.begin_analyze_document(
    "prebuilt-layout",
    AnalyzeDocumentRequest(bytes_source=file_bytes),
    output_content_format=DocumentContentFormat.MARKDOWN,
    output=[AnalyzeOutputOption.FIGURES])

    di_result = di_poller.result()
    
    return {
        "model_id": di_result.model_id,
        "operation_id": di_poller.details.get('operation_id'),
        "result": di_result.as_dict()
    }

In [None]:
def create_output_directory(file_name):
    if not os.path.exists(file_name):
        os.makedirs(file_name)

In [None]:
def download_images_from_analyzed_document(analyzed_document: dict, file):
    output_path=os.path.join(output_raw_document_path,file.get("path_wo_origin"),file.get("name_wo_extension"))
    create_output_directory(output_path)
    
    operations_id = analyzed_document.get("operation_id")
    model_id = analyzed_document.get("model_id")

    if analyzed_document.get("result"):
        if analyzed_document.get("result").get("figures"):
            image_ids_to_download = [figure["id"] for figure in analyzed_document["result"]["figures"]]
            image_pathes = []
            if operations_id and model_id:
                for image_id in image_ids_to_download:
                    img_bytes = di_client.get_analyze_result_figure(model_id, operations_id, image_id)
                    image_id = image_id.replace(".", "_")
                    with open(os.path.join(output_path,image_id+".png"), "wb") as image_file:
                        image_file.writelines(img_bytes)

In [None]:
def create_image_description(filepath: str):
    with open(filepath, "rb") as image_file:
        image_bytes = image_file.read()
        return analyze_image_from_local(image_bytes)

In [None]:
def replace_image_with_generated_text(analyzed_document):
    try:
        figures_per_page = []
        list_of_figures = analyzed_document.get("result", {}).get("figures", [])
        analyzed_images = analyzed_document.get("analyzed_images", [])
        split_document = analyzed_document.get("split_document", [])
        paragraphs = analyzed_document.get("result", {}).get("paragraphs", [])

        edited_pages = []
        for page in split_document:
            page_number = int(page.get("pageNumber"))
            page_content = page.get("pageContent")
            edited_page_content = page_content

            figures_per_page = [figure for figure in list_of_figures if figure.get("boundingRegions", [{}])[0].get("pageNumber") == page_number]

            for figure in figures_per_page:
                analyzed_image_text = None
                for image in analyzed_images:
                    if image.get("name") == figure.get("id").replace(".", "_"):
                        analyzed_image_text = image.get("image_description")
                        break

                if analyzed_image_text:
                    if figure.get("elements"):
                        ids = [int(element.split("/")[-1]) for element in figure.get("elements")]
                        replace_list = [paragraphs[idx].get("content").replace(":selected: ","").replace(":unselected:","").replace(":unselected: ","").replace(":selected:", "").strip() for idx in ids if idx < len(paragraphs)]
                        replace_list = [item for item in replace_list if item != '']
                        if replace_list:
                            pattern = r"(" + r"\s+".join(map(re.escape, replace_list)) + r")"
                            edited_page_content = re.sub(pattern, repr(analyzed_image_text), edited_page_content, count=1)
                    else:
                        edited_page_content += "\n\n" + analyzed_image_text

            edited_pages.append({
                "pageNumber": page.get("pageNumber"),
                "pageContent": edited_page_content
            })

        analyzed_document["edited_text"] = edited_pages
        return analyzed_document

    except Exception as e:
        print(f"An error occurred: {e}")
        return analyzed_document


In [None]:
def create_table_of_image_descriptions(analyzed_document,file) -> list:
    image_pathes = []
    path = os.path.join(output_raw_document_path,file.get("path_wo_origin"),file.get("name_wo_extension"))
    rel_path = os.path.join(file.get("path_wo_origin"),file.get("name_wo_extension"))
    for file in glob.glob(f"{path}\\*.png"):
        image_id = file.split("\\")[-1].split(".")[0].replace(".", "_")
        image_pathes.append({"name": image_id, "path": file, "relative_path": os.path.join(destination_folder,rel_path,image_id+".png"), "pageNumber": image_id.split("_")[0]})
        analyzed_document['analyzed_images'] = image_pathes
    
    if analyzed_document.get("analyzed_images"):
        for image_path in analyzed_document.get("analyzed_images"):
            print(image_path)
            image_description = create_image_description(image_path.get("path"))
            image_path["image_description"] = image_description
            image_path["type"] = "image"

In [None]:
def split_document(analyzed_document):
    fileContent = []
    for idx, page in enumerate(analyzed_document.get("result").get("pages")):
        pageContent = {
            "pageNumber": str(analyzed_document.get("result").get("pages")[idx].get("pageNumber")),
            "pageContent": ' '.join([pageLineContent.get("content") for pageLineContent in page.get("lines") if pageLineContent is not None])
        }
        fileContent.append(pageContent.copy())

    analyzed_document["split_document"] = fileContent

    return analyzed_document


In [None]:
def chunk_document(analyzed_document):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=2048,  # Maximum size of each chunk
        chunk_overlap=204  # Overlap between chunks
    )
    chunks_final_list = []
    for page in analyzed_document.get("edited_text"):
        chunks = splitter.split_text(page.get("pageContent"))
        for chunk in chunks:
            chunk_result = {
            "pageNumber": page.get("pageNumber"),
            "pageContent": chunk
            }
            chunks_final_list.append(chunk_result.copy())
    
    analyzed_document["chunks"] = chunks_final_list
    return analyzed_document

In [None]:
def create_embeddings(analyzed_document):
    embeddings = []
    for chunk in analyzed_document.get("chunks"):
        embeddings.append(client.embeddings.create(
            model="text-embedding-3-large",
            input=chunk.get("pageContent")
        ).data[0].embedding)
    analyzed_document['chunk_embedding'] = list(zip(analyzed_document.get("chunks"),embeddings))
    return analyzed_document

In [None]:
def save_analyzed_document_to_file(analzyedDocument,file,filename):
        output_path = os.path.join(output_raw_document_path,file.get("path_wo_origin"),file.get("name_wo_extension"))
        create_output_directory(output_path)
        with open(os.path.join(output_path,filename), "w") as file:
            file.write(json.dumps(analzyedDocument))

In [None]:
def create_uuid_from_string(val: str) -> str:
    hex_string = hashlib.md5(val.encode("UTF-8")).hexdigest()
    return str(uuid.UUID(hex=hex_string))

In [None]:
def generate_aisearch_output(analyzed_document,file):
    list_of_dict=[]
    if analyzed_document.get("chunk_embedding"):
        for chunk in analyzed_document.get("chunk_embedding"):
            unique_id = create_uuid_from_string(chunk[0].get("pageContent"))
            chunk_data = {
                    "id": unique_id,
                    "filename": file.get("name_wo_extension"),
                    "path": os.path.join(destination_folder,file.get("relative_path")),
                    "content": chunk[0]["pageContent"],
                    "pageNumber": chunk[0]["pageNumber"],
                    "contentVector": chunk[1],
                    "type": "text"
                }
            chunk_data.update(analyzed_document.get("metadata"))

            list_of_dict.append(chunk_data)
    
    if analyzed_document.get("analyzed_images"):
        for chunk in analyzed_document.get("analyzed_images"):
            unique_id = create_uuid_from_string(chunk.get("image_description"))
            image_data = {
                    "id": unique_id,
                    "filename": file.get("name_wo_extension"),
                    "path": chunk.get("relative_path"),
                    "content": chunk.get("image_description"),
                    "pageNumber": chunk.get("pageNumber"),
                    "contentVector": client.embeddings.create(
                        model="text-embedding-3-large",
                        input=chunk.get("image_description")
                    ).data[0].embedding,
                    "type": chunk.get("type")
                }
            image_data.update(analyzed_document.get("metadata"))
            
            list_of_dict.append(image_data)

    analyzed_document["indexed_data"] = list_of_dict
    
    return list_of_dict

In [None]:
def upload_documents_to_vectordb(list_documents_to_upload):
    for doc in list_documents_to_upload:
        aiseach_client.upload_documents(doc)

In [None]:
def delete_documents_from_vectordb(file):
    if file.get("name_wo_extension"):
        results = [res for res in aiseach_client.search(filter=f"filename eq '{file.get('name_wo_extension')}'")]
        if results:
            aiseach_client.delete_documents(documents=results)

In [None]:
def classify_document_content(file,prompt,analyzed_document,list_of_sources):
    response = client.chat.completions.create(
            model='gpt-4o',
            messages=[
                {
                    "role": "system",
                    "content": prompt
                },
                {
                    "role": "user",
                    "content": f"fileObject: {file}, documentContent: {analyzed_document.get('result').get('content')}, list_of_source_files: {list_of_sources}"
                },
            ],
            temperature=0.0,
            response_format={ "type": "json_object" }
        )
    analyzed_document["metadata"] = json.loads(response.choices[0].message.content)
    return analyzed_document

In [None]:
classification_prompt = \
"""
Your task is to create metadata for a given document and output them as a json object.
The document content is based on aviation knowledge base records.


1. You match the given document path to a shelf and a book.
try to extract from the absolutePath of the file object the shelf and book.
The list of shelfs: ["Welcome", "Airport Pilotbriefings", "ATC Training (German)", "ATC Training (English)", "Centersectors", "Flight Information Regions (FIRs)", "LoA", "Pilots", "Software", "SOPs-Airports", "Tools"]
The list of books: ["AFIS", "Air law", "Aircraft Knowledge", "Airfields Germany", "Airports Bremen FIR - EDWW", "Airports Langen FIR - EDGG", "Airports München FIR - EDMM", "Airspace Germany", "Allgemein (deutsch)", "Ansprechpartner", "ATC", "ATC English", "ATC Software", "Aufgaben und Zuständigkeitsbereiche", "Ausbildungsübersicht PTD", "Bremen FIR (EDWW)", "Coordination", "CPDLC Logon Codes", "EDGG - Langen Radar", "EDMM - München Radar", "EDUU - Rhein Radar", "EDWW - Bremen Radar", "EDYY - Maastricht Radar", "Einstieg als Pilot", "Familiarisation", "FIS - Langen Information", "Flugzeugkunde", "General (englisch)", "Heli-Ops", "IFR", "Koordination", "Langen FIR (EDGG)", "Luftrecht", "Meteorologie", "Meteorology", "Military", "Military Procedures", "München FIR (EDMM)", "Phraseologie", "Phraseology", "Pilot", "Pilot Software", "Practical Procedures", "Praktische Verfahren", "Quicksheets", "Segelflug", "Separation", "SOPs FIR Bremen", "SOPs FIR Langen", "SOPs FIR München", "Staffelung", "Tasks and areas of responsibility", "Technical Knowledge", "Technikkunde", "Trainingsmodule PTD", "vACDM", "VatGer Touren", "VFR", "vSID Plugin", "Euroscope SID assignment"]

example: 
path: 'c:\repositories\debug\gptsamples\vatsim\output_raw_documents\willkommen\pilot.pdf'
result: shelf: welcome, book: pilot

Shelf and book can only contain exactly one matching item!

<<fileObject>>

2. Assign keywords based on the document content.
Here is a list of keywords: [Controller Information, Pilot Information, General Aviation, VATSIM Information, Phraseology, VFR, IFR]

Assign maximum 3 keywords for each document.

<<documentContent>>

3. Create an abstract from the document content.
Not longer than 20 sentences. Try to highlight the key elements of the content.
<<documentContent>>

Respond with a JSON Object. Do not respond with something else! 
Output in the following format:

{
    "shelf": shelf
    "book": book
    "keywords": [item1, item2, ..]
    "abstract": abstract
}
"""


In [None]:
list_documents_to_upload = []
list_of_pdfs = list_files_glob(source_path)
for file in list_of_pdfs:
    with open(os.path.join(output_raw_document_path,"uploaded_documents.json"), "r") as f:
        files = json.loads(f.read())
    if not file.get("file_name") in files:
        if not os.path.exists(os.path.join(output_raw_document_path,file.get("path_wo_origin"),file.get("name_wo_extension"),"analyzed_document.json")):
            analyzed_document = analyze_document(open(file.get("absolute_path"), "rb").read())
            save_analyzed_document_to_file(analyzed_document,file,"analyzed_document.json")
            download_images_from_analyzed_document(analyzed_document, file)
        else:
            analyzed_document = json.loads(open(os.path.join(output_raw_document_path,file.get("path_wo_origin"),file.get("name_wo_extension"),"analyzed_document.json"), "r").read())
        create_table_of_image_descriptions(analyzed_document,file)
        split_document(analyzed_document)
        replace_image_with_generated_text(analyzed_document)
        chunk_document(analyzed_document)
        create_embeddings(analyzed_document)
        classify_document_content(file,classification_prompt,analyzed_document,list_of_pdfs)
        generate_aisearch_output(analyzed_document,file)
        save_analyzed_document_to_file(analyzed_document,file,"enriched_document.json")
        delete_documents_from_vectordb(file)
        upload_documents_to_vectordb(analyzed_document.get("indexed_data"))
        files.append(file.get("file_name"))
        with open(os.path.join(output_raw_document_path,"uploaded_documents.json"), "w") as f:
           f.write(json.dumps(files))

### EXPLORATION TO BE DELETED

In [None]:
# analyzed_document["result"]['tables']

In [None]:
# import pandas as pd
# panda_tables = []
# def create_pandas_tables(table):
#     column_length = [i for i in range(table.column_count)]
#     column_headers = []
#     for cell in table.cells:
#         if cell.row_index == 0 and cell.get("columnSpan") is None:
#             column_headers.append(cell.content)
#         elif cell.row_index == 0 and cell.get("columnSpan") is not None:
#             column_headers.append(cell.content)
#             for i in range(1,cell.get("columnSpan")):
#                 column_headers.append(cell.content)
    
#     df = pd.DataFrame(columns=column_length)
#     for cell in table.cells:
#         if cell.row_index != 0 and cell.get("columnSpan") is None:
#             df.at[cell.row_index,cell.column_index] = cell.content
#         elif cell.row_index != 0 and cell.get("columnSpan") is not None:
#             df.at[cell.row_index,cell.column_index] = cell.content
#             for i in range(1,cell.get("columnSpan")):
#                 df.at[cell.row_index,cell.column_index+i] = cell.content

#     df.fillna("", inplace=True)
#     df.columns = column_headers
#     return df

In [None]:
# table_output = (create_pandas_tables(analyzed_document["result"]['tables'][4]))

In [None]:
# table_output.to_markdown()