In [53]:
%run base.ipynb

# End-2-End-Test
Als QA-Lab-Team möchten wir einen einfachen E2E-Test aufsetzen, der die Gesamtfunktionalität unseres Prototypen testet

# 1. Laden der Testdaten von Dataland
Erstellen einer Liste mit allen Company IDs deren Nuclear & Gas Datasets wir in unserem Testdurchlauf verwenden möchten

In [54]:
company_id_list = [
    "cb19fd04-f458-4c29-a175-2102e640cee2",
    "888144af-aa27-4a60-8d5f-5805b2e3ee88",
    "a9a0949a-5190-45c1-9542-4f445e63b368",
    "298693d7-de4d-4599-a8d1-b85c8897d940",
    "b3249e50-dede-4fc2-b348-8b8daf36aebe"
]

In [55]:
from dataland_qa_lab.utils import config

conf = config.get_config()
dataland_client = conf.dataland_client

Nun erstellen wir für jede CompanyID einen Nuclear & Gas Datensatz auf dataland.com

In [59]:
from dataland_backend.models.company_associated_data_nuclear_and_gas_data import CompanyAssociatedDataNuclearAndGasData
from dataland_backend.models.extended_data_point_yes_no import ExtendedDataPointYesNo
from dataland_backend.models.nuclear_and_gas_data import NuclearAndGasData
from dataland_backend.models.nuclear_and_gas_general import NuclearAndGasGeneral
from dataland_backend.models.nuclear_and_gas_general_general import NuclearAndGasGeneralGeneral

alternate_yes_no = ["Yes", "No"]
alternate_switch = 0
for i in range(len(company_id_list)):

    nuclear_and_gas_dataset1 = NuclearAndGasData(
        general=NuclearAndGasGeneral(
            general=NuclearAndGasGeneralGeneral(
                nuclearEnergyRelatedActivitiesSection426=ExtendedDataPointYesNo(
                    value=alternate_yes_no[alternate_switch],
                    quality="Reported",
                    comment="",
                    dataSource=None
                )
            )
        )
    )

    company_nuclear_and_gas_data = CompanyAssociatedDataNuclearAndGasData(
        company_id=company_id_list[i],
        reportingPeriod="2024",
        data=nuclear_and_gas_dataset1,
    )

    api = dataland_client.eu_taxonomy_nuclear_gas_api
    api.post_company_associated_nuclear_and_gas_data(company_nuclear_and_gas_data)

    if (alternate_switch == 0):
        alternate_switch = 1
    if (alternate_switch == 1):
        alternate_switch = 0


Basierend auf der CompanyID laden wir nun das zugehörige Nuclear & Gas Dataset

In [None]:
company_id = company_id_list[4]

dataset = dataland_client.eu_taxonomy_nuclear_gas_api.get_all_company_nuclear_and_gas_data(company_id=company_id)
data_id = dataset[0].meta_info.data_id

print(dataset)
print(data_id)

Die ersten sechs Datenpunkte aus dem Nuclear & Gas Dataset werden geladen (Meldebogen 1)

In [None]:
for i in dataset:

    print("1. Datenpunkt: " + i.data.general.general.nuclear_energy_related_activities_section426.value.value)
    # print("2. Datenpunkt: " + i.data.general.general.nuclear_energy_related_activities_section427.value.value)
    # print("3. Datenpunkt: " + i.data.general.general.nuclear_energy_related_activities_section428.value.value)
    # print("4. Datenpunkt: " + i.data.general.general.fossil_gas_related_activities_section429.value.value)
    # print("5. Datenpunkt: " + i.data.general.general.fossil_gas_related_activities_section430.value.value)
    # print("6. Datenpunkt: " + i.data.general.general.fossil_gas_related_activities_section431.value.value)
    first_datapoint = i.data.general.general.nuclear_energy_related_activities_section426

# 2. Laden der Datenquelle von Dataland und Überführen in Textform

In [63]:
document_bytes = dataland_client.documents_api.get_document(first_datapoint.data_source.file_reference)

Finden des relevanten Bereichs im PDF-Dokument

In [64]:
import io

import pypdf

full_document_byte_stream = io.BytesIO(document_bytes)
full_pdf = pypdf.PdfReader(full_document_byte_stream)

partial_document_byte_stream = io.BytesIO()
partial_pdf = pypdf.PdfWriter()

partial_pdf.add_page(full_pdf.get_page(int(first_datapoint.data_source.page) - 1))  # Correct for 0 offset
partial_pdf.write(partial_document_byte_stream)
partial_document_byte_stream.seek(0)
None

Nun nutzen wir die Azure Document Intelligence API, um den Text aus dem PDF zu extrahieren.

In [65]:
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult, ContentFormat
from azure.core.credentials import AzureKeyCredential

docintel_cred = AzureKeyCredential(conf.azure_docintel_api_key)
document_intelligence_client = DocumentIntelligenceClient(
    endpoint=conf.azure_docintel_endpoint, credential=docintel_cred
)

poller = document_intelligence_client.begin_analyze_document(
    "prebuilt-layout",
    analyze_request=partial_document_byte_stream,
    content_type="application/octet-stream",
    output_content_format=ContentFormat.MARKDOWN,
)
result: AnalyzeResult = poller.result()

Das Ergebnis können wir uns direkt im Notebook anzeigen lassen.

In [None]:
from IPython.display import Markdown, display

display(Markdown(result.content))

# 3. Verifizierung mithilfe von GPT-4o

In [67]:
from openai import AzureOpenAI

client = AzureOpenAI(
    api_key=conf.azure_openai_api_key, api_version="2024-07-01-preview", azure_endpoint=conf.azure_openai_endpoint
)

deployment_name = "gpt-4o"

prompt = f"""
You are an AI research Agent. As the agent, you answer questions briefly, succinctly, and factually.
Always justify you answer.

# Safety
- You **should always** reference factual statements to search results based on [relevant documents]
- Search results based on [relevant documents] may be incomplete or irrelevant. You do not make assumptions
  on the search results beyond strictly what's returned.
- If the search results based on [relevant documents] do not contain sufficient information to answer user
  message completely, you respond using the tool 'cannot_answer_question'
- Your responses should avoid being vague, controversial or off-topic.

# Task
Given the information from the [relevant documents], is the company engaged in the research, development,
demonstration, and deployment of innovative power generation facilities that generate energy from nuclear
processes with minimal waste from the fuel cycle, finance such activities, or hold risk positions related
to these activities? Just answer the question with yes or no. The answer should not be longer than 3
characters and should not include punctation.

# Relevant Documents
{result.content}
"""

initial_openai_response = client.chat.completions.create(
    model=deployment_name,
    temperature=0,
    messages=[
        {"role": "system", "content": prompt},
    ],
)
print(initial_openai_response.choices[0].message.content)
report_value = initial_openai_response.choices[0].message.content

# 4. Vergleich der beiden Werte

In [None]:
print(f"Wert in Dataland: \t{first_datapoint.value.value}")
print(f"Wert im Bericht: \t{report_value}")

# 5. Erstellen und Abschicken eines QA Report

In [None]:
from dataland_qa.models.extended_data_point_yes_no import ExtendedDataPointYesNo
from dataland_qa.models.nuclear_and_gas_data import NuclearAndGasData
from dataland_qa.models.nuclear_and_gas_general import NuclearAndGasGeneral
from dataland_qa.models.nuclear_and_gas_general_general import NuclearAndGasGeneralGeneral
from dataland_qa.models.qa_report_data_point_extended_data_point_yes_no import QaReportDataPointExtendedDataPointYesNo
from dataland_qa.models.qa_report_data_point_verdict import QaReportDataPointVerdict

selected_qa_report = NuclearAndGasData(
    general=NuclearAndGasGeneral(
        general=NuclearAndGasGeneralGeneral(
            nuclear_energy_related_activities_section426=QaReportDataPointExtendedDataPointYesNo(
                comment="The value in the report is incorrect. The correct value is No",
                verdict=QaReportDataPointVerdict.QAREJECTED,
                correctedData=ExtendedDataPointYesNo(
                    value=report_value,
                    quality="Reported",
                )
            )
        )
    )
)

api = dataland_client.eu_taxonomy_nuclear_gas_qa_api
api.post_nuclear_and_gas_data_qa_report(data_id, selected_qa_report)