In [85]:
%run base.ipynb

# End-2-End-Test
Als QA-Lab-Team möchten wir einen einfachen E2E-Test aufsetzen, der die Gesamtfunktionalität unseres Prototypen testet

# 1. Laden der Testdaten von Dataland
Erstellen einer Liste mit allen Company IDs deren Nuclear & Gas Datasets wir in unserem Testdurchlauf verwenden möchten

In [86]:
company_id_list = [
    "4423c691-0436-423f-abcb-0a08127ee848",
    "660e1da9-66ac-476e-9c57-034343e392cd",
    "82bedcc2-3f13-43cf-8579-c46f816d57fe",
    "deb294b8-123e-4fb9-b834-4fa322b30883",
    "f2451040-7019-4044-86e5-8137c4fda811",
    # "8054c2ce-3c6d-4078-8f69-78f8c648a5d6",
]

In [87]:
from dataland_qa_lab.utils import config

conf = config.get_config()
dataland_client = conf.dataland_client

Nun erstellen wir für jede CompanyID einen Nuclear & Gas Datensatz auf dataland.com

In [88]:
from dataland_backend.models.company_associated_data_nuclear_and_gas_data import CompanyAssociatedDataNuclearAndGasData
from dataland_backend.models.extended_data_point_yes_no import ExtendedDataPointYesNo
from dataland_backend.models.nuclear_and_gas_data import NuclearAndGasData
from dataland_backend.models.nuclear_and_gas_general import NuclearAndGasGeneral
from dataland_backend.models.nuclear_and_gas_general_general import NuclearAndGasGeneralGeneral

alternate_yes_no = ["Yes", "No"]
alternate_switch = 0
for i in range(len(company_id_list)):
    nuclear_and_gas_dataset1 = NuclearAndGasData(
        general=NuclearAndGasGeneral(
            general=NuclearAndGasGeneralGeneral(
                nuclearEnergyRelatedActivitiesSection426=ExtendedDataPointYesNo(
                    value=alternate_yes_no[alternate_switch], quality="Reported", comment="", dataSource=None
                )
            )
        )
    )

    company_nuclear_and_gas_data = CompanyAssociatedDataNuclearAndGasData(
        company_id=company_id_list[i],
        reportingPeriod="2024",
        data=nuclear_and_gas_dataset1,
    )

    api = dataland_client.eu_taxonomy_nuclear_gas_api
    api.post_company_associated_nuclear_and_gas_data(company_nuclear_and_gas_data)

    if alternate_switch == 0:
        alternate_switch = 1
    if alternate_switch == 1:
        alternate_switch = 0

Basierend auf der CompanyID laden wir nun das zugehörige Nuclear & Gas Dataset

In [89]:
company_id = company_id_list[0]

dataset = dataland_client.eu_taxonomy_nuclear_gas_api.get_all_company_nuclear_and_gas_data(company_id=company_id)
data_id = dataset[0].meta_info.data_id

print(dataset)
print(data_id)

[DataAndMetaInformationNuclearAndGasData(meta_info=DataMetaInformation(data_id='d67db903-575d-48c3-99c3-b7f3f8ea174f', company_id='4423c691-0436-423f-abcb-0a08127ee848', data_type=<DataTypeEnum.NUCLEAR_MINUS_AND_MINUS_GAS: 'nuclear-and-gas'>, uploader_user_id='143995c1-1ead-4324-a49f-043eeaabace4', upload_time=1732022857239, reporting_period='2024', currently_active=True, qa_status=<QaStatus.ACCEPTED: 'Accepted'>), data=NuclearAndGasData(general=NuclearAndGasGeneral(general=NuclearAndGasGeneralGeneral(referenced_reports={'MVV 2023_engl': CompanyReport(file_reference='893f3ce12424765026ff72770f5982ec895e7befc880a71606e96b926cfd0a34', file_name=None, publication_date=datetime.date(2024, 11, 19))}, nuclear_energy_related_activities_section426=ExtendedDataPointYesNo(value=<YesNo.YES: 'Yes'>, quality=<QualityOptions.REPORTED: 'Reported'>, comment='', data_source=ExtendedDocumentReference(page='271', tag_name=None, file_name='MVV 2023_engl', file_reference='893f3ce12424765026ff72770f5982ec89

Die ersten sechs Datenpunkte aus dem Nuclear & Gas Dataset werden geladen (Meldebogen 1)

In [90]:
for i in dataset:
    print("1. Datenpunkt: " + i.data.general.general.nuclear_energy_related_activities_section426.value.value)
    # print("2. Datenpunkt: " + i.data.general.general.nuclear_energy_related_activities_section427.value.value)
    # print("3. Datenpunkt: " + i.data.general.general.nuclear_energy_related_activities_section428.value.value)
    # print("4. Datenpunkt: " + i.data.general.general.fossil_gas_related_activities_section429.value.value)
    # print("5. Datenpunkt: " + i.data.general.general.fossil_gas_related_activities_section430.value.value)
    # print("6. Datenpunkt: " + i.data.general.general.fossil_gas_related_activities_section431.value.value)
    first_datapoint = i.data.general.general.nuclear_energy_related_activities_section426

1. Datenpunkt: Yes


# 2. Laden der Datenquelle von Dataland und Überführen in Textform

In [91]:
document_bytes = dataland_client.documents_api.get_document(first_datapoint.data_source.file_reference)

Finden des relevanten Bereichs im PDF-Dokument

In [92]:
import io

import pypdf

full_document_byte_stream = io.BytesIO(document_bytes)
full_pdf = pypdf.PdfReader(full_document_byte_stream)

partial_document_byte_stream = io.BytesIO()
partial_pdf = pypdf.PdfWriter()

partial_pdf.add_page(full_pdf.get_page(int(first_datapoint.data_source.page) - 1))  # Correct for 0 offset
partial_pdf.write(partial_document_byte_stream)
partial_document_byte_stream.seek(0)
None

Nun nutzen wir die Azure Document Intelligence API, um den Text aus dem PDF zu extrahieren.

In [93]:
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult, ContentFormat
from azure.core.credentials import AzureKeyCredential

docintel_cred = AzureKeyCredential(conf.azure_docintel_api_key)
document_intelligence_client = DocumentIntelligenceClient(
    endpoint=conf.azure_docintel_endpoint, credential=docintel_cred
)

poller = document_intelligence_client.begin_analyze_document(
    "prebuilt-layout",
    analyze_request=partial_document_byte_stream,
    content_type="application/octet-stream",
    output_content_format=ContentFormat.MARKDOWN,
)
result: AnalyzeResult = poller.result()

Das Ergebnis können wir uns direkt im Notebook anzeigen lassen.

In [94]:
from IPython.display import Markdown, display

display(Markdown(result.content))

<!-- PageHeader="Other Disclosures > EU Taxonomy Tables" -->


# Sales (turnover)


<table>
<caption>Sales (turnover) FY 2023: Template 1 Nuclear and fossil gas related activities</caption>
<tr>
<th>Row</th>
<th>Nuclear energy related activities</th>
<th></th>
</tr>
<tr>
<td>1.</td>
<td>The undertaking carries out, funds or has exposures to research, development, demonstration and deployment of innovative electricity generation facilities that produce energy from nuclear processes with minimal waste from the fuel cycle.</td>
<td>No</td>
</tr>
<tr>
<td>2.</td>
<td>The undertaking carries out, funds or has exposures to construction and safe operation of new nuclear installations to produce electricity or process heat, including for the purposes of district heating or industrial processes such as hydrogen production, as well as their safety upgrades, using best available technologies.</td>
<td>No</td>
</tr>
<tr>
<td>3.</td>
<td>The undertaking carries out, funds or has exposures to safe operation of existing nuclear installations that produce electricity or process heat, including for the purposes of district heating or industrial processes such as hydrogen production, from nuclear energy, as well as their safety upgrades.</td>
<td>No</td>
</tr>
<tr>
<td>Row</td>
<td>Fossil gas related activities</td>
<td></td>
</tr>
<tr>
<td>4.</td>
<td>The undertaking carries out, funds or has exposures to construction or operation of electricity generation facilities that produce electricity using fossil gaseous fuels.</td>
<td>Yes</td>
</tr>
<tr>
<td>5.</td>
<td>The undertaking carries out, funds or has exposures to construction, refurbishment and operation of combined heat/cool and power generation facilities using fossil gaseous fuels.</td>
<td>Yes</td>
</tr>
<tr>
<td>6.</td>
<td>The undertaking carries out, funds or has exposures to construction, refurbishment and operation of heat generation facilities that produce heat/cool using fossil gaseous fuels.</td>
<td>Yes</td>
</tr>
</table>


## Sales (turnover) FY 2023: Template 2 Taxonomy-aligned economic activities (denominator)


<table>
<tr>
<th>Row</th>
<th>Economic activities</th>
<th colspan="6">Amount and proportion (disclosed as monetary amounts and percentages)</th>
</tr>
<tr>
<th></th>
<th></th>
<th colspan="2">CCM + CCA</th>
<th colspan="2">Climate change mitigation (CCM)</th>
<th colspan="2">Climate change adaptation (CCA)</th>
</tr>
<tr>
<th></th>
<th></th>
<th>Euro 000</th>
<th>%</th>
<th>Euro 000s</th>
<th>%</th>
<th>Euro 000s</th>
<th>%</th>
</tr>
<tr>
<td>1.</td>
<td>Amount and proportion of taxonomy-aligned economic activity referred to in Section 4.26 of Annexes I and II to Delegated Regulation 2021/2139 in the denominator of the applicable KPI</td>
<td>–</td>
<td>–</td>
<td>–</td>
<td>–</td>
<td>–</td>
<td>–</td>
</tr>
<tr>
<td>2.</td>
<td>Amount and proportion of taxonomy-aligned economic activity referred to in Section 4.27 of Annexes I and II to Delegated Regulation 2021/2139 in the denominator of the applicable KPI</td>
<td>–</td>
<td>–</td>
<td>–</td>
<td>–</td>
<td>–</td>
<td>–</td>
</tr>
<tr>
<td>3.</td>
<td>Amount and proportion of taxonomy-aligned economic activity referred to in Section 4.28 of Annexes I and II to Delegated Regulation 2021/2139 in the denominator of the applicable KPI</td>
<td>–</td>
<td>–</td>
<td>–</td>
<td>–</td>
<td>–</td>
<td>–</td>
</tr>
<tr>
<td>4.</td>
<td>Amount and proportion of taxonomy-aligned economic activity referred to in Section 4.29 of Annexes I and II to Delegated Regulation 2021/2139 in the denominator of the applicable KPI</td>
<td>0</td>
<td>0.0</td>
<td>0</td>
<td>0.0</td>
<td>–</td>
<td>–</td>
</tr>
<tr>
<td>5.</td>
<td>Amount and proportion of taxonomy-aligned economic activity referred to in Section 4.30 of Annexes I and II to Delegated Regulation 2021/2139 in the denominator of the applicable KPI</td>
<td>35,890</td>
<td>0.5</td>
<td>35,890</td>
<td>0.5</td>
<td>–</td>
<td>–</td>
</tr>
<tr>
<td>6.</td>
<td>Amount and proportion of taxonomy-aligned economic activity referred to in Section 4.31 of Annexes I and II to Delegated Regulation 2021/2139 in the denominator of the applicable KPI</td>
<td>0</td>
<td>0.0</td>
<td>0</td>
<td>0.0</td>
<td>–</td>
<td>–</td>
</tr>
<tr>
<td>7.</td>
<td>Amount and proportion of other taxonomy-aligned economic activities not referred to in rows 1 to 6 above in the denominator of the applicable KPI</td>
<td>918,790</td>
<td>12.2</td>
<td>918,790</td>
<td>12.2</td>
<td>–</td>
<td>–</td>
</tr>
<tr>
<td>8.</td>
<td>Total applicable KPI</td>
<td>7,530,520</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
</table>


<!-- PageFooter="MVV Annual Report 2023" -->
<!-- PageNumber="271" -->


# 3. Verifizierung mithilfe von GPT-4o

In [95]:
from openai import AzureOpenAI

client = AzureOpenAI(
    api_key=conf.azure_openai_api_key, api_version="2024-07-01-preview", azure_endpoint=conf.azure_openai_endpoint
)

deployment_name = "gpt-4o"

prompt = f"""
You are an AI research Agent. As the agent, you answer questions briefly, succinctly, and factually.
Always justify you answer.

# Safety
- You **should always** reference factual statements to search results based on [relevant documents]
- Search results based on [relevant documents] may be incomplete or irrelevant. You do not make assumptions
  on the search results beyond strictly what's returned.
- If the search results based on [relevant documents] do not contain sufficient information to answer user
  message completely, you respond using the tool 'cannot_answer_question'
- Your responses should avoid being vague, controversial or off-topic.

# Task
Given the information from the [relevant documents], is the company engaged in the research, development,
demonstration, and deployment of innovative power generation facilities that generate energy from nuclear
processes with minimal waste from the fuel cycle, finance such activities, or hold risk positions related
to these activities? Just answer the question with yes or no. The answer should not be longer than 3
characters and should not include punctation.

# Relevant Documents
{result.content}
"""

initial_openai_response = client.chat.completions.create(
    model=deployment_name,
    temperature=0,
    messages=[
        {"role": "system", "content": prompt},
    ],
)
print(initial_openai_response.choices[0].message.content)
report_value = initial_openai_response.choices[0].message.content

No


# 4. Vergleich der beiden Werte

In [96]:
print(f"Wert in Dataland: \t{first_datapoint.value.value}")
print(f"Wert im Bericht: \t{report_value}")

Wert in Dataland: 	Yes
Wert im Bericht: 	No


# 5. Erstellen und Abschicken eines QA Report

In [97]:
from dataland_qa.models.extended_data_point_yes_no import ExtendedDataPointYesNo
from dataland_qa.models.nuclear_and_gas_data import NuclearAndGasData
from dataland_qa.models.nuclear_and_gas_general import NuclearAndGasGeneral
from dataland_qa.models.nuclear_and_gas_general_general import NuclearAndGasGeneralGeneral
from dataland_qa.models.qa_report_data_point_extended_data_point_yes_no import QaReportDataPointExtendedDataPointYesNo
from dataland_qa.models.qa_report_data_point_verdict import QaReportDataPointVerdict

selected_qa_report = NuclearAndGasData(
    general=NuclearAndGasGeneral(
        general=NuclearAndGasGeneralGeneral(
            nuclear_energy_related_activities_section426=QaReportDataPointExtendedDataPointYesNo(
                comment="The value in the report is incorrect. The correct value is No",
                verdict=QaReportDataPointVerdict.QAREJECTED,
                correctedData=ExtendedDataPointYesNo(
                    value=report_value,
                    quality="Reported",
                ),
            )
        )
    )
)

api = dataland_client.eu_taxonomy_nuclear_gas_qa_api
api.post_nuclear_and_gas_data_qa_report(data_id, selected_qa_report)

QaReportMetaInformation(data_id='d67db903-575d-48c3-99c3-b7f3f8ea174f', data_type='nuclear-and-gas', qa_report_id='9bcb8023-b53a-4729-b18c-263f5ba39ba0', reporter_user_id='44d4e795-c0ee-468b-bd99-edf783a1d991', upload_time=1732023500463, active=True)