In [2]:
from pathlib import Path

from dataland_backend.models.data_type_enum import DataTypeEnum

from dataland_qa_lab.dataland.data_provider import get_numeric_values_by_data, get_yes_no_values_by_data
from dataland_qa_lab.pages.pages_provider import get_relevant_pages_of_pdf
from dataland_qa_lab.pages.text_to_doc_intelligence import extract_text_of_pdf
from dataland_qa_lab.review.numeric_value_generator import NumericValueGenerator
from dataland_qa_lab.review.yes_no_value_generator import extract_yes_no_template
from dataland_qa_lab.utils import config
from dataland_qa_lab.utils.nuclear_and_gas_data_collection import NuclearAndGasDataCollection

conf = config.get_config()
dataland_client = conf.dataland_client

In [3]:
company_infos = dataland_client.company_api.get_companies(data_types=[DataTypeEnum.NUCLEAR_MINUS_AND_MINUS_GAS])

company_ids = [company_info.company_id for company_info in company_infos]

print(len(company_ids))

data_ids = [
    dataland_client.eu_taxonomy_nuclear_and_gas_api.get_all_company_nuclear_and_gas_data(company_id=company_id)[
        0
    ].meta_info.data_id
    for company_id in company_ids
]

print(len(data_ids))

55
55


In [None]:
yes_no_values_dataland = {}
extracted_yes_no_values = {}

# check yes no values
for data_id, company_info in zip(data_ids, company_infos, strict=False):
    data = dataland_client.eu_taxonomy_nuclear_and_gas_api.get_company_associated_nuclear_and_gas_data(data_id=data_id)
    data_collection = NuclearAndGasDataCollection(dataset=data.data)

    # get values on Dataland
    yes_no_values_dataland[company_info.company_name] = get_yes_no_values_by_data(data=data_collection)

    # get values from AI
    try:
        pdf_reader = get_relevant_pages_of_pdf(data_collection)
        text_of_page = extract_text_of_pdf(pdf_reader)
        extracted_yes_no_values[company_info.company_name] = extract_yes_no_template(text_of_page)
    except ValueError as e:
        # the support for multiple pages is not implemented yet
        print(f"Skipping company {company_info.company_name} due to invalid page numbers: {e}")


['2240d847-5ae4-441f-9055-5d026b45354b', '03423310-3cfc-4119-8082-38cca404ddcf', '5754f2cd-41b0-49cb-9da6-cd780b518b26', 'f5d88afc-efc9-4f81-bbdc-2863873a1453', 'f7dc5e60-cf03-4406-8e2c-ba7ef07c7db8', '26a9d60b-2714-4b1b-89ee-9693e89b970a', 'aaa4ecbe-86fb-4e1e-a8ab-deb65dff81aa', '96bae0ae-86a9-4b59-bc3d-19bac36aceb5', 'd95728a4-0563-4ce4-ae66-4abb2b702675', '3a14b997-2ccf-4082-b124-e450efe3e0c1', '25c4b1ce-1670-4723-85d3-1e164f7469c7', '4c4a91e1-ede3-400f-96d4-8c6d686ba655', '6b15add8-d27a-4823-8613-a0b421f1bcff', 'cfa5361b-f1eb-4660-89f2-c3a29f9a0ae2', '6ace1809-0b5b-413c-948e-ccd27e7d347a', '007421d9-c6e9-4954-94d6-40bdd9e7c179', '3bde2091-8fa6-44b2-8454-6047b1025084', 'c0293b73-e9e3-486e-a770-c2210587a160', 'd27349ed-d98f-44f8-8d05-d53bc5554d73', '1ebeb8e0-5cdc-423b-8d41-4e2d8129d6bf', '56753330-0297-423c-aafc-32f19b7a57c4', '800f5ba3-4eb5-43f2-97a4-fd58d0cd80ca', '1680d38f-638f-43eb-aa2b-fc23cd155bb7', '1c4762fc-9210-44ab-ade4-accb5a03af3c', 'b2d94164-25bc-4c80-ae08-055c23b06fdf',

In [30]:
total_sections_yes_no = 0
matching_sections_yes_no = 0

for company, dataland_dict in yes_no_values_dataland.items():
    print("\nCompany:", company)
    if company in extracted_yes_no_values:
        extracted_dict = extracted_yes_no_values[company]

        for section, value in dataland_dict.items():
            extracted_value = extracted_dict.get(section, "Not found")
            print(f"{section}: Dataland={value}, Extracted={extracted_value}")

            total_sections_yes_no += 1
            if value == extracted_value:
                matching_sections_yes_no += 1
    else:
        print(f"No extracted values found for company: {company}")

matching_ratio = matching_sections_yes_no / total_sections_yes_no
print(matching_ratio)



Company: ATLAS COPCO AKTIEBOLAG
nuclear_energy_related_activities_section426: Dataland=YesNo.NO, Extracted=YesNo.NO
nuclear_energy_related_activities_section427: Dataland=YesNo.NO, Extracted=YesNo.NO
nuclear_energy_related_activities_section428: Dataland=YesNo.NO, Extracted=YesNo.NO
fossil_gas_related_activities_section429: Dataland=YesNo.NO, Extracted=YesNo.NO
fossil_gas_related_activities_section430: Dataland=YesNo.NO, Extracted=YesNo.NO
fossil_gas_related_activities_section431: Dataland=YesNo.NO, Extracted=YesNo.NO

Company: BARCLAYS BANK IRELAND PUBLIC LIMITED COMPANY
nuclear_energy_related_activities_section426: Dataland=YesNo.YES, Extracted=YesNo.YES
nuclear_energy_related_activities_section427: Dataland=YesNo.YES, Extracted=YesNo.YES
nuclear_energy_related_activities_section428: Dataland=YesNo.YES, Extracted=YesNo.YES
fossil_gas_related_activities_section429: Dataland=YesNo.YES, Extracted=YesNo.NO
fossil_gas_related_activities_section430: Dataland=YesNo.YES, Extracted=YesNo.NO


In [16]:
numeric_values_dataland = {}
extracted_numeric_values = {}

# check numeric values
for data_id, company_info in zip(data_ids[1:2], company_infos[1:2], strict=False):
    data = dataland_client.eu_taxonomy_nuclear_and_gas_api.get_company_associated_nuclear_and_gas_data(data_id=data_id)
    data_collection = NuclearAndGasDataCollection(dataset=data.data)

    # get values on Dataland
    numeric_values_dataland[company_info.company_name] = get_numeric_values_by_data(data=data_collection)
    # get values from AI
    try:
        pdf_reader = get_relevant_pages_of_pdf(data_collection)
        text_of_page = extract_text_of_pdf(pdf_reader)
        extracted_numeric_values[company_info.company_name] = [
            value
            for sublist in [
                NumericValueGenerator.get_taxonomy_alligned_denominator(text_of_page),
                NumericValueGenerator.get_taxonomy_alligned_numerator(text_of_page),
                NumericValueGenerator.get_taxonomy_eligible_not_alligned(text_of_page),
                NumericValueGenerator.get_taxonomy_non_eligible(text_of_page),
            ]
            for value in sublist
        ]
    except ValueError as e:
        # the support for multiple pages is not implemented yet
        print(f"Skipping company {company_info.company_name} due to invalid page numbers: {e}")

In [19]:
total_sections_numeric = 0
matching_sections_numeric = 0

for company, dataland_dict in numeric_values_dataland.items():
    print("\nCompany:", company)
    if company in numeric_values_dataland:
        extracted_list = extracted_numeric_values[company]
        print(len(extracted_list))

        for i, (section, value) in enumerate(dataland_dict.items()):
            print(i)
            extracted_value = extracted_list[i]
            print(f"{section}: Dataland={value}, Extracted={extracted_value}")

            total_sections_numeric += 1
            if value == extracted_value:
                matching_sections_numeric += 1
    else:
        print(f"No extracted values found for company: {company}")

matching_ratio = matching_sections_numeric / total_sections_numeric
print(matching_ratio)


Company: BARCLAYS BANK IRELAND PUBLIC LIMITED COMPANY
80
0
taxonomy_aligned_capex_denominator: Dataland=taxonomy_aligned_share_denominator_n_and_g426=NuclearAndGasEnvironmentalObjective(mitigation_and_adaptation=None, mitigation=None, adaptation=None) taxonomy_aligned_share_denominator_n_and_g427=NuclearAndGasEnvironmentalObjective(mitigation_and_adaptation=0.01, mitigation=0.01, adaptation=None) taxonomy_aligned_share_denominator_n_and_g428=NuclearAndGasEnvironmentalObjective(mitigation_and_adaptation=0.06, mitigation=0.06, adaptation=None) taxonomy_aligned_share_denominator_n_and_g429=NuclearAndGasEnvironmentalObjective(mitigation_and_adaptation=None, mitigation=None, adaptation=None) taxonomy_aligned_share_denominator_n_and_g430=NuclearAndGasEnvironmentalObjective(mitigation_and_adaptation=None, mitigation=None, adaptation=None) taxonomy_aligned_share_denominator_n_and_g431=NuclearAndGasEnvironmentalObjective(mitigation_and_adaptation=None, mitigation=None, adaptation=None) taxonom