In [1]:
from dataland_backend.models.data_type_enum import DataTypeEnum

from dataland_qa_lab.dataland.data_provider import (
    get_taxonomy_aligned_capex_denominator_values_by_data,
    get_taxonomy_aligned_capex_numerator_values_by_data,
    get_taxonomy_aligned_revenue_denominator_values_by_data,
    get_taxonomy_aligned_revenue_numerator_values_by_data,
    get_taxonomy_eligible_but_not_aligned_capex_values_by_data,
    get_taxonomy_eligible_but_not_aligned_revenue_values_by_data,
    get_taxonomy_non_eligible_capex_values_by_data,
    get_taxonomy_non_eligible_revenue_values_by_data,
    get_yes_no_values_by_data,
)
from dataland_qa_lab.pages.pages_provider import get_relevant_pages_of_pdf
from dataland_qa_lab.pages.text_to_doc_intelligence import extract_text_of_pdf
from dataland_qa_lab.review.numeric_value_generator import NumericValueGenerator
from dataland_qa_lab.review.yes_no_value_generator import get_yes_no_values_from_report
from dataland_qa_lab.utils import config
from dataland_qa_lab.utils.nuclear_and_gas_data_collection import NuclearAndGasDataCollection

conf = config.get_config()
dataland_client = conf.dataland_client

In [2]:
company_infos = dataland_client.company_api.get_companies(data_types=[DataTypeEnum.NUCLEAR_MINUS_AND_MINUS_GAS])

company_ids = [company_info.company_id for company_info in company_infos]

print(len(company_ids))

data_ids = [
    dataland_client.eu_taxonomy_nuclear_and_gas_api.get_all_company_nuclear_and_gas_data(company_id=company_id)[
        0
    ].meta_info.data_id
    for company_id in company_ids
]

print(len(data_ids))

69
69


In [3]:
yes_no_values_dataland = {}
extracted_yes_no_values = {}

# check yes no values
for data_id, company_info in zip(data_ids[8:9], company_infos[8:9], strict=False):
    print(company_info.company_name)
    data = dataland_client.eu_taxonomy_nuclear_and_gas_api.get_company_associated_nuclear_and_gas_data(data_id=data_id)
    data_collection = NuclearAndGasDataCollection(dataset=data.data)

    # get values on Dataland
    yes_no_values_dataland[company_info.company_name] = get_yes_no_values_by_data(data=data_collection)

    # get values from AI
    try:
        pdf_reader = get_relevant_pages_of_pdf(data_collection)
        text_of_page = extract_text_of_pdf(pdf_reader)
        extracted_yes_no_values[company_info.company_name] = get_yes_no_values_from_report(text_of_page)
    except ValueError as e:
        # the support for multiple pages is not implemented yet
        print(f"Skipping company {company_info.company_name} due to invalid page numbers: {e}")

BPCE


In [4]:
total_sections_yes_no = 0
matching_sections_yes_no = 0

for company, dataland_dict in yes_no_values_dataland.items():
    print("\nCompany:", company)
    if company in extracted_yes_no_values:
        extracted_dict = extracted_yes_no_values[company]

        for section, sections in dataland_dict.items():
            extracted_value = extracted_dict.get(section, "Not found")
            print(f"{section}: Dataland={sections}, Extracted={extracted_value}")

            total_sections_yes_no += 1
            if sections == extracted_value:
                matching_sections_yes_no += 1
    else:
        print(f"No extracted values found for company: {company}")

matching_ratio = matching_sections_yes_no / total_sections_yes_no
print(matching_ratio)


Company: BPCE
nuclear_energy_related_activities_section426: Dataland=YesNo.NO, Extracted=YesNo.NO
nuclear_energy_related_activities_section427: Dataland=YesNo.YES, Extracted=YesNo.YES
nuclear_energy_related_activities_section428: Dataland=YesNo.YES, Extracted=YesNo.YES
fossil_gas_related_activities_section429: Dataland=YesNo.YES, Extracted=YesNo.YES
fossil_gas_related_activities_section430: Dataland=YesNo.NO, Extracted=YesNo.NO
fossil_gas_related_activities_section431: Dataland=YesNo.NO, Extracted=YesNo.NO
1.0


In [8]:
numeric_values_dataland = {}
extracted_numeric_values = {}

# check numeric values
for data_id, company_info in zip(data_ids[6:7], company_infos[6:7], strict=False):
    data = dataland_client.eu_taxonomy_nuclear_and_gas_api.get_company_associated_nuclear_and_gas_data(data_id=data_id)
    data_collection = NuclearAndGasDataCollection(dataset=data.data)

    # get values on Dataland
    if company_info.company_name not in numeric_values_dataland:
        numeric_values_dataland[company_info.company_name] = {}

    numeric_values_dataland[company_info.company_name]["aligned_revenue_denominator"] = (
        get_taxonomy_aligned_revenue_denominator_values_by_data(data=data_collection)
    )
    numeric_values_dataland[company_info.company_name]["aligned_capex_denominator"] = (
        get_taxonomy_aligned_capex_denominator_values_by_data(data=data_collection)
    )
    numeric_values_dataland[company_info.company_name]["aligned_revenue_numerator"] = (
        get_taxonomy_aligned_revenue_numerator_values_by_data(data=data_collection)
    )
    numeric_values_dataland[company_info.company_name]["aligned_capex_numerator"] = (
        get_taxonomy_aligned_capex_numerator_values_by_data(data=data_collection)
    )
    numeric_values_dataland[company_info.company_name]["not_aligned_revenue"] = (
        get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(data=data_collection)
    )
    numeric_values_dataland[company_info.company_name]["not_aligned_capex"] = (
        get_taxonomy_eligible_but_not_aligned_capex_values_by_data(data=data_collection)
    )
    numeric_values_dataland[company_info.company_name]["non_eligible_revenue"] = (
        get_taxonomy_non_eligible_revenue_values_by_data(data=data_collection)
    )
    numeric_values_dataland[company_info.company_name]["non_eligible_capex"] = (
        get_taxonomy_non_eligible_capex_values_by_data(data=data_collection)
    )

    # get values from AI
    try:
        pdf_reader = get_relevant_pages_of_pdf(data_collection)
        text_of_page = extract_text_of_pdf(pdf_reader)
        extracted_numeric_values[company_info.company_name] = [
            value
            for sublist in [
                NumericValueGenerator.get_taxonomy_alligned_denominator(text_of_page, "Revenue"),
                NumericValueGenerator.get_taxonomy_alligned_denominator(text_of_page, "CapEx"),
                NumericValueGenerator.get_taxonomy_alligned_numerator(text_of_page, "Revenue"),
                NumericValueGenerator.get_taxonomy_alligned_numerator(text_of_page, "CapEx"),
                NumericValueGenerator.get_taxonomy_eligible_not_alligned(text_of_page, "Revenue"),
                NumericValueGenerator.get_taxonomy_eligible_not_alligned(text_of_page, "CapEx"),
                NumericValueGenerator.get_taxonomy_non_eligible(text_of_page, "Revenue"),
                NumericValueGenerator.get_taxonomy_non_eligible(text_of_page, "CapEx"),
            ]
            for value in sublist
        ]
    except ValueError as e:
        # the support for multiple pages is not implemented yet
        print(f"Skipping company {company_info.company_name} due to invalid page numbers: {e}")

In [25]:
total_sections_numeric = 0
matching_sections_numeric = 0

for company, dataland_dict in numeric_values_dataland.items():
    print("\nCompany:", company)
    if company in extracted_numeric_values:
        extracted_list = extracted_numeric_values[company]

        # Flatten the dataland_dict values to compare with the extracted_list
        flattened_dataland_values = []
        for sections in dataland_dict.values():
            # Extract all the relevant attributes from the object, treating None as 0
            if sections is not None:
                for section in sections.values():
                    if not isinstance(section, list):
                        flattened_dataland_values.extend([section])
                    elif section is not None:
                        flattened_dataland_values.extend(section)
                    else:
                        # if section is not answered that means the 3 values of this section are 0
                        flattened_dataland_values.extend([0] * 3)
            else:
                # if template is not answered that means the 24 values of this template are 0
                flattened_dataland_values.extend([0] * 24)

        for i, (dataland_value, extracted_value) in enumerate(
            zip(flattened_dataland_values, extracted_list, strict=False)
        ):
            print(f"Section {i}: Dataland={dataland_value}, Extracted={extracted_value}")
            total_sections_numeric += 1
            if dataland_value == float(extracted_value):
                matching_sections_numeric += 1
    else:
        print(f"No extracted values found for company: {company}")

matching_ratio = matching_sections_numeric / total_sections_numeric if total_sections_numeric > 0 else 0
print(f"Matching ratio: {matching_ratio:.2%}")


Company: Berliner Volksbank eG
Section 0: Dataland=0, Extracted=0.0
Section 1: Dataland=0, Extracted=0.0
Section 2: Dataland=0, Extracted=0.0
Section 3: Dataland=0, Extracted=0.0
Section 4: Dataland=0, Extracted=0.0
Section 5: Dataland=0, Extracted=0.0
Section 6: Dataland=0, Extracted=0.0
Section 7: Dataland=0, Extracted=0.0
Section 8: Dataland=0, Extracted=0.0
Section 9: Dataland=0, Extracted=0.0
Section 10: Dataland=0, Extracted=0.0
Section 11: Dataland=0, Extracted=0.0
Section 12: Dataland=0, Extracted=0.0
Section 13: Dataland=0, Extracted=0.0
Section 14: Dataland=0, Extracted=0.0
Section 15: Dataland=0, Extracted=0.0
Section 16: Dataland=0, Extracted=0.0
Section 17: Dataland=0, Extracted=0.0
Section 18: Dataland=0.1, Extracted=0.1
Section 19: Dataland=0.1, Extracted=0.1
Section 20: Dataland=0, Extracted=0.0
Section 21: Dataland=0.1, Extracted=0.1
Section 22: Dataland=0.1, Extracted=0.1
Section 23: Dataland=0, Extracted=0.0
Section 24: Dataland=0, Extracted=0.0
Section 25: Dataland