In [1]:
"""
ESG Data Extraction using LangChain and Google Gemini

This script processes ESG reports (PDFs) and extracts structured information about:
- Environmental campaigns and activities
- Conducted environmental actions
- Planned environmental actions
- Medium/long-term environmental goals

Author: Geo
Date: 2025
"""

'\nESG Data Extraction using LangChain and Google Gemini\n\nThis script processes ESG reports (PDFs) and extracts structured information about:\n- Environmental campaigns and activities\n- Conducted environmental actions\n- Planned environmental actions\n- Medium/long-term environmental goals\n\nAuthor: Geo\nDate: 2025\n'

In [2]:
# =============================================================================
# IMPORTS
# =============================================================================

import os
import uuid
from typing import List, Optional

import numpy as np
from dotenv import load_dotenv
from pydantic import BaseModel, Field

import google.generativeai as genai
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_core.embeddings import Embeddings
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# =============================================================================
# CONFIGURATION
# =============================================================================

# File paths
INPUT_DIR = "../data"
INPUT_FILE = "LFC.pdf"
OUTPUT_DIR = "../output"
CHROMA_DB_PATH = "../vectorstores"

# Create directories if they don't exist
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(CHROMA_DB_PATH, exist_ok=True)

# Configure API
load_dotenv()
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
genai.configure(api_key=GEMINI_API_KEY)

# File paths
input_filepath = os.path.join(INPUT_DIR, INPUT_FILE)
output_filepath = os.path.join(OUTPUT_DIR, f"{os.path.splitext(INPUT_FILE)[0].lower()}_extracted_data.json")
CHROMA_DB_PATH = os.path.join(CHROMA_DB_PATH, f"{os.path.splitext(INPUT_FILE)[0].lower()}_report_vectorstore")


In [4]:
# =============================================================================
# PYDANTIC MODELS FOR STRUCTURED OUTPUT
# =============================================================================

class AnswerWithSources(BaseModel):
    """An answer to a specific question or extracted piece of information, with sources and reasoning."""
    answer: Optional[str] = Field(description="The extracted answer or information.")
    sources: Optional[str] = Field(description="Full direct text chunk from the context used to extract this information.")
    reasoning: Optional[str] = Field(description="Explanation of how the answer was derived from the provided sources.")


class EnvironmentalSupplyChainImpact(BaseModel):
    """Information about an organisation's environmental supply chain impact monitoring."""
    monitors_and_measures: str = Field(
        description="Whether the organisation monitors and measures the environmental impact of its supply chain (Yes/No)."
    )
    further_information: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information about the supply chain impact monitoring process."
    )


class EnvironmentalSupplierAssessmentEngagement(BaseModel):
    """Information about an organisation's supplier assessment and engagement regarding environmental topics."""
    performs_assessment: str = Field(
        description="Whether the organisation performs supplier assessment and engagement regarding environmental topics (Yes/No)."
    )
    further_information: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on supplier assessment and engagement."
    )


class EnvironmentalSupplierScreening(BaseModel):
    """Information on an organisation's supplier screening process."""
    performs_screening: str = Field(
        description="Whether the organisation performs supplier screening using environmental criteria (Yes/No)."
    )
    percentage_screened: Optional[AnswerWithSources] = Field(
        default=None,
        description="The percentage of screened suppliers during the reporting period."
    )
    further_information: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on the supplier screening process."
    )


class EnvironmentalSupplierCodeOfConduct(BaseModel):
    """Information on an organisation's supplier code of conduct."""
    has_code_of_conduct: str = Field(
        description="Whether the organisation has a supplier code of conduct containing environmental aspects (Yes/No)."
    )
    percentage_signed: Optional[AnswerWithSources] = Field(
        default=None,
        description="The percentage of suppliers that have signed the code of conduct."
    )
    further_information: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on the supplier code of conduct."
    )


class EnvironmentalPurchasedGoods(BaseModel):
    """Information on an organisation's tracking of purchased goods."""
    tracks_quantity: str = Field(
        description="Whether the organisation keeps track of the quantity (tons/year) of purchased materials and goods (merchandising, food, metallic materials, plastics, other materials) (Yes/No)."
    )
    further_information: Optional[AnswerWithSources] = Field(
        default=None,
        description="Documentation and further information about tracking purchased goods."
    )


class EnvironmentalSupplyChainData(BaseModel):
    """
    A comprehensive model for all supply chain and purchased goods data.
    """
    environmental_supply_chain_impact: Optional[EnvironmentalSupplyChainImpact] = Field(
        default=None,
        description="Information on monitoring and measuring environmental impact in the supply chain."
    )
    supplier_assessment_engagement: Optional[EnvironmentalSupplierAssessmentEngagement] = Field(
        default=None,
        description="Information on supplier assessment and engagement regarding environmental topics."
    )
    supplier_screening: Optional[EnvironmentalSupplierScreening] = Field(
        default=None,
        description="Information on supplier screening using environmental criteria."
    )
    supplier_code_of_conduct: Optional[EnvironmentalSupplierCodeOfConduct] = Field(
        default=None,
        description="Information on the supplier code of conduct with environmental aspects."
    )
    purchased_goods: Optional[EnvironmentalPurchasedGoods] = Field(
        default=None,
        description="Information on tracking the quantity of purchased materials and goods."
    )


class EnvironmentalConductedActionDetails(BaseModel):
    """Details for a specific conducted environmental action category."""
    progress_achieved_for_period: AnswerWithSources = Field(
        description="Progress achieved for the reporting period regarding this environmental topic."
    )
    actual_targets_set_for_period: AnswerWithSources = Field(
        description="Actual targets set for the reporting period regarding this environmental topic."
    )
    reasons_for_achieved_progress: AnswerWithSources = Field(
        description="Reason(s) for the achieved progress regarding this environmental topic."
    )


class EnvironmentalConductedActionCategory(BaseModel):
    """A specific category of conducted environmental action (e.g., Total Emissions)."""
    category_name: str = Field(
        description="The name of the environmental category (e.g., 'Total emissions', 'Total energy consumption')."
    )
    details: EnvironmentalConductedActionDetails = Field(
        description="Detailed progress, targets, and reasons for this category."
    )


class EnvironmentalConductedActionsSection(BaseModel):
    """The main section for conducted environmental actions conducted during the reporting period."""
    categories_data: List[EnvironmentalConductedActionCategory] = Field(
        description="A list of specific environmental categories with their respective progress, targets, and reasons."
    )


class EnvironmentalPlannedActionDetails(BaseModel):
    """Details for a specific planned environmental action category."""
    planned_or_targeted_progress: AnswerWithSources = Field(
        description="Planned or targeted progress for the upcoming period regarding this environmental topic."
    )
    planned_actions_to_achieve_target: AnswerWithSources = Field(
        description="Planned action(s) to achieve the target(s) for the upcoming period regarding this environmental topic."
    )


class EnvironmentalPlannedActionCategory(BaseModel):
    """A specific category of planned environmental action (e.g., Total Emissions)."""
    category_name: str = Field(
        description="The name of the planned environmental category (e.g., 'Total emissions', 'Total energy consumption')."
    )
    details: EnvironmentalPlannedActionDetails = Field(
        description="Detailed planned progress and actions for this category."
    )


class EnvironmentalPlannedActionsSection(BaseModel):
    """The main section for environmental actions planned for the next reporting period."""
    categories_data: List[EnvironmentalPlannedActionCategory] = Field(
        description="A list of specific environmental categories with their respective planned progress and actions."
    )


class EnvironmentalGoalDetails(BaseModel):
    """Details for a specific medium/long-term goal."""
    definition_outlining: Optional[AnswerWithSources] = Field(
        default=None,
        description="Definition and outlining of the goal(s)."
    )
    is_measurable: Optional[str] = Field(
        default=None,
        description="Whether the goal(s) are measurable (Yes/No)."
    )
    measurable_definition: Optional[AnswerWithSources] = Field(
        default=None,
        description="If measurable, the definition of the goal(s)."
    )
    year_set: Optional[AnswerWithSources] = Field(
        default=None,
        description="Which year the goal(s) were set."
    )
    year_achieve: Optional[AnswerWithSources] = Field(
        default=None,
        description="Which year the goal(s) are planned to be achieved."
    )
    is_communicated: Optional[str] = Field(
        default=None,
        description="Whether the goal(s) are actively communicated (Yes/No)."
    )
    communication_method: Optional[AnswerWithSources] = Field(
        default=None,
        description="How the goal(s) are communicated (e.g., report, website, events)."
    )
    communication_scope: Optional[AnswerWithSources] = Field(
        default=None,
        description="Scope of communication (e.g., page numbers, website links)."
    )


class EnvironmentalMediumLongTermGoal(BaseModel):
    """A specific category of environmental goals."""
    category_name: str = Field(
        description="The name of the environmental category (e.g., 'Climate Action', 'Biodiversity')."
    )
    has_set_goal: str = Field(
        description="Whether this category has a set goal (Yes/No)."
    )
    details: Optional[EnvironmentalGoalDetails] = Field(
        default=None,
        description="Detailed goals, measurability, years, and communication methods for this category."
    )


class EnvironmentalMediumLongTermGoalsSection(BaseModel):
    """The main section for medium/long-term environmental goals and pledges."""
    goals_data: List[EnvironmentalMediumLongTermGoal] = Field(
        description="A list of specific environmental categories with their respective goals and details."
    )


class EnvironmentalCampaign(BaseModel):
    """Information extracted about a specific environmental campaign or activity."""
    name_of_campaign_activity: AnswerWithSources = Field(
        description="Name of the environmental campaign or activity."
    )
    purpose_and_goal: AnswerWithSources = Field(
        description="Purpose and aspired goal of the campaign/activity."
    )
    description_and_scope: AnswerWithSources = Field(
        description="Description and scope of the campaign."
    )
    qualitative_quantitative_results: AnswerWithSources = Field(
        description="Qualitative and quantitative results of the campaign."
    )
    environmental_issues_addressed: AnswerWithSources = Field(
        description="Environmental issues addressed by the campaign."
    )
    name_of_collaboration_partners: AnswerWithSources = Field(
        description="Names of collaboration partners (e.g., fan club, local community, charity, foundation, sponsorship partner)."
    )
    number_of_potential_individuals_reached: AnswerWithSources = Field(
        description="Number of potential individuals reached by the campaign."
    )
    sdgs_addressed: List[AnswerWithSources] = Field(
        description="Which of the 17 Sustainable Development Goals (SDGs) have been addressed by the campaign."
    )


class EnvironmentalCampaignsList(BaseModel):
    """List of environmental campaigns extracted from the report."""
    campaigns: List[EnvironmentalCampaign] = Field(
        description="A list of environmental campaigns and activities identified in the report."
    )


class GHGEmissionsEntry(BaseModel):
    """Data for a single GHG emissions category (e.g., Total scope 1)."""
    scope: str = Field(description="The scope of the emissions (e.g., Total scope 1, Total scope 2).")
    unit: str = Field(description="The unit of measurement (e.g., Tons GHG).")
    target_2024_25: Optional[AnswerWithSources] = Field(
        default=None, description="The target for 2024/25."
    )
    year_2023_24: Optional[AnswerWithSources] = Field(
        default=None, description="Emissions for the year 2023/24."
    )
    year_2022_23: Optional[AnswerWithSources] = Field(
        default=None, description="Emissions for the year 2022/23."
    )
    year_2021_22: Optional[AnswerWithSources] = Field(
        default=None, description="Emissions for the year 2021/22."
    )


class GHGEmissionsData(BaseModel):
    """Emissions data for a section like 'Greenhouse Gas Emissions' or 'Headquarters'."""
    emissions: List[GHGEmissionsEntry] = Field(
        description="A list of emission entries for different scopes."
    )


class GHGBaselineAndYear(BaseModel):
    """Information on the GHG baseline and reduction goals."""
    ghg_baseline_tons: Optional[AnswerWithSources] = Field(
        default=None, description="The GHG baseline in tons."
    )
    ghg_baseline_year: Optional[AnswerWithSources] = Field(
        default=None, description="The GHG baseline year."
    )
    planned_annual_reduction_percent: Optional[AnswerWithSources] = Field(
        default=None, description="The planned annual reduction in %."
    )
    planned_year_to_achieve_goal: Optional[AnswerWithSources] = Field(
        default=None, description="The planned year to achieve the goal."
    )


class OtherEmissions(BaseModel):
    """Information on other emissions."""
    has_other_emissions: str = Field(
        description="Whether there are any other emissions caused by the organisation (Yes/No)."
    )
    additional_information: Optional[AnswerWithSources] = Field(
        default=None, description="Additional information about other emissions."
    )


class GHGEmissionsDeclaration(BaseModel):
    """Information on the declaration and verification of GHG emissions."""
    self_declared: str = Field(
        description="Whether GHG emissions are self-declared (Yes/No)."
    )
    third_party_verified_partial_scope3: str = Field(
        description="Whether GHG emissions are third-party verified with partial scope 3 (Yes/No)."
    )
    third_party_verified_full_scope3: str = Field(
        description="Whether GHG emissions are third-party verified including full scope 3 (Yes/No)."
    )


class CarbonEmissionsManagement(BaseModel):
    """Information on the carbon management plan."""
    has_management_plan: str = Field(
        description="Whether the organisation has a carbon management plan to reduce GHG emissions (Yes/No)."
    )
    integration: Optional[AnswerWithSources] = Field(
        default=None, description="The integration of the plan."
    )
    is_assured_by_third_party: Optional[str] = Field(
        default=None, description="Whether the plan is assured by a third party (Yes/No)."
    )
    further_information_and_documentation: Optional[AnswerWithSources] = Field(
        default=None, description="Further information and documentation on the plan."
    )


class EmissionCalculation(BaseModel):
    """Information on the emission calculation methodology."""
    follows_ghg_protocol: str = Field(
        description="Whether the organisation mostly follows the GHG Protocol for the emission calculation (Yes/No)."
    )
    model_applied: Optional[str] = Field(
        default=None,
        description="Which model is applied (e.g., Spend model, Product carbon footprint, Lifecycle analysis, Other)."
    )
    detailed_outline: Optional[AnswerWithSources] = Field(
        default=None, description="More detailed outline of the calculation method."
    )


class CompensationOffsets(BaseModel):
    """Details on the types of offsets used for compensation."""
    reduction_and_avoidance_projects: str = Field(
        description="Whether offsets come from reduction and avoidance projects (Yes/No)."
    )
    short_term_capture: str = Field(
        description="Whether short-term capture offsets are used (Yes/No)."
    )
    long_term_capture: str = Field(
        description="Whether long-term capture offsets are used (Yes/No)."
    )


class YearlyCompensationData(BaseModel):
    """Data entry fields for a single year of GHG compensation."""
    compensated_emissions_tonnes: Optional[AnswerWithSources] = Field(
        default=None, description="Compensated emissions in tonnes."
    )
    total_amount_paid_in: Optional[AnswerWithSources] = Field(
        default=None, description="Total amount paid in."
    )
    ratio_offsetted_total: Optional[AnswerWithSources] = Field(
        default=None, description="Ratio of offsetted emission / total emissions."
    )


class CompensationData(BaseModel):
    """Data entry fields for GHG compensation, structured by year."""
    data_2023_24: Optional[YearlyCompensationData] = Field(
        default=None, description="Compensation data for the year 2023/24."
    )
    data_2022_23: Optional[YearlyCompensationData] = Field(
        default=None, description="Compensation data for the year 2022/23."
    )
    data_2021_22: Optional[YearlyCompensationData] = Field(
        default=None, description="Compensation data for the year 2021/22."
    )


class CompensationForUnavoidableGHG(BaseModel):
    """Information on compensation for unavoidable GHG emissions."""
    compensates_emissions: str = Field(
        description="Whether the organisation compensates its unavoidable GHG emissions (Yes/No)."
    )
    further_information_on_compensation: Optional[AnswerWithSources] = Field(
        default=None, description="Further information on compensation projects and partners."
    )
    types_of_offsets: Optional[CompensationOffsets] = Field(
        default=None, description="The types of offsets being used."
    )
    compensation_data: Optional[CompensationData] = Field(
        default=None, description="Data entry fields for compensation metrics."
    )


class VehiclePropulsion(BaseModel):
    """Details for a specific vehicle propulsion technology."""
    technology: str = Field(description="The propulsion technology (e.g., Fully electric vehicles).")
    number_of_vehicles: Optional[AnswerWithSources] = Field(
        default=None, description="Number of vehicles with this technology."
    )
    percentage_of_fleet: Optional[AnswerWithSources] = Field(
        default=None, description="Percentage of the fleet with this technology."
    )


class VehicleFleet(BaseModel):
    """Information on the organisation's vehicle fleet."""
    owns_or_operates_fleet: str = Field(
        description="Whether the organisation owns or operates a fleet of vehicles (Yes/No)."
    )
    propulsion_technologies: Optional[List[VehiclePropulsion]] = Field(
        default=None, description="A list of propulsion technologies and their details."
    )


class EmissionsData(BaseModel):
    """A comprehensive model for only the newly added environmental data fields."""
    total_emissions: Optional[GHGEmissionsData] = Field(
        default=None, description="Detailed information on GHG emissions."
    )
    headquarters_emissions: Optional[GHGEmissionsData] = Field(
        default=None, description="Detailed information on headquarters emissions."
    )
    ghg_baseline_and_year: Optional[GHGBaselineAndYear] = Field(
        default=None, description="Information on GHG baseline and reduction goals."
    )
    other_emissions: Optional[OtherEmissions] = Field(
        default=None, description="Information on any other emissions."
    )
    ghg_emissions_declaration: Optional[GHGEmissionsDeclaration] = Field(
        default=None, description="Information on how GHG emissions are declared and verified."
    )
    carbon_emissions_management: Optional[CarbonEmissionsManagement] = Field(
        default=None, description="Information on the carbon management plan."
    )
    emission_calculation: Optional[EmissionCalculation] = Field(
        default=None, description="Information on the methodology used for emission calculation."
    )
    compensation_for_ghg: Optional[CompensationForUnavoidableGHG] = Field(
        default=None, description="Information on compensation for unavoidable GHG emissions."
    )
    vehicle_fleet: Optional[VehicleFleet] = Field(
        default=None, description="Information on the organisation's vehicle fleet."
    )


class TravelDataEntry(BaseModel):
    """Represents a single row of travel data for a specific method and year."""
    method: str = Field(description="The travel method (e.g., By road, By plane).")
    unit: str = Field(description="The unit of measurement (e.g., km, miles).")
    target_2024_25: Optional[str] = Field(
        default=None, description="The target value for 2024/25."
    )
    year_2023_24: Optional[str] = Field(
        default=None, description="The value for the year 2023/24."
    )
    year_2022_23: Optional[str] = Field(
        default=None, description="The value for the year 2022/23."
    )
    year_2021_22: Optional[str] = Field(
        default=None, description="The value for the year 2021/22."
    )


class CommuterTravelData(BaseModel):
    """Data for commuter travel, separated by emission intensity."""
    emission_intense: List[TravelDataEntry]
    emission_low: List[TravelDataEntry]


class FanSpectatorTravelMeasurementDetails(BaseModel):
    """Details for a specific fan and spectator travel measurement."""
    is_tracked: str = Field(
        description="Whether this type of measurement is being tracked (Yes/No)."
    )
    details: Optional[str] = Field(
        default=None,
        description="Further information if the measurement is being tracked."
    )


class FanSpectatorTravelInfo(BaseModel):
    """Information on fan and spectator travel measurement."""
    has_measurement: str = Field(description="Whether any measurements are in place (Yes/No).")
    mode_of_transport: Optional[FanSpectatorTravelMeasurementDetails] = Field(
        default=None, description="Tracking information for 'Mode of transport'."
    )
    distance_travelled: Optional[FanSpectatorTravelMeasurementDetails] = Field(
        default=None, description="Tracking information for 'Distance travelled'."
    )
    other_data: Optional[FanSpectatorTravelMeasurementDetails] = Field(
        default=None, description="Tracking information for 'Other Data'."
    )


class FanSpectatorPublicTransport(BaseModel):
    """Information on promotion of public transport for fans and spectators."""
    provides_free_public_transport: str = Field(
        description="Whether free public transport is provided (Yes/No)."
    )
    overview: Optional[str] = Field(
        default=None, description="Further information about the promotion."
    )


class Travel(BaseModel):
    """A comprehensive model for all travel-related data."""
    travel_calculation_method: Optional[str] = Field(
        default=None,
        description="How travel activities are calculated (e.g., km, miles, hours)."
    )
    business_travel: List[TravelDataEntry] = Field(
        description="Business travel data by road, public transport, and plane."
    )
    headquarters_travel: List[TravelDataEntry] = Field(
        description="Headquarters travel data by road, public transport, and plane."
    )
    commuter_travel: CommuterTravelData = Field(
        description="Commuter travel data, split by emission intensity."
    )
    fan_spectator_travel: FanSpectatorTravelInfo = Field(
        description="Information on measurement of fan and spectator travel."
    )
    fan_spectator_promotion: FanSpectatorPublicTransport = Field(
        description="Information on promotion of public transport for fans."
    )


class PollutionMeasurements(BaseModel):
    """Information on pollution measurements."""
    is_measured: AnswerWithSources = Field(
        description="Whether this type of pollution is being measured (Yes/No)."
    )
    overview: Optional[AnswerWithSources] = Field(
        default=None, description="Further information on the measurements."
    )


class PollutionReductionActions(BaseModel):
    """Information on pollution reduction actions."""
    overview: Optional[AnswerWithSources] = Field(
        default=None,
        description="Overview of actions in place to reduce this type of pollution."
    )


class AirPollution(BaseModel):
    """Details on air pollution measurements and reduction."""
    measurements: PollutionMeasurements
    reduction: PollutionReductionActions


class NoisePollution(BaseModel):
    """Details on noise pollution measurements and reduction."""
    measurements: PollutionMeasurements
    reduction: PollutionReductionActions


class LightPollution(BaseModel):
    """Details on light pollution measurements and reduction."""
    measurements: PollutionMeasurements
    reduction: PollutionReductionActions


class OtherPollution(BaseModel):
    """Details on other types of pollution measurements and reduction."""
    measurements: PollutionMeasurements
    reduction: PollutionReductionActions


class Pollution(BaseModel):
    """Comprehensive extracted information on various types of pollution."""
    air_pollution: AirPollution
    noise_pollution: NoisePollution
    light_pollution: LightPollution
    other_pollution: OtherPollution


class ClimateOversight(BaseModel):
    """Information about climate oversight within the organization."""
    highest_management_level: Optional[AnswerWithSources] = Field(
        default=None,
        description="The highest management level with direct responsibility for climate change."
    )
    direct_responsibility: Optional[AnswerWithSources] = Field(
        default=None,
        description="Description of the direct responsibility for climate change."
    )


class ImpactResponsibilityDefinition(BaseModel):
    """Information on how the organization defines its climate impact and responsibility."""
    is_defined: str = Field(
        description="Whether the organization's impact and responsibility on climate change are defined (Yes/No)."
    )
    addressing_topics_risks: Optional[AnswerWithSources] = Field(
        default=None,
        description="How the organization addresses climate change-related topics and risks."
    )


class ClimateRelatedRisks(BaseModel):
    """Information on climate-related risks."""
    expects_risks: str = Field(
        description="Whether the organization expects any climate-related risks (Yes/No)."
    )
    risk_details: Optional[AnswerWithSources] = Field(
        default=None,
        description="Details on which risks are expected and why."
    )


class ClimateRiskMitigation(BaseModel):
    """Information on climate risk mitigation."""
    has_reported_risks: str = Field(
        description="Whether the organization has reported any climate-related risks (Yes/No)."
    )
    mitigation_plan: Optional[AnswerWithSources] = Field(
        default=None,
        description="How the organization addresses and plans to mitigate such risks."
    )


class StrategyToReduceClimateImpacts(BaseModel):
    """Information on the strategy to reduce climate impacts."""
    has_long_term_strategy: str = Field(
        description="Whether the organization has a dedicated long-term strategy to reduce its climate impact (Yes/No)."
    )
    further_information: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information about the long-term strategy."
    )


class ClimateChange(BaseModel):
    """A comprehensive model for all climate and environmental management data."""
    climate_oversight: Optional[ClimateOversight] = Field(
        default=None,
        description="Information on climate oversight within the organization."
    )
    impact_responsibility_definition: Optional[ImpactResponsibilityDefinition] = Field(
        default=None,
        description="Information on how the organization defines its climate impact and responsibility."
    )
    climate_related_risks: Optional[ClimateRelatedRisks] = Field(
        default=None,
        description="Information on climate-related risks."
    )
    climate_risk_mitigation: Optional[ClimateRiskMitigation] = Field(
        default=None,
        description="Information on climate risk mitigation."
    )
    strategy_to_reduce_climate_impacts: Optional[StrategyToReduceClimateImpacts] = Field(
        default=None,
        description="Information on the strategy to reduce climate impacts."
    )


class BiodiversityAddress(BaseModel):
    """Information on whether the organisation actively addresses biodiversity."""
    addresses_biodiversity: str = Field(
        description="Whether the organisation actively addresses biodiversity and takes action (Yes/No)."
    )
    further_information: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on how the organisation addresses biodiversity."
    )


class BiodiverseAreasOperations(BaseModel):
    """Information on whether the organisation operates in protected or biodiverse areas."""
    operates_in_biodiverse_areas: str = Field(
        description="Whether the organisation operates in protected areas or areas of high biodiversity value (Yes/No)."
    )
    further_information: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on operations in biodiverse areas."
    )


class BiodiversityImpact(BaseModel):
    """Information on the impact of the organisation's activities on biodiversity."""
    has_impact_on_biodiversity: str = Field(
        description="Whether the organisation's activities, products, and services have an impact on biodiversity (Yes/No)."
    )
    explanation_and_improvement: Optional[AnswerWithSources] = Field(
        default=None,
        description="Explanation of the impact and how the organisation improves it."
    )


class SoilSealingEntry(BaseModel):
    """A single entry for a real estate area and its sealed soil percentage."""
    area: Optional[AnswerWithSources] = Field(
        default=None,
        description="The name or description of the real estate area (e.g., 'Headquarters')."
    )
    soil_sealing_percentage: Optional[AnswerWithSources] = Field(
        default=None,
        description="The percentage of sealed soil for the specified area."
    )


class SoilSealing(BaseModel):
    """Information on soil sealing for real estate owned by the organisation."""
    sealed_areas: List[SoilSealingEntry] = Field(
        description="A list of real estate areas and their corresponding sealed soil percentages."
    )


class AnimalWelfare(BaseModel):
    """Information on the organisation's use of market power to influence animal welfare."""
    influences_animal_welfare: str = Field(
        description="Whether the organisation uses its market power to influence animal welfare in the supply chain (Yes/No)."
    )
    actions_and_initiatives: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on actions, initiatives, and other engagements regarding animal welfare."
    )


class Biodiversity(BaseModel):
    """
    A comprehensive model for all environmental data from the provided forms.
    """
    biodiversity_address: Optional[BiodiversityAddress] = Field(
        default=None,
        description="Information on the organisation's active efforts regarding biodiversity."
    )
    biodiverse_areas_operations: Optional[BiodiverseAreasOperations] = Field(
        default=None,
        description="Information on operations in protected or high-biodiversity areas."
    )
    biodiversity_impact: Optional[BiodiversityImpact] = Field(
        default=None,
        description="Information on the impact of activities, products, and services on biodiversity."
    )
    soil_sealing: Optional[SoilSealing] = Field(
        default=None,
        description="Information on sealed soil for owned real estate."
    )
    animal_welfare: Optional[AnimalWelfare] = Field(
        default=None,
        description="Information on the organisation's influence on animal welfare."
    )


class EnergyConsumptionEntry(BaseModel):
    """Information on energy systems and consumption for a specific location."""
    area: Optional[AnswerWithSources] = Field(
        default=None,
        description="The name or description of the location (e.g., 'Headquarters', 'Building B')."
    )
    electricity_consumption_kwh: Optional[str] = Field(
        description="Total electricity consumption in kWh/h."
    )
    purchased_electricity_kwh: Optional[str] = Field(
        description="Amount of purchased electricity in kWh/h."
    )
    purchased_renewable_electricity_kwh: Optional[str] = Field(
        description="Amount of purchased renewable electricity in kWh/h."
    )
    renewable_electricity_percentage: Optional[str] = Field(
        description="Percentage of total electricity consumption that is renewable."
    )
    self_generated_renewable_electricity_kwh: Optional[str] = Field(
        description="Amount of self-generated renewable electricity in kWh/h."
    )
    low_carbon_electricity_kwh: Optional[str] = Field(
        description="Amount of low-carbon electricity in kWh/h."
    )
    fuel_consumption_kwh: Optional[str] = Field(
        description="Fuel consumption (natural gas and heating oil) in kWh/h."
    )
    district_heating_kwh: Optional[str] = Field(
        description="District heating, cooling and steam consumption in kWh/h."
    )
    other_consumption_kwh: Optional[str] = Field(
        description="Other types of energy consumption in kWh/h."
    )


class Energy(BaseModel):
    """Information on the organisation's energy systems and consumption."""
    has_renewable_systems: str = Field(
        description="Whether the organisation owns renewable energy systems (Yes/No)."
    )
    renewable_systems_further_info: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on the renewable energy systems in place."
    )
    systems_in_place: List[str] = Field(
        description="List of types of renewable energy systems in place."
    )
    has_strategy: str = Field(
        description="Whether the organisation has a strategy in place to increase the percentage of renewable energy usage (Yes/No)."
    )
    strategy_further_info: Optional[AnswerWithSources] = Field(
        default=None,
        description="Corresponding information on the renewable energy strategy."
    )
    energy_by_location: List[EnergyConsumptionEntry] = Field(
        description="A list of energy consumption data by location."
    )


class LightingEntry(BaseModel):
    """Information on the types of lighting used for a specific location."""
    area: Optional[AnswerWithSources] = Field(
        default=None,
        description="The name or description of the location (e.g., 'Headquarters', 'Office 1')."
    )
    led_lighting_percentage: Optional[str] = Field(
        description="Percentage of LED lighting used."
    )
    energy_saving_lighting_percentage: Optional[str] = Field(
        description="Percentage of energy saving lighting used."
    )
    conventional_lighting_percentage: Optional[str] = Field(
        description="Percentage of conventional lighting used."
    )
    other_lighting_percentage: Optional[str] = Field(
        description="Percentage of other lighting used."
    )


class Lighting(BaseModel):
    """Information on the types of lighting used by the organisation."""
    lighting_by_location: List[LightingEntry] = Field(
        description="A list of lighting data by location."
    )


class WaterConsumptionEntry(BaseModel):
    """Information on water consumption for a specific location."""
    area: Optional[AnswerWithSources] = Field(
        default=None,
        description="The name or description of the location (e.g., 'Headquarters', 'Building B')."
    )
    purchased_water_m3: Optional[str] = Field(
        description="Volume of purchased water in M³."
    )
    rain_or_collected_water_m3: Optional[str] = Field(
        description="Volume of rainwater or other collected water in M³."
    )
    total_water_consumption_m3: Optional[str] = Field(
        description="Total water consumption in M³."
    )


class Water(BaseModel):
    """Information on the organisation's water consumption."""
    water_by_location: List[WaterConsumptionEntry] = Field(
        description="A list of water consumption data by location."
    )


class PaperConsumptionEntry(BaseModel):
    """Information on paper consumption for a specific location."""
    area: Optional[AnswerWithSources] = Field(
        default=None,
        description="The name or description of the location (e.g., 'Headquarters', 'Office 1')."
    )
    recycled_paper_tons: Optional[str] = Field(
        description="Amount of recycled paper used in Tons."
    )
    other_paper_tons: Optional[str] = Field(
        description="Amount of other paper used in Tons."
    )
    total_paper_tons: Optional[str] = Field(
        description="Total paper usage in Tons."
    )


class Paper(BaseModel):
    """Information on the organisation's paper consumption."""
    paper_by_location: List[PaperConsumptionEntry] = Field(
        description="A list of paper consumption data by location."
    )


class WasteGenerationEntry(BaseModel):
    """Information on waste generation for a specific location."""
    area: Optional[AnswerWithSources] = Field(
        default=None,
        description="The name or description of the location (e.g., 'Headquarters', 'Organization B')."
    )
    recyclable_waste: Optional[str] = Field(
        description="Amount of recyclable waste generated in Tons."
    )
    food_waste: Optional[str] = Field(
        description="Amount of food waste generated in Tons."
    )
    hazardous_waste: Optional[str] = Field(
        description="Amount of hazardous waste generated in Tons."
    )
    residual_waste: Optional[str] = Field(
        description="Amount of residual waste generated in Tons."
    )
    other_waste: Optional[str] = Field(
        description="Amount of other waste generated in Tons."
    )
    total_waste: Optional[str] = Field(
        description="Total waste generated in Tons."
    )


class WasteGeneration(BaseModel):
    """Information on the organisation's waste generation."""
    waste_by_location: List[WasteGenerationEntry] = Field(
        description="A list of waste generation data by location."
    )


class WasteManagement(BaseModel):
    """Information on the organisation's waste management."""
    tracks_diversion_rate: str = Field(
        description="Whether the organisation tracks its waste diversion rate (Yes/No)."
    )
    diversion_rate_percentage: Optional[AnswerWithSources] = Field(
        default=None,
        description="The most recent waste diversion rate in percentage."
    )
    diversion_methods: Optional[AnswerWithSources] = Field(
        default=None,
        description="A breakdown of the methods used for diversion."
    )
    has_documentation: str = Field(
        description="Whether the organisation has waste documentation in place (Yes/No)."
    )
    documentation_further_info: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on the waste documentation."
    )
    reduction_reuse_recovery: str = Field(
        description="Whether the organisation has actions for reduction, reuse, recovery, or material reconversion (Yes/No)."
    )
    internal_waste_differentiation: str = Field(
        description="Whether there is internal differentiation and disposal of waste (Yes/No)."
    )
    material_consumption_reduction: str = Field(
        description="Whether there is a reduction of material consumption through process optimisation (Yes/No)."
    )
    centralisation_of_purchasing: str = Field(
        description="Whether there is a centralisation of the purchasing function (Yes/No)."
    )
    other_actions: Optional[str] = Field(
        description="Whether there are other waste management actions (Yes/No)."
    )
    other_actions_further_info: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on other waste management actions."
    )
    uses_single_use_plastic: List[str] = Field(
        description="Whether the organisation uses single-use plastic in specific areas."
    )


class GroundCare(BaseModel):
    """Information on ground care and lawn management."""
    operates_ground_care_facilities: str = Field(
        description="Whether the organisation operates facilities with necessary ground care (Yes/No)."
    )
    has_lawn_water_recycling: str = Field(
        description="Whether the organisation has a lawn water recycling system (Yes/No)."
    )
    lawn_water_recycling_further_info: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on the lawn water recycling system."
    )
    fertiliser_conventional_used: str = Field(
        description="Whether conventional/chemical fertilisers are used (Yes/No)."
    )
    fertiliser_conventional_further_info: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on the use of conventional fertilisers."
    )
    fertiliser_organic_used: str = Field(
        description="Whether organic fertilisers are used (Yes/No)."
    )
    fertiliser_organic_further_info: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on the use of organic fertilisers."
    )
    fertiliser_other_used: str = Field(
        description="Whether other fertilisers are used (Yes/No)."
    )
    fertiliser_other_further_info: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on the use of other fertilisers."
    )
    machinery_conventional_used: str = Field(
        description="Whether conventional (Diesel, etc.) ground care machinery is used (Yes/No)."
    )
    machinery_conventional_further_info: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on the use of conventional machinery."
    )
    machinery_electric_used: str = Field(
        description="Whether electric ground care machinery is used (Yes/No)."
    )
    machinery_electric_further_info: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on the use of electric machinery."
    )
    machinery_other_used: str = Field(
        description="Whether other propulsion technology is used (Yes/No)."
    )
    machinery_other_further_info: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on the use of other machinery."
    )


class FoodOfferEntry(BaseModel):
    """Information about food offerings for a specific area."""
    area: Optional[AnswerWithSources] = Field(
        default=None,
        description="The name or description of the area where food is offered (e.g., 'Headquarters', 'Cafeteria')."
    )
    offers_food: str = Field(
        description="Whether the organisation offers/sells food to employees and/or customers in this area (Yes/No)."
    )


class PlantBasedOfferEntry(BaseModel):
    """Information on plant-based food options for a specific area."""
    area: Optional[AnswerWithSources] = Field(
        default=None,
        description="The name or description of the area where plant-based/low-carbon options are offered."
    )
    offers_plant_based: str = Field(
        description="Whether the organisation offers plant-based/low-carbon food alternatives in this area (Yes/No)."
    )

class SaltSugarReductionEntry(BaseModel):
    """Information on reduction concepts for salt and sugar in food offerings for a specific area."""
    area: Optional[AnswerWithSources] = Field(
        default=None,
        description="The name or description of the area where the reduction concept is applied."
    )
    has_reduction_concept: str = Field(
        description="Whether the organisation plans or has a reduction concept for salt and sugar in this area (Yes/No)."
    )


class Food(BaseModel):
    """Information on food sourcing, offerings, and waste management."""
    has_monitoring_system: str = Field(
        description="Whether a monitoring system for quality assurance is in place for food and nutrition suppliers (Yes/No)."
    )
    monitoring_system_further_info: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on the monitoring system."
    )
    food_offer_areas: List[FoodOfferEntry] = Field(
        description="List of areas where food is offered, with a Yes/No indicator."
    )
    offers_plant_based: List[PlantBasedOfferEntry] = Field(
        description="List of areas where plant-based/low-carbon food is offered, with a Yes/No indicator."
    )
    has_reduced_salt_sugar: List[SaltSugarReductionEntry] = Field(
        description="List of areas with a reduction concept for salt and sugar in food offerings, with a Yes/No indicator."
    )
    manages_surpluses: str = Field(
        description="Whether food surpluses are managed effectively to minimize waste (Yes/No)."
    )
    surplus_management_further_info: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on associations that collect surpluses and/or internal processes to reduce production."
    )


class SensorSystemEntry(BaseModel):
    """Information about a sensor-controlled building management system for a specific facility."""
    area: Optional[AnswerWithSources] = Field(
        default=None,
        description="The name or description of the facility (e.g., 'Headquarters', 'Building B')."
    )
    has_sensor_system: str = Field(
        description="Whether this facility has a sensor-controlled building management system (Yes/No)."
    )
    further_information: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on the sensor-controlled building management system for this facility."
    )


class Facilities(BaseModel):
    """Information on sustainable facility management."""
    has_sustainable_initiatives: str = Field(
        description="Whether the organisation has sustainable office/building initiatives (Yes/No)."
    )
    sustainable_initiatives_list: Optional[AnswerWithSources] = Field(
        default=None,
        description="A list of the initiatives and for which offices/buildings/business units."
    )
    sensor_systems: List[SensorSystemEntry] = Field(
        description="A list of sensor-controlled building management systems for different facilities."
    )


class OtherResourcesMaterials(BaseModel):
    """Information on other resources and materials."""
    has_other_resources: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on other resources and/or materials which are material for the organisation and its operations."
    )


class Resources(BaseModel):
    """A comprehensive model for all quantitative environmental data from the provided forms."""
    energy: Optional[Energy] = Field(
        default=None,
        description="Information on energy systems and consumption."
    )
    lighting: Optional[Lighting] = Field(
        default=None,
        description="Information on lighting usage."
    )
    water: Optional[Water] = Field(
        default=None,
        description="Information on water consumption."
    )
    paper: Optional[Paper] = Field(
        default=None,
        description="Information on paper consumption."
    )
    waste_generation: Optional[WasteGeneration] = Field(
        default=None,
        description="Information on waste generation."
    )
    waste_management: Optional[WasteManagement] = Field(
        default=None,
        description="Information on waste management."
    )


class ResourceManagement(BaseModel):
    """A comprehensive model for all qualitative environmental management and strategy data."""
    ground_care: Optional[GroundCare] = Field(
        default=None,
        description="Information on ground care."
    )
    food: Optional[Food] = Field(
        default=None,
        description="Information on food offerings and sourcing."
    )
    facilities: Optional[Facilities] = Field(
        default=None,
        description="Information on sustainable facility management."
    )
    other_resources: Optional[OtherResourcesMaterials] = Field(
        default=None,
        description="Information on other materials and resources."
    )


class ESGReportExtractedInfo(BaseModel):
    """Comprehensive extracted information from an ESG report."""
    env_supply_chain: Optional[EnvironmentalSupplyChainData] = Field(
        default=None,
        description="Detailed information on the environmental supply chain and purchased goods tracking."
    )
    env_actions_conducted: Optional[EnvironmentalConductedActionsSection] = Field(
        default=None,
        description="Detailed information on environmental actions conducted during the reporting period."
    )
    env_actions_planned: Optional[EnvironmentalPlannedActionsSection] = Field(
        default=None,
        description="Detailed information on environmental actions planned for the next reporting period."
    )
    env_medium_long_term_goals: Optional[EnvironmentalMediumLongTermGoalsSection] = Field(
        default=None,
        description="Detailed information on medium/long-term environmental goals, pledges, and claims."
    )
    env_campaigns: Optional[EnvironmentalCampaignsList] = Field(
        default=None,
        description="Detailed information on environmental campaigns and activities identified in the report."
    )
    ghg_emissions: Optional[EmissionsData] = Field(
        default=None,
        description="Detailed information on GHG emissions, headquarters emissions, and other environmental data."
    )
    travel: Optional[Travel] = Field(
        default=None,
        description="Detailed information on travel-related data."
    )
    pollution: Optional[Pollution] = Field(
        default=None,
        description="Detailed information on air, noise, light, and other pollution."
    )
    climate_change: Optional[ClimateChange] = Field(
        default=None,
        description="Detailed information on climate change oversight, risks, and strategies."
    )
    biodiversity: Optional[Biodiversity] = Field(
        default=None,
        description="Detailed information on biodiversity-related efforts and impacts."
    )
    resources: Optional[Resources] = Field(
        default=None,
        description="Detailed information on resource management and sustainability efforts."
    )
    resource_management: Optional[ResourceManagement] = Field(
        default=None,
        description="Detailed information on resource management practices."
    )


In [5]:
# =============================================================================
# EMBEDDING FUNCTIONS
# =============================================================================

def get_gemini_embedding_function():
    """Returns a function that embeds text using Gemini's embedding model."""
    def embed_text(text: str, task_type: str = "SEMANTIC_SIMILARITY") -> Optional[np.ndarray]:
        """Embeds a single string using the Gemini embedding model."""
        try:
            response = genai.embed_content(
                model="models/embedding-001",
                content=text,
                task_type=task_type
            )
            return np.array(response['embedding'])
        except Exception as e:
            print(f"Error embedding text with Gemini: {e}")
            return None
    return embed_text


class CustomGeminiEmbeddings(Embeddings):
    """
    A LangChain Embeddings class wrapper for the Gemini embedding function.
    This allows the Gemini embedder to be used with LangChain's vector stores.
    """
    def __init__(self, gemini_embed_func):
        self.gemini_embed_func = gemini_embed_func

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """
        Embeds a list of documents for storage in a vector store.
        Uses "RETRIEVAL_DOCUMENT" task type for optimal retrieval performance.
        """
        embeddings_list = []
        for text in texts:
            embedding = self.gemini_embed_func(text, task_type="RETRIEVAL_DOCUMENT")
            if embedding is not None:
                embeddings_list.append(embedding.tolist())
            else:
                # Handle cases where embedding fails
                print(f"Warning: Embedding failed for a document. Using a zero vector.")
                embeddings_list.append([0.0] * 768)  # Assuming 768 is the embedding dimension
        return embeddings_list

    def embed_query(self, text: str) -> List[float]:
        """
        Embeds a single query string for similarity search.
        Uses "RETRIEVAL_QUERY" task type.
        """
        embedding = self.gemini_embed_func(text, task_type="RETRIEVAL_QUERY")
        if embedding is not None:
            return embedding.tolist()
        else:
            print(f"Warning: Embedding failed for the query. Using a zero vector.")
            return [0.0] * 768  # Assuming 768 is the embedding dimension


In [6]:
# =============================================================================
# VECTOR STORE FUNCTIONS
# =============================================================================

def create_vectorstore(chunks: List[Document], embedding_function_instance: Embeddings, vectorstore_path: str):
    """
    Creates or loads a Chroma vector store from a list of document chunks.
    Ensures uniqueness of documents based on content hash.
    """
    print(f"Attempting to create/load vector store at: {vectorstore_path}")

    # Generate unique IDs for each document based on its content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

    # Filter out duplicate chunks based on their generated IDs
    unique_ids = set()
    unique_chunks = []
    for chunk, doc_id in zip(chunks, ids):
        if doc_id not in unique_ids:
            unique_ids.add(doc_id)
            unique_chunks.append(chunk)

    print(f"Found {len(unique_chunks)} unique chunks out of {len(chunks)} total.")

    # Create or load the Chroma database
    vectorstore = Chroma.from_documents(
        documents=unique_chunks,
        ids=list(unique_ids),
        embedding=embedding_function_instance,
        persist_directory=vectorstore_path
    )
    
    print(f"Vector store created/loaded and persisted successfully at: {vectorstore_path}")
    return vectorstore


def format_docs(docs):
    """Format documents for RAG context."""
    return "\n\n".join(doc.page_content for doc in docs)

In [7]:
# =============================================================================
# DOCUMENT PROCESSING
# =============================================================================

def load_and_split_documents(file_path: str) -> List[Document]:
    """Load PDF and split into chunks."""
    print(f"Loading document: {file_path}")
    
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", " "]
    )
    
    chunks = text_splitter.split_documents(pages)
    print(f"Document split into {len(chunks)} chunks")
    
    return chunks


In [8]:
# =============================================================================
# RAG SETUP AND QUERIES
# =============================================================================

def setup_rag_chains(vectorstore, llm):
   """Set up RAG chains for different extraction tasks."""
    
   # Create retriever with optimized parameters
   retriever = vectorstore.as_retriever(
      search_type="similarity", 
      search_kwargs={"k": 5}
   )
   
   # Define optimized RAG prompt template
   rag_prompt_template = """You are an expert ESG (Environmental, Social, Governance) data extraction specialist with deep knowledge of sustainability reporting standards, environmental metrics, and corporate responsibility frameworks.

      ## TASK OVERVIEW:
      Your primary objective is to extract precise, structured information from ESG reports and sustainability documents. You must analyze the provided context thoroughly and extract relevant data according to the specified Pydantic model structure.

      ## EXTRACTION PRINCIPLES:
      1. **ACCURACY FIRST**: Base all responses strictly on the provided context. Never invent, assume, or extrapolate information not present in the source material.

      2. **COMPREHENSIVE ANALYSIS**: 
         - Examine ALL provided context chunks thoroughly
         - Look for information across different sections, pages, and document parts
         - Consider synonyms, alternative terminology, and related concepts
         - Connect information that may be scattered across multiple paragraphs

      3. **DIRECT EVIDENCE REQUIREMENT**:
         - For each extracted piece of information, provide the EXACT source text that supports your answer
         - Use direct quotes whenever possible, preserving original wording and formatting
         - When paraphrasing is necessary, stay as close to the original language as possible

      4. **STRUCTURED OUTPUT COMPLIANCE**:
         - Follow the Pydantic model structure exactly as specified
         - Ensure all required fields are populated appropriately
         - Use "Information not available in the provided context" for missing data rather than leaving fields empty

      5. **NUMERICAL DATA PRECISION**:
         - Extract exact figures, percentages, dates, and metrics when available
         - Preserve units of measurement, currency symbols, and time periods
         - Note any baseline years, comparison periods, or contextual qualifiers

      6. **REASONING TRANSPARENCY**:
         - Clearly explain how each answer was derived from the source material
         - Show logical connections between questions and relevant context sections
         - Highlight any assumptions or interpretations made during extraction

      ## CONTEXT ANALYSIS GUIDELINES:
      - **Multiple Perspectives**: Consider how information might be presented in different formats (tables, charts, narrative text, bullet points)
      - **Cross-References**: Look for information that spans multiple sections or references other parts of the document
      - **Implicit Information**: Extract information that might be implied or stated indirectly
      - **Temporal Context**: Pay attention to reporting periods, historical comparisons, and future projections

      ## QUALITY ASSURANCE:
      - **Consistency Check**: Ensure extracted information aligns logically across all fields
      - **Completeness Review**: Verify that all available relevant information has been captured
      - **Accuracy Validation**: Double-check that source quotes exactly match the original text

      ## HANDLING MISSING INFORMATION:
      If specific information is not available in the context:
      - State explicitly: "This information is not available in the provided context"
      - Do not guess, estimate, or use general knowledge to fill gaps
      - Provide reasoning for why the information might be missing or where it might typically be found

      ---

      ## DOCUMENT CONTEXT:
      {context}

      ---

      ## EXTRACTION REQUEST:
      {question}

      ---

      ## RESPONSE INSTRUCTIONS:
      Analyze the above context thoroughly and extract the requested information following the Pydantic model structure. Ensure every piece of extracted data is supported by direct evidence from the provided context."""
   rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)
   
   # Create chains for different extraction tasks
   env_supply_chain_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(EnvironmentalSupplyChainData, strict=True)
   )

   env_conducted_actions_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(EnvironmentalConductedActionsSection, strict=True)
   )
   
   env_planned_actions_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(EnvironmentalPlannedActionsSection, strict=True)
   )

   env_medium_long_term_goals_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(EnvironmentalMediumLongTermGoalsSection, strict=True)
   )

   env_campaigns_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(EnvironmentalCampaignsList, strict=True)
   )

   ghg_emissions_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(EmissionsData, strict=True)
   )

   travel_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(Travel, strict=True)
   )

   pollution_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(Pollution, strict=True)
   )

   climate_change_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(ClimateChange, strict=True)
   )

   biodiversity_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(Biodiversity, strict=True)
   )

   resources_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(Resources, strict=True)
   )

   resource_management_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(ResourceManagement, strict=True)
   )

   return (env_supply_chain_chain, 
           env_conducted_actions_chain, 
           env_planned_actions_chain, 
           env_medium_long_term_goals_chain, 
           env_campaigns_chain, 
           ghg_emissions_chain,
           travel_chain,
           pollution_chain,
           climate_change_chain,
           biodiversity_chain,
           resources_chain,
           resource_management_chain)

def get_extraction_questions():
   """Define optimized questions for different extraction tasks."""

   env_supply_chain_question = """
      Comprehensively analyze the ESG report to extract detailed, structured information about the organization's environmental approach to supply chain management and purchased goods tracking during the current reporting period.

      ## SUPPLY CHAIN ENVIRONMENTAL IMPACT MONITORING

      **1. Supply Chain Environmental Impact Assessment:**
      - Does the organization actively monitor, measure, and assess the environmental impact of its supply chain operations? (Answer: Yes/No/Partially)
      - What specific environmental impacts are monitored (e.g., carbon emissions, water usage, waste generation, energy consumption, biodiversity impact)?
      - What methodologies, frameworks, or standards are used for measurement (e.g., Scope 3 emissions calculation, life cycle assessment, carbon footprint analysis)?
      - What are the key findings or metrics from supply chain environmental impact assessments?
      - Which supply chain stages or tiers are included in environmental monitoring (Tier 1, Tier 2, upstream, downstream)?

      ## SUPPLIER ENVIRONMENTAL ENGAGEMENT & ASSESSMENT

      **2. Supplier Environmental Assessment Programs:**
      - Does the organization conduct formal environmental assessments or evaluations of suppliers? (Answer: Yes/No/Partially)
      - What specific environmental criteria are assessed (e.g., environmental management systems, certifications, compliance, performance metrics)?
      - What assessment methods are used (questionnaires, audits, third-party evaluations, self-assessments, on-site visits)?
      - How frequently are supplier environmental assessments conducted (annually, bi-annually, upon onboarding)?
      - What are the consequences or follow-up actions for suppliers with poor environmental performance?

      **3. Supplier Environmental Engagement & Collaboration:**
      - What specific engagement activities does the organization undertake with suppliers on environmental topics?
      - Are there supplier training programs, workshops, or capacity-building initiatives on environmental matters?
      - Does the organization collaborate with suppliers on environmental improvement projects or innovations?
      - Are there supplier recognition programs or incentives for environmental performance excellence?
      - What support or resources does the organization provide to help suppliers improve their environmental performance?

      ## SUPPLIER SCREENING & SELECTION

      **4. Environmental Supplier Screening Process:**
      - Does the organization implement environmental criteria in supplier screening and selection processes? (Answer: Yes/No/Partially)
      - What percentage of new suppliers were screened using environmental criteria during the reporting period? (Extract exact percentage)
      - What percentage of existing suppliers were re-screened or re-evaluated using environmental criteria? (Extract exact percentage)
      - What specific environmental screening criteria are applied (e.g., ISO 14001 certification, carbon disclosure, waste management practices, regulatory compliance)?
      - What is the process for suppliers who do not meet environmental screening criteria?
      - Are there any environmental prerequisites or mandatory requirements for supplier qualification?

      ## SUPPLIER CODE OF CONDUCT & CONTRACTUAL REQUIREMENTS

      **5. Supplier Environmental Code of Conduct:**
      - Does the organization maintain a supplier code of conduct that explicitly includes environmental requirements and expectations? (Answer: Yes/No/Partially)
      - What percentage of suppliers have formally signed or acknowledged the environmental code of conduct during the reporting period? (Extract exact percentage)
      - What percentage of total procurement spend is covered by suppliers who have signed the code of conduct?
      - What specific environmental topics are covered in the code of conduct (e.g., emissions reduction, waste management, resource efficiency, biodiversity protection)?
      - What are the enforcement mechanisms and consequences for code of conduct violations?
      - How frequently is the environmental code of conduct updated or revised?

      **6. Contractual Environmental Requirements:**
      - Are environmental clauses, terms, or requirements integrated into supplier contracts and agreements?
      - What percentage of procurement contracts include specific environmental performance requirements?
      - Are there contractual penalties or incentives tied to environmental performance?

      ## PURCHASED GOODS & MATERIALS TRACKING

      **7. Purchased Goods Quantity Tracking:**
      - Does the organization systematically track and record the quantity (in tons/year or other units) of purchased materials and goods? (Answer: Yes/No/Partially)
      - What categories of purchased goods are tracked (e.g., merchandising products, food and beverages, metallic materials, plastics, packaging materials, textiles, electronics, chemicals, construction materials)?

      **8. Detailed Material Categories & Quantities:**
      For each category where tracking exists, extract:
      - **Category Name**: Specific type of material or good
      - **Quantity Tracked**: Annual volumes in tons/year or other specified units
      - **Tracking Methodology**: How quantities are measured and recorded
      - **Data Quality**: Completeness and reliability of tracking data
      - **Reporting Period**: Time period covered by the tracking data

      **9. Purchased Goods Environmental Impact:**
      - Does the organization assess or calculate the environmental impact of purchased goods and materials?
      - Are there specific environmental metrics tracked for purchased goods (e.g., embodied carbon, water footprint, recyclability)?
      - What documentation, databases, or systems are used for tracking purchased goods?
      - Are there any certifications or standards applied to purchased materials (e.g., FSC, EPEAT, ENERGY STAR, organic certifications)?

      ## SUPPLY CHAIN ENVIRONMENTAL INITIATIVES

      **10. Supply Chain Environmental Programs:**
      - What specific environmental initiatives or programs are implemented across the supply chain?
      - Are there collaborative projects with suppliers focused on environmental improvements?
      - What investments or resources are allocated to supply chain environmental improvements?
      - Are there any supply chain environmental targets or commitments for the next reporting period?

      ## DATA SOURCES & VERIFICATION

      **11. Documentation & Evidence:**
      - What specific reports, systems, or documentation support the supply chain environmental information?
      - Are there third-party verifications or audits of supply chain environmental data?
      - What are the data collection challenges or limitations mentioned?

      **CRITICAL EXTRACTION GUIDELINES:**
      - **Quantitative Precision**: Extract exact percentages, quantities, and numerical data with units and time periods
      - **Yes/No Clarity**: Provide definitive answers where possible, use "Partially" only when explicitly supported by mixed evidence
      - **Source Attribution**: Reference specific sections, pages, or documents where information is found
      - **Missing Information Protocol**: For each question where no information is available, explicitly state: "This information is not available in the provided context"
      - **Comprehensive Coverage**: Look for information across different report sections including sustainability reports, supply chain sections, procurement policies, and appendices
      - **Terminology Variations**: Consider alternative terms like "value chain," "sourcing," "procurement," "vendor management," "third-party suppliers"
      - **Temporal Context**: Note whether information refers to the current reporting period, historical data, or future commitments
      """
    
   env_conducted_actions_question = """
      Analyze the ESG report and extract detailed information about CONDUCTED environmental actions during the current reporting period.
      
      For each of these environmental categories, identify:
      - Total emissions (including Scope 1, 2, and 3 emissions, carbon footprint, GHG emissions)
      - Total business travel (including team travel, staff travel, transportation)
      - Total energy consumption (including electricity, gas, fuel, renewable energy usage)
      - Total water consumption (including water usage, conservation efforts)
      - Total paper consumption (including digital transformation efforts, paperless initiatives)
      - Total waste generation (including waste reduction, recycling, circular economy efforts)
      
      For EACH category found, extract:
      1. PROGRESS ACHIEVED: What specific measurable progress was made? Include quantitative metrics (percentages, absolute numbers, comparisons to baselines or previous years).
      2. ACTUAL TARGETS: What were the specific targets or goals set for this reporting period? Include numerical targets and timeframes.
      3. REASONS FOR PROGRESS: What specific actions, initiatives, or factors led to the achieved progress? Include implementation details and methodologies.
      
      Important guidelines:
      - Look for synonyms and related terms (e.g., "carbon footprint" for emissions, "energy efficiency" for energy consumption)
      - Extract exact figures, percentages, and metrics when available
      - Include both absolute and relative improvements
      - Note any certifications, standards, or methodologies mentioned
      - If no information is found for a category, clearly state "No conducted actions reported for this category"
      """
      
   env_planned_actions_question = """
      Analyze the ESG report and extract detailed information about PLANNED environmental actions specifically for the NEXT REPORTING PERIOD ONLY (typically the upcoming 12 months or next fiscal year).
      
      IMPORTANT: Focus STRICTLY on SHORT-TERM plans for the immediate next reporting cycle. EXCLUDE any medium-term (2-5 years) or long-term (5+ years) strategic goals, commitments, or targets - these will be handled separately.
      
      For each of these environmental categories, identify ONLY next-period plans:
      - Total emissions (including immediate reduction initiatives, next-year offset plans, short-term efficiency measures)
      - Total business travel (including next-period travel policies, immediate transport changes)
      - Total energy consumption (including next-year efficiency projects, immediate renewable energy implementations)
      - Total water consumption (including next-period conservation measures, immediate efficiency upgrades)
      - Total paper consumption (including next-year digitization phases, immediate reduction initiatives)
      - Total waste generation (including next-period reduction programs, immediate recycling improvements)
      
      For EACH category found, extract ONLY:
      1. PLANNED/TARGETED PROGRESS: What specific measurable targets are set for the NEXT reporting period only? Include numerical goals and percentages specifically for the upcoming 12 months or next fiscal year.
      2. PLANNED ACTIONS: What specific initiatives, strategies, or actions will be implemented during the NEXT reporting period? Include immediate implementation plans, next-period deliverables, and short-term milestones.
      
      Identifying next-period actions - look for language such as:
      - "Next year we will..."
      - "In the coming year..."
      - "For [next fiscal year]..."
      - "In the next 12 months..."
      - "Our immediate plans include..."
      - "Next reporting period targets..."
      - "Upcoming initiatives..."
      - "Short-term actions..."
      
      EXCLUDE language indicating longer timeframes:
      - "By 2030/2035/2040..." (medium/long-term)
      - "Our 5-year plan..." (medium-term)
      - "Strategic roadmap to 2030..." (long-term)
      - "Long-term commitment to..." (strategic goals)
      - "Over the next decade..." (long-term)
      
      Important guidelines:
      - Be very strict about timeframe - only include actions explicitly planned for the next reporting cycle
      - Focus on operational and tactical plans rather than strategic commitments
      - Look for specific next-period budgets, resources, or project phases
      - Include immediate partnerships or collaborations starting in the next period
      - Capture next-period compliance requirements or regulatory implementations
      - If no next-period specific plans are mentioned for a category, clearly state "No planned actions reported for the next reporting period for this category"
      """

   env_medium_long_term_goals_question = """
      Analyze the ESG report comprehensively and extract detailed information about MEDIUM-TERM (2-5 years) and LONG-TERM (5+ years) environmental goals, targets, pledges, commitments, and strategic objectives.
      
      TIMEFRAME FOCUS: Look specifically for goals with target dates beyond the next immediate reporting period, including:
      - Medium-term goals: 2-5 years out (e.g., by 2027-2030)
      - Long-term goals: 5+ years out (e.g., by 2030, 2035, 2040, 2050 and beyond)
      - Strategic commitments with extended timelines
      - Multi-year roadmaps and transformation plans

      For EACH of the following environmental categories, thoroughly analyze and determine if medium/long-term goals have been established. The category names MUST be exactly as follows:
      
      **CORE ENVIRONMENTAL CATEGORIES:**
      - Emissions (carbon neutrality, net-zero, GHG reduction targets, scope 1/2/3 commitments)
      - Travel (sustainable transport, business travel reduction, logistics optimization)
      - Energy (renewable energy transition, energy efficiency, power sourcing commitments)
      - Water (conservation targets, usage reduction, water stewardship goals)
      - Paper & Digital Transformation (paperless operations, digitization commitments)
      - Waste (zero-waste goals, circular economy targets, waste reduction pledges)
      - Food & Nutrition (sustainable sourcing, food waste reduction, nutritional impact goals)
      - Climate Change (climate resilience, adaptation strategies, climate risk mitigation)
      - Biodiversity (nature protection, biodiversity restoration, ecosystem preservation)
      - Supply Chain (supplier engagement, value chain decarbonization, sustainable procurement)
      - Other (identify any additional environmental goal areas not listed above)
      
      For EACH category where medium/long-term goals are identified, extract the following comprehensive information:
      
      1. **GOAL DEFINITION & SCOPE**: 
         - Clear, detailed definition of the goal(s) and what they encompass
         - Scope of application (global, regional, specific operations)
         - Any sub-goals or phased approaches within the main commitment
      
      2. **MEASURABILITY ASSESSMENT**:
         - Is the goal quantitatively measurable? (Answer: 'Yes', 'No', or 'Partially')
         - Are there specific KPIs, metrics, or indicators defined?
      
      3. **MEASURABLE SPECIFICATIONS**:
         - Exact numerical targets, percentages, or reduction amounts
         - Baseline years and reference points for measurements
         - Units of measurement and calculation methodologies
         - Any interim milestones or checkpoints specified
      
      4. **TEMPORAL FRAMEWORK**:
         - Year the goal(s) were first established or announced
         - Target achievement year(s) or deadline(s)
         - Any revised timelines or updated commitments
         - Interim milestone years if specified
      
      5. **STRATEGIC ALIGNMENT**:
         - Connection to broader corporate strategy or sustainability framework
         - Alignment with external standards (SBTi, Paris Agreement, SDGs, etc.)
         - Integration with industry initiatives or sector commitments
      
      6. **COMMUNICATION & TRANSPARENCY**:
         - Do they actively communicate these goals? (Answer: 'Yes', 'No', or 'Limited')
         - Communication channels and methods used (annual reports, sustainability reports, website, press releases, investor communications)
         - Frequency of progress reporting and updates
      
      7. **COMMUNICATION EVIDENCE**:
         - Specific document sections, page numbers, or website locations where goals are communicated
         - Exact quotes or statements about the commitments
         - Public announcements, press releases, or external communications
      
      8. **IMPLEMENTATION APPROACH**:
         - Strategic initiatives, programs, or investments planned to achieve goals
         - Governance structures or responsible parties for goal delivery
         - Resource allocation or budget commitments mentioned
         - Partnerships or external collaborations for goal achievement
      
      **IDENTIFICATION KEYWORDS** - Look for language such as:
      - "By 2030/2035/2040/2050..."
      - "Long-term commitment to..."
      - "Strategic goal to achieve..."
      - "Our roadmap to..."
      - "Multi-year target..."
      - "Decade commitment..."
      - "Net-zero by..."
      - "Carbon neutral by..."
      - "Sustainable future vision..."
      
      **CRITICAL GUIDELINES:**
      - **Evidence-Based Only**: Extract information strictly supported by the provided text - do not infer or assume details not explicitly stated
      - **Temporal Precision**: Be exact about timeframes and distinguish between medium-term vs. long-term commitments
      - **Goal Hierarchy**: Identify both overarching strategic goals and specific sub-targets within categories
      - **Missing Information Protocol**: If no medium/long-term goals exist for a category, explicitly state: "No medium or long-term goals identified for this category in the provided context"
      - **Incomplete Data Handling**: If partial information is available, extract what exists and note what information is missing
      - **Verification Focus**: Look for evidence of goal validation, third-party verification, or external endorsement of commitments
      """
    
   env_campaigns_question = """
      Comprehensively analyze the ESG report to identify and extract detailed information about ALL environmental campaigns, initiatives, activities, and programs mentioned.
      
      Look for various types of environmental activities including:
      - Awareness campaigns and educational programs
      - Community engagement initiatives
      - Sustainability programs and projects
      - Environmental partnerships and collaborations
      - Green initiatives and eco-friendly activities
      - Climate action programs
      - Conservation efforts and biodiversity projects
      - Stakeholder engagement on environmental issues
      - Environmental advocacy and policy initiatives
      - Green technology implementations
      
      For EACH campaign/activity identified, extract comprehensive details:
      
      1. NAME: The specific name or title of the campaign/activity. If no formal name exists, create a descriptive title based on the content.
      
      2. PURPOSE & GOAL: The intended objectives, mission, and aspirational outcomes. What environmental impact is the campaign trying to achieve?
      
      3. DESCRIPTION & SCOPE: Detailed description of what the campaign involves, its scope, duration, geographic reach, and key components or phases.
      
      4. RESULTS: Both qualitative and quantitative outcomes, impacts, and achievements. Include metrics, participation numbers, environmental benefits, and success indicators.
      
      5. ENVIRONMENTAL ISSUES: Specific environmental challenges, problems, or areas addressed (e.g., climate change, biodiversity loss, pollution, resource depletion).
      
      6. COLLABORATION PARTNERS: Names of all partners involved, including:
         - Fan clubs and supporter groups
         - Local communities and neighborhoods
         - Charities and non-profit organizations
         - Foundations and environmental groups
         - Sponsorship and business partners
         - Government agencies and institutions
         - Academic institutions and schools
      
      7. REACH: Number of people, communities, or stakeholders potentially reached or engaged by the campaign.
      
      8. SDGs ADDRESSED: Which of the 17 UN Sustainable Development Goals are specifically addressed, with explanations of how the campaign contributes to each goal.
      
      Important guidelines:
      - Don't miss smaller initiatives or brief mentions of environmental activities
      - Look for both formal programs and informal/ad-hoc environmental efforts
      - Extract information even if incomplete - capture what's available
      - Consider the full lifecycle of campaigns from planning to evaluation
      - Include both internal (employee-focused) and external (community-focused) campaigns
      - Look for environmental messaging, communications, and awareness efforts
      """

   ghg_emissions_question = """
      Extract comprehensive GHG emissions data from the ESG report using the standardized reporting format. For each data point, provide the exact value and source section. If information is unavailable, state "Not available".

      ## EMISSIONS DATA (Extract for years 2021/22, 2022/23, 2023/24, and Target 2024/25)

      **Total and Scope-specific Emissions (in Tons GHG):**
      - Total scope 1 emissions
      - Total scope 2 emissions  
      - Total scope 3 emissions
      - Total emissions (sum of all scopes)
      - Headquarters scope 1, 2, 3, and total emissions (if reported separately)

      **GHG Baseline and Targets:**
      - GHG baseline value in tons
      - GHG baseline year
      - Planned annual reduction percentage (e.g., 5%)
      - Planned year to achieve reduction goal (e.g., 2030)

      **Other Emissions:**
      - Are there other emissions beyond standard scopes: Yes/No
      - Details of other emissions (methane, nitrous oxide, high GWP gases)

      ## EMISSIONS DECLARATION & VERIFICATION

      **Declaration Method:**
      - Self-declared GHG emissions: Yes/No
      - Third-party verified GHG emissions with partial scope 3: Yes/No
      - Third-party verified GHG emissions including full scope 3: Yes/No

      **Carbon Management:**
      - Organization has carbon management (reduction) plan: Yes/No
      - Plan integration approach (select from dropdown options if available)
      - Is plan assured by third party: Yes/No
      - Plan overview and documentation details

      ## CALCULATION METHODOLOGY

      **GHG Protocol Compliance:**
      - Organization mostly follows GHG Protocol: Yes/No
      - Calculation model applied:
      * Spend model: Yes/No
      * Product carbon footprint: Yes/No
      * Lifecycle analysis: Yes/No
      * Other: Yes/No (specify details)
      - Detailed methodology outline

      ## COMPENSATION FOR UNAVOIDABLE EMISSIONS

      **Compensation Approach:**
      - Compensates unavoidable GHG emissions (e.g., via emissions trading): Yes/No

      **Offset Types Used:**
      - Offsets from reduction and avoidance projects (renewable energy, biomass waste, cookstoves, transport): Yes/No
      - Short-term capture (nature-based solutions like forestry, peat land, mangroves): Yes/No
      - Long-term capture (Carbon Capture & Storage, Biomass with Carbon Capture): Yes/No

      **Compensation Metrics (for 2021/22, 2022/23, 2023/24):**
      - Compensated emissions in tonnes
      - Total amount paid
      - Cost per tonne emissions
      - Ratio of offsetted emission to total emissions

      ## VEHICLE FLEET ANALYSIS

      **Fleet Ownership:**
      - Own/operate a fleet of vehicles: Yes/No

      **Fleet Composition by Propulsion Technology:**
      - Fully electric vehicles (EVs): [Number] vehicles, [%] of fleet
      - Petrol/gasoline engine: [Number] vehicles, [%] of fleet
      - Diesel engine: [Number] vehicles, [%] of fleet
      - Hybrid propulsion (HEV): [Number] vehicles, [%] of fleet
      - Plug-in-Hybrid (PHEV): [Number] vehicles, [%] of fleet
      - CNG-engine: [Number] vehicles, [%] of fleet
      - Fuel-cell drive: [Number] vehicles, [%] of fleet
      - Other propulsion: [Number] vehicles, [%] of fleet

      **EXTRACTION GUIDELINES:**
      - Extract exact numerical values from form fields or data tables
      - Preserve original units (Tons GHG, tonnes, etc.) as stated
      - For Yes/No questions, provide definitive answers based on selections
      - Look for both organizational total and headquarters-specific data
      - Note if information appears in forms, tables, or narrative text
      - Identify dropdown selections and checkbox responses
      - Cross-reference data across multiple sections for completeness

      **OUTPUT FORMAT:**
      Structure response as:
      Section: [Data Category]
      • Data point: [Exact Value] [Units] (Source: [Form section/Table])
      • Use "Not available" for unfilled fields or missing information
      • Include reporting year context for each data point
      • Note calculation methodologies and verification levels

      **PRIORITY SEARCH LOCATIONS:**
      - GHG emissions data tables and forms
      - Carbon management and strategy sections  
      - Verification and assurance statements
      - Vehicle fleet and transportation data
      - Offset and compensation project details
      - Baseline and target-setting documentation
      """
   
   travel_question = """
      Extract comprehensive travel data from the ESG report using the standardized reporting format. For each data point, provide the exact value and source section. If information is unavailable, state "Not available".

      ## TRAVEL DATA (Extract for years 2021/22, 2022/23, 2023/24, and Target 2024/25)

      **Travel Metrics Configuration:**
      - Travel activity calculation method: km/miles/hours
      - Selected measurement unit from dropdown

      **Business Travel - Organization (in km):**
      - By road travel distance
      - By public transport travel distance
      - By plane travel distance
      - Total business travel distance

      **Headquarters Travel (in km):**
      - By road travel distance
      - By public transport travel distance
      - By plane travel distance
      - Total headquarters travel distance

      ## COMMUTER TRAVEL DATA

      **Emission Intense Methods (in km):**
      - By road travel distance
      - By public transport travel distance
      - By plane travel distance
      - Total emission intense commuter travel

      **Emission Low Methods (in km):**
      - By foot travel distance
      - By bicycle travel distance
      - Total emission low commuter travel

      ## FAN & SPECTATOR TRAVEL

      **Measurement Capability:**
      - Measurements in place to capture fan & spectator travel: Yes/No
      - Mode of transport data collection: Yes/No
      - Distance travelled data collection: Yes/No
      - Other data collection methods: Yes/No
      - Overview of measurement methodology and data sources

      **Public Transport Promotion:**
      - Provides fans/spectators with free public transport: Yes/No
      - Overview of public transport arrangements and initiatives

      **EXTRACTION GUIDELINES:**
      - Extract exact numerical values from form input fields and data tables
      - Preserve original units (km, miles, hours) as selected in dropdown
      - For Yes/No questions, provide definitive answers based on radio button selections
      - Look for both organizational total and headquarters-specific travel data
      - Note dropdown selections, checkbox responses, and text field entries
      - Identify auto-calculated fields vs. manually entered values
      - Cross-reference travel data across multiple reporting sections

      **OUTPUT FORMAT:**
      Structure response as:
      Section: [Travel Category]
      - Data point: [Exact Value] [Units] (Source: [Form section/Field])
      - Use "Not available" for unfilled fields or missing information
      - Include reporting year context for each data point
      - Note calculation methodologies and intensity formulations

      **PRIORITY SEARCH LOCATIONS:**
      - Travel metrics and configuration sections
      - Business travel data tables and input forms
      - Commuter travel methodology statements
      - Fan and spectator travel measurement systems
      - Transport mode breakdown data and classifications
      - Travel intensity calculation methodologies and employee/revenue data
      - Public transport initiative descriptions and policy statements
      """
   
   pollution_question = """
      Extract comprehensive pollution data from the ESG report using the standardized reporting format. For each data point, provide the exact value and source section. If information is unavailable, state "Not available".

      ## POLLUTION DATA

      **Air Pollution:**
      - Measurements in place to capture air pollution caused by the organisation: Yes/No
      - Air pollution measurement methodology and overview
      - Air pollution reduction actions and strategies overview

      **Noise Pollution:**
      - Measurements in place to capture noise pollution caused by the organisation: Yes/No
      - Noise pollution measurement methodology and overview
      - Noise pollution reduction actions and strategies overview

      **Light Pollution:**
      - Measurements in place to capture light pollution caused by the organisation: Yes/No
      - Light pollution measurement methodology and overview
      - Light pollution reduction actions and strategies overview

      **Other Pollution:**
      - Measurements in place to capture other pollution caused by the organisation: Yes/No
      - Other pollution measurement methodology and overview
      - Other pollution reduction actions and strategies overview
      - Types of other pollution monitored (water, soil, chemical, electromagnetic, etc.)

      ## POLLUTION MANAGEMENT & MITIGATION

      **Measurement Systems:**
      - Air pollution monitoring systems and protocols
      - Noise pollution monitoring systems and protocols
      - Light pollution monitoring systems and protocols
      - Other pollution monitoring systems and protocols

      **Reduction Strategies:**
      - Air pollution mitigation measures and implementation plans
      - Noise pollution mitigation measures and implementation plans
      - Light pollution mitigation measures and implementation plans
      - Other pollution mitigation measures and implementation plans

      **Additional Pollution Categories:**
      - Water pollution measurement and reduction strategies
      - Soil contamination measurement and reduction strategies
      - Chemical pollution measurement and reduction strategies
      - Electromagnetic pollution measurement and reduction strategies
      - Thermal pollution measurement and reduction strategies
      - Any other environmental pollution types and associated management approaches

      **EXTRACTION GUIDELINES:**
      - Extract exact responses from Yes/No radio button selections
      - Capture complete text from overview text areas and detailed descriptions
      - Look for pollution-specific measurement protocols and methodologies
      - Identify specific reduction actions, technologies, and implementation timelines
      - Note monitoring equipment, frequency of measurements, and reporting standards
      - Cross-reference pollution data across environmental management sections
      - Include any pollution types beyond air, noise, light in "Other Pollution" category

      **OUTPUT FORMAT:**
      Structure response as:
      Section: [Pollution Category]
      - Data point: [Exact Value/Response] (Source: [Form section/Field])
      - Use "Not available" for unfilled fields or missing information
      - Include detailed descriptions from overview text areas
      - Note specific measurement protocols and reduction initiatives

      **PRIORITY SEARCH LOCATIONS:**
      - Environmental impact and pollution sections
      - Air quality monitoring and management systems
      - Noise control and mitigation strategies
      - Light pollution policies and dark sky initiatives
      - Water quality and discharge monitoring
      - Soil contamination assessments
      - Chemical waste management protocols
      - Environmental monitoring equipment and protocols
      - Pollution reduction targets and implementation plans
      - Regulatory compliance and environmental permits
      """
   
   climate_change_question = """
      PRIORITY: Extract ANY climate-related information from the document, even if partial or indirect. Cast a wide net to identify climate content.

      ## BROAD CLIMATE INFORMATION SEARCH
      **Look for ANY mention of:**
      - Climate, climate change, global warming, greenhouse gas, GHG, emissions, carbon
      - Environmental impact, sustainability, ESG, environmental management
      - Weather, temperature, extreme events, flooding, drought, storms
      - Energy efficiency, renewable energy, clean energy, solar, wind
      - Net zero, carbon neutral, carbon footprint, decarbonization
      - Environmental risks, climate risks, physical risks, transition risks
      - Green initiatives, environmental programs, sustainability strategy
      - Board oversight, governance, responsibility for environmental matters
      - Risk management, environmental compliance, regulatory requirements

      ## FLEXIBLE EXTRACTION APPROACH
      **For Climate Oversight - search for:**
      - ANY executive, board member, or manager mentioned with environmental/climate duties
      - Environmental committees, sustainability teams, ESG officers
      - ANY governance structure related to environmental matters
      - Even basic mentions of who handles environmental issues

      **For Impact & Responsibility - search for:**
      - ANY statement about environmental impact or responsibility
      - Sustainability commitments, environmental policies
      - ANY description of how they address environmental issues
      - Environmental management systems or procedures

      **For Climate Risks - search for:**
      - ANY risk assessment mentioning environmental factors
      - Weather-related risks, regulatory risks, reputational risks
      - Supply chain risks, operational risks related to environment
      - ANY mention of climate or environmental challenges

      **For Risk Mitigation - search for:**
      - ANY environmental protection measures
      - Energy efficiency programs, waste reduction initiatives
      - Environmental compliance programs
      - ANY actions to reduce environmental impact

      **For Climate Strategy - search for:**
      - ANY long-term environmental goals or commitments
      - Sustainability initiatives, green programs
      - Environmental targets, reduction goals
      - ANY future environmental plans or strategies

      ## DEBUG EXTRACTION RULES
      1. **Lower the bar**: Extract even vague or indirect climate references
      2. **Alternative terminology**: Look for "environmental", "sustainability", "green", "eco-friendly"
      3. **Partial information**: Even incomplete data is better than null
      4. **Context clues**: Look in sections about risk management, governance, strategy, operations
      5. **Flexible matching**: Don't require exact terminology matches

      ## FALLBACK EXTRACTION
      If no direct climate information is found, look for:
      - General environmental policies or statements
      - Energy usage or efficiency mentions  
      - Waste management or recycling programs
      - Any sustainability initiatives
      - General risk management frameworks that could apply to climate
      - Corporate responsibility or ESG sections

      ## OUTPUT REQUIREMENT
      For each section, provide:
      - **Answer**: ANY relevant information found, even if not perfectly aligned
      - **Sources**: The exact text where you found this information
      - **Reasoning**: Why you believe this relates to the climate topic, even if indirectly

      **If truly no climate information exists**, then provide:
      - Answer: "No climate-related information found in document"
      - Sources: "Searched throughout document sections: [list sections searched]"  
      - Reasoning: "Comprehensive search conducted but no climate governance, risks, or strategy content identified"
      """
   
   biodiversity_question = """
      Extract comprehensive biodiversity and nature-related data from the ESG report using the standardized reporting format. For each data point, provide the exact value and source section. If information is unavailable, state "Not available".

      ## BIODIVERSITY ENGAGEMENT
      **Active Biodiversity Addressing:**
      - Does the organisation actively address the topic biodiversity and does take action: Yes/No
      - Look for: biodiversity policies, conservation initiatives, nature-based solutions, ecosystem protection programs
      - Search terms: biodiversity, ecosystem, habitat, species, conservation, natural capital, nature protection

      **Biodiversity Actions and Initiatives:**
      - Extract detailed information about biodiversity actions taken by the organization
      - Look for: specific conservation projects, habitat restoration, species protection programs, biodiversity monitoring
      - Include: partnerships with conservation organizations, biodiversity management plans, ecosystem services initiatives

      ## OPERATIONS IN BIODIVERSE AREAS
      **Biodiverse Area Operations:**
      - Does the organisation operate in and/or own, lease and/or manage buildings and/or other real estate in protected areas and/or areas of high biodiversity value: Yes/No
      - Look for: operations in national parks, protected areas, UNESCO sites, critical habitats, biodiversity hotspots
      - Search terms: protected areas, conservation areas, national parks, wildlife reserves, critical habitats, sensitive ecosystems

      **Location Details:**
      - Extract specific information about operations in biodiverse areas
      - Look for: geographical locations, types of protected areas, biodiversity assessments, impact mitigation measures
      - Include: permits, compliance measures, stakeholder engagement with conservation authorities

      ## IMPACT ON BIODIVERSITY
      **Biodiversity Impact Assessment:**
      - Do activities, products and services have an impact on biodiversity: Yes/No
      - Look for: impact assessments, biodiversity footprint analysis, ecosystem impact evaluations
      - Search terms: biodiversity impact, ecosystem disruption, habitat fragmentation, species impact

      **Impact Details and Improvements:**
      - Extract explanations of how or why biodiversity impacts occur
      - Extract information on how the organisation improves the impact on biodiversity
      - Look for: mitigation measures, restoration projects, offset programs, impact reduction strategies
      - Include: biodiversity enhancement initiatives, no net loss commitments, positive biodiversity outcomes

      ## SOIL SEALING DATA
      **Soil Sealing Measurements:**
      - Extract sealed soil data for real estate owned by the organisation in percentage
      - Look for: land use data, impervious surface coverage, built-up area percentages
      - Search terms: sealed soil, impervious surfaces, concrete coverage, paved areas, built-up land

      **Area-Specific Soil Sealing:**
      - Extract soil sealing percentages for different areas/locations
      - Look for: headquarters, offices, facilities, industrial sites, retail locations
      - Format: Area name and corresponding soil sealing percentage
      - Include: total land area, developed area, green space ratios

      ## ANIMAL WELFARE
      **Supply Chain Animal Welfare Influence:**
      - Does the organisation use its market power to influence stakeholders in the supply chain in a positive way with regard to animal welfare: Yes/No
      - Look for: supplier codes of conduct, animal welfare standards, supply chain requirements
      - Search terms: animal welfare, ethical sourcing, humane treatment, animal rights, livestock standards

      **Animal Welfare Actions:**
      - Extract specific actions taken, initiatives and other engagements regarding animal welfare
      - Look for: supplier audits, certification requirements, animal welfare policies, training programs
      - Include: partnerships with animal welfare organizations, welfare improvement programs, ethical sourcing initiatives

      ## EXTRACTION GUIDELINES
      - Extract exact responses from Yes/No radio button selections
      - Capture complete text from overview text areas and detailed descriptions
      - Look for biodiversity-specific policies, management systems, and monitoring protocols
      - Identify specific conservation actions, restoration projects, and impact mitigation measures
      - Note quantitative data for soil sealing percentages and area measurements
      - Cross-reference biodiversity data across environmental management and sustainability sections
      - Include nature-based solutions and ecosystem services approaches

      ## OUTPUT FORMAT
      Structure response as:
      Section: [Biodiversity Category]
      - Data point: [Exact Value/Response] (Source: [Form section/Field])
      - Use "Not available" for unfilled fields or missing information
      - Include detailed descriptions from text areas and additional information fields
      - Note specific biodiversity initiatives, conservation projects, and impact reduction measures

      ## PRIORITY SEARCH LOCATIONS
      - Environmental impact and biodiversity sections
      - Nature and ecosystem management policies
      - Land use and real estate management sections
      - Conservation and restoration project descriptions
      - Protected area operations and compliance documentation
      - Supply chain and procurement policies
      - Animal welfare and ethical sourcing standards
      - Sustainability reports and environmental management systems
      - Site-specific environmental assessments
      - Conservation partnership and collaboration agreements
      """
   
   resources_question = """
      Extract comprehensive energy, lighting, water, paper, and waste management data from the sustainability/ESG report using the standardized reporting format. For each data point, provide the exact value and source section. If information is unavailable, state "Not available".

      ## RENEWABLE ENERGY SYSTEMS
      **Active Renewable Energy Ownership:**
      - Does the organisation own renewable energy systems (e.g., photovoltaic solar panels, wind turbines, geothermal technology, heat-pump systems): Yes/No
      - Look for: solar panel installations, wind energy systems, geothermal facilities, heat pumps, renewable energy infrastructure
      - Search terms: renewable energy, solar panels, wind turbines, photovoltaic, geothermal, heat pump, clean energy

      **Renewable Energy Systems Details:**
      - Extract detailed information about renewable energy systems in place
      - Look for: system types, capacity, installation dates, performance metrics, energy generation data
      - Include: list of specific renewable technologies, system specifications, energy output

      **Renewable Energy Strategy:**
      - Does the organisation have a strategy in place to increase the percentage of renewable energy usage: Yes/No
      - Look for: renewable energy targets, transition plans, clean energy roadmaps, sustainability strategies
      - Extract corresponding information on the renewable energy strategy implementation

      ## ENERGY CONSUMPTION BY LOCATION
      **Energy Consumption Data:**
      For each facility, building, or location, extract the following energy data:
      - Extract area/location names and descriptions (e.g., 'Headquarters', 'Building B', 'Manufacturing Plant')
      - Look for: building names, facility locations, site identifications, campus areas

      **Electricity Consumption:**
      - Extract total electricity consumption in kWh/h for each location
      - Extract purchased electricity consumption in kWh/h for each location
      - Extract purchased renewable electricity in kWh/h for each location
      - Extract percentage of total electricity consumption that is renewable for each location
      - Extract self-generated renewable electricity in kWh/h for each location
      - Extract low-carbon electricity consumption in kWh/h for each location
      - Search terms: electricity consumption, kWh, renewable electricity, grid electricity, self-generated power

      **Other Energy Consumption:**
      - Extract fuel consumption (natural gas and heating oil) in kWh/h for each location
      - Extract district heating, cooling and steam consumption in kWh/h for each location
      - Extract other types of energy consumption in kWh/h for each location
      - Search terms: natural gas, heating oil, district heating, steam, fuel consumption, energy usage

      ## LIGHTING SYSTEMS BY LOCATION
      **Lighting Distribution Data:**
      For each facility or location, extract lighting information:
      - Extract area/location names where lighting data is available
      - Extract percentage of LED lighting used for each location
      - Extract percentage of energy saving lighting used for each location
      - Extract percentage of conventional lighting used for each location
      - Extract percentage of other lighting types used for each location
      - Look for: lighting efficiency data, LED conversion rates, energy-saving bulbs, conventional lighting systems
      - Search terms: LED lighting, energy-saving lighting, conventional lighting, lighting efficiency, bulbs

      ## WATER CONSUMPTION BY LOCATION
      **Water Usage Data:**
      For each facility or location, extract water consumption information:
      - Extract area/location names where water consumption occurs
      - Extract volume of purchased water in M³ for each location
      - Extract volume of rainwater or other collected water in M³ for each location  
      - Extract total water consumption in M³ for each location
      - Look for: water usage statistics, municipal water, rainwater harvesting, water conservation data
      - Search terms: water consumption, cubic meters, M³, purchased water, rainwater, collected water

      ## PAPER CONSUMPTION BY LOCATION
      **Paper Usage Data:**
      For each facility or location, extract paper consumption information:
      - Extract area/location names where paper is consumed
      - Extract amount of recycled paper used in Tons for each location
      - Extract amount of other paper used in Tons for each location
      - Extract total paper usage in Tons for each location
      - Look for: office paper usage, recycled paper procurement, paper consumption statistics
      - Search terms: paper consumption, recycled paper, office paper, paper usage, tons

      ## WASTE GENERATION BY LOCATION
      **Waste Production Data:**
      For each facility or location, extract waste generation information:
      - Extract area/location names where waste is generated (e.g., 'Headquarters', 'Organization B')
      - Extract amount of recyclable waste generated in Tons for each location
      - Extract amount of food waste generated in Tons for each location
      - Extract amount of hazardous waste generated in Tons for each location
      - Extract amount of residual waste generated in Tons for each location
      - Extract amount of other waste generated in Tons for each location
      - Extract total waste generated in Tons for each location
      - Search terms: waste generation, recyclable waste, food waste, hazardous waste, residual waste, tons

      ## WASTE MANAGEMENT PRACTICES
      **Waste Diversion Tracking:**
      - Does the organisation track its waste diversion rate: Yes/No
      - Extract the most recent waste diversion rate in percentage
      - Extract breakdown of methods used for diversion (recycling, composting, energy recovery)
      - Look for: waste diversion statistics, recycling rates, diversion methods, waste management KPIs
      - Search terms: waste diversion rate, recycling rate, diversion methods, waste tracking

      **Waste Documentation:**
      - Does the organisation have waste documentation in place: Yes/No
      - Extract further information on waste documentation systems and processes
      - Look for: waste transfer notes, documentation systems, waste tracking procedures
      - Search terms: waste documentation, waste transfer notes, waste tracking, documentation systems

      **Waste Management Actions:**
      - Does the organisation have actions for reduction, reuse, recovery, or material reconversion: Yes/No
      - Is there internal differentiation and disposal of waste: Yes/No
      - Is there reduction of material consumption through process optimisation: Yes/No
      - Is there centralisation of the purchasing function: Yes/No
      - Are there other waste management actions: Yes/No
      - Extract further information on other waste management actions
      - Look for: waste reduction initiatives, reuse programs, process optimization, centralized purchasing
      - Search terms: waste reduction, reuse, recovery, material reconversion, process optimization

      **Single-Use Plastic Usage:**
      - Extract whether the organisation uses single-use plastic in specific areas
      - Look for: single-use plastic policies, plastic reduction initiatives, disposable plastic usage
      - Search terms: single-use plastic, disposable plastic, plastic reduction, plastic usage

      ## EXTRACTION GUIDELINES
      - Extract exact responses from Yes/No selections and radio button fields
      - Capture complete numerical values with original units (kWh, M³, Tons, percentages)
      - Look for location-specific data tables and facility breakdowns
      - Identify energy efficiency initiatives, renewable energy projects, and sustainability programs
      - Note quantitative consumption data across multiple time periods if available
      - Cross-reference data across environmental management and sustainability sections
      - Include targets, goals, and future projections where mentioned

      ## OUTPUT FORMAT
      Structure response as:
      Section: [Energy/Lighting/Water/Paper/Waste Category]
      - Data point: [Exact Value/Response] (Source: [Report section/Page])
      - Location: [Facility/Area Name] - [Specific measurement] (Source: [Section])
      - Use "Not available" for unfilled fields or missing information
      - Include detailed descriptions from sustainability reports and environmental data
      - Note specific initiatives, efficiency programs, and consumption reduction measures

      ## PRIORITY SEARCH LOCATIONS
      - Environmental data and sustainability metrics sections
      - Energy management and consumption reporting
      - Facility operations and resource usage data
      - Waste management and recycling program descriptions
      - Water conservation and usage statistics
      - Sustainability performance indicators and KPIs
      - Environmental management system documentation
      - Resource efficiency and conservation initiatives
      - Location-specific environmental impact data
      - Annual sustainability reports and environmental disclosures
      """
   
   resource_management_question = """
      Extract comprehensive qualitative information on sustainable office/building initiatives, facility management systems, ground care operations, food and nutrition management, and other resources and materials from the sustainability/ESG report using the standardized reporting format. For each data point, provide the exact value and source section. If information is unavailable, state "Not available".

      ## SUSTAINABLE OFFICE/BUILDING INITIATIVES
      **Sustainable Office/Building Initiatives Implementation:**
      - Does the organisation have sustainable office/building initiatives in place: Yes/No
      - Look for: FSC and/or PEFC certified printing paper, reconditioned toner cartridges, single-use products with FSC and/or compostable and/or PEFC environmental certification, ecological cleaning products certified by Ecolabel and/or Safer Choice and/or ICEA, reusable/washable and/or air-dry hand towels, high energy efficiency electrical and electronic devices
      - Search terms: sustainable office, FSC certified paper, PEFC certification, reconditioned toner, ecological cleaning, Ecolabel, energy efficient devices, sustainable procurement

      **Initiative Details by Location:**
      - Extract specific initiatives and corresponding offices/buildings/business units where implemented
      - Look for: location-specific sustainability programs, facility-level green initiatives, building-specific certifications
      - Include: detailed descriptions of initiatives, implementation scope, performance metrics

      ## FACILITY MANAGEMENT SYSTEMS
      **Sensor-Controlled Building Management Systems:**
      For each facility or location, extract building management information:
      - Extract area/location names (e.g., 'Headquarters', 'Branch Office', 'Manufacturing Site')
      - Does each location have sensor-controlled building management systems in place: Yes/No
      - Look for: smart management and control of energy systems, water systems, lighting, shading, heating, air conditioning, ventilation systems
      - Search terms: building management system, sensor-controlled systems, smart building, automated controls, HVAC automation, energy management

      **Building Management System Details:**
      - Extract further information on building management systems and their capabilities
      - Look for: system specifications, control mechanisms, efficiency improvements, automation features
      - Include: types of sensors, control systems, integration capabilities, performance data

      ## GROUND CARE OPERATIONS
      **Ground Care Facility Operations:**
      - Does the organisation operate facilities with necessary ground care (lawn care, landscaping, etc.): Yes/No
      - Look for: facility maintenance, landscaping operations, outdoor space management, ground maintenance activities
      - Search terms: ground care, lawn care, landscaping, facility maintenance, outdoor maintenance

      **Water Recycling Systems:**
      - Does the organisation have lawn water recycling systems in place: Yes/No
      - Extract further information on water recycling systems and implementation
      - Look for: irrigation water recycling, greywater systems, rainwater harvesting for landscaping, water conservation systems
      - Search terms: water recycling, irrigation systems, greywater, rainwater harvesting, water conservation

      **Fertiliser Usage by Type:**
      For each location with ground care operations, extract fertiliser information:
      - Extract area/location names where fertiliser is used
      - Conventional/chemical fertilisers usage: Yes/No
      - Organic fertilisers usage: Yes/No  
      - Other fertilisers usage: Yes/No (specify type)
      - Extract further information on fertiliser types, application rates, and environmental considerations
      - Search terms: fertiliser use, chemical fertilisers, organic fertilisers, soil treatment, landscaping chemicals

      **Ground Care Machinery Propulsion Technology:**
      For each location, extract machinery information:
      - Extract area/location names where ground care machinery is used
      - Conventional propulsion technology (Diesel, etc.) usage: Yes/No
      - Electric propulsion technology usage: Yes/No
      - Other propulsion technology usage: Yes/No (specify type)
      - Extract further information on machinery types, fleet composition, emissions data
      - Search terms: ground care equipment, lawn mowers, landscaping machinery, electric equipment, diesel equipment

      ## FOOD & NUTRITION SERVICES
      **Food Sourcing and Quality Monitoring:**
      - Does the organisation have a monitoring system for quality assurance for food and nutrition suppliers: Yes/No
      - Extract further information on supplier monitoring systems and quality controls
      - Look for: supplier auditing, food safety protocols, nutrition standards, sourcing policies
      - Search terms: food supplier monitoring, quality assurance, food safety, supplier auditing

      **Food Service Operations by Location:**
      For each facility offering food services, extract:
      - Extract area/location names where food is offered/sold to employees and/or customers
      - Does each location offer/sell food to employees and/or customers: Yes/No
      - Look for: cafeterias, food courts, vending services, catering operations, employee dining facilities
      - Search terms: food services, employee dining, cafeteria, food offerings

      **Nutrition Options Implementation:**
      For each location with food services, extract nutrition information:
      - Extract area/location names offering nutrition programs
      - Plant-based/low carbon food alternatives offered constantly: Yes/No
      - Reduced salt & sugar food offerings planned or in place: Yes/No
      - Extract details on nutrition programs, healthy food initiatives, sustainable food options
      - Search terms: plant-based food, low carbon meals, reduced salt, reduced sugar, healthy food options

      **Food Waste Management:**
      - Are food surpluses managed effectively to minimize waste: Yes/No
      - Extract information on food surplus associations, internal waste reduction processes, and disposal methods
      - Look for: food donation programs, surplus redistribution, waste reduction initiatives, composting programs
      - Search terms: food waste management, surplus food, food donation, waste reduction

      ## OTHER RESOURCES & MATERIALS
      **Additional Resource Management:**
      - Extract information on any other resources and/or materials which are material for the organisation and its operations
      - Look for: material resource consumption, specialized equipment, unique operational materials, resource efficiency programs
      - Search terms: resource management, material consumption, operational materials, resource efficiency

      ## EXTRACTION GUIDELINES
      - Extract exact responses from Yes/No selections and radio button fields
      - Capture complete details from text fields and additional information sections
      - Look for location-specific implementation data and facility breakdowns
      - Identify sustainability certifications, environmental standards, and green building features
      - Note facility management technologies, automation systems, and efficiency measures
      - Cross-reference data across operational management and sustainability sections
      - Include implementation details, performance metrics, and expansion plans where mentioned

      ## OUTPUT FORMAT
      Structure response as:
      Section: [Initiative/Management/Operations Category]
      - Data point: [Exact Value/Response] (Source: [Report section/Page])
      - Location: [Facility/Area Name] - [Specific implementation] (Source: [Section])
      - Use "Not available" for unfilled fields or missing information
      - Include detailed descriptions from sustainability reports and operational data
      - Note specific technologies, certifications, and implementation scope

      ## PRIORITY SEARCH LOCATIONS
      - Operational sustainability and facility management sections
      - Building management and automation system documentation
      - Ground care and landscaping operations data
      - Food services and nutrition program descriptions
      - Resource management and procurement policies
      - Facility-specific sustainability initiatives
      - Environmental management system operational data
      - Sustainable procurement and supplier management information
      - Location-specific operational sustainability metrics
      - Facility management technology and automation reports
      """

   return (env_supply_chain_question, 
           env_conducted_actions_question, 
           env_planned_actions_question, 
           env_medium_long_term_goals_question, 
           env_campaigns_question,
           ghg_emissions_question,
           travel_question,
           pollution_question,
           climate_change_question,
           biodiversity_question,
           resources_question,
           resource_management_question)


In [9]:
# =============================================================================
# MAIN EXECUTION
# =============================================================================

def main():
    """Main execution function."""
    print("=" * 80)
    print("ESG DATA EXTRACTION PIPELINE")
    print("=" * 80)
    
    # 1. Load and process documents
    print("\n1. Loading and processing documents...")
    chunks = load_and_split_documents(input_filepath)
    
    # 2. Initialize embedding function
    print("\n2. Initializing embedding function...")
    raw_gemini_embedder = get_gemini_embedding_function()
    custom_langchain_embeddings = CustomGeminiEmbeddings(raw_gemini_embedder)
    
    # 3. Create/load vector store
    print("\n3. Creating/loading vector store...")
    vectorstore = create_vectorstore(
        chunks=chunks,
        embedding_function_instance=custom_langchain_embeddings,
        vectorstore_path=CHROMA_DB_PATH
    )
    
    # 4. Initialize LLM and setup RAG chains
    print("\n4. Setting up RAG chains...")
    llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2, google_api_key=GEMINI_API_KEY)

    (env_supply_chain_chain, 
    env_conducted_actions_chain, 
    env_planned_actions_chain, 
    env_medium_long_term_goals_chain, 
    env_campaigns_chain, 
    ghg_emissions_chain,
    travel_chain,
    pollution_chain,
    climate_change_chain,
    biodiversity_chain,
    resources_chain,
    resource_management_chain) = setup_rag_chains(vectorstore, llm)

    # 5. Get extraction questions
    (env_supply_chain_question, 
     env_conducted_actions_question, 
     env_planned_actions_question, 
     env_medium_long_term_goals_question, 
     env_campaigns_question, 
     ghg_emissions_question,
     travel_question,
     pollution_question,
     climate_change_question,
     biodiversity_question,
     resources_question,
     resource_management_question) = get_extraction_questions()

    # 6. Extract information
    print("\n5. Extracting information...")
    print("   - Extracting supply chain information...")
    extracted_env_supply_chain = env_supply_chain_chain.invoke(env_supply_chain_question)

    print("   - Extracting conducted actions...")
    extracted_env_conducted_actions = env_conducted_actions_chain.invoke(env_conducted_actions_question)
    
    print("   - Extracting planned actions...")
    extracted_env_planned_actions = env_planned_actions_chain.invoke(env_planned_actions_question)

    print("   - Extracting medium/long-term goals...")
    extracted_env_medium_long_term_goals = env_medium_long_term_goals_chain.invoke(env_medium_long_term_goals_question)
    
    print("   - Extracting environmental campaigns...")
    extracted_env_campaigns = env_campaigns_chain.invoke(env_campaigns_question)

    print("   - Extracting GHG emissions...")
    extracted_ghg_emissions = ghg_emissions_chain.invoke(ghg_emissions_question)

    print("   - Extracting travel data...")
    extracted_travel = travel_chain.invoke(travel_question)

    print("   - Extracting pollution data...")
    extracted_pollution = pollution_chain.invoke(pollution_question)

    print("   - Extracting climate change data...")
    extracted_climate_change = climate_change_chain.invoke(climate_change_question)

    print("   - Extracting biodiversity data...")
    extracted_biodiversity = biodiversity_chain.invoke(biodiversity_question)

    print("   - Extracting resources data...")
    extracted_resources = resources_chain.invoke(resources_question)

    print("   - Extracting resources management data...")
    extracted_resource_management = resource_management_chain.invoke(resource_management_question)

    # 7. Combine results
    print("\n6. Combining results...")
    extracted_esg_data = ESGReportExtractedInfo(
        env_supply_chain=extracted_env_supply_chain,
        env_actions_conducted=extracted_env_conducted_actions,
        env_actions_planned=extracted_env_planned_actions,
        env_medium_long_term_goals=extracted_env_medium_long_term_goals,
        env_campaigns=extracted_env_campaigns,
        ghg_emissions=extracted_ghg_emissions,
        travel=extracted_travel,
        pollution=extracted_pollution,
        climate_change=extracted_climate_change,
        biodiversity=extracted_biodiversity,
        resources=extracted_resources,
        resources_management=extracted_resource_management
    )
    
    # 8. Save results
    print("\n7. Saving results...")
    try:
        json_output = extracted_esg_data.model_dump_json(indent=2)
        
        with open(output_filepath, 'w', encoding='utf-8') as f:
            f.write(json_output)
        
        print(f"✓ Data successfully saved to: {output_filepath}")

    except Exception as e:
        print(f"❌ Error saving results: {e}")
        return False
    
    print("\n" + "=" * 80)
    print("EXTRACTION COMPLETED SUCCESSFULLY!")
    print("=" * 80)
    
    return True


if __name__ == "__main__":
    try:
        success = main()
        if not success:
            exit(1)
    except Exception as e:
        print(f"❌ Error during execution: {e}")
        exit(1)

ESG DATA EXTRACTION PIPELINE

1. Loading and processing documents...
Loading document: ../data/LFC.pdf
Document split into 62 chunks

2. Initializing embedding function...

3. Creating/loading vector store...
Attempting to create/load vector store at: ../vectorstores/lfc_report_vectorstore
Found 62 unique chunks out of 62 total.
Vector store created/loaded and persisted successfully at: ../vectorstores/lfc_report_vectorstore

4. Setting up RAG chains...

5. Extracting information...
   - Extracting supply chain information...
   - Extracting conducted actions...
   - Extracting planned actions...
   - Extracting medium/long-term goals...
   - Extracting environmental campaigns...
   - Extracting GHG emissions...
   - Extracting travel data...
   - Extracting pollution data...
   - Extracting climate change data...
   - Extracting biodiversity data...
   - Extracting resources data...
   - Extracting resources management data...

6. Combining results...

7. Saving results...
✓ Data succ