In [1]:
"""
ESG Data Extraction using LangChain and Google Gemini

This script processes ESG reports (PDFs) and extracts structured information about:
- Environmental campaigns and activities
- Conducted environmental actions
- Planned environmental actions
- Medium/long-term environmental goals

Author: Geo
Date: 2025
"""

'\nESG Data Extraction using LangChain and Google Gemini\n\nThis script processes ESG reports (PDFs) and extracts structured information about:\n- Environmental campaigns and activities\n- Conducted environmental actions\n- Planned environmental actions\n- Medium/long-term environmental goals\n\nAuthor: Geo\nDate: 2025\n'

In [2]:
# =============================================================================
# IMPORTS
# =============================================================================

import os
import uuid
from typing import List, Optional, Literal, Union

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from pydantic import BaseModel, Field

import google.generativeai as genai
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_core.embeddings import Embeddings
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# =============================================================================
# CONFIGURATION
# =============================================================================

# File paths
INPUT_DIR = "../data"
INPUT_FILE = "LFC.pdf"
OUTPUT_DIR = "../output"
CHROMA_DB_PATH = "../vectorstores"

# Create directories if they don't exist
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(CHROMA_DB_PATH, exist_ok=True)

# Configure API
load_dotenv()
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
genai.configure(api_key=GEMINI_API_KEY)

# File paths
input_filepath = os.path.join(INPUT_DIR, INPUT_FILE)
output_filepath = os.path.join(OUTPUT_DIR, f"{os.path.splitext(INPUT_FILE)[0].lower()}_extracted_data.json")
CHROMA_DB_PATH = os.path.join(CHROMA_DB_PATH, f"{os.path.splitext(INPUT_FILE)[0].lower()}_report_vectorstore")


In [4]:
# =============================================================================
# PYDANTIC MODELS FOR STRUCTURED OUTPUT
# =============================================================================

class AnswerWithSources(BaseModel):
    """An answer to a specific question or extracted piece of information, with sources and reasoning."""
    answer: str = Field(description="The extracted answer or information.")
    sources: str = Field(description="Full direct text chunk from the context used to extract this information.")
    reasoning: str = Field(description="Explanation of how the answer was derived from the provided sources.")


class EnvironmentalSupplyChainImpact(BaseModel):
    """Information about an organisation's environmental supply chain impact monitoring."""
    monitors_and_measures: str = Field(
        description="Whether the organisation monitors and measures the environmental impact of its supply chain (Yes/No)."
    )
    further_information: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information about the supply chain impact monitoring process."
    )


class EnvironmentalSupplierAssessmentEngagement(BaseModel):
    """Information about an organisation's supplier assessment and engagement regarding environmental topics."""
    performs_assessment: str = Field(
        description="Whether the organisation performs supplier assessment and engagement regarding environmental topics (Yes/No)."
    )
    further_information: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on supplier assessment and engagement."
    )


class EnvironmentalSupplierScreening(BaseModel):
    """Information on an organisation's supplier screening process."""
    performs_screening: str = Field(
        description="Whether the organisation performs supplier screening using environmental criteria (Yes/No)."
    )
    percentage_screened: Optional[AnswerWithSources] = Field(
        default=None,
        description="The percentage of screened suppliers during the reporting period."
    )
    further_information: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on the supplier screening process."
    )


class EnvironmentalSupplierCodeOfConduct(BaseModel):
    """Information on an organisation's supplier code of conduct."""
    has_code_of_conduct: str = Field(
        description="Whether the organisation has a supplier code of conduct containing environmental aspects (Yes/No)."
    )
    percentage_signed: Optional[AnswerWithSources] = Field(
        default=None,
        description="The percentage of suppliers that have signed the code of conduct."
    )
    further_information: Optional[AnswerWithSources] = Field(
        default=None,
        description="Further information on the supplier code of conduct."
    )


class EnvironmentalPurchasedGoods(BaseModel):
    """Information on an organisation's tracking of purchased goods."""
    tracks_quantity: str = Field(
        description="Whether the organisation keeps track of the quantity (tons/year) of purchased materials and goods (merchandising, food, metallic materials, plastics, other materials) (Yes/No)."
    )
    further_information: Optional[AnswerWithSources] = Field(
        default=None,
        description="Documentation and further information about tracking purchased goods."
    )


class EnvironmentalSupplyChainData(BaseModel):
    """
    A comprehensive model for all supply chain and purchased goods data.
    """
    environmental_supply_chain_impact: Optional[EnvironmentalSupplyChainImpact] = Field(
        default=None,
        description="Information on monitoring and measuring environmental impact in the supply chain."
    )
    supplier_assessment_engagement: Optional[EnvironmentalSupplierAssessmentEngagement] = Field(
        default=None,
        description="Information on supplier assessment and engagement regarding environmental topics."
    )
    supplier_screening: Optional[EnvironmentalSupplierScreening] = Field(
        default=None,
        description="Information on supplier screening using environmental criteria."
    )
    supplier_code_of_conduct: Optional[EnvironmentalSupplierCodeOfConduct] = Field(
        default=None,
        description="Information on the supplier code of conduct with environmental aspects."
    )
    purchased_goods: Optional[EnvironmentalPurchasedGoods] = Field(
        default=None,
        description="Information on tracking the quantity of purchased materials and goods."
    )


class EnvironmentalConductedActionDetails(BaseModel):
    """Details for a specific conducted environmental action category."""
    progress_achieved_for_period: AnswerWithSources = Field(
        description="Progress achieved for the reporting period regarding this environmental topic."
    )
    actual_targets_set_for_period: AnswerWithSources = Field(
        description="Actual targets set for the reporting period regarding this environmental topic."
    )
    reasons_for_achieved_progress: AnswerWithSources = Field(
        description="Reason(s) for the achieved progress regarding this environmental topic."
    )


class EnvironmentalConductedActionCategory(BaseModel):
    """A specific category of conducted environmental action (e.g., Total Emissions)."""
    category_name: str = Field(
        description="The name of the environmental category (e.g., 'Total emissions', 'Total energy consumption')."
    )
    details: EnvironmentalConductedActionDetails = Field(
        description="Detailed progress, targets, and reasons for this category."
    )


class EnvironmentalConductedActionsSection(BaseModel):
    """The main section for conducted environmental actions conducted during the reporting period."""
    categories_data: List[EnvironmentalConductedActionCategory] = Field(
        description="A list of specific environmental categories with their respective progress, targets, and reasons."
    )


class EnvironmentalPlannedActionDetails(BaseModel):
    """Details for a specific planned environmental action category."""
    planned_or_targeted_progress: AnswerWithSources = Field(
        description="Planned or targeted progress for the upcoming period regarding this environmental topic."
    )
    planned_actions_to_achieve_target: AnswerWithSources = Field(
        description="Planned action(s) to achieve the target(s) for the upcoming period regarding this environmental topic."
    )


class EnvironmentalPlannedActionCategory(BaseModel):
    """A specific category of planned environmental action (e.g., Total Emissions)."""
    category_name: str = Field(
        description="The name of the planned environmental category (e.g., 'Total emissions', 'Total energy consumption')."
    )
    details: EnvironmentalPlannedActionDetails = Field(
        description="Detailed planned progress and actions for this category."
    )


class EnvironmentalPlannedActionsSection(BaseModel):
    """The main section for environmental actions planned for the next reporting period."""
    categories_data: List[EnvironmentalPlannedActionCategory] = Field(
        description="A list of specific environmental categories with their respective planned progress and actions."
    )


class EnvironmentalGoalDetails(BaseModel):
    """Details for a specific medium/long-term goal."""
    definition_outlining: AnswerWithSources = Field(
        description="Definition and outlining of the goal(s)."
    )
    is_measurable: str = Field(
        description="Whether the goal(s) are measurable (Yes/No)."
    )
    measurable_definition: Optional[AnswerWithSources] = Field(
        default=None,
        description="If measurable, the definition of the goal(s)."
    )
    year_set: AnswerWithSources = Field(
        description="Which year the goal(s) were set."
    )
    year_achieve: AnswerWithSources = Field(
        description="Which year the goal(s) are planned to be achieved."
    )
    is_communicated: str = Field(
        description="Whether the goal(s) are actively communicated (Yes/No)."
    )
    communication_method: AnswerWithSources = Field(
        description="How the goal(s) are communicated (e.g., report, website, events)."
    )
    communication_scope: AnswerWithSources = Field(
        description="Scope of communication (e.g., page numbers, website links)."
    )


class EnvironmentalMediumLongTermGoal(BaseModel):
    """A specific category of environmental goals."""
    category_name: str = Field(
        description="The name of the environmental category (e.g., 'Climate Action', 'Biodiversity')."
    )
    has_set_goal: str = Field(
        description="Whether this category has a set goal (Yes/No)."
    )
    details: Optional[EnvironmentalGoalDetails] = Field(
        default=None,
        description="Detailed goals, measurability, years, and communication methods for this category."
    )


class EnvironmentalMediumLongTermGoalsSection(BaseModel):
    """The main section for medium/long-term environmental goals and pledges."""
    goals_data: List[EnvironmentalMediumLongTermGoal] = Field(
        description="A list of specific environmental categories with their respective goals and details."
    )

    
class EnvironmentalCampaign(BaseModel):
    """Information extracted about a specific environmental campaign or activity."""
    name_of_campaign_activity: AnswerWithSources = Field(
        description="Name of the environmental campaign or activity."
    )
    purpose_and_goal: AnswerWithSources = Field(
        description="Purpose and aspired goal of the campaign/activity."
    )
    description_and_scope: AnswerWithSources = Field(
        description="Description and scope of the campaign."
    )
    qualitative_quantitative_results: AnswerWithSources = Field(
        description="Qualitative and quantitative results of the campaign."
    )
    environmental_issues_addressed: AnswerWithSources = Field(
        description="Environmental issues addressed by the campaign."
    )
    name_of_collaboration_partners: AnswerWithSources = Field(
        description="Names of collaboration partners (e.g., fan club, local community, charity, foundation, sponsorship partner)."
    )
    number_of_potential_individuals_reached: AnswerWithSources = Field(
        description="Number of potential individuals reached by the campaign."
    )
    sdgs_addressed: List[AnswerWithSources] = Field(
        description="Which of the 17 Sustainable Development Goals (SDGs) have been addressed by the campaign."
    )


class EnvironmentalCampaignsList(BaseModel):
    """List of environmental campaigns extracted from the report."""
    campaigns: List[EnvironmentalCampaign] = Field(
        description="A list of environmental campaigns and activities identified in the report."
    )


class ESGReportExtractedInfo(BaseModel):
    """Comprehensive extracted information from an ESG report."""
    env_supply_chain: Optional[EnvironmentalSupplyChainData] = Field(
        default=None,
        description="Detailed information about the environmental supply chain and purchased goods tracking."
    )
    env_actions_conducted: Optional[EnvironmentalConductedActionsSection] = Field(
        default=None,
        description="Detailed information about environmental actions conducted during the reporting period."
    )
    env_actions_planned: Optional[EnvironmentalPlannedActionsSection] = Field(
        default=None,
        description="Detailed information about environmental actions planned for the next reporting period."
    )
    env_medium_long_term_goals: Optional[EnvironmentalMediumLongTermGoalsSection] = Field(
        default=None,
        description="Detailed information about medium/long-term environmental goals, pledges, and claims."
    )
    env_campaigns: Optional[EnvironmentalCampaignsList] = Field(
        default=None,
        description="Detailed information about environmental campaigns and activities identified in the report."
    )


In [5]:
# =============================================================================
# EMBEDDING FUNCTIONS
# =============================================================================

def get_gemini_embedding_function():
    """Returns a function that embeds text using Gemini's embedding model."""
    def embed_text(text: str, task_type: str = "SEMANTIC_SIMILARITY") -> Optional[np.ndarray]:
        """Embeds a single string using the Gemini embedding model."""
        try:
            response = genai.embed_content(
                model="models/embedding-001",
                content=text,
                task_type=task_type
            )
            return np.array(response['embedding'])
        except Exception as e:
            print(f"Error embedding text with Gemini: {e}")
            return None
    return embed_text


class CustomGeminiEmbeddings(Embeddings):
    """
    A LangChain Embeddings class wrapper for the Gemini embedding function.
    This allows the Gemini embedder to be used with LangChain's vector stores.
    """
    def __init__(self, gemini_embed_func):
        self.gemini_embed_func = gemini_embed_func

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """
        Embeds a list of documents for storage in a vector store.
        Uses "RETRIEVAL_DOCUMENT" task type for optimal retrieval performance.
        """
        embeddings_list = []
        for text in texts:
            embedding = self.gemini_embed_func(text, task_type="RETRIEVAL_DOCUMENT")
            if embedding is not None:
                embeddings_list.append(embedding.tolist())
            else:
                # Handle cases where embedding fails
                print(f"Warning: Embedding failed for a document. Using a zero vector.")
                embeddings_list.append([0.0] * 768)  # Assuming 768 is the embedding dimension
        return embeddings_list

    def embed_query(self, text: str) -> List[float]:
        """
        Embeds a single query string for similarity search.
        Uses "RETRIEVAL_QUERY" task type.
        """
        embedding = self.gemini_embed_func(text, task_type="RETRIEVAL_QUERY")
        if embedding is not None:
            return embedding.tolist()
        else:
            print(f"Warning: Embedding failed for the query. Using a zero vector.")
            return [0.0] * 768  # Assuming 768 is the embedding dimension


In [6]:
# =============================================================================
# VECTOR STORE FUNCTIONS
# =============================================================================

def create_vectorstore(chunks: List[Document], embedding_function_instance: Embeddings, vectorstore_path: str):
    """
    Creates or loads a Chroma vector store from a list of document chunks.
    Ensures uniqueness of documents based on content hash.
    """
    print(f"Attempting to create/load vector store at: {vectorstore_path}")

    # Generate unique IDs for each document based on its content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

    # Filter out duplicate chunks based on their generated IDs
    unique_ids = set()
    unique_chunks = []
    for chunk, doc_id in zip(chunks, ids):
        if doc_id not in unique_ids:
            unique_ids.add(doc_id)
            unique_chunks.append(chunk)

    print(f"Found {len(unique_chunks)} unique chunks out of {len(chunks)} total.")

    # Create or load the Chroma database
    vectorstore = Chroma.from_documents(
        documents=unique_chunks,
        ids=list(unique_ids),
        embedding=embedding_function_instance,
        persist_directory=vectorstore_path
    )
    
    print(f"Vector store created/loaded and persisted successfully at: {vectorstore_path}")
    return vectorstore


def format_docs(docs):
    """Format documents for RAG context."""
    return "\n\n".join(doc.page_content for doc in docs)

In [7]:
# =============================================================================
# DOCUMENT PROCESSING
# =============================================================================

def load_and_split_documents(file_path: str) -> List[Document]:
    """Load PDF and split into chunks."""
    print(f"Loading document: {file_path}")
    
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", " "]
    )
    
    chunks = text_splitter.split_documents(pages)
    print(f"Document split into {len(chunks)} chunks")
    
    return chunks


In [8]:
# =============================================================================
# RAG SETUP AND QUERIES
# =============================================================================

def setup_rag_chains(vectorstore, llm):
   """Set up RAG chains for different extraction tasks."""
    
   # Create retriever with optimized parameters
   retriever = vectorstore.as_retriever(
      search_type="similarity", 
      search_kwargs={"k": 5}
   )
   
   # Define optimized RAG prompt template
   rag_prompt_template = """You are an expert ESG (Environmental, Social, Governance) data extraction specialist with deep knowledge of sustainability reporting standards, environmental metrics, and corporate responsibility frameworks.

      ## TASK OVERVIEW:
      Your primary objective is to extract precise, structured information from ESG reports and sustainability documents. You must analyze the provided context thoroughly and extract relevant data according to the specified Pydantic model structure.

      ## EXTRACTION PRINCIPLES:
      1. **ACCURACY FIRST**: Base all responses strictly on the provided context. Never invent, assume, or extrapolate information not present in the source material.

      2. **COMPREHENSIVE ANALYSIS**: 
         - Examine ALL provided context chunks thoroughly
         - Look for information across different sections, pages, and document parts
         - Consider synonyms, alternative terminology, and related concepts
         - Connect information that may be scattered across multiple paragraphs

      3. **DIRECT EVIDENCE REQUIREMENT**:
         - For each extracted piece of information, provide the EXACT source text that supports your answer
         - Use direct quotes whenever possible, preserving original wording and formatting
         - When paraphrasing is necessary, stay as close to the original language as possible

      4. **STRUCTURED OUTPUT COMPLIANCE**:
         - Follow the Pydantic model structure exactly as specified
         - Ensure all required fields are populated appropriately
         - Use "Information not available in the provided context" for missing data rather than leaving fields empty

      5. **NUMERICAL DATA PRECISION**:
         - Extract exact figures, percentages, dates, and metrics when available
         - Preserve units of measurement, currency symbols, and time periods
         - Note any baseline years, comparison periods, or contextual qualifiers

      6. **REASONING TRANSPARENCY**:
         - Clearly explain how each answer was derived from the source material
         - Show logical connections between questions and relevant context sections
         - Highlight any assumptions or interpretations made during extraction

      ## CONTEXT ANALYSIS GUIDELINES:
      - **Multiple Perspectives**: Consider how information might be presented in different formats (tables, charts, narrative text, bullet points)
      - **Cross-References**: Look for information that spans multiple sections or references other parts of the document
      - **Implicit Information**: Extract information that might be implied or stated indirectly
      - **Temporal Context**: Pay attention to reporting periods, historical comparisons, and future projections

      ## QUALITY ASSURANCE:
      - **Consistency Check**: Ensure extracted information aligns logically across all fields
      - **Completeness Review**: Verify that all available relevant information has been captured
      - **Accuracy Validation**: Double-check that source quotes exactly match the original text

      ## HANDLING MISSING INFORMATION:
      If specific information is not available in the context:
      - State explicitly: "This information is not available in the provided context"
      - Do not guess, estimate, or use general knowledge to fill gaps
      - Provide reasoning for why the information might be missing or where it might typically be found

      ---

      ## DOCUMENT CONTEXT:
      {context}

      ---

      ## EXTRACTION REQUEST:
      {question}

      ---

      ## RESPONSE INSTRUCTIONS:
      Analyze the above context thoroughly and extract the requested information following the Pydantic model structure. Ensure every piece of extracted data is supported by direct evidence from the provided context."""
   rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)
   
   # Create chains for different extraction tasks
   env_supply_chain_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(EnvironmentalSupplyChainData, strict=True)
   )

   env_conducted_actions_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(EnvironmentalConductedActionsSection, strict=True)
   )
   
   env_planned_actions_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(EnvironmentalPlannedActionsSection, strict=True)
   )

   env_medium_long_term_goals_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(EnvironmentalMediumLongTermGoalsSection, strict=True)
   )

   env_campaigns_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | rag_prompt
      | llm.with_structured_output(EnvironmentalCampaignsList, strict=True)
   )

   return env_supply_chain_chain, env_conducted_actions_chain, env_planned_actions_chain, env_medium_long_term_goals_chain, env_campaigns_chain

def get_extraction_questions():
   """Define optimized questions for different extraction tasks."""

   env_supply_chain_question = """
      Comprehensively analyze the ESG report to extract detailed, structured information about the organization's environmental approach to supply chain management and purchased goods tracking during the current reporting period.

      ## SUPPLY CHAIN ENVIRONMENTAL IMPACT MONITORING

      **1. Supply Chain Environmental Impact Assessment:**
      - Does the organization actively monitor, measure, and assess the environmental impact of its supply chain operations? (Answer: Yes/No/Partially)
      - What specific environmental impacts are monitored (e.g., carbon emissions, water usage, waste generation, energy consumption, biodiversity impact)?
      - What methodologies, frameworks, or standards are used for measurement (e.g., Scope 3 emissions calculation, life cycle assessment, carbon footprint analysis)?
      - What are the key findings or metrics from supply chain environmental impact assessments?
      - Which supply chain stages or tiers are included in environmental monitoring (Tier 1, Tier 2, upstream, downstream)?

      ## SUPPLIER ENVIRONMENTAL ENGAGEMENT & ASSESSMENT

      **2. Supplier Environmental Assessment Programs:**
      - Does the organization conduct formal environmental assessments or evaluations of suppliers? (Answer: Yes/No/Partially)
      - What specific environmental criteria are assessed (e.g., environmental management systems, certifications, compliance, performance metrics)?
      - What assessment methods are used (questionnaires, audits, third-party evaluations, self-assessments, on-site visits)?
      - How frequently are supplier environmental assessments conducted (annually, bi-annually, upon onboarding)?
      - What are the consequences or follow-up actions for suppliers with poor environmental performance?

      **3. Supplier Environmental Engagement & Collaboration:**
      - What specific engagement activities does the organization undertake with suppliers on environmental topics?
      - Are there supplier training programs, workshops, or capacity-building initiatives on environmental matters?
      - Does the organization collaborate with suppliers on environmental improvement projects or innovations?
      - Are there supplier recognition programs or incentives for environmental performance excellence?
      - What support or resources does the organization provide to help suppliers improve their environmental performance?

      ## SUPPLIER SCREENING & SELECTION

      **4. Environmental Supplier Screening Process:**
      - Does the organization implement environmental criteria in supplier screening and selection processes? (Answer: Yes/No/Partially)
      - What percentage of new suppliers were screened using environmental criteria during the reporting period? (Extract exact percentage)
      - What percentage of existing suppliers were re-screened or re-evaluated using environmental criteria? (Extract exact percentage)
      - What specific environmental screening criteria are applied (e.g., ISO 14001 certification, carbon disclosure, waste management practices, regulatory compliance)?
      - What is the process for suppliers who do not meet environmental screening criteria?
      - Are there any environmental prerequisites or mandatory requirements for supplier qualification?

      ## SUPPLIER CODE OF CONDUCT & CONTRACTUAL REQUIREMENTS

      **5. Supplier Environmental Code of Conduct:**
      - Does the organization maintain a supplier code of conduct that explicitly includes environmental requirements and expectations? (Answer: Yes/No/Partially)
      - What percentage of suppliers have formally signed or acknowledged the environmental code of conduct during the reporting period? (Extract exact percentage)
      - What percentage of total procurement spend is covered by suppliers who have signed the code of conduct?
      - What specific environmental topics are covered in the code of conduct (e.g., emissions reduction, waste management, resource efficiency, biodiversity protection)?
      - What are the enforcement mechanisms and consequences for code of conduct violations?
      - How frequently is the environmental code of conduct updated or revised?

      **6. Contractual Environmental Requirements:**
      - Are environmental clauses, terms, or requirements integrated into supplier contracts and agreements?
      - What percentage of procurement contracts include specific environmental performance requirements?
      - Are there contractual penalties or incentives tied to environmental performance?

      ## PURCHASED GOODS & MATERIALS TRACKING

      **7. Purchased Goods Quantity Tracking:**
      - Does the organization systematically track and record the quantity (in tons/year or other units) of purchased materials and goods? (Answer: Yes/No/Partially)
      - What categories of purchased goods are tracked (e.g., merchandising products, food and beverages, metallic materials, plastics, packaging materials, textiles, electronics, chemicals, construction materials)?

      **8. Detailed Material Categories & Quantities:**
      For each category where tracking exists, extract:
      - **Category Name**: Specific type of material or good
      - **Quantity Tracked**: Annual volumes in tons/year or other specified units
      - **Tracking Methodology**: How quantities are measured and recorded
      - **Data Quality**: Completeness and reliability of tracking data
      - **Reporting Period**: Time period covered by the tracking data

      **9. Purchased Goods Environmental Impact:**
      - Does the organization assess or calculate the environmental impact of purchased goods and materials?
      - Are there specific environmental metrics tracked for purchased goods (e.g., embodied carbon, water footprint, recyclability)?
      - What documentation, databases, or systems are used for tracking purchased goods?
      - Are there any certifications or standards applied to purchased materials (e.g., FSC, EPEAT, ENERGY STAR, organic certifications)?

      ## SUPPLY CHAIN ENVIRONMENTAL INITIATIVES

      **10. Supply Chain Environmental Programs:**
      - What specific environmental initiatives or programs are implemented across the supply chain?
      - Are there collaborative projects with suppliers focused on environmental improvements?
      - What investments or resources are allocated to supply chain environmental improvements?
      - Are there any supply chain environmental targets or commitments for the next reporting period?

      ## DATA SOURCES & VERIFICATION

      **11. Documentation & Evidence:**
      - What specific reports, systems, or documentation support the supply chain environmental information?
      - Are there third-party verifications or audits of supply chain environmental data?
      - What are the data collection challenges or limitations mentioned?

      **CRITICAL EXTRACTION GUIDELINES:**
      - **Quantitative Precision**: Extract exact percentages, quantities, and numerical data with units and time periods
      - **Yes/No Clarity**: Provide definitive answers where possible, use "Partially" only when explicitly supported by mixed evidence
      - **Source Attribution**: Reference specific sections, pages, or documents where information is found
      - **Missing Information Protocol**: For each question where no information is available, explicitly state: "This information is not available in the provided context"
      - **Comprehensive Coverage**: Look for information across different report sections including sustainability reports, supply chain sections, procurement policies, and appendices
      - **Terminology Variations**: Consider alternative terms like "value chain," "sourcing," "procurement," "vendor management," "third-party suppliers"
      - **Temporal Context**: Note whether information refers to the current reporting period, historical data, or future commitments
      """
    
   env_conducted_actions_question = """
      Analyze the ESG report and extract detailed information about CONDUCTED environmental actions during the current reporting period.
      
      For each of these environmental categories, identify:
      - Total emissions (including Scope 1, 2, and 3 emissions, carbon footprint, GHG emissions)
      - Total business travel (including team travel, staff travel, transportation)
      - Total energy consumption (including electricity, gas, fuel, renewable energy usage)
      - Total water consumption (including water usage, conservation efforts)
      - Total paper consumption (including digital transformation efforts, paperless initiatives)
      - Total waste generation (including waste reduction, recycling, circular economy efforts)
      
      For EACH category found, extract:
      1. PROGRESS ACHIEVED: What specific measurable progress was made? Include quantitative metrics (percentages, absolute numbers, comparisons to baselines or previous years).
      2. ACTUAL TARGETS: What were the specific targets or goals set for this reporting period? Include numerical targets and timeframes.
      3. REASONS FOR PROGRESS: What specific actions, initiatives, or factors led to the achieved progress? Include implementation details and methodologies.
      
      Important guidelines:
      - Look for synonyms and related terms (e.g., "carbon footprint" for emissions, "energy efficiency" for energy consumption)
      - Extract exact figures, percentages, and metrics when available
      - Include both absolute and relative improvements
      - Note any certifications, standards, or methodologies mentioned
      - If no information is found for a category, clearly state "No conducted actions reported for this category"
      """
      
   env_planned_actions_question = """
      Analyze the ESG report and extract detailed information about PLANNED environmental actions specifically for the NEXT REPORTING PERIOD ONLY (typically the upcoming 12 months or next fiscal year).
      
      IMPORTANT: Focus STRICTLY on SHORT-TERM plans for the immediate next reporting cycle. EXCLUDE any medium-term (2-5 years) or long-term (5+ years) strategic goals, commitments, or targets - these will be handled separately.
      
      For each of these environmental categories, identify ONLY next-period plans:
      - Total emissions (including immediate reduction initiatives, next-year offset plans, short-term efficiency measures)
      - Total business travel (including next-period travel policies, immediate transport changes)
      - Total energy consumption (including next-year efficiency projects, immediate renewable energy implementations)
      - Total water consumption (including next-period conservation measures, immediate efficiency upgrades)
      - Total paper consumption (including next-year digitization phases, immediate reduction initiatives)
      - Total waste generation (including next-period reduction programs, immediate recycling improvements)
      
      For EACH category found, extract ONLY:
      1. PLANNED/TARGETED PROGRESS: What specific measurable targets are set for the NEXT reporting period only? Include numerical goals and percentages specifically for the upcoming 12 months or next fiscal year.
      2. PLANNED ACTIONS: What specific initiatives, strategies, or actions will be implemented during the NEXT reporting period? Include immediate implementation plans, next-period deliverables, and short-term milestones.
      
      Identifying next-period actions - look for language such as:
      - "Next year we will..."
      - "In the coming year..."
      - "For [next fiscal year]..."
      - "In the next 12 months..."
      - "Our immediate plans include..."
      - "Next reporting period targets..."
      - "Upcoming initiatives..."
      - "Short-term actions..."
      
      EXCLUDE language indicating longer timeframes:
      - "By 2030/2035/2040..." (medium/long-term)
      - "Our 5-year plan..." (medium-term)
      - "Strategic roadmap to 2030..." (long-term)
      - "Long-term commitment to..." (strategic goals)
      - "Over the next decade..." (long-term)
      
      Important guidelines:
      - Be very strict about timeframe - only include actions explicitly planned for the next reporting cycle
      - Focus on operational and tactical plans rather than strategic commitments
      - Look for specific next-period budgets, resources, or project phases
      - Include immediate partnerships or collaborations starting in the next period
      - Capture next-period compliance requirements or regulatory implementations
      - If no next-period specific plans are mentioned for a category, clearly state "No planned actions reported for the next reporting period for this category"
      """

   env_medium_long_term_goals_question = """
      Analyze the ESG report comprehensively and extract detailed information about MEDIUM-TERM (2-5 years) and LONG-TERM (5+ years) environmental goals, targets, pledges, commitments, and strategic objectives.
      
      TIMEFRAME FOCUS: Look specifically for goals with target dates beyond the next immediate reporting period, including:
      - Medium-term goals: 2-5 years out (e.g., by 2027-2030)
      - Long-term goals: 5+ years out (e.g., by 2030, 2035, 2040, 2050 and beyond)
      - Strategic commitments with extended timelines
      - Multi-year roadmaps and transformation plans

      For EACH of the following environmental categories, thoroughly analyze and determine if medium/long-term goals have been established. The category names MUST be exactly as follows:
      
      **CORE ENVIRONMENTAL CATEGORIES:**
      - Emissions (carbon neutrality, net-zero, GHG reduction targets, scope 1/2/3 commitments)
      - Travel (sustainable transport, business travel reduction, logistics optimization)
      - Energy (renewable energy transition, energy efficiency, power sourcing commitments)
      - Water (conservation targets, usage reduction, water stewardship goals)
      - Paper & Digital Transformation (paperless operations, digitization commitments)
      - Waste (zero-waste goals, circular economy targets, waste reduction pledges)
      - Food & Nutrition (sustainable sourcing, food waste reduction, nutritional impact goals)
      - Climate Change (climate resilience, adaptation strategies, climate risk mitigation)
      - Biodiversity (nature protection, biodiversity restoration, ecosystem preservation)
      - Supply Chain (supplier engagement, value chain decarbonization, sustainable procurement)
      - Other (identify any additional environmental goal areas not listed above)
      
      For EACH category where medium/long-term goals are identified, extract the following comprehensive information:
      
      1. **GOAL DEFINITION & SCOPE**: 
         - Clear, detailed definition of the goal(s) and what they encompass
         - Scope of application (global, regional, specific operations)
         - Any sub-goals or phased approaches within the main commitment
      
      2. **MEASURABILITY ASSESSMENT**:
         - Is the goal quantitatively measurable? (Answer: 'Yes', 'No', or 'Partially')
         - Are there specific KPIs, metrics, or indicators defined?
      
      3. **MEASURABLE SPECIFICATIONS**:
         - Exact numerical targets, percentages, or reduction amounts
         - Baseline years and reference points for measurements
         - Units of measurement and calculation methodologies
         - Any interim milestones or checkpoints specified
      
      4. **TEMPORAL FRAMEWORK**:
         - Year the goal(s) were first established or announced
         - Target achievement year(s) or deadline(s)
         - Any revised timelines or updated commitments
         - Interim milestone years if specified
      
      5. **STRATEGIC ALIGNMENT**:
         - Connection to broader corporate strategy or sustainability framework
         - Alignment with external standards (SBTi, Paris Agreement, SDGs, etc.)
         - Integration with industry initiatives or sector commitments
      
      6. **COMMUNICATION & TRANSPARENCY**:
         - Do they actively communicate these goals? (Answer: 'Yes', 'No', or 'Limited')
         - Communication channels and methods used (annual reports, sustainability reports, website, press releases, investor communications)
         - Frequency of progress reporting and updates
      
      7. **COMMUNICATION EVIDENCE**:
         - Specific document sections, page numbers, or website locations where goals are communicated
         - Exact quotes or statements about the commitments
         - Public announcements, press releases, or external communications
      
      8. **IMPLEMENTATION APPROACH**:
         - Strategic initiatives, programs, or investments planned to achieve goals
         - Governance structures or responsible parties for goal delivery
         - Resource allocation or budget commitments mentioned
         - Partnerships or external collaborations for goal achievement
      
      **IDENTIFICATION KEYWORDS** - Look for language such as:
      - "By 2030/2035/2040/2050..."
      - "Long-term commitment to..."
      - "Strategic goal to achieve..."
      - "Our roadmap to..."
      - "Multi-year target..."
      - "Decade commitment..."
      - "Net-zero by..."
      - "Carbon neutral by..."
      - "Sustainable future vision..."
      
      **CRITICAL GUIDELINES:**
      - **Evidence-Based Only**: Extract information strictly supported by the provided text - do not infer or assume details not explicitly stated
      - **Temporal Precision**: Be exact about timeframes and distinguish between medium-term vs. long-term commitments
      - **Goal Hierarchy**: Identify both overarching strategic goals and specific sub-targets within categories
      - **Missing Information Protocol**: If no medium/long-term goals exist for a category, explicitly state: "No medium or long-term goals identified for this category in the provided context"
      - **Incomplete Data Handling**: If partial information is available, extract what exists and note what information is missing
      - **Verification Focus**: Look for evidence of goal validation, third-party verification, or external endorsement of commitments
      """
    
   env_campaigns_question = """
      Comprehensively analyze the ESG report to identify and extract detailed information about ALL environmental campaigns, initiatives, activities, and programs mentioned.
      
      Look for various types of environmental activities including:
      - Awareness campaigns and educational programs
      - Community engagement initiatives
      - Sustainability programs and projects
      - Environmental partnerships and collaborations
      - Green initiatives and eco-friendly activities
      - Climate action programs
      - Conservation efforts and biodiversity projects
      - Stakeholder engagement on environmental issues
      - Environmental advocacy and policy initiatives
      - Green technology implementations
      
      For EACH campaign/activity identified, extract comprehensive details:
      
      1. NAME: The specific name or title of the campaign/activity. If no formal name exists, create a descriptive title based on the content.
      
      2. PURPOSE & GOAL: The intended objectives, mission, and aspirational outcomes. What environmental impact is the campaign trying to achieve?
      
      3. DESCRIPTION & SCOPE: Detailed description of what the campaign involves, its scope, duration, geographic reach, and key components or phases.
      
      4. RESULTS: Both qualitative and quantitative outcomes, impacts, and achievements. Include metrics, participation numbers, environmental benefits, and success indicators.
      
      5. ENVIRONMENTAL ISSUES: Specific environmental challenges, problems, or areas addressed (e.g., climate change, biodiversity loss, pollution, resource depletion).
      
      6. COLLABORATION PARTNERS: Names of all partners involved, including:
         - Fan clubs and supporter groups
         - Local communities and neighborhoods
         - Charities and non-profit organizations
         - Foundations and environmental groups
         - Sponsorship and business partners
         - Government agencies and institutions
         - Academic institutions and schools
      
      7. REACH: Number of people, communities, or stakeholders potentially reached or engaged by the campaign.
      
      8. SDGs ADDRESSED: Which of the 17 UN Sustainable Development Goals are specifically addressed, with explanations of how the campaign contributes to each goal.
      
      Important guidelines:
      - Don't miss smaller initiatives or brief mentions of environmental activities
      - Look for both formal programs and informal/ad-hoc environmental efforts
      - Extract information even if incomplete - capture what's available
      - Consider the full lifecycle of campaigns from planning to evaluation
      - Include both internal (employee-focused) and external (community-focused) campaigns
      - Look for environmental messaging, communications, and awareness efforts
      """

   return env_supply_chain_question, env_conducted_actions_question, env_planned_actions_question, env_medium_long_term_goals_question, env_campaigns_question



In [9]:
# =============================================================================
# MAIN EXECUTION
# =============================================================================

def main():
    """Main execution function."""
    print("=" * 80)
    print("ESG DATA EXTRACTION PIPELINE")
    print("=" * 80)
    
    # 1. Load and process documents
    print("\n1. Loading and processing documents...")
    chunks = load_and_split_documents(input_filepath)
    
    # 2. Initialize embedding function
    print("\n2. Initializing embedding function...")
    raw_gemini_embedder = get_gemini_embedding_function()
    custom_langchain_embeddings = CustomGeminiEmbeddings(raw_gemini_embedder)
    
    # 3. Create/load vector store
    print("\n3. Creating/loading vector store...")
    vectorstore = create_vectorstore(
        chunks=chunks,
        embedding_function_instance=custom_langchain_embeddings,
        vectorstore_path=CHROMA_DB_PATH
    )
    
    # 4. Initialize LLM and setup RAG chains
    print("\n4. Setting up RAG chains...")
    llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2, google_api_key=GEMINI_API_KEY)

    env_supply_chain_chain, env_conducted_actions_chain, env_planned_actions_chain, env_medium_long_term_goals_chain, env_campaigns_chain = setup_rag_chains(vectorstore, llm)

    # 5. Get extraction questions
    env_supply_chain_question, env_conducted_actions_question, env_planned_actions_question, env_medium_long_term_goals_question, env_campaigns_question = get_extraction_questions()

    # 6. Extract information
    print("\n5. Extracting information...")
    print("   - Extracting supply chain information...")
    extracted_env_supply_chain = env_supply_chain_chain.invoke(env_supply_chain_question)

    print("   - Extracting conducted actions...")
    extracted_env_conducted_actions = env_conducted_actions_chain.invoke(env_conducted_actions_question)
    
    print("   - Extracting planned actions...")
    extracted_env_planned_actions = env_planned_actions_chain.invoke(env_planned_actions_question)

    print("   - Extracting medium/long-term goals...")
    extracted_env_medium_long_term_goals = env_medium_long_term_goals_chain.invoke(env_medium_long_term_goals_question)
    
    print("   - Extracting environmental campaigns...")
    extracted_env_campaigns = env_campaigns_chain.invoke(env_campaigns_question)
    
    # 7. Combine results
    print("\n6. Combining results...")
    extracted_esg_data = ESGReportExtractedInfo(
        env_supply_chain=extracted_env_supply_chain,
        env_actions_conducted=extracted_env_conducted_actions,
        env_actions_planned=extracted_env_planned_actions,
        env_medium_long_term_goals=extracted_env_medium_long_term_goals,
        env_campaigns=extracted_env_campaigns
    )
    
    # 8. Save results
    print("\n7. Saving results...")
    try:
        json_output = extracted_esg_data.model_dump_json(indent=2)
        
        with open(output_filepath, 'w', encoding='utf-8') as f:
            f.write(json_output)
        
        print(f"✓ Data successfully saved to: {output_filepath}")
        print(f"✓ Extracted {len(extracted_esg_data.env_campaigns.campaigns if extracted_esg_data.env_campaigns else 0)} environmental campaigns")
        print(f"✓ Extracted {len(extracted_esg_data.env_medium_long_term_goals.goals_data) if extracted_esg_data.env_medium_long_term_goals else 0} medium/long-term goals")
        print(f"✓ Extracted {len(extracted_esg_data.env_actions_conducted.categories_data) if extracted_esg_data.env_actions_conducted else 0} conducted action categories")
        print(f"✓ Extracted {len(extracted_esg_data.env_actions_planned.categories_data) if extracted_esg_data.env_actions_planned else 0} planned action categories")
        
    except Exception as e:
        print(f"❌ Error saving results: {e}")
        return False
    
    print("\n" + "=" * 80)
    print("EXTRACTION COMPLETED SUCCESSFULLY!")
    print("=" * 80)
    
    return True


if __name__ == "__main__":
    try:
        success = main()
        if not success:
            exit(1)
    except Exception as e:
        print(f"❌ Error during execution: {e}")
        exit(1)

ESG DATA EXTRACTION PIPELINE

1. Loading and processing documents...
Loading document: ../data/LFC.pdf
Document split into 62 chunks

2. Initializing embedding function...

3. Creating/loading vector store...
Attempting to create/load vector store at: ../vectorstores/lfc_report_vectorstore
Found 62 unique chunks out of 62 total.
Vector store created/loaded and persisted successfully at: ../vectorstores/lfc_report_vectorstore

4. Setting up RAG chains...

5. Extracting information...
   - Extracting supply chain information...
   - Extracting conducted actions...
   - Extracting planned actions...
   - Extracting medium/long-term goals...
   - Extracting environmental campaigns...

6. Combining results...

7. Saving results...
✓ Data successfully saved to: ../output/lfc_extracted_data.json
✓ Extracted 8 environmental campaigns
✓ Extracted 11 medium/long-term goals
✓ Extracted 6 conducted action categories
✓ Extracted 6 planned action categories

EXTRACTION COMPLETED SUCCESSFULLY!
