In [None]:
"""
Executive Summary Agent - MVP
Author: Alejandro
Objective: Demonstrate an advanced RAG flow for summarizing financial PDF documents.
"""
import os
import sys
from pathlib import Path
import re

# --- LangChain and RAG Components ---
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# --------------------------------------------------------------------
# 1. DOCUMENT PROCESSING COMPONENTS
# --------------------------------------------------------------------
def load_pdf(file_path: str) -> str:
    """Load a PDF and extract its text using pdfplumber."""
    if not os.path.exists(file_path):
        return ""

    print(f"[INFO] Loading PDF with pdfplumber: {file_path}")
    extracted_text = ""  # Changed from texto_extraido
    try:
        import pdfplumber
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                extracted_text += page.extract_text() + "\n\n"
    except Exception as e:
        print(f"[ERROR] Failed to process PDF: {e}")
        return ""

    return extracted_text.strip()


def clean_text(text: str) -> str:
    """Improved cleaning for headers."""
    if not text:
        return ""
    
    lines = text.split('\n')
    cleaned_lines = []
    
    # Specific pattern for repetitive Adidas headers
    adidas_header_pattern = r'^TO OUR GROUP MANAGEMENT REPORT ‚Äì GROUP MANAGEMENT REPORT ‚Äì GROUP MANAGEMENT REPORT ‚Äì CONSOLIDATED ADDITIONAL$'
    
    for line in lines:
        stripped = line.strip()
        
        # 1. Remove the exact repetitive header
        if re.match(adidas_header_pattern, stripped):
            continue
            
        # 2. Remove the section line (appears below the header)
        if stripped == "SHAREHOLDERS OUR COMPANY FINANCIAL REVIEW SUSTAINABILITY STATEMENT FINANCIAL STATEMENTS INFORMATION":
            continue
            
        # 3. Remove single numeric lines
        if re.fullmatch(r'(\d\s+)+\d', stripped):
            continue
            
        # 4. Keep lines with substantial content
        if len(stripped) > 20:
            cleaned_lines.append(stripped)
    
    result = '\n'.join(cleaned_lines)
    return result


def chunk_text(text: str, chunk_size: int = 4000, chunk_overlap: int = 500) -> list[str]:
    """Divide the text into overlapping fragments using LangChain."""
    if not text:
        return []
    
    print(f"[INFO] Dividing text into chunks (size={chunk_size}, overlap={chunk_overlap})...")
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    
    chunks = text_splitter.split_text(text)
    return chunks


# --------------------------------------------------------------------
# 2. RAG COMPONENTS - EMBEDDINGS, VECTORSTORE, SEARCH
# --------------------------------------------------------------------
def create_and_index_vectorstore(chunks: list[str]) -> FAISS:
    """Create embeddings and build a FAISS index."""
    print(f"[INFO] Creating embeddings and FAISS index for {len(chunks)} chunks...")
    
    embeddings_model = SentenceTransformerEmbeddings(
        model_name="all-MiniLM-L6-v2"
    )
    
    vectorstore = FAISS.from_texts(
        texts=chunks,
        embedding=embeddings_model
    )
    
    print(f"[INFO] Vectorstore created with {vectorstore.index.ntotal} vectors.")
    return vectorstore


def retrieve_financial_chunks(vectorstore: FAISS, query: str, top_k: int = 10) -> list[str]:
    """Retrieve chunks with a focus on financial content using hybrid search."""
    print(f"[INFO] Retrieving {top_k} financial chunks...")  # Fixed message
    
    # Semantic similarity search
    semantic_docs = vectorstore.similarity_search(query, k=top_k * 2)
    
    # Filter by financial keywords
    financial_keywords = [
        'revenue', 'income', 'profit', 'ebitda', 'margin', 
        'cash flow', 'balance sheet', 'financial statement',
        'euro', 'million', 'billion', '%', 'growth',
        'sales', 'net income', 'operating', 'segment',
        'quarter', 'annual', 'forecast', 'guidance'
    ]
    
    scored_docs = []
    for doc in semantic_docs:
        content_lower = doc.page_content.lower()
        score = 0
        
        # Score by financial keywords
        for keyword in financial_keywords:
            if keyword in content_lower:
                score += 1
        
        # Bonus for numbers with decimals (likely financial figures)
        if re.search(r'\d+[\.,]\d+', doc.page_content):
            score += 2
            
        # Bonus for currency symbols
        if re.search(r'[‚Ç¨$\¬£]', doc.page_content):
            score += 3
            
        # Bonus for percentages
        if re.search(r'\d+\s*%', doc.page_content):
            score += 2
            
        scored_docs.append((score, doc))
    
    # Sort by score and take the best ones
    scored_docs.sort(key=lambda x: x[0], reverse=True)
    best_docs = [doc for _, doc in scored_docs[:top_k]]
    
    if scored_docs:
        print(f"[DEBUG] Best chunk score: {scored_docs[0][0]}")
    
    return [doc.page_content for doc in best_docs]


def filter_executive_chunks(all_chunks: list[str], min_financial_score: int = 20, max_chunks: int = 5) -> list[str]:
    """
    Filter chunks to keep only those with high executive value.
    """
    scored = []
    
    for i, chunk in enumerate(all_chunks):
        score = 0
        
        # High points for financial tables
        if re.search(r'\d{4}\s+\d{4}\s+\d{4}\s+\d{4}', chunk):  # Table pattern
            score += 50
        
        # Points for financial figures in ‚Ç¨
        if re.search(r'‚Ç¨\s*\d+[\.,]\d+', chunk):
            score += 30
            
        # Points for percentages
        if re.search(r'\d+\.?\d*\s*%', chunk):
            score += 20
            
        # Penalize residual headers
        if 'TO OUR GROUP MANAGEMENT REPORT' in chunk:
            score -= 40
            
        scored.append((score, chunk))
    
    # Order by score and take the best
    scored.sort(key=lambda x: x[0], reverse=True)
    best_chunks = [chunk for score, chunk in scored if score >= min_financial_score][:max_chunks]
    
    print(f"[FILTER] Top 3 scores: {[s for s, _ in scored[:3]]}")
    print(f"[FILTER] {len(best_chunks)}/{len(all_chunks)} chunks selected")
    
    return best_chunks


# --------------------------------------------------------------------
# 3. LLM COMPONENT - SUMMARY GENERATION
# --------------------------------------------------------------------
def initialize_groq_llm(model_name: str = "llama-3.3-70b-versatile"):
    """Initialize the Groq client."""
    import os
    api_key = os.getenv("GROQ_API_KEY")
    
    if not api_key:
        raise ValueError(
            "ERROR: GROQ_API_KEY not found."  # Fixed error message
        )
    
    try:
        llm = ChatGroq(
            groq_api_key=api_key,
            model_name=model_name,
            temperature=0.2,  # Low temperature for precision
            max_tokens=1500   # Enough for an executive summary
        )
        return llm
    except Exception as e:
        print(f"[WARNING] Error with model {model_name}: {e}")
        print("[WARNING] Trying alternative model 'mixtral-8x7b-32768'...")
        return ChatGroq(
            groq_api_key=api_key,
            model_name="mixtral-8x7b-32768",
            temperature=0.2,
            max_tokens=1500
        )


def generate_executive_summary(retrieved_chunks: list[str], query: str) -> str:
    """Generate an executive summary using LangChain with Groq."""
    print("[INFO] Initializing Groq LLM...")
    llm = initialize_groq_llm()
    
    context = "\n\n---\n\n".join(retrieved_chunks)
    
    prompt_template = ChatPromptTemplate.from_messages([
        ("system", """You are the CFO of a consulting firm, preparing an URGENT executive briefing for the CEO.

ABSOLUTE RULES:
1. Use ONLY the information provided in the context. DO NOT invent data.
2. Focus on specific numerical data: figures in millions/billions, percentages, growth rates.
3. Extract and present the most important data from the financial tables.
4. MANDATORY structure:
   ---
   FINANCIAL EXECUTIVE SUMMARY
   ---
   
   üéØ KEY RESULTS (TOP 5)
   ‚Ä¢ [Metric 1]: [2024 value] vs [2023 value] ([% change] if available)
   ‚Ä¢ [Metric 2]: [2024 value] vs [2023 value] ([% change] if available)
   ‚Ä¢ ... (maximum 5 points)
   
   üìä DETAILED ANALYSIS
   1. Profitability: [Operating profit, net income, margins]
   2. Sales/Revenue: [Revenue, sales, segments]
   3. Efficiency: [Cash flow, working capital, ratios]
   4. Outlook: [Any projections or guidance mentioned]
   
   ‚ö†Ô∏è RISKS/OPORTUNITIES (maximum 3 of each, only if mentioned in the document)
   ‚Ä¢ [Risk 1]: [Brief explanation]
   ‚Ä¢ [Opportunity 1]: [Brief explanation]
   (If no risks/opportunities are mentioned, omit this entire section)
   
   üí° EXECUTIVE RECOMMENDATION (1-2 sentences)

5. ALWAYS include units (‚Ç¨ million, %, etc.).
6. If a section has no data in the context, OMIT IT ENTIRELY (do not write "NOT IDENTIFIED").
7. Use emojis to improve readability.
8. Maximum 400 words.
9. Respond in ENGLISH."""),
        
        ("human", """RAW CONTEXT EXTRACTED FROM THE ANNUAL REPORT:
{context}

--- 
GENERATE THE EXECUTIVE SUMMARY STRICTLY FOLLOWING THE ABOVE RULES.""")
    ])
    
    chain = (
        {"context": RunnablePassthrough()}
        | prompt_template
        | llm
        | StrOutputParser()
    )
    
    print("[INFO] Generating executive summary with LLM... (this may take 15-30 seconds)")
    
    try:
        summary = chain.invoke(context)
        return summary
    except Exception as e:
        print(f"[ERROR] Failed to generate summary: {e}")
        return f"""
        [ERROR] The executive summary could not be generated automatically.
        Reason: {e}
        
        Retrieved chunks: {len(retrieved_chunks)}
        """


# --------------------------------------------------------------------
# 4. MAIN FLOW
# --------------------------------------------------------------------
def main(pdf_path: str):
    """End-to-end executive summary agent flow."""
    print("\n" + "="*60)
    print("EXECUTIVE SUMMARY AGENT")
    print("="*60)
    
    # 1. Load document
    raw_text = load_pdf(pdf_path)
    if not raw_text:
        print("[ERROR] Failed to extract text from the PDF.")
        return
    
    print(f"[INFO] Extracted text: {len(raw_text):,} characters")
    
    # 2. Clean text
    cleaned_text = clean_text(raw_text)
    print(f"[INFO] Cleaned text: {len(cleaned_text):,} characters")
    print(f"[INFO] Reduction: {len(raw_text)-len(cleaned_text):,} characters removed")
    
    if len(cleaned_text) < 50000:
        print("[WARNING] Cleaned text is very short. Verify cleaning process.")
    
    # 3. Split into chunks
    chunks = chunk_text(cleaned_text)
    print(f"[INFO] Text split into {len(chunks)} chunks.")
    
    if len(chunks) == 0:
        print("[ERROR] No chunks generated.")
        return
    
    # 4. Create vectorstore (embeddings + FAISS)
    vectorstore = create_and_index_vectorstore(chunks)
    
    # 5. SPECIFIC FINANCIAL QUERY (Completely in English)
    financial_query = """
    Financial results 2024 vs 2023: 
    Operating profit, Revenue, Sales, Net income, EBITDA, 
    Gross margin, Operating margin, Cash flow, 
    Segment performance (Footwear, Apparel, Accessories),
    Regional results (North America, EMEA, Asia-Pacific),
    Financial guidance 2025,
    Risks mentioned, Opportunities mentioned.
    """
    
    # 6. Retrieve financial chunks
    print("\n[INFO] Searching for specific financial content...")
    financial_chunks = retrieve_financial_chunks(vectorstore, financial_query, top_k=10)
    print(f"[INFO] Initially retrieved {len(financial_chunks)} financial chunks.")  # Fixed message
    
    # 7. Filter by executive value
    print("\n[INFO] Filtering chunks by executive value...")
    executive_chunks = filter_executive_chunks(financial_chunks, min_financial_score=20, max_chunks=5)
    
    if not executive_chunks:
        print("[WARNING] Filter too strict. Using top 3 chunks.")
        executive_chunks = financial_chunks[:3]
    
    print(f"[INFO] Final executive chunks: {len(executive_chunks)}")
    
    # 8. Generate executive summary
    print("\n[INFO] Generating executive summary...")
    summary = generate_executive_summary(executive_chunks, financial_query)
    
    # 9. Present results
    print("\n" + "="*60)
    print("FINANCIAL EXECUTIVE SUMMARY")
    print("="*60)
    print(summary)
    print("\n" + "="*60)
    
    # 10. Save results
    output_path = "executive_summary.txt"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("="*60 + "\n")
        f.write("FINANCIAL EXECUTIVE SUMMARY\n")
        f.write("="*60 + "\n\n")
        f.write(summary)
    
    print(f"[INFO] Summary saved to: {output_path}")
    
    # 11. Save executive chunks for debugging
    debug_path = "executive_chunks_debug.txt"
    with open(debug_path, "w", encoding="utf-8") as f:
        for i, chunk in enumerate(executive_chunks):
            f.write(f"\n{'='*50}\nEXECUTIVE CHUNK #{i+1}\n{'='*50}\n")
            f.write(chunk[:800] + ("..." if len(chunk) > 800 else ""))
            f.write(f"\n\n[Total length: {len(chunk):,} characters]")
    
    print(f"[DEBUG] Executive chunks saved to: {debug_path}")
    
    # 12. Final metrics
    print("\n" + "="*60)
    print("PROCESS METRICS")
    print("="*60)
    print(f"‚Ä¢ Original text: {len(raw_text):,} characters")
    print(f"‚Ä¢ Cleaned text: {len(cleaned_text):,} characters")
    print(f"‚Ä¢ Generated chunks: {len(chunks)}")
    print(f"‚Ä¢ Retrieved financial chunks: {len(financial_chunks)}")
    print(f"‚Ä¢ Filtered executive chunks: {len(executive_chunks)}")
    print(f"‚Ä¢ Data reduction: {(len(raw_text)-len(cleaned_text))/len(raw_text)*100:.1f}%")
    
    # Optional: Save cleaned text for inspection
    with open("cleaned_text_debug.txt", "w", encoding="utf-8") as f:
        f.write(cleaned_text[:5000] + "\n\n[...]")

# --------------------------------------------------------------------
# 5. ENTRY POINT
# --------------------------------------------------------------------
if __name__ == "__main__":
    pdf_path = "annual-report-adidas-ar24.pdf"
    
    if not Path(pdf_path).exists():
        print(f"[ERROR] PDF not found: {pdf_path}")
        print("[INFO] Creating a test document...")
        
        # Create a simple test document
        test_text = """
        ADIDAS ANNUAL REPORT 2024 - KEY FINANCIAL HIGHLIGHTS
        
        Operating Results:
        ‚Ä¢ Revenue: ‚Ç¨23.5 billion (2024) vs ‚Ç¨21.9 billion (2023) - +7.3% growth
        ‚Ä¢ Operating Profit: ‚Ç¨1.8 billion (2024) vs ‚Ç¨1.2 billion (2023) - +50% improvement
        ‚Ä¢ Net Income: ‚Ç¨1.2 billion (2024) vs ‚Ç¨0.8 billion (2023) - +50% growth
        ‚Ä¢ EBITDA Margin: 12.5% (2024) vs 10.8% (2023) - +170 bps improvement
        
        Segment Performance:
        ‚Ä¢ Footwear: ‚Ç¨15.2 billion (+8% YoY)
        ‚Ä¢ Apparel: ‚Ç¨7.1 billion (+6% YoY)
        ‚Ä¢ Accessories: ‚Ç¨1.2 billion (+5% YoY)
        
        Regional Performance:
        ‚Ä¢ North America: ‚Ç¨8.5 billion (+9% YoY)
        ‚Ä¢ EMEA: ‚Ç¨7.8 billion (+6% YoY)
        ‚Ä¢ Asia-Pacific: ‚Ç¨5.9 billion (+8% YoY)
        
        Cash Flow & Balance Sheet:
        ‚Ä¢ Free Cash Flow: ‚Ç¨1.5 billion
        ‚Ä¢ Net Debt: ‚Ç¨2.1 billion (improved from ‚Ç¨2.8 billion in 2023)
        ‚Ä¢ Dividend per share: ‚Ç¨2.00 (2024) vs ‚Ç¨1.50 (2023)
        
        2025 Guidance:
        ‚Ä¢ Revenue growth: 5-7%
        ‚Ä¢ Operating margin: 11-12%
        ‚Ä¢ EPS growth: 10-12%
        """
        
        with open("test_document.txt", "w", encoding="utf-8") as f:
            f.write(test_text)
        pdf_path = "test_document.txt"
    
    try:
        main(pdf_path)
    except Exception as e:
        print(f"[ERROR] Execution failed: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)

  from .autonotebook import tqdm as notebook_tqdm



RUNNING EXECUTIVE SUMMARY AGENT
Loading PDF with pdfplumber: annual-report-adidas-ar24.pdf


Cannot set gray non-stroke color because /'P7494' is an invalid float value
Cannot set gray non-stroke color because /'P7494' is an invalid float value


Text extracted: 1,422,572 characters
Text after cleaning: 1,282,748 characters
Reduction: 139,824 deleted characters
Dividing text into chunks (size=4000, overlap=500)...
Text divided into 366 chunk(s).
Creating embeddings and FAISS index for 366 chunks...


  embeddings_model = SentenceTransformerEmbeddings(


Vectorstore created with 366 vectors.

Searching for specific financial content...
Recovering 10 chunks financial...
Best chunk score: 21
[Chunks initially recovered: 10

Filtering chunks by executable value...
[FILTER] Top 3 scores: [100, 50, 50]
[FILTER] 5/10 chunks selected
Final executive chunks: 5

Generating executive summary...
Initializing Groq's LLM...
Generating executive summary with LLM...

FINANCIAL EXECUTIVE SUMMARY
---
FINANCIAL EXECUTIVE SUMMARY
---

üéØ KEY RESULTS (TOP 5)
‚Ä¢ Net sales: ‚Ç¨23,683 million (2024) vs ‚Ç¨21,427 million (2023) (11% increase)
‚Ä¢ Operating profit: ‚Ç¨1,337 million (2024) vs ‚Ç¨268 million (2023) (398% increase)
‚Ä¢ Gross margin: 50.8% (2024) vs 47.5% (2023) (3.3 percentage points increase)
‚Ä¢ Average operating working capital as a percentage of sales: 19.7% (2024) vs 25.7% (2023) (5.9 percentage points decrease)
‚Ä¢ EBITDA: ‚Ç¨2,465 million (2024) vs ‚Ç¨1,358 million (2023) (81% increase)

üìä DETAILED ANALYSIS
1. Profitability: Operatin