In [None]:
import os
import sys
import json
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple

In [None]:
import fitz  # PyMuPDF
import re
from sentence_transformers import SentenceTransformer
import faiss
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [None]:
# Set OpenAI API key
openai.api_key = "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
"


In [None]:
%% PDF Processing Class
class PDFProcessor:
    """Extract and process text from PDF financial documents"""
    
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
    
    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """Extract all text from PDF file"""
        print(f"Extracting text from: {pdf_path}")
        
        try:
            doc = fitz.open(pdf_path)
            text = ""
            
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                page_text = page.get_text()
                text += f"\n--- Page {page_num + 1} ---\n{page_text}"
            
            doc.close()
            print(f"Successfully extracted {len(text):,} characters from {len(doc)} pages")
            return text
            
        except Exception as e:
            print(f"Error extracting PDF: {e}")
            return ""
    
    def clean_text(self, text: str) -> str:
        """Clean and normalize extracted text"""
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Remove special characters but preserve financial notation
        text = re.sub(r'[^\w\s\$\%\.\,\(\)\-\+\:]', ' ', text)
        
        # Remove page headers/footers patterns
        text = re.sub(r'--- Page \d+ ---', '', text)
        
        return text.strip()
    
    def chunk_text(self, text: str) -> List[Dict]:
        """Split text into semantic chunks with metadata"""
        clean_text = self.clean_text(text)
        chunks = self.text_splitter.split_text(clean_text)
        
        processed_chunks = []
        for i, chunk in enumerate(chunks):
            processed_chunks.append({
                "content": chunk,
                "chunk_id": i,
                "length": len(chunk),
                "word_count": len(chunk.split())
            })
        
        print(f"Created {len(processed_chunks)} text chunks")
        return processed_chunks

In [None]:
#%% Vector Retrieval Class
class VectorRetriever:
    """Handle vector embedding and similarity search"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        print(f"Loading embedding model: {model_name}")
        self.model = SentenceTransformer(model_name)
        self.index = None
        self.chunks = []
        
    def build_index(self, chunks: List[Dict]):
        """Build FAISS vector index from text chunks"""
        self.chunks = chunks
        texts = [chunk["content"] for chunk in chunks]
        
        print("Generating embeddings...")
        # Generate embeddings with progress bar
        embeddings = self.model.encode(
            texts, 
            show_progress_bar=True,
            convert_to_numpy=True
        )
        
        print(f"Generated embeddings shape: {embeddings.shape}")

In [None]:
# Build FAISS index for cosine similarity
        dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)  # Inner product

In [None]:
# Normalize embeddings for cosine similarity
        faiss.normalize_L2(embeddings)
        self.index.add(embeddings.astype('float32'))
        
        print(f"FAISS index built with {self.index.ntotal} vectors")
    
    def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
        """Retrieve most relevant chunks for query"""
        if self.index is None:
            raise ValueError("Index not built. Call build_index() first.")
        
        print(f"Searching for: '{query}'")

In [None]:
# Encode query
        query_embedding = self.model.encode([query], convert_to_numpy=True)
        faiss.normalize_L2(query_embedding)
        
        # Search index
        scores, indices = self.index.search(
            query_embedding.astype('float32'), 
            top_k
        )

In [None]:
# Format results
        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx != -1:  # Valid result
                results.append({
                    "content": self.chunks[idx]["content"],
                    "similarity_score": float(score),
                    "chunk_id": self.chunks[idx]["chunk_id"],
                    "length": self.chunks[idx]["length"]
                })
        
        print(f"Retrieved {len(results)} relevant chunks")
        return results

In [None]:
#%% Answer Generation Class
class FinancialQAGenerator:
    """Generate answers using retrieved context"""
    
    def __init__(self, model: str = "gpt-3.5-turbo"):
        self.model = model
        
    def generate_answer(self, query: str, context_chunks: List[Dict]) -> Dict:
        """Generate answer using GPT with financial context"""
        
        # Combine retrieved contexts
        context = "\n\n".join([
            f"Context {i+1}: {chunk['content']}" 
            for i, chunk in enumerate(context_chunks)
        ])

In [None]:
# Create prompt for financial QA
        prompt = f"""You are a financial analyst assistant. Based on the following context from Meta's financial reports, provide a precise and factual answer to the query.

CONTEXT FROM FINANCIAL DOCUMENTS:
{context}

QUERY: {query}

INSTRUCTIONS:
- Provide direct, factual answers based ONLY on the context provided
- Include specific numbers, percentages, and dollar amounts when available
- If the information is not in the context, clearly state this
- Keep the answer concise but comprehensive
- Cite specific figures when making claims

ANSWER:"""

        try:
            response = openai.ChatCompletion.create(
                model=self.model,
                messages=[
                    {
                        "role": "system", 
                        "content": "You are a financial analyst providing accurate information from financial reports."
                    },
                    {
                        "role": "user", 
                        "content": prompt
                    }
                ],
                max_tokens=500,
                temperature=0.1,
                presence_penalty=0.1
            )
            
            answer = response.choices[0].message.content
            
            return {
                "answer": answer,
                "model_used": self.model,
                "tokens_used": response.usage.total_tokens,
                "context_chunks_used": len(context_chunks)
            }
            
        except Exception as e:
            return {
                "answer": f"Error generating response: {str(e)}",
                "model_used": self.model,
                "tokens_used": 0,
                "context_chunks_used": len(context_chunks)
            }

In [None]:
#%% Main Pipeline Execution

def run_basic_rag_pipeline():
    """Execute the complete basic RAG pipeline"""
    
    print("="*60)
    print("STEP 1: BASIC RAG PIPELINE FOR FINANCIAL DOCUMENTS")
    print("="*60)
    
    # File path - update with your PDF location
    pdf_path = "meta_q1_2024.pdf"  # Update this path
    
    # Initialize components
    print("\n1. Initializing components...")
    processor = PDFProcessor()
    retriever = VectorRetriever()
    generator = FinancialQAGenerator()
    
    # Process PDF document
    print("\n2. Processing PDF document...")
    if not os.path.exists(pdf_path):
        print(f"ERROR: PDF file not found at {pdf_path}")
        print("Please download Meta's Q1 2024 earnings report and update the path")
        return
    
    raw_text = processor.extract_text_from_pdf(pdf_path)
    if not raw_text:
        print("ERROR: Failed to extract text from PDF")
        return
    
    chunks = processor.chunk_text(raw_text)
    

In [None]:
# Build vector index
    print("\n3. Building vector search index...")
    retriever.build_index(chunks)
    
    # Test queries for Step 1
    test_queries = [
        "What was Meta's revenue in Q1 2024?",
        "What were the key financial highlights for Meta in Q1 2024?"
    ]
    
    print("\n4. Testing RAG pipeline with queries...")
    results = {}
    
    for query in test_queries:
        print(f"\n{'='*50}")
        print(f"QUERY: {query}")
        print(f"{'='*50}")
        
        # Retrieve relevant context
        relevant_chunks = retriever.retrieve(query, top_k=3)
        

In [None]:
# Display retrieved chunks
        print("\nRETRIEVED CONTEXT:")
        for i, chunk in enumerate(relevant_chunks, 1):
            print(f"\nChunk {i} (Similarity: {chunk['similarity_score']:.3f}):")
            print(f"{chunk['content'][:200]}...")
        
        # Generate answer
        print("\nGENERATING ANSWER...")
        answer_result = generator.generate_answer(query, relevant_chunks)
        
        print(f"\nFINAL ANSWER:")
        print(f"{answer_result['answer']}")
        
        print(f"\nMETADATA:")
        print(f"- Tokens used: {answer_result['tokens_used']}")
        print(f"- Context chunks: {answer_result['context_chunks_used']}")

In [None]:
# Store results
        results[query] = {
            "answer": answer_result['answer'],
            "retrieved_chunks": relevant_chunks,
            "metadata": {
                "tokens_used": answer_result['tokens_used'],
                "chunks_used": len(relevant_chunks),
                "avg_similarity": np.mean([c['similarity_score'] for c in relevant_chunks])
            }
        }

In [None]:
# Save results
    print("\n5. Saving results...")
    output_file = "step1_basic_rag_results.json"
    with open(output_file, "w") as f:
        json.dump(results, f, indent=2, default=str)
    
    print(f"Results saved to: {output_file}")

In [None]:
# Summary statistics
    print("\n" + "="*60)
    print("PIPELINE SUMMARY")
    print("="*60)
    print(f"Document processed: {pdf_path}")
    print(f"Text chunks created: {len(chunks)}")
    print(f"Queries processed: {len(test_queries)}")
    print(f"Average retrieval similarity: {np.mean([r['metadata']['avg_similarity'] for r in results.values()]):.3f}")
    print(f"Total tokens used: {sum([r['metadata']['tokens_used'] for r in results.values()])}")
    
    return results

In [None]:
#%% Execute Pipeline
if __name__ == "__main__":
    # Run the complete pipeline
    results = run_basic_rag_pipeline()
    
    # Display final results
    print("\n" + "="*60)
    print("FINAL RESULTS SUMMARY")
    print("="*60)
    
    for query, result in results.items():
        print(f"\nQuery: {query}")
        print(f"Answer: {result['answer'][:150]}...")
        print(f"Confidence: {result['metadata']['avg_similarity']:.3f}")

In [None]:
#%% Evaluation and Analysis
def evaluate_step1_results(results: Dict):
    """Basic evaluation of Step 1 results"""
    
    print("\n" + "="*60)
    print("STEP 1 EVALUATION")
    print("="*60)
    

In [None]:
# Metrics to evaluate
    metrics = {
        "avg_retrieval_score": np.mean([
            r['metadata']['avg_similarity'] for r in results.values()
        ]),
        "avg_chunks_per_query": np.mean([
            r['metadata']['chunks_used'] for r in results.values()
        ]),
        "total_tokens": sum([
            r['metadata']['tokens_used'] for r in results.values()
        ]),
        "queries_processed": len(results)
    }
    
    print("Quantitative Metrics:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}")

In [None]:
# Qualitative assessment
    print("\nQualitative Assessment:")
    for query, result in results.items():
        answer = result['answer']
        
        # Check for key indicators of good answers
        has_numbers = bool(re.search(r'\$[\d,]+|\d+%|\d+\.\d+', answer))
        mentions_meta = 'meta' in answer.lower()
        mentions_q1_2024 = any(term in answer.lower() for term in ['q1 2024', 'first quarter 2024'])
        
        print(f"\nQuery: {query}")
        print(f"  Contains numbers: {has_numbers}")
        print(f"  Mentions Meta: {mentions_meta}")
        print(f"  References Q1 2024: {mentions_q1_2024}")
        print(f"  Answer length: {len(answer)} characters")
    
    return metrics

In [None]:
# Run evaluation if results exist
if 'results' in globals():
    evaluation_metrics = evaluate_step1_results(results)