In [None]:
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Install required packages
try:
    from langchain_community.vectorstores import Chroma
    from langchain_community.embeddings import HuggingFaceEmbeddings
    from langchain_text_splitters import RecursiveCharacterTextSplitter
    from langchain_core.documents import Document
    from langgraph.graph import StateGraph, END
    from langchain_community.document_loaders import PyPDFLoader
    import google.generativeai as genai
except ImportError:
    print("Installing required packages...")
    !pip install langchain-core==0.1.0 langchain-community==0.0.10 chromadb==0.4.15 langgraph==0.0.40 sentence-transformers torch transformers pypdf google-generativeai
    from langchain_community.vectorstores import Chroma
    from langchain_community.embeddings import HuggingFaceEmbeddings
    from langchain_text_splitters import RecursiveCharacterTextSplitter
    from langchain_core.documents import Document
    from langgraph.graph import StateGraph, END
    from langchain_community.document_loaders import PyPDFLoader
    import google.generativeai as genai

from typing import Dict, List, Any, TypedDict
import glob

class GeminiFinancialQASystem:
    """Financial Q&A System using Gemini 2.5 Flash"""
    
    def __init__(self, data_path: str = 'Financial-QA-10k.csv'):
        self.vectorstore = None
        self.gemini_model = None
        self.graph = None
        self.data_path = data_path
        self._initialize_system()
    
    def _initialize_system(self):
        """Initialize system with Gemini"""
        print("üöÄ Initializing Gemini Financial Q&A System...")
        
        # 1. Initialize Gemini
        self._initialize_gemini()
        
        # 2. Load and prepare data
        documents = self._prepare_documents()
        
        # 3. Initialize vector store
        self.vectorstore = self._initialize_vector_store(documents)
        
        # 4. Build graph
        self.graph = self._build_graph()
        
        print("‚úÖ Gemini Financial Q&A System initialized successfully!")
    
    def _initialize_gemini(self):
        """Initialize Gemini 2.5 Flash"""
        print("üîß Initializing Gemini 2.5 Flash...")
        
        try:
            # Configure Gemini - you need to set GOOGLE_API_KEY environment variable
            api_key = os.getenv('GOOGLE_API_KEY')
            if not api_key:
                print("‚ùå GOOGLE_API_KEY not found. Please set it as environment variable.")
                raise ValueError("GOOGLE_API_KEY environment variable is required")
            
            genai.configure(api_key=api_key)
            
            # Initialize Gemini 2.5 Flash model
            self.gemini_model = genai.GenerativeModel('gemini-2.0-flash')
            
            # Test the connection
            response = self.gemini_model.generate_content("Hello, please respond with 'OK' to confirm connection.")
            if "OK" in response.text:
                print("‚úÖ Gemini 2.5 Flash connected successfully!")
            else:
                print(f"‚úÖ Gemini 2.5 Flash connected. Response: {response.text}")
                
        except Exception as e:
            print(f"‚ùå Gemini initialization failed: {e}")
            raise
    
    def _prepare_documents(self) -> List[Document]:
        """Prepare documents from CSV data"""
        print("üìä Loading financial dataset...")
        
        try:
            df = pd.read_csv(self.data_path)
            df = df.fillna('')
            print(f"‚úÖ Loaded {len(df)} Q&A pairs")
            
            documents = []
            for idx, row in df.iterrows():
                # Create structured content
                content = f"""
QUESTION: {row['question']}
ANSWER: {row['answer']}
CONTEXT: {row['context']}
COMPANY: {row['ticker']}
FILING_YEAR: {row['filing']}
"""
                
                metadata = {
                    "ticker": row['ticker'],
                    "filing_year": row['filing'],
                    "source": "qa_database",
                    "original_question": row['question'],
                    "original_answer": row['answer'],
                    "document_type": "financial_qa"
                }
                
                documents.append(Document(page_content=content, metadata=metadata))
            
            return documents
            
        except Exception as e:
            print(f"‚ùå Error loading data: {e}")
            return []
    
    def _initialize_vector_store(self, documents: List[Document]) -> Chroma:
        """Initialize vector store"""
        print("üîß Initializing vector store...")
        
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=150,
        )
        
        splits = text_splitter.split_documents(documents)
        print(f"üìÑ Created {len(splits)} document chunks")
        
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={'device': 'cpu'}
        )
        
        vectorstore = Chroma.from_documents(
            documents=splits,
            embedding=embeddings,
            persist_directory="./gemini_financial_qa_db"
        )
        
        return vectorstore
    
    def add_pdfs(self, pdf_directory: str) -> int:
        """Add PDF documents to the knowledge base"""
        print(f"üì• Adding PDFs from: {pdf_directory}")
        
        pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
        if not pdf_files:
            print("‚ùå No PDF files found")
            return 0
        
        all_documents = []
        
        for pdf_file in pdf_files:
            try:
                print(f"üìÑ Processing: {os.path.basename(pdf_file)}")
                
                loader = PyPDFLoader(pdf_file)
                documents = loader.load()
                
                for doc in documents:
                    doc.metadata.update({
                        "source_file": os.path.basename(pdf_file),
                        "source_type": "pdf_filing",
                        "document_type": "10K"
                    })
                
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=1200,
                    chunk_overlap=200
                )
                split_docs = text_splitter.split_documents(documents)
                all_documents.extend(split_docs)
                
                print(f"‚úÖ Added {len(split_docs)} chunks from {os.path.basename(pdf_file)}")
                
            except Exception as e:
                print(f"‚ùå Error processing {pdf_file}: {e}")
        
        if all_documents:
            self.vectorstore.add_documents(all_documents)
            print(f"üìä Successfully added {len(all_documents)} PDF document chunks")
        
        return len(all_documents)
    
    def _build_graph(self):
        """Build LangGraph workflow with Gemini"""
        
        def retrieve_node(state: Dict) -> Dict:
            """Retrieve relevant documents"""
            print("üîç Retrieving relevant financial information...")
            
            question = state["question"]
            
            try:
                # Enhanced retrieval with more documents for better context
                docs = self.vectorstore.similarity_search(question, k=6)
                
                context = []
                sources = []
                
                for doc in docs:
                    source_type = doc.metadata.get("source", "unknown")
                    if source_type == "qa_database":
                        source_info = f"Q&A Database - {doc.metadata.get('ticker', 'Unknown')}"
                    elif source_type == "pdf_filing":
                        source_info = f"PDF Filing - {doc.metadata.get('source_file', 'Unknown')}"
                    else:
                        source_info = "Financial Document"
                    
                    context.append({
                        "content": doc.page_content,
                        "metadata": doc.metadata,
                        "source_info": source_info
                    })
                    sources.append(source_info)
                
                state["context"] = context
                state["sources"] = list(set(sources))
                print(f"‚úÖ Retrieved {len(context)} relevant documents")
                
            except Exception as e:
                print(f"‚ùå Retrieval error: {e}")
                state["context"] = []
                state["sources"] = []
            
            return state
        
        def generate_node(state: Dict) -> Dict:
            """Generate answers using Gemini"""
            print("ü§ñ Generating answer with Gemini...")
            
            question = state["question"]
            context = state["context"]
            
            try:
                if not context:
                    state["answer"] = "I couldn't find relevant information to answer this question in the available financial documents."
                    return state
                
                # Build comprehensive context
                context_text = self._build_gemini_context(context)
                
                # Generate with Gemini
                answer = self._generate_with_gemini(question, context_text)
                
                state["answer"] = answer
                print("‚úÖ Answer generated successfully")
                
            except Exception as e:
                print(f"‚ùå Generation error: {e}")
                state["answer"] = self._create_fallback_answer(context)
            
            return state
        
        def evaluate_node(state: Dict) -> Dict:
            """Evaluate the response quality"""
            answer = state["answer"]
            context = state["context"]
            
            evaluation = []
            
            # Answer length evaluation
            word_count = len(answer.split())
            if word_count > 100:
                evaluation.append("‚úÖ Comprehensive analysis")
            elif word_count > 50:
                evaluation.append("‚úÖ Detailed answer")
            elif word_count > 20:
                evaluation.append("‚úÖ Adequate answer")
            else:
                evaluation.append("‚ö†Ô∏è Brief answer")
            
            # Source utilization
            if len(context) >= 4:
                evaluation.append("‚úÖ Excellent context usage")
            elif len(context) >= 2:
                evaluation.append("‚úÖ Good context usage")
            else:
                evaluation.append("‚ö†Ô∏è Limited context")
            
            # Financial content check
            financial_terms = ['revenue', 'income', 'profit', 'growth', 'margin', 'cash', 'debt', 
                              'equity', 'assets', 'risk', 'segment', 'investment', 'market']
            financial_count = sum(1 for term in financial_terms if term in answer.lower())
            
            if financial_count >= 3:
                evaluation.append("‚úÖ Strong financial analysis")
            elif financial_count >= 1:
                evaluation.append("‚úÖ Financial content present")
            else:
                evaluation.append("‚ö†Ô∏è Limited financial depth")
            
            state["evaluation"] = " | ".join(evaluation)
            return state
        
        # Build the graph
        workflow = StateGraph(Dict)
        workflow.add_node("retrieve", retrieve_node)
        workflow.add_node("generate", generate_node)
        workflow.add_node("evaluate", evaluate_node)
        
        workflow.set_entry_point("retrieve")
        workflow.add_edge("retrieve", "generate")
        workflow.add_edge("generate", "evaluate")
        workflow.add_edge("evaluate", END)
        
        return workflow.compile()
    
    def _build_gemini_context(self, context: List[Dict]) -> str:
        """Build context optimized for Gemini"""
        context_parts = []
        
        for i, doc in enumerate(context):
            content = doc["content"]
            metadata = doc["metadata"]
            source_info = doc["source_info"]
            
            # Format for Gemini
            context_parts.append(f"--- SOURCE {i+1} ({source_info}) ---\n{content}\n")
        
        return "\n".join(context_parts)
    
    def _generate_with_gemini(self, question: str, context: str) -> str:
        """Generate answer using Gemini 2.5 Flash"""
        
        prompt = f"""You are a senior financial analyst specializing in SEC filings and corporate financial analysis. 

Based EXCLUSIVELY on the following information from financial documents, 10-K filings, and Q&A data, provide a comprehensive and accurate answer to the question.

FINANCIAL DOCUMENTS AND CONTEXT:
{context}

QUESTION: {question}

ANALYSIS REQUIREMENTS:
1. Answer based SOLELY on the provided financial context
2. Be specific, detailed, and factually accurate
3. Include relevant numbers, dates, metrics, and financial data when available
4. Cite the specific sources when referencing information
5. If certain information is not available in the context, acknowledge this limitation
6. Structure complex answers with clear sections or bullet points
7. Use professional financial terminology and analysis frameworks
8. Highlight trends, comparisons, or significant findings when relevant

RESPONSE FORMAT:
- Start with a direct, concise answer
- Provide supporting evidence and details from the sources
- Mention specific companies, years, and metrics when available
- Use bullet points for multiple items or comparisons
- Conclude with any limitations or additional context needed

FINANCIAL ANALYST'S COMPREHENSIVE ANSWER:"""

        try:
            # Generate with Gemini
            response = self.gemini_model.generate_content(
                prompt,
                generation_config=genai.types.GenerationConfig(
                    temperature=0.2,  # Low temperature for factual accuracy
                    top_p=0.8,
                    top_k=40,
                    max_output_tokens=2000,  # Allow detailed responses
                )
            )
            
            return response.text.strip()
            
        except Exception as e:
            print(f"‚ùå Gemini generation error: {e}")
            return self._create_fallback_answer_from_context(context)
    
    def _create_fallback_answer(self, context: List[Dict]) -> str:
        """Create fallback answer from context"""
        if not context:
            return "I don't have enough information to answer this question based on the available financial documents."
        
        # Extract key information from context
        key_info = []
        for doc in context[:3]:  # Use first 3 documents
            content = doc["content"]
            # Try to extract answer from Q&A format
            if "ANSWER:" in content:
                answer_part = content.split("ANSWER:")[1]
                if "CONTEXT:" in answer_part:
                    answer_part = answer_part.split("CONTEXT:")[0]
                key_info.append(answer_part.strip())
            else:
                key_info.append(content[:300] + "...")
        
        return f"Based on the financial documents:\n\n" + "\n\n".join(key_info)
    
    def _create_fallback_answer_from_context(self, context: str) -> str:
        """Create fallback answer from context string"""
        lines = context.split('\n')
        relevant_lines = [line for line in lines if len(line.strip()) > 10]
        
        if relevant_lines:
            return "Based on the financial information available:\n\n" + "\n".join(relevant_lines[:8])
        else:
            return "The financial documents contain relevant information, but I'm unable to generate a comprehensive analysis at the moment."
    
    def query(self, question: str, verbose: bool = True) -> Dict[str, Any]:
        """Query the Gemini-powered system"""
        if verbose:
            print(f"\nüéØ QUESTION: {question}")
            print("=" * 70)
        
        initial_state = {
            "question": question,
            "context": [],
            "answer": "",
            "evaluation": "",
            "sources": []
        }
        
        try:
            result = self.graph.invoke(initial_state)
            
            if verbose:
                print(f"\nüìù ANSWER:\n{result['answer']}")
                print(f"\nüìä EVALUATION: {result['evaluation']}")
                
                if result["sources"]:
                    print(f"\nüîó SOURCES:")
                    for i, source in enumerate(result["sources"][:5], 1):
                        print(f"   {i}. {source}")
                
                print(f"\nüìà CONTEXT UTILIZATION: {len(result['context'])} documents retrieved")
            
            return result
            
        except Exception as e:
            error_msg = f"Pipeline execution error: {e}"
            print(f"‚ùå {error_msg}")
            return {
                "question": question,
                "answer": "I encountered an error while processing your question. Please try again.",
                "evaluation": "System error",
                "sources": [],
                "error": error_msg
            }

# Advanced financial analysis capabilities
class FinancialAnalyzer:
    """Advanced financial analysis using Gemini"""
    
    def __init__(self, qa_system: GeminiFinancialQASystem):
        self.qa_system = qa_system
    
    def compare_companies(self, companies: List[str], aspect: str) -> Dict[str, Any]:
        """Compare multiple companies on specific aspects"""
        question = f"Compare {', '.join(companies)} in terms of {aspect}. Provide specific financial metrics, trends, and strategic differences from their 10-K filings and financial reports."
        return self.qa_system.query(question)
    
    def analyze_trends(self, company: str, metric: str, years: List[str] = None) -> Dict[str, Any]:
        """Analyze financial trends for a company"""
        year_context = f" from {years[0]} to {years[-1]}" if years else " over recent years"
        question = f"Analyze the {metric} trends for {company}{year_context}. Include specific numbers, growth rates, and key drivers mentioned in their financial filings."
        return self.qa_system.query(question)
    
    def risk_analysis(self, company: str) -> Dict[str, Any]:
        """Comprehensive risk analysis"""
        question = f"Provide a comprehensive risk analysis for {company} based on their 10-K filings. Include operational risks, financial risks, market risks, and strategic risks with specific examples from their reports."
        return self.qa_system.query(question)
    
    def business_segment_analysis(self, company: str) -> Dict[str, Any]:
        """Analyze business segments and revenue breakdown"""
        question = f"Analyze the business segments and revenue breakdown for {company}. Provide specific revenue numbers, growth rates by segment, and strategic importance of each segment from their latest financial reports."
        return self.qa_system.query(question)

# Demonstration and testing
def demonstrate_gemini_system():
    """Comprehensive demonstration of Gemini-powered system"""
    print("=" * 80)
    print("üåü GEMINI-POWERED FINANCIAL Q&A SYSTEM")
    print("=" * 80)
    
    # Check for API key
    if not os.getenv('GOOGLE_API_KEY'):
        print("‚ùå GOOGLE_API_KEY environment variable not set!")
        print("üí° Please set it first:")
        print("   import os")
        print("   os.environ['GOOGLE_API_KEY'] = 'your_actual_api_key'")
        print("üí° Get free API key from: https://aistudio.google.com/app/apikey")
        return
    
    # Initialize system
    qa_system = GeminiFinancialQASystem()
    analyzer = FinancialAnalyzer(qa_system)
    
    # Test questions
    test_scenarios = [
        {
            "category": "Company History & Focus",
            "questions": [
                "What was NVIDIA's initial business focus and how has it evolved over time?",
                "What significant technological inventions did NVIDIA create and when?",
                "How does NVIDIA describe its transformation from graphics to AI computing?"
            ]
        },
        {
            "category": "Financial Performance", 
            "questions": [
                "What are NVIDIA's main revenue drivers and business segments?",
                "How has NVIDIA's revenue composition changed in recent years?",
                "What are the key financial metrics that demonstrate NVIDIA's growth?"
            ]
        },
        {
            "category": "Technology & Products",
            "questions": [
                "What is CUDA and what computational capabilities does it enable?",
                "What are the main applications of NVIDIA's GPU technology beyond gaming?",
                "How does NVIDIA's platform strategy integrate hardware and software?"
            ]
        }
    ]
    
    print("\nüß™ RUNNING COMPREHENSIVE TESTS:")
    for scenario in test_scenarios:
        print(f"\nüìä CATEGORY: {scenario['category']}")
        print("-" * 60)
        
        for question in scenario["questions"]:
            result = qa_system.query(question)
            print("\n" + "=" * 70)
    
    # Test PDF integration
    print(f"\nüìÑ TESTING PDF INTEGRATION:")
    pdf_dir = "./10k_pdfs"
    if os.path.exists(pdf_dir) and any(f.endswith('.pdf') for f in os.listdir(pdf_dir)):
        print(f"üì• Adding PDFs from {pdf_dir}")
        added_count = qa_system.add_pdfs(pdf_dir)
        
        if added_count > 0:
            print(f"\nüî¨ ADVANCED ANALYSIS WITH PDFS:")
            
            # Advanced analysis with PDFs
            advanced_questions = [
                "What is NVIDIA's current revenue breakdown by business segment?",
                "What are the major risk factors discussed in NVIDIA's latest 10-K filing?",
                "How does NVIDIA describe its competitive advantages and market position?",
                "What is the company's strategy for future growth and investment?",
                "what are in the products in the iphone line?",
            ]
            
            for question in advanced_questions:
                print(f"\nüéØ {question}")
                result = qa_system.query(question, verbose=False)
                print(f"üìù {result['answer'][:300]}...")
                print(f"üìä {result['evaluation']}")
                print(f"üîó Sources: {result['sources']}")
                print("-" * 50)
    
    # Demonstrate advanced analysis
    print(f"\nüî¨ ADVANCED FINANCIAL ANALYSIS DEMONSTRATION:")
    
    return qa_system

# Interactive chat interface
def interactive_gemini_chat():
    """Interactive chat with Gemini-powered system"""
    print("\nüí¨ GEMINI FINANCIAL Q&A CHAT")
    print("Type 'quit' to exit, 'sources' to show recent sources\n")
    
    # Check for API key
    if not os.getenv('GOOGLE_API_KEY'):
        print("‚ùå GOOGLE_API_KEY not set. Please set it first.")
        return
    
    qa_system = GeminiFinancialQASystem()
    
    # Add PDFs if available
    pdf_dir = "./10k_pdfs"
    if os.path.exists(pdf_dir) and any(f.endswith('.pdf') for f in os.listdir(pdf_dir)):
        print("üì• Loading PDF documents...")
        qa_system.add_pdfs(pdf_dir)
    
    chat_history = []
    
    while True:
        question = input("\n‚ùì Your financial question: ").strip()
        
        if question.lower() in ['quit', 'exit', 'bye']:
            print("üëã Thank you for using the Gemini Financial Q&A System!")
            break
        elif question.lower() == 'sources':
            print("\nüìö Recent sources mentioned:")
            for i, source in enumerate(set([s for chat in chat_history for s in chat.get('sources', [])]), 1):
                print(f"  {i}. {source}")
            continue
        elif not question:
            continue
        
        print("\n" + "üîÑ Processing with Gemini..." + "\n")
        result = qa_system.query(question)
        
        # Store in history
        chat_history.append(result)
        
        print(f"\nüí° Tip: Ask about:")
        print("   - Company financials and performance")
        print("   - Technology and product strategies") 
        print("   - Risk factors and market analysis")
        print("   - Business segments and revenue breakdown")


if __name__ == "__main__":    
    # Check if API key is available
    if os.getenv('GOOGLE_API_KEY'):
        # Run demonstration
        demonstrate_gemini_system()
        
        # Start interactive chat
        chat_option = input("\nüí¨ Start interactive chat? (y/n): ").strip().lower()
        if chat_option in ['y', 'yes']:
            interactive_gemini_chat()
    else:
        print("\n‚ùå Please set GOOGLE_API_KEY environment variable first.")
        print("üí° Use the setup instructions above to get your free API key.")

üåü GEMINI-POWERED FINANCIAL Q&A SYSTEM
üöÄ Initializing Gemini Financial Q&A System...
üîß Initializing Gemini 2.5 Flash...
‚úÖ Gemini 2.5 Flash connected successfully!
üìä Loading financial dataset...
‚úÖ Loaded 7000 Q&A pairs
üîß Initializing vector store...
üìÑ Created 7338 document chunks
‚úÖ Gemini Financial Q&A System initialized successfully!

üß™ RUNNING COMPREHENSIVE TESTS:

üìä CATEGORY: Company History & Focus
------------------------------------------------------------

üéØ QUESTION: What was NVIDIA's initial business focus and how has it evolved over time?
üîç Retrieving relevant financial information...
‚úÖ Retrieved 6 relevant documents
ü§ñ Generating answer with Gemini...
‚úÖ Answer generated successfully

üìù ANSWER:
NVIDIA's initial business focus was on PC graphics. Since then, the company has expanded into other computationally intensive fields.

**Supporting Evidence and Details:**

*   **Initial Focus:** According to the provided Q&A data from NVIDIA'