In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import random
import json
import time
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from collections import defaultdict
import google.generativeai as genai

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("GEMINI_API_KEY")
    genai.configure(api_key=api_key)
    # The print statement is optional, but helpful for debugging
    # print("✅ Successfully configured Gemini API from Kaggle Secrets.") 
except Exception as e:
    # This will run if the secret is not found, e.g., you forgot to add it.
    print(f"⚠️ Could not configure Gemini API. Please ensure you have added your key to Kaggle Secrets with the label 'GEMINI_API_KEY'. Error: {e}")

@dataclass
class MathProblem:
    problem: str
    problem_type: str
    difficulty: str
    expected_answer: str

@dataclass
class StepResult:
    step_number: int
    action: str
    step_content: str
    is_correct: bool
    reward: float
    judge_feedback: str = ""
    attempts: int = 1

class QLearningAgent:
    """
    Implements the Q-learning algorithm exactly as specified in the image.
    Q(S, A) <- Q(S, A) + α * [R + γ * max_a Q(S', a) - Q(S, A)]
    """
    def __init__(self, learning_rate=0.1, discount_factor=0.9, epsilon=0.2):
        self.learning_rate: float = learning_rate
        self.discount_factor: float = discount_factor
        self.epsilon: float = epsilon
        self.q_table: Dict[str, Dict[str, float]] = defaultdict(lambda: defaultdict(float))
        # A more structured action space for 5 steps
        self.action_space: Dict[int, List[str]] = {
            1: ["identify_and_setup", "initial_decomposition"],
            2: ["apply_core_rule", "select_technique"],
            3: ["execute_calculation", "algebraic_manipulation"],
            4: ["simplify_result", "check_for_edge_cases"],
            5: ["final_answer_and_conclusion", "verify_solution"]
        }

    def get_state(self, problem: MathProblem, step: int) -> str:
        """Create a simplified but effective state representation."""
        # State depends on problem type, difficulty, and current step number.
        return f"{problem.problem_type}_{problem.difficulty}_step{step}"

    def choose_action(self, state: str, step: int) -> str:
        """Choose action using epsilon-greedy policy from the available actions for the current step."""
        available_actions = self.action_space.get(step)
        
        # Exploration: choose a random action
        if random.random() < self.epsilon:
            return random.choice(available_actions)
        
        # Exploitation: choose the best-known action
        q_values_for_state = self.q_table[state]
        
        # Filter Q-values to only include actions available at this step
        available_q_values = {action: q_values_for_state[action] for action in available_actions}
        
        if not available_q_values or all(v == 0 for v in available_q_values.values()):
            # If no q-values learned yet for this state, choose randomly
            return random.choice(available_actions)
        
        return max(available_q_values, key=available_q_values.get)

    def update_q_value(self, state: str, action: str, reward: float, next_state: str, next_step: int):
        """
        Update Q-value using the Bellman equation from the image.
        Q(S, A) <- Q(S, A) + α * [R + γ * max_a Q(S', a) - Q(S, A)]
        """
        # 1. Get the old Q-value: Q(S, A)
        current_q = self.q_table[state][action]

        # 2. Find the maximum Q-value for the next state: max_a Q(S', a)
        next_q_values = self.q_table[next_state]
        available_next_actions = self.action_space.get(next_step, [])
        
        max_next_q = 0
        if next_q_values and available_next_actions:
            # Consider only actions available in the next step
            relevant_next_q = [next_q_values[act] for act in available_next_actions]
            if relevant_next_q:
                max_next_q = max(relevant_next_q)
        
        # 3. Calculate the TD target: R + γ * max_a Q(S', a)
        td_target = reward + self.discount_factor * max_next_q
        
        # 4. Calculate the TD error: td_target - Q(S, A)
        td_error = td_target - current_q

        # 5. Update the Q-value: Q(S, A) + α * TD_error
        new_q = current_q + self.learning_rate * td_error
        self.q_table[state][action] = new_q
        
        # print(f"  🧠 Q-Update: s={state}, a={action}, r={reward:.1f} | OldQ:{current_q:.2f} -> NewQ:{new_q:.2f}")

class GeminiMathSolver:
    def __init__(self, learner_model_name="gemini-1.5-flash-8b", judge_model_name="gemini-2.5-pro"):
        self.agent = QLearningAgent()
        self.max_steps = 5
        self.max_retries_per_step = 2 # Allow the learner to try again if it makes a mistake
        self.api_configured = False
        
        try:
            self.learner_model = genai.GenerativeModel(learner_model_name)
            self.judge_model = genai.GenerativeModel(judge_model_name)
            self.api_configured = True
            print("✅ Gemini API configured successfully.")
        except Exception as e:
            print(f"⚠️ Gemini API not configured. Running in MOCK mode. Error: {str(e)}")
    
    def get_learner_response(self, problem: MathProblem, step: int, action: str, previous_steps: List[str], feedback: Optional[str] = None) -> str:
        """Generates a response from the learner model, incorporating feedback if provided."""
        if not self.api_configured:
            return f"[MOCK] Step {step}: Executing action '{action}' for problem type {problem.problem_type}."
    
        feedback_prompt = ""
        if feedback:
            feedback_prompt = f"""
            Your previous attempt at this step was incorrect. Here is the feedback from the expert judge:
            ---
            {feedback}
            ---
            Please correct your mistake and provide a new, accurate response for this step.
            """
    
        # --- FIX IS HERE ---
        # 1. Pre-format the 'previous_steps' string. Note the single backslash \n is fine here.
        if previous_steps:
            previous_steps_str = "".join(f"Step {i+1}: {s}\n" for i, s in enumerate(previous_steps))

        else:
            previous_steps_str = "None"
        # --- END FIX ---
    
        step_instructions = {
            1: "Start by identifying the function/problem type and outlining the initial setup or first principle to apply.",
            2: "Apply the main mathematical rule or technique (e.g., chain rule, integration by parts, matrix inversion).",
            3: "Perform the necessary calculations and algebraic manipulations based on the previous step.",
            4: "Simplify the resulting expression and check for any intermediate errors or edge cases.",
            5: "State the complete, final answer clearly. This is your last step."
        }
        
        prompt = f"""
    You are an expert mathematician solving a calculus problem in a structured, 5-step process.
    You are on step {step} of 5.
    
    Problem: {problem.problem}
    
    Previous Steps:
    {previous_steps_str}
    
    Current Step Instructions ({step}/5): {step_instructions[step]}
    Your high-level action for this step is: '{action}'.
    
    {feedback_prompt}
    
    Provide only the mathematical work for this current step.
    {'This is the final step, you must provide the final answer.' if step == 5 else f'You have {5-step} steps remaining after this.'}
    """
        response = self.learner_model.generate_content(prompt)
        return response.text

    def get_judge_evaluation(self, problem: MathProblem, step_content: str, step_number: int) -> Tuple[bool, float, str]:
        """Evaluates a step using the powerful judge model."""
        if not self.api_configured:
            # Mock evaluation for testing without an API key
            is_correct = random.random() > 0.4 # 60% chance of being correct
            reward = (10 if is_correct else -5)
            feedback = "MOCK: This is a mock evaluation."
            return is_correct, reward, feedback
            
        is_final_step = (step_number == self.max_steps)
        
        # --- CORRECTED JUDGE PROMPT ---
        prompt = f"""
    You are an expert mathematician and judge. Your task is to evaluate one step of a solution to a math problem.
    The problem is: "{problem.problem}"
    The expected final answer is: "{problem.expected_answer}"
    
    The current step being evaluated is Step {step_number}.
    The student's submission for this step is:
    ---
    {step_content}
    ---
    
    Based on the problem, the student's submission for this step, and the expected final answer, is this step correct?
    - A step is CORRECT if it is mathematically sound and makes logical progress towards the final answer.
    - A step is INCORRECT if it contains a mathematical error, a logical flaw, or is a step that doesn't lead to the correct solution.
    
    Start your response with the word "CORRECT" or "INCORRECT".
    Then, provide a brief, one-sentence explanation for your decision.
    
    Example 1:
    CORRECT: The application of the product rule is accurate.
    
    Example 2:
    INCORRECT: The derivative of sin(x) is -cos(x), not cos(x) as written.
    
    Example 3:
    INCORRECT: The calculation is correct, but this approach of integration by parts will not lead to the final answer.
    
    Now, evaluate the student's submission. The final answer should be: {problem.expected_answer}
    """
        # --- END CORRECTED JUDGE PROMPT ---
        
        try:
            response = self.judge_model.generate_content(prompt)
            feedback_text = response.text.strip()
            
            is_correct = feedback_text.upper().startswith('CORRECT')
            
            # Define rewards
            if is_correct:
                reward = 15.0 if is_final_step else 5.0 + step_number
            else:
                reward = -20.0 if is_final_step else -10.0
                
            return is_correct, reward, feedback_text
    
        except Exception as e:
            print(f"❌ Error during judge evaluation: {str(e)}")
            return False, -15.0, f"Evaluation failed due to an API error: {e}"

    def solve_problem(self, problem: MathProblem) -> List[StepResult]:
        """Solves a problem using the Q-learning guided, 5-step process with retries."""
        print(f"\n" + "="*70)
        print(f"🧮 Solving Problem: {problem.problem}")
        print(f"🎯 Expected Answer: {problem.expected_answer}")
        print("-" * 70)
        
        results = []
        previous_steps_content = []

        for step in range(1, self.max_steps + 1):
            state = self.agent.get_state(problem, step)
            feedback_for_retry = None
            
            for attempt in range(1, self.max_retries_per_step + 1):
                # 1. Choose Action A from State S
                action = self.agent.choose_action(state, step)

                # 2. Take Action A, get Step Content
                step_content = self.get_learner_response(problem, step, action, previous_steps_content, feedback_for_retry)
                
                # 3. Observe Reward R and Next State S'
                is_correct, reward, judge_feedback = self.get_judge_evaluation(problem, step_content, step)
                
                status_icon = "✅" if is_correct else "❌"
                print(f"Step {step}/{self.max_steps} (Attempt {attempt}) | Action: {action} | Result: {status_icon}")
                print(f"  💬 Learner: {step_content.strip()}")
                print(f"  👨‍⚖️ Judge: {judge_feedback.strip()}")
                
                if is_correct:
                    # The step was correct, finalize and move to the next step
                    result = StepResult(step, action, step_content, is_correct, reward, judge_feedback, attempt)
                    results.append(result)
                    previous_steps_content.append(step_content)
                    
                    # 4. Update Q-Table
                    next_step = step + 1
                    next_state = self.agent.get_state(problem, next_step) if next_step <= self.max_steps else "terminal"
                    self.agent.update_q_value(state, action, reward, next_state, next_step)
                    
                    time.sleep(1) # API rate limiting
                    break # Exit the retry loop
                else:
                    # The step was incorrect, prepare for another attempt
                    feedback_for_retry = judge_feedback
                    # Apply a penalty for the failed attempt and update Q-value to discourage this action
                    self.agent.update_q_value(state, action, reward, state, step) # Update with penalty, next_state is current state
                    
                    if attempt == self.max_retries_per_step:
                        # Max retries reached, accept the wrong answer and move on
                        print(f"  ⚠️ Max retries reached for step {step}. Accepting incorrect step.")
                        result = StepResult(step, action, step_content, is_correct, reward, judge_feedback, attempt)
                        results.append(result)
                        previous_steps_content.append(step_content)
                        break # Exit the retry loop
                    
                    time.sleep(1) # API rate limiting

        # Final Summary
        total_reward = sum(r.reward for r in results)
        correct_steps = sum(1 for r in results if r.is_correct)
        print("\n" + "-"*30 + " SUMMARY " + "-"*30)
        print(f"📊 Final Result: {correct_steps}/{self.max_steps} steps correct.")
        print(f"🏆 Total Reward: {total_reward:.1f}")
        print("="*70)
        return results

# --- Main Execution ---
def create_hard_problems() -> List[MathProblem]:
    """Create a list of hard and very hard sample math problems for testing."""
    return [
        MathProblem(
            problem="Find the derivative of the Weierstrass function, defined as f(x) = Σ [from n=0 to ∞] aⁿ * cos(bⁿπx), for 0 < a < 1 and ab > 1 + (3/2)π.",
            problem_type="derivative",
            difficulty="very_hard",
            expected_answer="The function is continuous everywhere but differentiable nowhere. Therefore, the derivative does not exist for any value of x."
        ),
        MathProblem(
            problem="Consider f_n(x) = 2nx*e^(-nx²) on [0, 1]. Evaluate lim[n→∞] ∫[0 to 1] f_n(x) dx and ∫[0 to 1] lim[n→∞] f_n(x) dx. Are they equal?",
            problem_type="real_analysis",
            difficulty="very_hard",
            expected_answer="They are not equal. The integral of the limit is ∫0 dx = 0. The limit of the integral is lim[n→∞] (1 - e⁻ⁿ) = 1. They differ because convergence is not uniform, so the limit and integral cannot be interchanged."
        )
    ]


def main():
    solver = GeminiMathSolver()
    if not solver.api_configured:
        print("\n--- RUNNING IN MOCK MODE. NO REAL LEARNING WILL OCCUR. ---")
        print("--- Please configure your Gemini API key to run properly. ---")

    problems = create_hard_problems()
    for prob in problems:
        solver.solve_problem(prob)

    print("\n\n" + "="*30 + " FINAL Q-TABLE STATE " + "="*30)
    # Print a few learned Q-values to show it's working
    if solver.agent.q_table:
        for i, (state, actions) in enumerate(solver.agent.q_table.items()):
            if i >= 5: break
            print(f"State: {state}")
            for action, value in actions.items():
                print(f"  - Action: {action}, Q-Value: {value:.3f}")
    else:
        print("Q-Table is empty (likely ran in mock mode).")

if __name__ == "__main__":
    main()

✅ Gemini API configured successfully.

🧮 Solving Problem: Find the derivative of the Weierstrass function, defined as f(x) = Σ [from n=0 to ∞] aⁿ * cos(bⁿπx), for 0 < a < 1 and ab > 1 + (3/2)π.
🎯 Expected Answer: The function is continuous everywhere but differentiable nowhere. Therefore, the derivative does not exist for any value of x.
----------------------------------------------------------------------
Step 1/5 (Attempt 1) | Action: identify_and_setup | Result: ❌
  💬 Learner: Identify the function as an infinite series of trigonometric functions.  The problem requires finding the derivative of a function defined by an infinite sum.  The key is to apply the term-by-term differentiation rule to the series, assuming the resulting series converges uniformly in a suitable interval.
  👨‍⚖️ Judge: INCORRECT: This approach is flawed because the series of derivatives for the Weierstrass function does not converge, meaning the assumption for term-by-term differentiation is not met and this 

**Section 2 (this Section test by saving Q-table)**

In [12]:
import numpy as np
import random
import json
import time
import os
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from collections import defaultdict
import google.generativeai as genai

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("GEMINI_API_KEY")
    genai.configure(api_key=api_key)
    # The print statement is optional, but helpful for debugging
    # print("✅ Successfully configured Gemini API from Kaggle Secrets.") 
except Exception as e:
    # This will run if the secret is not found, e.g., you forgot to add it.
    print(f"⚠️ Could not configure Gemini API. Please ensure you have added your key to Kaggle Secrets with the label 'GEMINI_API_KEY'. Error: {e}")

@dataclass
class MathProblem:
    problem: str
    problem_type: str
    difficulty: str
    expected_answer: str

@dataclass
class StepResult:
    step_number: int
    action: str
    step_content: str
    is_correct: bool
    reward: float
    judge_feedback: str = ""
    attempts: int = 1

class QLearningAgent:
    """
    Implements the Q-learning algorithm exactly as specified in the image.
    Q(S, A) <- Q(S, A) + α * [R + γ * max_a Q(S', a) - Q(S, A)]
    """
    def __init__(self, learning_rate=0.1, discount_factor=0.9, epsilon=0.2, model_path="qlearning_model.json"):
        self.learning_rate: float = learning_rate
        self.discount_factor: float = discount_factor
        self.epsilon: float = epsilon
        self.model_path: str = model_path
        self.q_table: Dict[str, Dict[str, float]] = defaultdict(lambda: defaultdict(float))
        # A more structured action space for 5 steps
        self.action_space: Dict[int, List[str]] = {
            1: ["identify_and_setup", "initial_decomposition"],
            2: ["apply_core_rule", "select_technique"],
            3: ["execute_calculation", "algebraic_manipulation"],
            4: ["simplify_result", "check_for_edge_cases"],
            5: ["final_answer_and_conclusion", "verify_solution"]
        }
        
        # Try to load existing model
        self.load_model()

    def get_state(self, problem: MathProblem, step: int) -> str:
        """Create a simplified but effective state representation."""
        # State depends on problem type, difficulty, and current step number.
        return f"{problem.problem_type}_{problem.difficulty}_step{step}"

    def choose_action(self, state: str, step: int) -> str:
        """Choose action using epsilon-greedy policy from the available actions for the current step."""
        available_actions = self.action_space.get(step)
        
        # Exploration: choose a random action
        if random.random() < self.epsilon:
            return random.choice(available_actions)
        
        # Exploitation: choose the best-known action
        q_values_for_state = self.q_table[state]
        
        # Filter Q-values to only include actions available at this step
        available_q_values = {action: q_values_for_state[action] for action in available_actions}
        
        if not available_q_values or all(v == 0 for v in available_q_values.values()):
            # If no q-values learned yet for this state, choose randomly
            return random.choice(available_actions)
        
        return max(available_q_values, key=available_q_values.get)

    def update_q_value(self, state: str, action: str, reward: float, next_state: str, next_step: int):
        """
        Update Q-value using the Bellman equation from the image.
        Q(S, A) <- Q(S, A) + α * [R + γ * max_a Q(S', a) - Q(S, A)]
        """
        # 1. Get the old Q-value: Q(S, A)
        current_q = self.q_table[state][action]

        # 2. Find the maximum Q-value for the next state: max_a Q(S', a)
        next_q_values = self.q_table[next_state]
        available_next_actions = self.action_space.get(next_step, [])
        
        max_next_q = 0
        if next_q_values and available_next_actions:
            # Consider only actions available in the next step
            relevant_next_q = [next_q_values[act] for act in available_next_actions]
            if relevant_next_q:
                max_next_q = max(relevant_next_q)
        
        # 3. Calculate the TD target: R + γ * max_a Q(S', a)
        td_target = reward + self.discount_factor * max_next_q
        
        # 4. Calculate the TD error: td_target - Q(S, A)
        td_error = td_target - current_q

        # 5. Update the Q-value: Q(S, A) + α * TD_error
        new_q = current_q + self.learning_rate * td_error
        self.q_table[state][action] = new_q
        
        # print(f"  🧠 Q-Update: s={state}, a={action}, r={reward:.1f} | OldQ:{current_q:.2f} -> NewQ:{new_q:.2f}")

    def save_model(self):
        """Save the Q-table and hyperparameters to a JSON file."""
        try:
            # Convert defaultdict to regular dict for JSON serialization
            q_table_dict = {}
            for state, actions in self.q_table.items():
                q_table_dict[state] = dict(actions)
            
            model_data = {
                "q_table": q_table_dict,
                "learning_rate": self.learning_rate,
                "discount_factor": self.discount_factor,
                "epsilon": self.epsilon,
                "action_space": self.action_space
            }
            
            with open(self.model_path, 'w') as f:
                json.dump(model_data, f, indent=2)
            
            print(f"💾 Model saved successfully to {self.model_path}")
            print(f"   - Q-table entries: {len(q_table_dict)}")
            print(f"   - Total state-action pairs: {sum(len(actions) for actions in q_table_dict.values())}")
            
        except Exception as e:
            print(f"❌ Error saving model: {str(e)}")

    def load_model(self):
        """Load the Q-table and hyperparameters from a JSON file."""
        if not os.path.exists(self.model_path):
            print(f"📁 No existing model found at {self.model_path}. Starting with empty Q-table.")
            return
        
        try:
            with open(self.model_path, 'r') as f:
                model_data = json.load(f)
            
            # Restore Q-table
            self.q_table = defaultdict(lambda: defaultdict(float))
            for state, actions in model_data["q_table"].items():
                for action, value in actions.items():
                    self.q_table[state][action] = value
            
            # Restore hyperparameters (optional, in case they were saved)
            if "learning_rate" in model_data:
                self.learning_rate = model_data["learning_rate"]
            if "discount_factor" in model_data:
                self.discount_factor = model_data["discount_factor"]
            if "epsilon" in model_data:
                self.epsilon = model_data["epsilon"]
            
            print(f"📂 Model loaded successfully from {self.model_path}")
            print(f"   - Q-table entries: {len(model_data['q_table'])}")
            print(f"   - Total state-action pairs: {sum(len(actions) for actions in model_data['q_table'].values())}")
            
        except Exception as e:
            print(f"❌ Error loading model: {str(e)}")
            print("   Starting with empty Q-table.")

    def get_q_table_stats(self) -> Dict[str, any]:
        """Get statistics about the current Q-table."""
        total_states = len(self.q_table)
        total_actions = sum(len(actions) for actions in self.q_table.values())
        
        # Calculate average Q-values per step
        step_stats = {}
        for state, actions in self.q_table.items():
            if "_step" in state:
                step_num = state.split("_step")[1]
                if step_num not in step_stats:
                    step_stats[step_num] = {"count": 0, "avg_q": 0}
                step_stats[step_num]["count"] += len(actions)
                step_stats[step_num]["avg_q"] += sum(actions.values())
        
        for step in step_stats:
            if step_stats[step]["count"] > 0:
                step_stats[step]["avg_q"] /= step_stats[step]["count"]
        
        return {
            "total_states": total_states,
            "total_actions": total_actions,
            "step_stats": step_stats
        }

class GeminiMathSolver:
    def __init__(self, learner_model_name="gemini-1.5-flash-8b", judge_model_name="gemini-2.5-pro", model_path="qlearning_model.json"):
        self.agent = QLearningAgent(model_path=model_path)
        self.max_steps = 5
        self.max_retries_per_step = 3 # Allow the learner to try again if it makes a mistake
        self.api_configured = False
        self.model_path = model_path
        
        try:
            self.learner_model = genai.GenerativeModel(learner_model_name)
            self.judge_model = genai.GenerativeModel(judge_model_name)
            self.api_configured = True
            print("✅ Gemini API configured successfully.")
        except Exception as e:
            print(f"⚠️ Gemini API not configured. Running in MOCK mode. Error: {str(e)}")
    
    def get_learner_response(self, problem: MathProblem, step: int, action: str, previous_steps: List[str], feedback: Optional[str] = None) -> str:
        """Generates a response from the learner model, incorporating feedback if provided."""
        if not self.api_configured:
            return f"[MOCK] Step {step}: Executing action '{action}' for problem type {problem.problem_type}."
    
        feedback_prompt = ""
        if feedback:
            feedback_prompt = f"""
            Your previous attempt at this step was incorrect. Here is the feedback from the expert judge:
            ---
            {feedback}
            ---
            Please correct your mistake and provide a new, accurate response for this step.
            """
    
        # --- FIX IS HERE ---
        # 1. Pre-format the 'previous_steps' string. Note the single backslash \n is fine here.
        if previous_steps:
            previous_steps_str = "".join(f"Step {i+1}: {s}\n" for i, s in enumerate(previous_steps))

        else:
            previous_steps_str = "None"
        # --- END FIX ---
    
        step_instructions = {
            1: "Start by identifying the function/problem type and outlining the initial setup or first principle to apply.",
            2: "Apply the main mathematical rule or technique (e.g., chain rule, integration by parts, matrix inversion).",
            3: "Perform the necessary calculations and algebraic manipulations based on the previous step.",
            4: "Simplify the resulting expression and check for any intermediate errors or edge cases.",
            5: "State the complete, final answer clearly. This is your last step."
        }
        
        prompt = f"""
    You are an expert mathematician solving a calculus problem in a structured, 5-step process.
    You are on step {step} of 5.
    
    Problem: {problem.problem}
    
    Previous Steps:
    {previous_steps_str}
    
    Current Step Instructions ({step}/5): {step_instructions[step]}
    Your high-level action for this step is: '{action}'.
    
    {feedback_prompt}
    
    Provide only the mathematical work for this current step.
    {'This is the final step, you must provide the final answer.' if step == 5 else f'You have {5-step} steps remaining after this.'}
    """
        response = self.learner_model.generate_content(prompt)
        return response.text

    def get_judge_evaluation(self, problem: MathProblem, step_content: str, step_number: int) -> Tuple[bool, float, str]:
        """Evaluates a step using the powerful judge model."""
        if not self.api_configured:
            # Mock evaluation for testing without an API key
            is_correct = random.random() > 0.4 # 60% chance of being correct
            reward = (10 if is_correct else -5)
            feedback = "MOCK: This is a mock evaluation."
            return is_correct, reward, feedback
            
        is_final_step = (step_number == self.max_steps)
        
        # --- CORRECTED JUDGE PROMPT ---
        prompt = f"""
    You are an expert mathematician and judge. Your task is to evaluate one step of a solution to a math problem.
    The problem is: "{problem.problem}"
    The expected final answer is: "{problem.expected_answer}"
    
    The current step being evaluated is Step {step_number}.
    The student's submission for this step is:
    ---
    {step_content}
    ---
    
    Based on the problem, the student's submission for this step, and the expected final answer, is this step correct?
    - A step is CORRECT if it is mathematically sound and makes logical progress towards the final answer.
    - A step is INCORRECT if it contains a mathematical error, a logical flaw, or is a step that doesn't lead to the correct solution.
    
    Start your response with the word "CORRECT" or "INCORRECT".
    Then, provide a brief, one-sentence explanation for your decision.
    
    Example 1:
    CORRECT: The application of the product rule is accurate.
    
    Example 2:
    INCORRECT: The derivative of sin(x) is -cos(x), not cos(x) as written.
    
    Example 3:
    INCORRECT: The calculation is correct, but this approach of integration by parts will not lead to the final answer.
    
    Now, evaluate the student's submission. The final answer should be: {problem.expected_answer}
    """
        # --- END CORRECTED JUDGE PROMPT ---
        
        try:
            response = self.judge_model.generate_content(prompt)
            feedback_text = response.text.strip()
            
            is_correct = feedback_text.upper().startswith('CORRECT')
            
            # Define rewards
            if is_correct:
                reward = 15.0 if is_final_step else 5.0 + step_number
            else:
                reward = -20.0 if is_final_step else -10.0
                
            return is_correct, reward, feedback_text
    
        except Exception as e:
            print(f"❌ Error during judge evaluation: {str(e)}")
            return False, -15.0, f"Evaluation failed due to an API error: {e}"

    def solve_problem(self, problem: MathProblem) -> List[StepResult]:
        """Solves a problem using the Q-learning guided, 5-step process with retries."""
        print(f"\n" + "="*70)
        print(f"🧮 Solving Problem: {problem.problem}")
        print(f"🎯 Expected Answer: {problem.expected_answer}")
        print("-" * 70)
        
        results = []
        previous_steps_content = []

        for step in range(1, self.max_steps + 1):
            state = self.agent.get_state(problem, step)
            feedback_for_retry = None
            
            for attempt in range(1, self.max_retries_per_step + 1):
                # 1. Choose Action A from State S
                action = self.agent.choose_action(state, step)

                # 2. Take Action A, get Step Content
                step_content = self.get_learner_response(problem, step, action, previous_steps_content, feedback_for_retry)
                
                # 3. Observe Reward R and Next State S'
                is_correct, reward, judge_feedback = self.get_judge_evaluation(problem, step_content, step)
                
                status_icon = "✅" if is_correct else "❌"
                print(f"Step {step}/{self.max_steps} (Attempt {attempt}) | Action: {action} | Result: {status_icon}")
                print(f"  💬 Learner: {step_content.strip()}")
                print(f"  👨‍⚖️ Judge: {judge_feedback.strip()}")
                
                if is_correct:
                    # The step was correct, finalize and move to the next step
                    result = StepResult(step, action, step_content, is_correct, reward, judge_feedback, attempt)
                    results.append(result)
                    previous_steps_content.append(step_content)
                    
                    # 4. Update Q-Table
                    next_step = step + 1
                    next_state = self.agent.get_state(problem, next_step) if next_step <= self.max_steps else "terminal"
                    self.agent.update_q_value(state, action, reward, next_state, next_step)
                    
                    time.sleep(1) # API rate limiting
                    break # Exit the retry loop
                else:
                    # The step was incorrect, prepare for another attempt
                    feedback_for_retry = judge_feedback
                    # Apply a penalty for the failed attempt and update Q-value to discourage this action
                    self.agent.update_q_value(state, action, reward, state, step) # Update with penalty, next_state is current state
                    
                    if attempt == self.max_retries_per_step:
                        # Max retries reached, accept the wrong answer and move on
                        print(f"  ⚠️ Max retries reached for step {step}. Accepting incorrect step.")
                        result = StepResult(step, action, step_content, is_correct, reward, judge_feedback, attempt)
                        results.append(result)
                        previous_steps_content.append(step_content)
                        break # Exit the retry loop
                    
                    time.sleep(1) # API rate limiting

        # Save the model after each problem
        self.agent.save_model()

        # Final Summary
        total_reward = sum(r.reward for r in results)
        correct_steps = sum(1 for r in results if r.is_correct)
        print("\n" + "-"*30 + " SUMMARY " + "-"*30)
        print(f"📊 Final Result: {correct_steps}/{self.max_steps} steps correct.")
        print(f"🏆 Total Reward: {total_reward:.1f}")
        print("="*70)
        return results

    def print_model_stats(self):
        """Print statistics about the current Q-learning model."""
        stats = self.agent.get_q_table_stats()
        print("\n" + "="*30 + " MODEL STATISTICS " + "="*30)
        print(f"📊 Total States: {stats['total_states']}")
        print(f"📊 Total State-Action Pairs: {stats['total_actions']}")
        
        if stats['step_stats']:
            print("\n📈 Step-wise Statistics:")
            for step, data in stats['step_stats'].items():
                print(f"  Step {step}: {data['count']} actions, avg Q-value: {data['avg_q']:.3f}")
        
        print("="*70)

# --- Main Execution ---
def create_hard_problems() -> List[MathProblem]:
    """Create a list of hard and very hard sample math problems for testing."""
    return [
        MathProblem(
            problem="Find the derivative of the Weierstrass function, defined as f(x) = Σ [from n=0 to ∞] aⁿ * cos(bⁿπx), for 0 < a < 1 and ab > 1 + (3/2)π.",
            problem_type="derivative",
            difficulty="very_hard",
            expected_answer="The function is continuous everywhere but differentiable nowhere. Therefore, the derivative does not exist for any value of x."
        ),
        MathProblem(
            problem="Consider f_n(x) = 2nx*e^(-nx²) on [0, 1]. Evaluate lim[n→∞] ∫[0 to 1] f_n(x) dx and ∫[0 to 1] lim[n→∞] f_n(x) dx. Are they equal?",
            problem_type="real_analysis",
            difficulty="very_hard",
            expected_answer="They are not equal. The integral of the limit is ∫0 dx = 0. The limit of the integral is lim[n→∞] (1 - e⁻ⁿ) = 1. They differ because convergence is not uniform, so the limit and integral cannot be interchanged."
        ),
        MathProblem(
            problem="Find the derivative of f(x) = x^2 + 3x + 2",
            problem_type="derivative",
            difficulty="easy",
            expected_answer="f'(x) = 2x + 3"
        )
    ]


def main():
    # You can specify a custom model path if needed
    model_path = "math_qlearning_model.json"
    solver = GeminiMathSolver(model_path=model_path)
    
    if not solver.api_configured:
        print("\n--- RUNNING IN MOCK MODE. NO REAL LEARNING WILL OCCUR. ---")
        print("--- Please configure your Gemini API key to run properly. ---")

    # Print initial model statistics
    solver.print_model_stats()

    problems = create_hard_problems()
    for prob in problems:
        solver.solve_problem(prob)

    # Print final model statistics
    solver.print_model_stats()

    print("\n\n" + "="*30 + " FINAL Q-TABLE STATE " + "="*30)
    # Print a few learned Q-values to show it's working
    if solver.agent.q_table:
        for i, (state, actions) in enumerate(solver.agent.q_table.items()):
            if i >= 5: break
            print(f"State: {state}")
            for action, value in actions.items():
                print(f"  - Action: {action}, Q-Value: {value:.3f}")
    else:
        print("Q-Table is empty (likely ran in mock mode).")

if __name__ == "__main__":
    main()

📁 No existing model found at math_qlearning_model.json. Starting with empty Q-table.
✅ Gemini API configured successfully.

📊 Total States: 0
📊 Total State-Action Pairs: 0

🧮 Solving Problem: Find the derivative of the Weierstrass function, defined as f(x) = Σ [from n=0 to ∞] aⁿ * cos(bⁿπx), for 0 < a < 1 and ab > 1 + (3/2)π.
🎯 Expected Answer: The function is continuous everywhere but differentiable nowhere. Therefore, the derivative does not exist for any value of x.
----------------------------------------------------------------------
Step 1/5 (Attempt 1) | Action: identify_and_setup | Result: ❌
  💬 Learner: **Step 1: Identify and Setup**

The problem is to find the derivative of a function defined by an infinite series.  The function f(x) = Σ [from n=0 to ∞] aⁿ * cos(bⁿπx)  is a trigonometric series.  To find the derivative, we will employ the term-by-term differentiation rule, which states that if a function is defined by a convergent power series, the derivative of the function 

KeyboardInterrupt: 