In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import random
import json
import time
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from collections import defaultdict
import google.generativeai as genai

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("GEMINI_API_KEY")
    genai.configure(api_key=api_key)
    # The print statement is optional, but helpful for debugging
    # print("✅ Successfully configured Gemini API from Kaggle Secrets.") 
except Exception as e:
    # This will run if the secret is not found, e.g., you forgot to add it.
    print(f"⚠️ Could not configure Gemini API. Please ensure you have added your key to Kaggle Secrets with the label 'GEMINI_API_KEY'. Error: {e}")

@dataclass
class MathProblem:
    problem: str
    problem_type: str
    difficulty: str
    expected_answer: str

@dataclass
class StepResult:
    step_number: int
    action: str
    step_content: str
    is_correct: bool
    reward: float
    judge_feedback: str = ""
    attempts: int = 1

class QLearningAgent:
    """
    Implements the Q-learning algorithm exactly as specified in the image.
    Q(S, A) <- Q(S, A) + α * [R + γ * max_a Q(S', a) - Q(S, A)]
    """
    def __init__(self, learning_rate=0.1, discount_factor=0.9, epsilon=0.2):
        self.learning_rate: float = learning_rate
        self.discount_factor: float = discount_factor
        self.epsilon: float = epsilon
        self.q_table: Dict[str, Dict[str, float]] = defaultdict(lambda: defaultdict(float))
        # A more structured action space for 5 steps
        self.action_space: Dict[int, List[str]] = {
            1: ["identify_and_setup", "initial_decomposition"],
            2: ["apply_core_rule", "select_technique"],
            3: ["execute_calculation", "algebraic_manipulation"],
            4: ["simplify_result", "check_for_edge_cases"],
            5: ["final_answer_and_conclusion", "verify_solution"]
        }

    def get_state(self, problem: MathProblem, step: int) -> str:
        """Create a simplified but effective state representation."""
        # State depends on problem type, difficulty, and current step number.
        return f"{problem.problem_type}_{problem.difficulty}_step{step}"

    def choose_action(self, state: str, step: int) -> str:
        """Choose action using epsilon-greedy policy from the available actions for the current step."""
        available_actions = self.action_space.get(step)
        
        # Exploration: choose a random action
        if random.random() < self.epsilon:
            return random.choice(available_actions)
        
        # Exploitation: choose the best-known action
        q_values_for_state = self.q_table[state]
        
        # Filter Q-values to only include actions available at this step
        available_q_values = {action: q_values_for_state[action] for action in available_actions}
        
        if not available_q_values or all(v == 0 for v in available_q_values.values()):
            # If no q-values learned yet for this state, choose randomly
            return random.choice(available_actions)
        
        return max(available_q_values, key=available_q_values.get)

    def update_q_value(self, state: str, action: str, reward: float, next_state: str, next_step: int):
        """
        Update Q-value using the Bellman equation from the image.
        Q(S, A) <- Q(S, A) + α * [R + γ * max_a Q(S', a) - Q(S, A)]
        """
        # 1. Get the old Q-value: Q(S, A)
        current_q = self.q_table[state][action]

        # 2. Find the maximum Q-value for the next state: max_a Q(S', a)
        next_q_values = self.q_table[next_state]
        available_next_actions = self.action_space.get(next_step, [])
        
        max_next_q = 0
        if next_q_values and available_next_actions:
            # Consider only actions available in the next step
            relevant_next_q = [next_q_values[act] for act in available_next_actions]
            if relevant_next_q:
                max_next_q = max(relevant_next_q)
        
        # 3. Calculate the TD target: R + γ * max_a Q(S', a)
        td_target = reward + self.discount_factor * max_next_q
        
        # 4. Calculate the TD error: td_target - Q(S, A)
        td_error = td_target - current_q

        # 5. Update the Q-value: Q(S, A) + α * TD_error
        new_q = current_q + self.learning_rate * td_error
        self.q_table[state][action] = new_q
        
        # print(f"  🧠 Q-Update: s={state}, a={action}, r={reward:.1f} | OldQ:{current_q:.2f} -> NewQ:{new_q:.2f}")

class GeminiMathSolver:
    def __init__(self, learner_model_name="gemini-1.5-flash-8b", judge_model_name="gemini-2.5-pro"):
        self.agent = QLearningAgent()
        self.max_steps = 5
        self.max_retries_per_step = 2 # Allow the learner to try again if it makes a mistake
        self.api_configured = False
        
        try:
            self.learner_model = genai.GenerativeModel(learner_model_name)
            self.judge_model = genai.GenerativeModel(judge_model_name)
            self.api_configured = True
            print("✅ Gemini API configured successfully.")
        except Exception as e:
            print(f"⚠️ Gemini API not configured. Running in MOCK mode. Error: {str(e)}")
    
    def get_learner_response(self, problem: MathProblem, step: int, action: str, previous_steps: List[str], feedback: Optional[str] = None) -> str:
        """Generates a response from the learner model, incorporating feedback if provided."""
        if not self.api_configured:
            return f"[MOCK] Step {step}: Executing action '{action}' for problem type {problem.problem_type}."
    
        feedback_prompt = ""
        if feedback:
            feedback_prompt = f"""
            Your previous attempt at this step was incorrect. Here is the feedback from the expert judge:
            ---
            {feedback}
            ---
            Please correct your mistake and provide a new, accurate response for this step.
            """
    
        # --- FIX IS HERE ---
        # 1. Pre-format the 'previous_steps' string. Note the single backslash \n is fine here.
        if previous_steps:
            previous_steps_str = "".join(f"Step {i+1}: {s}\n" for i, s in enumerate(previous_steps))

        else:
            previous_steps_str = "None"
        # --- END FIX ---
    
        step_instructions = {
            1: "Start by identifying the function/problem type and outlining the initial setup or first principle to apply.",
            2: "Apply the main mathematical rule or technique (e.g., chain rule, integration by parts, matrix inversion).",
            3: "Perform the necessary calculations and algebraic manipulations based on the previous step.",
            4: "Simplify the resulting expression and check for any intermediate errors or edge cases.",
            5: "State the complete, final answer clearly. This is your last step."
        }
        
        prompt = f"""
    You are an expert mathematician solving a calculus problem in a structured, 5-step process.
    You are on step {step} of 5.
    
    Problem: {problem.problem}
    
    Previous Steps:
    {previous_steps_str}
    
    Current Step Instructions ({step}/5): {step_instructions[step]}
    Your high-level action for this step is: '{action}'.
    
    {feedback_prompt}
    
    Provide only the mathematical work for this current step.
    {'This is the final step, you must provide the final answer.' if step == 5 else f'You have {5-step} steps remaining after this.'}
    """
        response = self.learner_model.generate_content(prompt)
        return response.text

    def get_judge_evaluation(self, problem: MathProblem, step_content: str, step_number: int) -> Tuple[bool, float, str]:
        """Evaluates a step using the powerful judge model."""
        if not self.api_configured:
            # Mock evaluation for testing without an API key
            is_correct = random.random() > 0.4 # 60% chance of being correct
            reward = (10 if is_correct else -5)
            feedback = "MOCK: This is a mock evaluation."
            return is_correct, reward, feedback
            
        is_final_step = (step_number == self.max_steps)
        
        # --- CORRECTED JUDGE PROMPT ---
        prompt = f"""
    You are an expert mathematician and judge. Your task is to evaluate one step of a solution to a math problem.
    The problem is: "{problem.problem}"
    The expected final answer is: "{problem.expected_answer}"
    
    The current step being evaluated is Step {step_number}.
    The student's submission for this step is:
    ---
    {step_content}
    ---
    
    Based on the problem, the student's submission for this step, and the expected final answer, is this step correct?
    - A step is CORRECT if it is mathematically sound and makes logical progress towards the final answer.
    - A step is INCORRECT if it contains a mathematical error, a logical flaw, or is a step that doesn't lead to the correct solution.
    
    Start your response with the word "CORRECT" or "INCORRECT".
    Then, provide a brief, one-sentence explanation for your decision.
    
    Example 1:
    CORRECT: The application of the product rule is accurate.
    
    Example 2:
    INCORRECT: The derivative of sin(x) is -cos(x), not cos(x) as written.
    
    Example 3:
    INCORRECT: The calculation is correct, but this approach of integration by parts will not lead to the final answer.
    
    Now, evaluate the student's submission. The final answer should be: {problem.expected_answer}
    """
        # --- END CORRECTED JUDGE PROMPT ---
        
        try:
            response = self.judge_model.generate_content(prompt)
            feedback_text = response.text.strip()
            
            is_correct = feedback_text.upper().startswith('CORRECT')
            
            # Define rewards
            if is_correct:
                reward = 15.0 if is_final_step else 5.0 + step_number
            else:
                reward = -20.0 if is_final_step else -10.0
                
            return is_correct, reward, feedback_text
    
        except Exception as e:
            print(f"❌ Error during judge evaluation: {str(e)}")
            return False, -15.0, f"Evaluation failed due to an API error: {e}"

    def solve_problem(self, problem: MathProblem) -> List[StepResult]:
        """Solves a problem using the Q-learning guided, 5-step process with retries."""
        print(f"\n" + "="*70)
        print(f"🧮 Solving Problem: {problem.problem}")
        print(f"🎯 Expected Answer: {problem.expected_answer}")
        print("-" * 70)
        
        results = []
        previous_steps_content = []

        for step in range(1, self.max_steps + 1):
            state = self.agent.get_state(problem, step)
            feedback_for_retry = None
            
            for attempt in range(1, self.max_retries_per_step + 1):
                # 1. Choose Action A from State S
                action = self.agent.choose_action(state, step)

                # 2. Take Action A, get Step Content
                step_content = self.get_learner_response(problem, step, action, previous_steps_content, feedback_for_retry)
                
                # 3. Observe Reward R and Next State S'
                is_correct, reward, judge_feedback = self.get_judge_evaluation(problem, step_content, step)
                
                status_icon = "✅" if is_correct else "❌"
                print(f"Step {step}/{self.max_steps} (Attempt {attempt}) | Action: {action} | Result: {status_icon}")
                print(f"  💬 Learner: {step_content.strip()}")
                print(f"  👨‍⚖️ Judge: {judge_feedback.strip()}")
                
                if is_correct:
                    # The step was correct, finalize and move to the next step
                    result = StepResult(step, action, step_content, is_correct, reward, judge_feedback, attempt)
                    results.append(result)
                    previous_steps_content.append(step_content)
                    
                    # 4. Update Q-Table
                    next_step = step + 1
                    next_state = self.agent.get_state(problem, next_step) if next_step <= self.max_steps else "terminal"
                    self.agent.update_q_value(state, action, reward, next_state, next_step)
                    
                    time.sleep(1) # API rate limiting
                    break # Exit the retry loop
                else:
                    # The step was incorrect, prepare for another attempt
                    feedback_for_retry = judge_feedback
                    # Apply a penalty for the failed attempt and update Q-value to discourage this action
                    self.agent.update_q_value(state, action, reward, state, step) # Update with penalty, next_state is current state
                    
                    if attempt == self.max_retries_per_step:
                        # Max retries reached, accept the wrong answer and move on
                        print(f"  ⚠️ Max retries reached for step {step}. Accepting incorrect step.")
                        result = StepResult(step, action, step_content, is_correct, reward, judge_feedback, attempt)
                        results.append(result)
                        previous_steps_content.append(step_content)
                        break # Exit the retry loop
                    
                    time.sleep(1) # API rate limiting

        # Final Summary
        total_reward = sum(r.reward for r in results)
        correct_steps = sum(1 for r in results if r.is_correct)
        print("\n" + "-"*30 + " SUMMARY " + "-"*30)
        print(f"📊 Final Result: {correct_steps}/{self.max_steps} steps correct.")
        print(f"🏆 Total Reward: {total_reward:.1f}")
        print("="*70)
        return results

# --- Main Execution ---
def create_hard_problems() -> List[MathProblem]:
    """Create a list of hard and very hard sample math problems for testing."""
    return [
        MathProblem(
            problem="Find the derivative of the Weierstrass function, defined as f(x) = Σ [from n=0 to ∞] aⁿ * cos(bⁿπx), for 0 < a < 1 and ab > 1 + (3/2)π.",
            problem_type="derivative",
            difficulty="very_hard",
            expected_answer="The function is continuous everywhere but differentiable nowhere. Therefore, the derivative does not exist for any value of x."
        ),
        MathProblem(
            problem="Consider f_n(x) = 2nx*e^(-nx²) on [0, 1]. Evaluate lim[n→∞] ∫[0 to 1] f_n(x) dx and ∫[0 to 1] lim[n→∞] f_n(x) dx. Are they equal?",
            problem_type="real_analysis",
            difficulty="very_hard",
            expected_answer="They are not equal. The integral of the limit is ∫0 dx = 0. The limit of the integral is lim[n→∞] (1 - e⁻ⁿ) = 1. They differ because convergence is not uniform, so the limit and integral cannot be interchanged."
        )
    ]


def main():
    solver = GeminiMathSolver()
    if not solver.api_configured:
        print("\n--- RUNNING IN MOCK MODE. NO REAL LEARNING WILL OCCUR. ---")
        print("--- Please configure your Gemini API key to run properly. ---")

    problems = create_hard_problems()
    for prob in problems:
        solver.solve_problem(prob)

    print("\n\n" + "="*30 + " FINAL Q-TABLE STATE " + "="*30)
    # Print a few learned Q-values to show it's working
    if solver.agent.q_table:
        for i, (state, actions) in enumerate(solver.agent.q_table.items()):
            if i >= 5: break
            print(f"State: {state}")
            for action, value in actions.items():
                print(f"  - Action: {action}, Q-Value: {value:.3f}")
    else:
        print("Q-Table is empty (likely ran in mock mode).")

if __name__ == "__main__":
    main()

✅ Gemini API configured successfully.

🧮 Solving Problem: Find the derivative of the Weierstrass function, defined as f(x) = Σ [from n=0 to ∞] aⁿ * cos(bⁿπx), for 0 < a < 1 and ab > 1 + (3/2)π.
🎯 Expected Answer: The function is continuous everywhere but differentiable nowhere. Therefore, the derivative does not exist for any value of x.
----------------------------------------------------------------------
Step 1/5 (Attempt 1) | Action: identify_and_setup | Result: ❌
  💬 Learner: Identify the function as an infinite series of trigonometric functions.  The problem requires finding the derivative of a function defined by an infinite sum.  The key is to apply the term-by-term differentiation rule to the series, assuming the resulting series converges uniformly in a suitable interval.
  👨‍⚖️ Judge: INCORRECT: This approach is flawed because the series of derivatives for the Weierstrass function does not converge, meaning the assumption for term-by-term differentiation is not met and this 

In [12]:
import numpy as np
import random
import json
import time
import os
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from collections import defaultdict
import google.generativeai as genai

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("GEMINI_API_KEY")
    genai.configure(api_key=api_key)
    # The print statement is optional, but helpful for debugging
    # print("✅ Successfully configured Gemini API from Kaggle Secrets.") 
except Exception as e:
    # This will run if the secret is not found, e.g., you forgot to add it.
    print(f"⚠️ Could not configure Gemini API. Please ensure you have added your key to Kaggle Secrets with the label 'GEMINI_API_KEY'. Error: {e}")

@dataclass
class MathProblem:
    problem: str
    problem_type: str
    difficulty: str
    expected_answer: str

@dataclass
class StepResult:
    step_number: int
    action: str
    step_content: str
    is_correct: bool
    reward: float
    judge_feedback: str = ""
    attempts: int = 1

class QLearningAgent:
    """
    Implements the Q-learning algorithm exactly as specified in the image.
    Q(S, A) <- Q(S, A) + α * [R + γ * max_a Q(S', a) - Q(S, A)]
    """
    def __init__(self, learning_rate=0.1, discount_factor=0.9, epsilon=0.2, model_path="qlearning_model.json"):
        self.learning_rate: float = learning_rate
        self.discount_factor: float = discount_factor
        self.epsilon: float = epsilon
        self.model_path: str = model_path
        self.q_table: Dict[str, Dict[str, float]] = defaultdict(lambda: defaultdict(float))
        # A more structured action space for 5 steps
        self.action_space: Dict[int, List[str]] = {
            1: ["identify_and_setup", "initial_decomposition"],
            2: ["apply_core_rule", "select_technique"],
            3: ["execute_calculation", "algebraic_manipulation"],
            4: ["simplify_result", "check_for_edge_cases"],
            5: ["final_answer_and_conclusion", "verify_solution"]
        }
        
        # Try to load existing model
        self.load_model()

    def get_state(self, problem: MathProblem, step: int) -> str:
        """Create a simplified but effective state representation."""
        # State depends on problem type, difficulty, and current step number.
        return f"{problem.problem_type}_{problem.difficulty}_step{step}"

    def choose_action(self, state: str, step: int) -> str:
        """Choose action using epsilon-greedy policy from the available actions for the current step."""
        available_actions = self.action_space.get(step)
        
        # Exploration: choose a random action
        if random.random() < self.epsilon:
            return random.choice(available_actions)
        
        # Exploitation: choose the best-known action
        q_values_for_state = self.q_table[state]
        
        # Filter Q-values to only include actions available at this step
        available_q_values = {action: q_values_for_state[action] for action in available_actions}
        
        if not available_q_values or all(v == 0 for v in available_q_values.values()):
            # If no q-values learned yet for this state, choose randomly
            return random.choice(available_actions)
        
        return max(available_q_values, key=available_q_values.get)

    def update_q_value(self, state: str, action: str, reward: float, next_state: str, next_step: int):
        """
        Update Q-value using the Bellman equation from the image.
        Q(S, A) <- Q(S, A) + α * [R + γ * max_a Q(S', a) - Q(S, A)]
        """
        # 1. Get the old Q-value: Q(S, A)
        current_q = self.q_table[state][action]

        # 2. Find the maximum Q-value for the next state: max_a Q(S', a)
        next_q_values = self.q_table[next_state]
        available_next_actions = self.action_space.get(next_step, [])
        
        max_next_q = 0
        if next_q_values and available_next_actions:
            # Consider only actions available in the next step
            relevant_next_q = [next_q_values[act] for act in available_next_actions]
            if relevant_next_q:
                max_next_q = max(relevant_next_q)
        
        # 3. Calculate the TD target: R + γ * max_a Q(S', a)
        td_target = reward + self.discount_factor * max_next_q
        
        # 4. Calculate the TD error: td_target - Q(S, A)
        td_error = td_target - current_q

        # 5. Update the Q-value: Q(S, A) + α * TD_error
        new_q = current_q + self.learning_rate * td_error
        self.q_table[state][action] = new_q
        
        # print(f"  🧠 Q-Update: s={state}, a={action}, r={reward:.1f} | OldQ:{current_q:.2f} -> NewQ:{new_q:.2f}")

    def save_model(self):
        """Save the Q-table and hyperparameters to a JSON file."""
        try:
            # Convert defaultdict to regular dict for JSON serialization
            q_table_dict = {}
            for state, actions in self.q_table.items():
                q_table_dict[state] = dict(actions)
            
            model_data = {
                "q_table": q_table_dict,
                "learning_rate": self.learning_rate,
                "discount_factor": self.discount_factor,
                "epsilon": self.epsilon,
                "action_space": self.action_space
            }
            
            with open(self.model_path, 'w') as f:
                json.dump(model_data, f, indent=2)
            
            print(f"💾 Model saved successfully to {self.model_path}")
            print(f"   - Q-table entries: {len(q_table_dict)}")
            print(f"   - Total state-action pairs: {sum(len(actions) for actions in q_table_dict.values())}")
            
        except Exception as e:
            print(f"❌ Error saving model: {str(e)}")

    def load_model(self):
        """Load the Q-table and hyperparameters from a JSON file."""
        if not os.path.exists(self.model_path):
            print(f"📁 No existing model found at {self.model_path}. Starting with empty Q-table.")
            return
        
        try:
            with open(self.model_path, 'r') as f:
                model_data = json.load(f)
            
            # Restore Q-table
            self.q_table = defaultdict(lambda: defaultdict(float))
            for state, actions in model_data["q_table"].items():
                for action, value in actions.items():
                    self.q_table[state][action] = value
            
            # Restore hyperparameters (optional, in case they were saved)
            if "learning_rate" in model_data:
                self.learning_rate = model_data["learning_rate"]
            if "discount_factor" in model_data:
                self.discount_factor = model_data["discount_factor"]
            if "epsilon" in model_data:
                self.epsilon = model_data["epsilon"]
            
            print(f"📂 Model loaded successfully from {self.model_path}")
            print(f"   - Q-table entries: {len(model_data['q_table'])}")
            print(f"   - Total state-action pairs: {sum(len(actions) for actions in model_data['q_table'].values())}")
            
        except Exception as e:
            print(f"❌ Error loading model: {str(e)}")
            print("   Starting with empty Q-table.")

    def get_q_table_stats(self) -> Dict[str, any]:
        """Get statistics about the current Q-table."""
        total_states = len(self.q_table)
        total_actions = sum(len(actions) for actions in self.q_table.values())
        
        # Calculate average Q-values per step
        step_stats = {}
        for state, actions in self.q_table.items():
            if "_step" in state:
                step_num = state.split("_step")[1]
                if step_num not in step_stats:
                    step_stats[step_num] = {"count": 0, "avg_q": 0}
                step_stats[step_num]["count"] += len(actions)
                step_stats[step_num]["avg_q"] += sum(actions.values())
        
        for step in step_stats:
            if step_stats[step]["count"] > 0:
                step_stats[step]["avg_q"] /= step_stats[step]["count"]
        
        return {
            "total_states": total_states,
            "total_actions": total_actions,
            "step_stats": step_stats
        }

class GeminiMathSolver:
    def __init__(self, learner_model_name="gemini-1.5-flash-8b", judge_model_name="gemini-2.5-pro", model_path="qlearning_model.json"):
        self.agent = QLearningAgent(model_path=model_path)
        self.max_steps = 5
        self.max_retries_per_step = 3 # Allow the learner to try again if it makes a mistake
        self.api_configured = False
        self.model_path = model_path
        
        try:
            self.learner_model = genai.GenerativeModel(learner_model_name)
            self.judge_model = genai.GenerativeModel(judge_model_name)
            self.api_configured = True
            print("✅ Gemini API configured successfully.")
        except Exception as e:
            print(f"⚠️ Gemini API not configured. Running in MOCK mode. Error: {str(e)}")
    
    def get_learner_response(self, problem: MathProblem, step: int, action: str, previous_steps: List[str], feedback: Optional[str] = None) -> str:
        """Generates a response from the learner model, incorporating feedback if provided."""
        if not self.api_configured:
            return f"[MOCK] Step {step}: Executing action '{action}' for problem type {problem.problem_type}."
    
        feedback_prompt = ""
        if feedback:
            feedback_prompt = f"""
            Your previous attempt at this step was incorrect. Here is the feedback from the expert judge:
            ---
            {feedback}
            ---
            Please correct your mistake and provide a new, accurate response for this step.
            """
    
        # --- FIX IS HERE ---
        # 1. Pre-format the 'previous_steps' string. Note the single backslash \n is fine here.
        if previous_steps:
            previous_steps_str = "".join(f"Step {i+1}: {s}\n" for i, s in enumerate(previous_steps))

        else:
            previous_steps_str = "None"
        # --- END FIX ---
    
        step_instructions = {
            1: "Start by identifying the function/problem type and outlining the initial setup or first principle to apply.",
            2: "Apply the main mathematical rule or technique (e.g., chain rule, integration by parts, matrix inversion).",
            3: "Perform the necessary calculations and algebraic manipulations based on the previous step.",
            4: "Simplify the resulting expression and check for any intermediate errors or edge cases.",
            5: "State the complete, final answer clearly. This is your last step."
        }
        
        prompt = f"""
    You are an expert mathematician solving a calculus problem in a structured, 5-step process.
    You are on step {step} of 5.
    
    Problem: {problem.problem}
    
    Previous Steps:
    {previous_steps_str}
    
    Current Step Instructions ({step}/5): {step_instructions[step]}
    Your high-level action for this step is: '{action}'.
    
    {feedback_prompt}
    
    Provide only the mathematical work for this current step.
    {'This is the final step, you must provide the final answer.' if step == 5 else f'You have {5-step} steps remaining after this.'}
    """
        response = self.learner_model.generate_content(prompt)
        return response.text

    def get_judge_evaluation(self, problem: MathProblem, step_content: str, step_number: int) -> Tuple[bool, float, str]:
        """Evaluates a step using the powerful judge model."""
        if not self.api_configured:
            # Mock evaluation for testing without an API key
            is_correct = random.random() > 0.4 # 60% chance of being correct
            reward = (10 if is_correct else -5)
            feedback = "MOCK: This is a mock evaluation."
            return is_correct, reward, feedback
            
        is_final_step = (step_number == self.max_steps)
        
        # --- CORRECTED JUDGE PROMPT ---
        prompt = f"""
    You are an expert mathematician and judge. Your task is to evaluate one step of a solution to a math problem.
    The problem is: "{problem.problem}"
    The expected final answer is: "{problem.expected_answer}"
    
    The current step being evaluated is Step {step_number}.
    The student's submission for this step is:
    ---
    {step_content}
    ---
    
    Based on the problem, the student's submission for this step, and the expected final answer, is this step correct?
    - A step is CORRECT if it is mathematically sound and makes logical progress towards the final answer.
    - A step is INCORRECT if it contains a mathematical error, a logical flaw, or is a step that doesn't lead to the correct solution.
    
    Start your response with the word "CORRECT" or "INCORRECT".
    Then, provide a brief, one-sentence explanation for your decision.
    
    Example 1:
    CORRECT: The application of the product rule is accurate.
    
    Example 2:
    INCORRECT: The derivative of sin(x) is -cos(x), not cos(x) as written.
    
    Example 3:
    INCORRECT: The calculation is correct, but this approach of integration by parts will not lead to the final answer.
    
    Now, evaluate the student's submission. The final answer should be: {problem.expected_answer}
    """
        # --- END CORRECTED JUDGE PROMPT ---
        
        try:
            response = self.judge_model.generate_content(prompt)
            feedback_text = response.text.strip()
            
            is_correct = feedback_text.upper().startswith('CORRECT')
            
            # Define rewards
            if is_correct:
                reward = 15.0 if is_final_step else 5.0 + step_number
            else:
                reward = -20.0 if is_final_step else -10.0
                
            return is_correct, reward, feedback_text
    
        except Exception as e:
            print(f"❌ Error during judge evaluation: {str(e)}")
            return False, -15.0, f"Evaluation failed due to an API error: {e}"

    def solve_problem(self, problem: MathProblem) -> List[StepResult]:
        """Solves a problem using the Q-learning guided, 5-step process with retries."""
        print(f"\n" + "="*70)
        print(f"🧮 Solving Problem: {problem.problem}")
        print(f"🎯 Expected Answer: {problem.expected_answer}")
        print("-" * 70)
        
        results = []
        previous_steps_content = []

        for step in range(1, self.max_steps + 1):
            state = self.agent.get_state(problem, step)
            feedback_for_retry = None
            
            for attempt in range(1, self.max_retries_per_step + 1):
                # 1. Choose Action A from State S
                action = self.agent.choose_action(state, step)

                # 2. Take Action A, get Step Content
                step_content = self.get_learner_response(problem, step, action, previous_steps_content, feedback_for_retry)
                
                # 3. Observe Reward R and Next State S'
                is_correct, reward, judge_feedback = self.get_judge_evaluation(problem, step_content, step)
                
                status_icon = "✅" if is_correct else "❌"
                print(f"Step {step}/{self.max_steps} (Attempt {attempt}) | Action: {action} | Result: {status_icon}")
                print(f"  💬 Learner: {step_content.strip()}")
                print(f"  👨‍⚖️ Judge: {judge_feedback.strip()}")
                
                if is_correct:
                    # The step was correct, finalize and move to the next step
                    result = StepResult(step, action, step_content, is_correct, reward, judge_feedback, attempt)
                    results.append(result)
                    previous_steps_content.append(step_content)
                    
                    # 4. Update Q-Table
                    next_step = step + 1
                    next_state = self.agent.get_state(problem, next_step) if next_step <= self.max_steps else "terminal"
                    self.agent.update_q_value(state, action, reward, next_state, next_step)
                    
                    time.sleep(1) # API rate limiting
                    break # Exit the retry loop
                else:
                    # The step was incorrect, prepare for another attempt
                    feedback_for_retry = judge_feedback
                    # Apply a penalty for the failed attempt and update Q-value to discourage this action
                    self.agent.update_q_value(state, action, reward, state, step) # Update with penalty, next_state is current state
                    
                    if attempt == self.max_retries_per_step:
                        # Max retries reached, accept the wrong answer and move on
                        print(f"  ⚠️ Max retries reached for step {step}. Accepting incorrect step.")
                        result = StepResult(step, action, step_content, is_correct, reward, judge_feedback, attempt)
                        results.append(result)
                        previous_steps_content.append(step_content)
                        break # Exit the retry loop
                    
                    time.sleep(1) # API rate limiting

        # Save the model after each problem
        self.agent.save_model()

        # Final Summary
        total_reward = sum(r.reward for r in results)
        correct_steps = sum(1 for r in results if r.is_correct)
        print("\n" + "-"*30 + " SUMMARY " + "-"*30)
        print(f"📊 Final Result: {correct_steps}/{self.max_steps} steps correct.")
        print(f"🏆 Total Reward: {total_reward:.1f}")
        print("="*70)
        return results

    def print_model_stats(self):
        """Print statistics about the current Q-learning model."""
        stats = self.agent.get_q_table_stats()
        print("\n" + "="*30 + " MODEL STATISTICS " + "="*30)
        print(f"📊 Total States: {stats['total_states']}")
        print(f"📊 Total State-Action Pairs: {stats['total_actions']}")
        
        if stats['step_stats']:
            print("\n📈 Step-wise Statistics:")
            for step, data in stats['step_stats'].items():
                print(f"  Step {step}: {data['count']} actions, avg Q-value: {data['avg_q']:.3f}")
        
        print("="*70)

# --- Main Execution ---
def create_hard_problems() -> List[MathProblem]:
    """Create a list of hard and very hard sample math problems for testing."""
    return [
        MathProblem(
            problem="Find the derivative of the Weierstrass function, defined as f(x) = Σ [from n=0 to ∞] aⁿ * cos(bⁿπx), for 0 < a < 1 and ab > 1 + (3/2)π.",
            problem_type="derivative",
            difficulty="very_hard",
            expected_answer="The function is continuous everywhere but differentiable nowhere. Therefore, the derivative does not exist for any value of x."
        ),
        MathProblem(
            problem="Consider f_n(x) = 2nx*e^(-nx²) on [0, 1]. Evaluate lim[n→∞] ∫[0 to 1] f_n(x) dx and ∫[0 to 1] lim[n→∞] f_n(x) dx. Are they equal?",
            problem_type="real_analysis",
            difficulty="very_hard",
            expected_answer="They are not equal. The integral of the limit is ∫0 dx = 0. The limit of the integral is lim[n→∞] (1 - e⁻ⁿ) = 1. They differ because convergence is not uniform, so the limit and integral cannot be interchanged."
        ),
        MathProblem(
            problem="Find the derivative of f(x) = x^2 + 3x + 2",
            problem_type="derivative",
            difficulty="easy",
            expected_answer="f'(x) = 2x + 3"
        )
    ]


def main():
    # You can specify a custom model path if needed
    model_path = "math_qlearning_model.json"
    solver = GeminiMathSolver(model_path=model_path)
    
    if not solver.api_configured:
        print("\n--- RUNNING IN MOCK MODE. NO REAL LEARNING WILL OCCUR. ---")
        print("--- Please configure your Gemini API key to run properly. ---")

    # Print initial model statistics
    solver.print_model_stats()

    problems = create_hard_problems()
    for prob in problems:
        solver.solve_problem(prob)

    # Print final model statistics
    solver.print_model_stats()

    print("\n\n" + "="*30 + " FINAL Q-TABLE STATE " + "="*30)
    # Print a few learned Q-values to show it's working
    if solver.agent.q_table:
        for i, (state, actions) in enumerate(solver.agent.q_table.items()):
            if i >= 5: break
            print(f"State: {state}")
            for action, value in actions.items():
                print(f"  - Action: {action}, Q-Value: {value:.3f}")
    else:
        print("Q-Table is empty (likely ran in mock mode).")

if __name__ == "__main__":
    main()

📁 No existing model found at math_qlearning_model.json. Starting with empty Q-table.
✅ Gemini API configured successfully.

📊 Total States: 0
📊 Total State-Action Pairs: 0

🧮 Solving Problem: Find the derivative of the Weierstrass function, defined as f(x) = Σ [from n=0 to ∞] aⁿ * cos(bⁿπx), for 0 < a < 1 and ab > 1 + (3/2)π.
🎯 Expected Answer: The function is continuous everywhere but differentiable nowhere. Therefore, the derivative does not exist for any value of x.
----------------------------------------------------------------------
Step 1/5 (Attempt 1) | Action: identify_and_setup | Result: ❌
  💬 Learner: **Step 1: Identify and Setup**

The problem is to find the derivative of a function defined by an infinite series.  The function f(x) = Σ [from n=0 to ∞] aⁿ * cos(bⁿπx)  is a trigonometric series.  To find the derivative, we will employ the term-by-term differentiation rule, which states that if a function is defined by a convergent power series, the derivative of the function 

KeyboardInterrupt: 

In [1]:
# import numpy as np
# import random
# import json
# import time
# from typing import Dict, List, Tuple, Optional
# from dataclasses import dataclass
# from collections import defaultdict, deque

# # --- New Imports for DQN ---
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from sentence_transformers import SentenceTransformer

# import google.generativeai as genai

# # --- Gemini API Configuration (No Changes) ---
# try:
#     from kaggle_secrets import UserSecretsClient
#     user_secrets = UserSecretsClient()
#     api_key = user_secrets.get_secret("GEMINI_API_KEY")
#     genai.configure(api_key=api_key)
# except Exception as e:
#     print(f"⚠️ Could not configure Gemini API. Please ensure you have added your key to Kaggle Secrets with the label 'GEMINI_API_KEY'. Error: {e}")

# # --- Data Classes (No Changes) ---
# @dataclass
# class MathProblem:
#     problem: str
#     problem_type: str
#     difficulty: str
#     expected_answer: str

# @dataclass
# class StepResult:
#     step_number: int
#     action: str
#     step_content: str
#     is_correct: bool
#     reward: float
#     judge_feedback: str = ""
#     attempts: int = 1

# # --- NEW: The DQN Model (The "Evaluator") ---
# class Q_Network(nn.Module):
#     """
#     A Neural Network to predict the Q-value for a (state, action) pair.
#     Input will be the concatenated embeddings of the state and a proposed action.
#     Output is a single value representing the predicted quality of that action.
#     """
#     def __init__(self, embedding_dim: int):
#         super(Q_Network, self).__init__()
#         # Input size is state_embedding + action_embedding
#         input_size = embedding_dim * 2
#         self.fc1 = nn.Linear(input_size, 256)
#         self.fc2 = nn.Linear(256, 128)
#         self.fc3 = nn.Linear(128, 1) # Output a single Q-value

#     def forward(self, state_embedding: torch.Tensor, action_embedding: torch.Tensor):
#         # Concatenate the state and action embeddings along the feature dimension
#         x = torch.cat([state_embedding, action_embedding], dim=1)
#         x = torch.relu(self.fc1(x))
#         x = torch.relu(self.fc2(x))
#         q_value = self.fc3(x)
#         return q_value

# # --- NEW: The DQNAgent ---
# class DQNAgent:
#     """
#     Implements a Deep Q-Network agent that learns to evaluate generated text-based actions.
#     """
#     def __init__(self, learning_rate=0.001, discount_factor=0.9, epsilon=1.0, epsilon_decay=0.995, min_epsilon=0.1, batch_size=32):
#         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#         print(f"🧠 DQN Agent using device: {self.device}")

#         # Sentence Transformer for creating embeddings from text
#         self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=self.device)
#         self.embedding_dim = self.embedding_model.get_sentence_embedding_dimension()

#         # DQN Model, Optimizer, and Loss
#         self.q_network = Q_Network(self.embedding_dim).to(self.device)
#         self.target_network = Q_Network(self.embedding_dim).to(self.device)
#         self.target_network.load_state_dict(self.q_network.state_dict()) # Initialize target network
#         self.target_network.eval()
#         self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
#         self.loss_fn = nn.MSELoss()

#         # Hyperparameters
#         self.discount_factor = discount_factor
#         self.epsilon = epsilon
#         self.epsilon_decay = epsilon_decay
#         self.min_epsilon = min_epsilon
#         self.batch_size = batch_size

#         # Replay Buffer to store experiences
#         self.replay_buffer = deque(maxlen=10000)

#     def get_text_embedding(self, text: str) -> torch.Tensor:
#         """Converts a text string into a numerical tensor embedding."""
#         embedding = self.embedding_model.encode(text, convert_to_tensor=True, show_progress_bar=False)
#         return embedding.to(self.device).unsqueeze(0) # Add batch dimension

#     def get_state_representation(self, problem: MathProblem, step: int, previous_steps_content: List[str]) -> str:
#         """Creates a single text string describing the current state."""
#         # More descriptive state for better embeddings
#         last_step_text = "None"
#         if previous_steps_content:
#             last_step_text = previous_steps_content[-1].strip().split('\n')[0] # First line of last step

#         return (f"Problem Type: {problem.problem_type}. Difficulty: {problem.difficulty}. "
#                 f"Currently on step {step} of 5. Last step result: {last_step_text}")

#     def choose_action(self, state_embedding: torch.Tensor, candidate_actions: List[str]) -> str:
#         """
#         Choose an action from the candidates using an epsilon-greedy policy.
#         """
#         if random.random() < self.epsilon:
#             # Exploration: choose a random action from the generated candidates
#             return random.choice(candidate_actions)
#         else:
#             # Exploitation: evaluate all candidates and choose the best one
#             with torch.no_grad():
#                 best_action = None
#                 max_q_value = -float('inf')

#                 for action_text in candidate_actions:
#                     action_embedding = self.get_text_embedding(action_text)
#                     q_value = self.q_network(state_embedding, action_embedding).item()
#                     if q_value > max_q_value:
#                         max_q_value = q_value
#                         best_action = action_text
#                 return best_action

#     def remember(self, state_embedding, action_embedding, reward, next_state_embedding, done):
#         """Store experience in replay buffer."""
#         self.replay_buffer.append((state_embedding, action_embedding, reward, next_state_embedding, done))

#     def replay(self):
#         """Train the Q-network using a batch of experiences from the replay buffer."""
#         if len(self.replay_buffer) < self.batch_size:
#             return # Not enough samples to train

#         # Sample a random batch of experiences
#         minibatch = random.sample(self.replay_buffer, self.batch_size)
        
#         # Unpack the batch and move to the correct device
#         states = torch.cat([s for s, a, r, ns, d in minibatch]).to(self.device)
#         actions = torch.cat([a for s, a, r, ns, d in minibatch]).to(self.device)
#         rewards = torch.tensor([r for s, a, r, ns, d in minibatch], dtype=torch.float32).to(self.device).unsqueeze(1)
#         next_states = torch.cat([ns for s, a, r, ns, d in minibatch]).to(self.device)
#         dones = torch.tensor([d for s, a, r, ns, d in minibatch], dtype=torch.float32).to(self.device).unsqueeze(1)
        
#         # --- DQN Bellman Equation ---
#         # 1. Get the Q-values for the current (state, action) pairs from the q_network
#         current_q_values = self.q_network(states, actions)
        
#         # 2. Get the Q-values for the next states from the target_network
#         # NOTE: In this architecture, we can't do a simple max(Q(s',a')) because actions are dynamic.
#         # For simplicity in this example, we will use the value of the next state itself,
#         # which is a common simplification in some advanced DQN contexts.
#         # A more complex implementation would re-generate actions for each next_state in the batch.
#         with torch.no_grad():
#             # We need to predict the value of the next action, but we don't have it.
#             # We'll use a placeholder for the action, like "proceed", to get a general value of the state.
#             # This is an approximation.
#             pseudo_action_embedding = self.get_text_embedding("Proceed to next step.").repeat(self.batch_size, 1)
#             next_q_values = self.target_network(next_states, pseudo_action_embedding)
        
#         # 3. Calculate the TD Target
#         # target = reward if done, else reward + gamma * max_q_for_next_state
#         td_target = rewards + (1 - dones) * self.discount_factor * next_q_values

#         # 4. Calculate Loss and perform backpropagation
#         loss = self.loss_fn(current_q_values, td_target)
#         self.optimizer.zero_grad()
#         loss.backward()
#         self.optimizer.step()
        
#         # Decay epsilon
#         if self.epsilon > self.min_epsilon:
#             self.epsilon *= self.epsilon_decay

#     def update_target_network(self):
#         """Update the target network with weights from the main q_network."""
#         self.target_network.load_state_dict(self.q_network.state_dict())


# class GeminiMathSolver:
#     def __init__(self, learner_model_name="gemini-1.5-flash", judge_model_name="gemini-1.5-pro"):
#         # --- Use the new DQNAgent ---
#         self.agent = DQNAgent()
#         self.max_steps = 5
#         self.max_retries_per_step = 2
#         self.api_configured = False
#         self.update_target_every = 5 # Update target network every 5 problems solved

#         try:
#             self.learner_model = genai.GenerativeModel(learner_model_name)
#             self.judge_model = genai.GenerativeModel(judge_model_name)
#             self.api_configured = True
#             print("✅ Gemini API configured successfully.")
#         except Exception as e:
#             print(f"⚠️ Gemini API not configured. Running in MOCK mode. Error: {str(e)}")

#     # --- NEW: Method to generate candidate actions ---
#     def get_learner_candidate_actions(self, problem: MathProblem, step: int, previous_steps: List[str], num_candidates=3) -> List[str]:
#         """Generates a list of candidate high-level actions from the learner model."""
#         if not self.api_configured:
#             return [
#                 f"[MOCK] High-level action A for step {step}",
#                 f"[MOCK] High-level action B for step {step}",
#                 f"[MOCK] High-level action C for step {step}"
#             ]
            
#         previous_steps_str = "".join(f"Step {i+1}: {s}\n" for i, s in enumerate(previous_steps)) if previous_steps else "None"

#         prompt = f"""
#         You are an expert mathematician planning to solve a problem.
#         Problem: {problem.problem}
#         Previous Steps Taken:
#         {previous_steps_str}

#         You are now on step {step} of 5.
#         Your task is to propose {num_candidates} distinct, high-level strategic actions for this step.
#         Each action should be a concise phrase describing a mathematical strategy. Do not solve the problem yet.

#         Provide the actions as a numbered list. For example:
#         1. Apply the product rule to the main function.
#         2. Decompose the fraction using partial fractions.
#         3. Identify the integrating factor for the differential equation.

#         Now, provide your {num_candidates} proposed actions for the current problem.
#         """
#         try:
#             response = self.learner_model.generate_content(prompt)
#             # Parse the numbered list into a Python list of strings
#             actions = [line.split('. ', 1)[1].strip() for line in response.text.strip().split('\n') if '. ' in line]
#             return actions if actions else ["Perform the next calculation."] # Fallback
#         except Exception as e:
#             print(f"Error generating candidate actions: {e}")
#             return [f"Error-generated action {i}" for i in range(num_candidates)]


#     def get_learner_step_execution(self, problem: MathProblem, step: int, chosen_action: str, previous_steps: List[str], feedback: Optional[str] = None) -> str:
#         """Generates the detailed mathematical work for a chosen action."""
#         if not self.api_configured:
#             return f"[MOCK] Step {step}: Executing action '{chosen_action}' for problem type {problem.problem_type}."

#         feedback_prompt = f"Your previous attempt was wrong. Feedback: {feedback}. Please correct your work." if feedback else ""
#         previous_steps_str = "".join(f"Step {i+1}: {s}\n" for i, s in enumerate(previous_steps)) if previous_steps else "None"
        
#         prompt = f"""
#         You are an expert mathematician solving a calculus problem in a structured, 5-step process.
#         You are on step {step} of 5.

#         Problem: {problem.problem}

#         Previous Steps:
#         {previous_steps_str}

#         Your high-level instruction for this step is: '{chosen_action}'.
#         {feedback_prompt}

#         Now, perform the mathematical work for this current step based on the instruction.
#         Provide only the work for this single step.
#         """
#         response = self.learner_model.generate_content(prompt)
#         return response.text

#     # --- Judge Evaluation (No significant changes) ---
#     def get_judge_evaluation(self, problem: MathProblem, step_content: str, step_number: int) -> Tuple[bool, float, str]:
#         if not self.api_configured:
#             is_correct = random.random() > 0.4
#             reward = (10 if is_correct else -5)
#             feedback = "MOCK: This is a mock evaluation."
#             return is_correct, reward, feedback

#         is_final_step = (step_number == self.max_steps)
#         prompt = f"""
#         You are an expert math judge. Evaluate one step of a solution.
#         Problem: "{problem.problem}"
#         Expected Final Answer: "{problem.expected_answer}"
#         The student is on Step {step_number} and submitted this:
#         ---
#         {step_content}
#         ---
#         Is this step mathematically correct AND a logical progression towards the final answer?
#         Start your response with "CORRECT" or "INCORRECT", followed by a one-sentence explanation.
#         """
#         try:
#             response = self.judge_model.generate_content(prompt)
#             feedback_text = response.text.strip()
#             is_correct = feedback_text.upper().startswith('CORRECT')
#             reward = (15.0 if is_correct and is_final_step else 5.0 + step_number) if is_correct else (-20.0 if is_final_step else -10.0)
#             return is_correct, reward, feedback_text
#         except Exception as e:
#             print(f"❌ Error during judge evaluation: {str(e)}")
#             return False, -15.0, f"Evaluation failed due to an API error: {e}"

#     def solve_problem(self, problem: MathProblem) -> List[StepResult]:
#         """Solves a problem using the DQN-guided, generative action process."""
#         print(f"\n" + "="*70)
#         print(f"🧮 Solving Problem: {problem.problem}")
#         print("-" * 70)
        
#         results = []
#         previous_steps_content = []

#         for step in range(1, self.max_steps + 1):
#             # 1. Get State Representation
#             state_text = self.agent.get_state_representation(problem, step, previous_steps_content)
#             state_embedding = self.agent.get_text_embedding(state_text)
            
#             feedback_for_retry = None
            
#             for attempt in range(1, self.max_retries_per_step + 1):
#                 # 2. Generate Candidate Actions
#                 candidate_actions = self.get_learner_candidate_actions(problem, step, previous_steps_content)
#                 print(f"Step {step}/{self.max_steps} (Attempt {attempt}) | Generated Actions: {candidate_actions}")

#                 # 3. Choose Best Action using DQN (or explore)
#                 action_text = self.agent.choose_action(state_embedding, candidate_actions)
#                 action_embedding = self.agent.get_text_embedding(action_text)

#                 # 4. Take Action, get Step Content
#                 step_content = self.get_learner_step_execution(problem, step, action_text, previous_steps_content, feedback_for_retry)
                
#                 # 5. Observe Reward and Next State
#                 is_correct, reward, judge_feedback = self.get_judge_evaluation(problem, step_content, step)
                
#                 status_icon = "✅" if is_correct else "❌"
#                 print(f"  🤖 Chosen Action: '{action_text}' | Result: {status_icon}")
#                 print(f"  💬 Learner: {step_content.strip()}")
#                 print(f"  👨‍⚖️ Judge: {judge_feedback.strip()} (Reward: {reward:.1f})")

#                 # 6. Store experience and train
#                 next_step = step + 1
#                 done = (next_step > self.max_steps)
#                 next_state_text = self.agent.get_state_representation(problem, next_step, previous_steps_content + [step_content]) if not done else "terminal"
#                 next_state_embedding = self.agent.get_text_embedding(next_state_text)
                
#                 self.agent.remember(state_embedding, action_embedding, reward, next_state_embedding, done)
#                 self.agent.replay() # Train the agent
                
#                 if is_correct:
#                     result = StepResult(step, action_text, step_content, is_correct, reward, judge_feedback, attempt)
#                     results.append(result)
#                     previous_steps_content.append(step_content)
#                     time.sleep(1) # API rate limit
#                     break # Exit retry loop
#                 else:
#                     feedback_for_retry = judge_feedback
#                     if attempt == self.max_retries_per_step:
#                         print(f"  ⚠️ Max retries reached for step {step}. Accepting incorrect step.")
#                         result = StepResult(step, action_text, step_content, is_correct, reward, judge_feedback, attempt)
#                         results.append(result)
#                         previous_steps_content.append(step_content)
#                         break
#                     time.sleep(1)

#         total_reward = sum(r.reward for r in results)
#         correct_steps = sum(1 for r in results if r.is_correct)
#         print("\n" + "-"*30 + " SUMMARY " + "-"*30)
#         print(f"📊 Final Result: {correct_steps}/{self.max_steps} steps correct.")
#         print(f"🏆 Total Reward: {total_reward:.1f}")
#         print(f"📉 Final Epsilon: {self.agent.epsilon:.3f}")
#         print("="*70)
#         return results

# # --- Main Execution ---
# def create_hard_problems():
#     """Create a list of sample math problems for testing."""
#     return [
#         MathProblem(
#             problem="Find the derivative of f(x) = sin(x²) * e^(3x)",
#             problem_type="derivative_calculus",
#             difficulty="hard",
#             expected_answer="e^(3x) * (2x*cos(x²) + 3*sin(x²))"
#         ),
#         MathProblem(
#             problem="Evaluate the integral of x * cos(x) dx",
#             problem_type="integral_calculus",
#             difficulty="medium",
#             expected_answer="x*sin(x) + cos(x) + C"
#         )
#     ]

# def main():
#     solver = GeminiMathSolver()
#     if not solver.api_configured:
#         print("\n--- RUNNING IN MOCK MODE. DQN WILL TRAIN ON MOCK DATA. ---")
    
#     problems = create_hard_problems()
#     for i, prob in enumerate(problems):
#         solver.solve_problem(prob)
#         # Periodically update the target network for stability
#         if (i + 1) % solver.update_target_every == 0:
#             print("\n" + "--- 🎯 Updating Target Network ---" + "\n")
#             solver.agent.update_target_network()

#     print("\n\nDQN training complete.")
#     # You can now save the trained model if desired
#     torch.save(solver.agent.q_network.state_dict(), "dqn_math_solver.pth")
#     print("💾 Model weights saved to dqn_math_solver.pth")


# if __name__ == "__main__":
#     main()

2025-07-23 15:38:35.196571: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753285115.367103      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753285115.415470      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🧠 DQN Agent using device: cuda


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Gemini API configured successfully.

🧮 Solving Problem: Find the derivative of f(x) = sin(x²) * e^(3x)
----------------------------------------------------------------------
Step 1/5 (Attempt 1) | Generated Actions: ['Apply the product rule, recognizing two distinct functions.', 'Utilize the chain rule within the application of the product rule.', 'Employ the derivative rules for sine and exponential functions.']
  🤖 Chosen Action: 'Apply the product rule, recognizing two distinct functions.' | Result: ✅
  💬 Learner: Let u(x) = sin(x²) and v(x) = e^(3x).  Then f(x) = u(x)v(x).

The product rule states that the derivative of a product of two functions is given by:

d/dx [u(x)v(x)] = u'(x)v(x) + u(x)v'(x)
  👨‍⚖️ Judge: CORRECT. This step correctly identifies the functions to which the product rule will be applied and states the product rule. (Reward: 6.0)
Step 2/5 (Attempt 1) | Generated Actions: ['Compute the derivative of u(x) using the chain rule.', 'Compute the derivative of v(x) u

In [3]:
import numpy as np
import random
import json
import time
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from collections import defaultdict, deque

# --- New Imports for DQN ---
import torch
import torch.nn as nn
import torch.optim as optim
from sentence_transformers import SentenceTransformer

import google.generativeai as genai

# --- Gemini API Configuration (No Changes) ---
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("GEMINI_API_KEY")
    genai.configure(api_key=api_key)
except Exception as e:
    print(f"⚠️ Could not configure Gemini API. Please ensure you have added your key to Kaggle Secrets with the label 'GEMINI_API_KEY'. Error: {e}")

# --- Data Classes (No Changes) ---
@dataclass
class MathProblem:
    problem: str
    problem_type: str
    difficulty: str
    expected_answer: str

@dataclass
class StepResult:
    step_number: int
    action: str
    step_content: str
    is_correct: bool
    reward: float
    judge_feedback: str = ""
    attempts: int = 1

# --- DQN Model and Agent (No Changes) ---
class Q_Network(nn.Module):
    def __init__(self, embedding_dim: int):
        super(Q_Network, self).__init__()
        input_size = embedding_dim * 2
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)
    def forward(self, state_embedding: torch.Tensor, action_embedding: torch.Tensor):
        x = torch.cat([state_embedding, action_embedding], dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class DQNAgent:
    def __init__(self, learning_rate=0.001, discount_factor=0.9, epsilon=1.0, epsilon_decay=0.995, min_epsilon=0.1, batch_size=32):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"🧠 DQN Agent using device: {self.device}")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=self.device)
        self.embedding_dim = self.embedding_model.get_sentence_embedding_dimension()
        self.q_network = Q_Network(self.embedding_dim).to(self.device)
        self.target_network = Q_Network(self.embedding_dim).to(self.device)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.target_network.eval()
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
        self.loss_fn = nn.MSELoss()
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon
        self.batch_size = batch_size
        self.replay_buffer = deque(maxlen=10000)

    def get_text_embedding(self, text: str) -> torch.Tensor:
        embedding = self.embedding_model.encode(text, convert_to_tensor=True, show_progress_bar=False)
        return embedding.to(self.device).unsqueeze(0)

    def get_state_representation(self, problem: MathProblem, step: int, previous_steps_content: List[str]) -> str:
        last_step_text = "None"
        if previous_steps_content:
            last_step_text = previous_steps_content[-1].strip().split('\n')[0]
        return (f"Problem Type: {problem.problem_type}. Difficulty: {problem.difficulty}. "
                f"Currently on step {step} of 5. Last step result: {last_step_text}")

    def choose_action(self, state_embedding: torch.Tensor, candidate_actions: List[str]) -> str:
        if random.random() < self.epsilon:
            return random.choice(candidate_actions)
        else:
            with torch.no_grad():
                best_action = None
                max_q_value = -float('inf')
                for action_text in candidate_actions:
                    action_embedding = self.get_text_embedding(action_text)
                    q_value = self.q_network(state_embedding, action_embedding).item()
                    if q_value > max_q_value:
                        max_q_value = q_value
                        best_action = action_text
                return best_action

    def remember(self, state_embedding, action_embedding, reward, next_state_embedding, done):
        self.replay_buffer.append((state_embedding, action_embedding, reward, next_state_embedding, done))

    def replay(self):
        if len(self.replay_buffer) < self.batch_size: return
        minibatch = random.sample(self.replay_buffer, self.batch_size)
        states = torch.cat([s for s, a, r, ns, d in minibatch]).to(self.device)
        actions = torch.cat([a for s, a, r, ns, d in minibatch]).to(self.device)
        rewards = torch.tensor([r for s, a, r, ns, d in minibatch], dtype=torch.float32).to(self.device).unsqueeze(1)
        next_states = torch.cat([ns for s, a, r, ns, d in minibatch]).to(self.device)
        dones = torch.tensor([d for s, a, r, ns, d in minibatch], dtype=torch.float32).to(self.device).unsqueeze(1)
        current_q_values = self.q_network(states, actions)
        with torch.no_grad():
            pseudo_action_embedding = self.get_text_embedding("Proceed to next step.").repeat(self.batch_size, 1)
            next_q_values = self.target_network(next_states, pseudo_action_embedding)
        td_target = rewards + (1 - dones) * self.discount_factor * next_q_values
        loss = self.loss_fn(current_q_values, td_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        if self.epsilon > self.min_epsilon:
            self.epsilon *= self.epsilon_decay

    def update_target_network(self):
        self.target_network.load_state_dict(self.q_network.state_dict())

# --- MODIFIED: GeminiMathSolver Class ---
class GeminiMathSolver:
    def __init__(self, learner_model_name="gemini-1.5-flash", judge_model_name="gemini-1.5-pro"):
        self.agent = DQNAgent()
        self.max_steps = 5
        self.max_tactical_retries = 2 # Retries for a single step
        self.max_strategic_retries = 2 # Retries for the whole problem
        self.update_target_every = 5
        self.api_configured = False
        try:
            self.learner_model = genai.GenerativeModel(learner_model_name)
            self.judge_model = genai.GenerativeModel(judge_model_name)
            self.api_configured = True
            print("✅ Gemini API configured successfully.")
        except Exception as e:
            print(f"⚠️ Gemini API not configured. Running in MOCK mode. Error: {str(e)}")

    # --- NEW: Method to generate a new high-level strategy ---
    def generate_new_strategy(self, problem: MathProblem, failed_solution: List[StepResult]) -> str:
        """Analyzes a failed solution and proposes a new high-level strategy."""
        print("\n" + "🤔" * 35)
        print("🤔 Analyzing failed solution to generate a new high-level strategy...")
        print("🤔" * 35)

        if not self.api_configured:
            return "[MOCK] The previous attempt failed. Try a completely different method."

        # Format the failed attempt for the prompt
        failed_attempt_str = ""
        for step_res in failed_solution:
            failed_attempt_str += (f"Step {step_res.step_number} (Action: '{step_res.action}'):\n"
                                   f"{step_res.step_content}\n"
                                   f"Judge's Feedback: {step_res.judge_feedback}\n---\n")

        prompt = f"""
        You are a master mathematician and strategist. A student attempted to solve a problem but failed, even after several retries on the final step. Their entire method may be flawed.

        Problem: "{problem.problem}"
        Expected Final Answer: "{problem.expected_answer}"

        Here is the student's full, incorrect attempt:
        ---
        {failed_attempt_str}
        ---

        Your task is to analyze their entire failed solution and provide a single, concise, high-level strategic suggestion for a COMPLETELY DIFFERENT APPROACH.
        Do NOT solve the problem yourself. Just provide a guiding principle for the next attempt.

        Example suggestions:
        - "The previous approach using integration by parts led to a recursive loop. Suggest trying a u-substitution with u=cos(x) instead."
        - "The direct differentiation was too complex. Advise the student to first simplify the expression using logarithmic properties before differentiating."
        - "The series expansion failed. Suggest analyzing the function's properties for continuity and differentiability directly, as it might be a special case like the Weierstrass function."

        Now, provide your strategic guidance for the given problem and failed attempt.
        """
        try:
            response = self.judge_model.generate_content(prompt)
            strategy = response.text.strip()
            print(f"💡 New Strategy: {strategy}")
            return strategy
        except Exception as e:
            print(f"❌ Error generating new strategy: {e}")
            return "An error occurred. Try to be more careful with basic calculations."

    # --- MODIFIED: Prompt now includes strategic guidance ---
    def get_learner_candidate_actions(self, problem: MathProblem, step: int, previous_steps: List[str], strategic_guidance: Optional[str] = None, num_candidates=3) -> List[str]:
        if not self.api_configured:
            return [f"[MOCK] Action for step {step}" for _ in range(num_candidates)]

        previous_steps_str = "".join(f"Step {i+1}: {s}\n" for i, s in enumerate(previous_steps)) if previous_steps else "None"

        # Dynamically add the guidance section to the prompt if it exists
        guidance_prompt = ""
        if strategic_guidance:
            guidance_prompt = f"""
            **IMPORTANT STRATEGIC GUIDANCE FOR THIS ENTIRE ATTEMPT:**
            ---
            {strategic_guidance}
            ---
            You MUST generate actions that follow this new high-level strategy.
            """

        prompt = f"""
        You are a mathematician planning to solve a problem.
        Problem: {problem.problem}
        Previous Steps Taken:
        {previous_steps_str}

        {guidance_prompt}

        You are now on step {step} of 5.
        Your task is to propose {num_candidates} distinct, high-level strategic actions for THIS step that are consistent with any overall guidance provided.
        Each action should be a concise phrase. Provide the actions as a numbered list.
        """
        try:
            response = self.learner_model.generate_content(prompt)
            actions = [line.split('. ', 1)[1].strip() for line in response.text.strip().split('\n') if '. ' in line]
            return actions if actions else ["Perform the next calculation."]
        except Exception as e:
            print(f"Error generating candidate actions: {e}")
            return [f"Error-gen action {i}" for i in range(num_candidates)]

    def get_learner_step_execution(self, problem: MathProblem, step: int, chosen_action: str, previous_steps: List[str], feedback: Optional[str] = None) -> str:
        # This function does not need changes
        if not self.api_configured:
            return f"[MOCK] Step {step}: Executing action '{chosen_action}'"
        feedback_prompt = f"Your previous attempt was wrong. Feedback: {feedback}. Please correct your work." if feedback else ""
        previous_steps_str = "".join(f"Step {i+1}: {s}\n" for i, s in enumerate(previous_steps)) if previous_steps else "None"
        prompt = f"""
        You are an expert mathematician solving a problem step-by-step.
        On step {step}/5. Problem: {problem.problem}
        Previous Steps: {previous_steps_str}
        Your high-level instruction for this step is: '{chosen_action}'.
        {feedback_prompt}
        Now, perform the mathematical work for THIS step only.
        """
        response = self.learner_model.generate_content(prompt)
        return response.text

    def get_judge_evaluation(self, problem: MathProblem, step_content: str, step_number: int) -> Tuple[bool, float, str]:
        # This function does not need changes
        if not self.api_configured:
            is_correct = random.random() > 0.4
            reward = (10 if is_correct else -5)
            return is_correct, reward, "MOCK: Evaluation."
        is_final_step = (step_number == self.max_steps)
        prompt = f"""
        You are an expert math judge. Evaluate one step of a solution.
        Problem: "{problem.problem}"
        Expected Final Answer: "{problem.expected_answer}"
        The student is on Step {step_number} and submitted this:
        ---
        {step_content}
        ---
        Is this step mathematically correct AND a logical progression towards the final answer?
        Start your response with "CORRECT" or "INCORRECT", followed by a one-sentence explanation.
        """
        try:
            response = self.judge_model.generate_content(prompt)
            feedback_text = response.text.strip()
            is_correct = feedback_text.upper().startswith('CORRECT')
            reward = (15.0 if is_correct and is_final_step else 5.0 + step_number) if is_correct else (-20.0 if is_final_step else -10.0)
            return is_correct, reward, feedback_text
        except Exception as e:
            print(f"❌ Error during judge evaluation: {str(e)}")
            return False, -15.0, f"Evaluation failed due to an API error: {e}"

    # --- MODIFIED: This is now the "inner loop" ---
    def _solve_single_attempt(self, problem: MathProblem, strategic_guidance: Optional[str] = None) -> Tuple[List[StepResult], bool]:
        results = []
        previous_steps_content = []

        for step in range(1, self.max_steps + 1):
            state_text = self.agent.get_state_representation(problem, step, previous_steps_content)
            state_embedding = self.agent.get_text_embedding(state_text)
            feedback_for_retry = None
            
            for attempt in range(1, self.max_tactical_retries + 1):
                candidate_actions = self.get_learner_candidate_actions(problem, step, previous_steps_content, strategic_guidance)
                print(f"Step {step}/{self.max_steps} (Tac. Attempt {attempt}) | Generated Actions: {candidate_actions}")
                
                action_text = self.agent.choose_action(state_embedding, candidate_actions)
                action_embedding = self.agent.get_text_embedding(action_text)
                
                step_content = self.get_learner_step_execution(problem, step, action_text, previous_steps_content, feedback_for_retry)
                is_correct, reward, judge_feedback = self.get_judge_evaluation(problem, step_content, step)
                
                status_icon = "✅" if is_correct else "❌"
                print(f"  🤖 Chosen Action: '{action_text}' | Result: {status_icon}")
                print(f"  💬 Learner: {step_content.strip()}")
                print(f"  👨‍⚖️ Judge: {judge_feedback.strip()} (Reward: {reward:.1f})")

                next_step = step + 1
                done = (next_step > self.max_steps)
                next_state_text = self.agent.get_state_representation(problem, next_step, previous_steps_content + [step_content]) if not done else "terminal"
                next_state_embedding = self.agent.get_text_embedding(next_state_text)
                
                self.agent.remember(state_embedding, action_embedding, reward, next_state_embedding, done)
                self.agent.replay()
                
                if is_correct:
                    result = StepResult(step, action_text, step_content, is_correct, reward, judge_feedback, attempt)
                    results.append(result)
                    previous_steps_content.append(step_content)
                    time.sleep(1)
                    break
                else:
                    feedback_for_retry = judge_feedback
                    if attempt == self.max_tactical_retries:
                        print(f"  ⚠️ Max tactical retries reached for step {step}. Accepting incorrect step.")
                        result = StepResult(step, action_text, step_content, is_correct, reward, judge_feedback, attempt)
                        results.append(result)
                        previous_steps_content.append(step_content)
                        break
                    time.sleep(1)
        
        final_step_was_correct = results[-1].is_correct if results else False
        return results, final_step_was_correct

    # --- NEW: The "outer loop" that handles strategic retries ---
    def solve_problem_with_strategic_retries(self, problem: MathProblem):
        """
        Solves a problem with a two-tiered retry system.
        If the entire solution fails, it generates a new strategy and starts over.
        """
        print(f"\n" + "="*80)
        print(f"🧮 Solving Problem: {problem.problem} | Expected: {problem.expected_answer}")
        print("="*80)

        strategic_guidance = None
        final_results = []
        
        for strat_attempt in range(1, self.max_strategic_retries + 1):
            print(f"\n--- STRATEGIC ATTEMPT #{strat_attempt}/{self.max_strategic_retries} ---")
            if strategic_guidance:
                print(f"GUIDANCE: {strategic_guidance}")
            print("-" * 55)

            results, final_step_success = self._solve_single_attempt(problem, strategic_guidance)
            
            if final_step_success:
                print("\n" + "🎉" * 20)
                print("🎉 Final step was correct! Problem solved successfully!")
                print("🎉" * 20)
                final_results = results
                break
            else:
                final_results = results # Store the latest failed attempt
                print("\n" + "🔥" * 20)
                print("🔥 Final step was incorrect after all tactical retries.")
                print("🔥 The overall strategy may be flawed.")
                print("🔥" * 20)
                if strat_attempt < self.max_strategic_retries:
                    # Generate a new strategy for the next loop iteration
                    strategic_guidance = self.generate_new_strategy(problem, results)
                else:
                    print("\n" + "🛑" * 20)
                    print("🛑 Max strategic retries reached. Unable to solve the problem.")
                    print("🛑" * 20)

        # Final Summary
        total_reward = sum(r.reward for r in final_results)
        correct_steps = sum(1 for r in final_results if r.is_correct)
        print("\n" + "="*30 + " FINAL SUMMARY " + "="*30)
        print(f"Problem: {problem.problem}")
        print(f"📊 Final Result: {correct_steps}/{self.max_steps} steps correct.")
        print(f"🏆 Total Reward from last attempt: {total_reward:.1f}")
        print(f"📉 Final Epsilon: {self.agent.epsilon:.3f}")
        print("="*74)
        return final_results

# --- Main Execution ---
def create_hard_problems() -> List[MathProblem]:
    """Create a list of hard and very hard sample math problems for testing."""
    return [
        MathProblem(
            problem="Find the derivative of the Weierstrass function, defined as f(x) = Σ [from n=0 to ∞] aⁿ * cos(bⁿπx), for 0 < a < 1 and ab > 1 + (3/2)π.",
            problem_type="derivative",
            difficulty="very_hard",
            expected_answer="The function is continuous everywhere but differentiable nowhere. Therefore, the derivative does not exist for any value of x."
        ),
        MathProblem(
            problem="Consider f_n(x) = 2nx*e^(-nx²) on [0, 1]. Evaluate lim[n→∞] ∫[0 to 1] f_n(x) dx and ∫[0 to 1] lim[n→∞] f_n(x) dx. Are they equal?",
            problem_type="real_analysis",
            difficulty="very_hard",
            expected_answer="They are not equal. The integral of the limit is ∫0 dx = 0. The limit of the integral is lim[n→∞] (1 - e⁻ⁿ) = 1. They differ because convergence is not uniform, so the limit and integral cannot be interchanged."
        )
    ]

def main():
    solver = GeminiMathSolver()
    if not solver.api_configured:
        print("\n--- RUNNING IN MOCK MODE. DQN WILL TRAIN ON MOCK DATA. ---")
    
    problems = create_hard_problems()
    for i, prob in enumerate(problems):
        # --- Call the new supervisory method ---
        solver.solve_problem_with_strategic_retries(prob)
        
        if (i + 1) % solver.update_target_every == 0:
            print("\n" + "--- 🎯 Updating Target Network ---" + "\n")
            solver.agent.update_target_network()

    print("\n\nDQN training complete.")
    torch.save(solver.agent.q_network.state_dict(), "dqn_math_solver_v2.pth")
    print("💾 Model weights saved to dqn_math_solver_v2.pth")

if __name__ == "__main__":
    main()

🧠 DQN Agent using device: cuda
✅ Gemini API configured successfully.

🧮 Solving Problem: Find the derivative of the Weierstrass function, defined as f(x) = Σ [from n=0 to ∞] aⁿ * cos(bⁿπx), for 0 < a < 1 and ab > 1 + (3/2)π. | Expected: The function is continuous everywhere but differentiable nowhere. Therefore, the derivative does not exist for any value of x.

--- STRATEGIC ATTEMPT #1/2 ---
-------------------------------------------------------
Step 1/5 (Tac. Attempt 1) | Generated Actions: ['Analyze the convergence properties of the series.', 'Investigate term-by-term differentiation.', 'Explore the use of Fourier series theory.']
  🤖 Chosen Action: 'Explore the use of Fourier series theory.' | Result: ✅
  💬 Learner: Step 1/5: Exploring the use of Fourier Series Theory

The Weierstrass function,  f(x) = Σ [from n=0 to ∞] aⁿ * cos(bⁿπx), is already expressed in a form resembling a Fourier series.  A standard Fourier series representation of a periodic function g(x) with period 2L is