<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/ARC_AGI3_DEMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## CASE1

In [2]:
import numpy as np
import random
from collections import defaultdict
import google.generativeai as genai
# import os # No longer directly using os.getenv for API key, as Colab userdata is preferred
from google.colab import userdata # For Colab Secrets

# --- Agent Configuration Class ---
class AgentConfig:
    LLM_MODEL_NAME: str = "gemini-2.5-flash"

# --- Simplified Grid World Environment (Mimicking ARC-AGI elements) ---
class SimpleArcGridEnv:
    def __init__(self, size=(5, 5), start=(0, 0), goal=(4, 4), obstacles=None, max_steps=50):
        self.size = size
        self.grid = np.zeros(size, dtype=int) # 0 for empty, 1 for goal, 2 for obstacle, 3 for agent
        self.start = start
        self.goal = goal
        self.obstacles = obstacles if obstacles is not None else []
        self.agent_pos = np.array(start)
        self.max_steps = max_steps
        self.current_step = 0
        self.initialize_grid()

        # Define actions: (dx, dy)
        self.actions = {
            0: (-1, 0),  # Up
            1: (1, 0),   # Down
            2: (0, -1),  # Left
            3: (0, 1),   # Right
            # For ARC-AGI, you'd have more complex actions:
            # 4: "change_color_at_current_pos(color_id)",
            # 5: "flood_fill_from_current_pos(color_id)",
            # 6: "copy_selection_to_clipboard()",
            # 7: "paste_clipboard_at_current_pos()"
        }
        self.num_actions = len(self.actions)

    def initialize_grid(self):
        self.grid = np.zeros(self.size, dtype=int)
        self.grid[self.goal[0], self.goal[1]] = 1  # Goal
        for ox, oy in self.obstacles:
            self.grid[ox, oy] = 2  # Obstacle

    def reset(self):
        self.agent_pos = np.array(self.start)
        self.current_step = 0
        self.initialize_grid() # Reset grid to initial state
        # Place agent for observation
        obs_grid = self.grid.copy()
        obs_grid[self.agent_pos[0], self.agent_pos[1]] = 3
        return obs_grid.flatten() # State as flattened grid (or tuple of agent_pos for simpler state space)

    def step(self, action_idx):
        action_vec = self.actions[action_idx]
        next_pos = self.agent_pos + action_vec

        # Keep within bounds
        next_pos[0] = np.clip(next_pos[0], 0, self.size[0] - 1)
        next_pos[1] = np.clip(next_pos[1], 0, self.size[1] - 1)

        reward = -0.1 # Small penalty for each step
        done = False

        # Check for obstacle
        if tuple(next_pos) in self.obstacles:
            reward = -5  # Larger penalty for hitting an obstacle
            # For this simple example, we'll let it "hit" and get penalized, but still move.

        self.agent_pos = next_pos
        self.current_step += 1

        # Check for goal
        if np.array_equal(self.agent_pos, self.goal):
            reward = 100
            done = True

        # Check for max steps
        if self.current_step >= self.max_steps and not done:
            done = True
            reward = -10 # Penalty for not reaching goal within max steps

        obs_grid = self.grid.copy()
        obs_grid[self.agent_pos[0], self.agent_pos[1]] = 3 # Mark agent on observation
        return obs_grid.flatten(), reward, done, {} # obs, reward, done, info

    def render(self):
        display_grid = self.grid.copy()
        display_grid[self.agent_pos[0], self.agent_pos[1]] = 3 # Agent
        print(display_grid)
        print("-" * (self.size[1] * 2 + 1)) # Separator

    def get_state_representation(self):
        # For a Q-table, the state needs to be hashable (e.g., a tuple or string)
        # For simplicity, we'll just use the agent's position as the state
        return tuple(self.agent_pos)

# --- Q-Learning Agent ---
class QLearningAgent:
    def __init__(self, actions, learning_rate=0.1, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.01, epsilon_decay_rate=0.001):
        self.actions = actions # List of action indices
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay_rate = epsilon_decay_rate
        self.q_table = defaultdict(lambda: np.zeros(len(actions)))

    def choose_action(self, state):
        # Epsilon-greedy strategy
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(list(self.actions.keys())) # Explore
        else:
            return np.argmax(self.q_table[state]) # Exploit

    def learn(self, state, action, reward, next_state):
        current_q = self.q_table[state][action]
        max_next_q = np.max(self.q_table[next_state])
        new_q = current_q + self.lr * (reward + self.gamma * max_next_q - current_q)
        self.q_table[state][action] = new_q

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_end, self.epsilon - self.epsilon_decay_rate)

# --- Gemini LLM Integration (Conceptual) ---
class GeminiArcStrategist:
    def __init__(self, api_key: str, model_name: str):
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel(model_name)
        self.chat = self.model.start_chat(history=[])
        print(f"Gemini Strategist initialized with model: {model_name}")


    def propose_high_level_strategy(self, current_grid_state_description, task_description="reach the red goal (1) from the green agent (3), avoiding blue obstacles (2)"):
        prompt = f"""
        You are an AI assistant specialized in solving Abstract and Reasoning Corpus (ARC) tasks.
        The current grid state is represented as a flattened array (0: empty, 1: goal, 2: obstacle, 3: agent).
        Current Grid State: {current_grid_state_description}
        Task: {task_description}

        Given this, propose a high-level strategy or sequence of conceptual steps to solve this task.
        Think step-by-step about the abstract goal. Do not output specific low-level actions like "move up".
        Instead, provide a logical plan. For example, "Identify the shortest clear path to the goal" or
        "If an obstacle is detected, try to go around it."
        """
        response = self.chat.send_message(prompt)
        return response.text.strip()

    def interpret_failed_trajectory(self, initial_state_desc, trajectory_log, reason_for_failure="hit an obstacle"):
        prompt = f"""
        An AI agent attempted an ARC-like task and failed.
        Initial Grid State: {initial_state_desc}
        Trajectory (State, Action, Reward, Next State) examples:
        {trajectory_log}
        Reason for Failure: {reason_for_failure}

        Analyze this trajectory and provide insights on why the agent failed at a high level.
        Suggest abstract adjustments to the agent's strategy or learning process.
        """
        response = self.chat.send_message(prompt)
        return response.text.strip()

    def generate_task_specific_rule(self, current_grid_state_description, past_successful_examples_desc, problem_type_desc="navigation"):
        prompt = f"""
        Based on the current grid state and successful past examples, propose a simple, abstract rule that might help solve this {problem_type_desc} problem.
        Current Grid State: {current_grid_state_description}
        Successful Examples (simplified, e.g., "always move towards goal if clear"):
        {past_successful_examples_desc}

        Proposed Rule:
        """
        response = self.chat.send_message(prompt)
        return response.text.strip()

# --- Main Reinforcement Learning Loop with Conceptual Gemini Integration ---
def main():
    # --- Configure Gemini API Key using Colab Secrets ---
    try:
        GOOGLE_API_KEY = userdata.get('GEMINI')
        print("Google Generative AI configured successfully using Colab Secrets.")
    except Exception as e:
        print(f"Failed to retrieve API key from Colab Secrets: {e}")
        print("Please ensure you have added your API key to Colab Secrets named 'GEMINI'.")
        print("Gemini integration will be skipped.")
        GOOGLE_API_KEY = None

    gemini_strategist = None
    if GOOGLE_API_KEY:
        try:
            gemini_strategist = GeminiArcStrategist(GOOGLE_API_KEY, AgentConfig.LLM_MODEL_NAME)
        except Exception as e:
            print(f"Failed to initialize Gemini Strategist: {e}")
            gemini_strategist = None

    # --- Environment and Agent Initialization ---
    env = SimpleArcGridEnv(
        size=(5, 5),
        start=(0, 0),
        goal=(4, 4),
        obstacles=[(1, 1), (1, 2), (2, 1), (3, 3)]
    )
    agent = QLearningAgent(actions=env.actions, epsilon_decay_rate=0.005)

    num_episodes = 5000
    rewards_per_episode = []
    trajectory_log = [] # To log recent trajectory for Gemini analysis

    print(f"\n--- Starting RL Training in EST ---") # Using preferred time zone (2025-07-06 preference)

    for episode in range(num_episodes):
        state_grid = env.reset()
        current_state_tuple = env.get_state_representation()
        done = False
        total_reward = 0
        episode_trajectory = [] # Log for this specific episode

        # --- Gemini High-Level Strategy (Conceptual) ---
        # Only propose strategy once at the beginning or on task change (ARC-AGI-3 is about novel tasks)
        if gemini_strategist and episode == 0:
            print(f"\n--- Proposing Initial High-Level Strategy (Model: {AgentConfig.LLM_MODEL_NAME}) ---")
            grid_desc = str(state_grid.reshape(env.size)) # Reshape for better LLM context
            strategy = gemini_strategist.propose_high_level_strategy(grid_desc)
            print(f"Gemini's initial strategy: {strategy}")
            print("--------------------------------------------------")

        while not done:
            # env.render() # Uncomment to see each step (can be very verbose)
            action = agent.choose_action(current_state_tuple)
            next_state_grid, reward, done, _ = env.step(action)
            next_state_tuple = env.get_state_representation()

            agent.learn(current_state_tuple, action, reward, next_state_tuple)

            total_reward += reward
            episode_trajectory.append((current_state_tuple, action, reward, next_state_tuple))

            current_state_tuple = next_state_tuple

        rewards_per_episode.append(total_reward)
        agent.decay_epsilon()

        if episode % 100 == 0:
            print(f"Episode {episode}: Total Reward = {total_reward:.2f}, Epsilon = {agent.epsilon:.2f}")
            # --- Gemini Trajectory Interpretation (Conceptual) ---
            if gemini_strategist and total_reward < 0: # If episode was a failure
                print(f"\n--- Gemini 2.0 Analyzing Failed Trajectory (Model: {AgentConfig.LLM_MODEL_NAME}, Example) ---")
                initial_grid_desc = str(env.grid.copy().reshape(env.size)) # State at reset
                # Only log a few steps for brevity for LLM, or summarize
                log_snippet = "\n".join([f"State: {s}, Action: {a}, Reward: {r}, Next State: {ns}" for s, a, r, ns in episode_trajectory[:5]])
                failure_reason = "Did not reach goal" if total_reward <= -10 else "Hit obstacles repeatedly"
                interpretation = gemini_strategist.interpret_failed_trajectory(initial_grid_desc, log_snippet, failure_reason)
                print(f"Gemini's analysis: {interpretation}")
                print("----------------------------------------------------")


    print("\n--- Training Complete ---")

    # --- Evaluate Agent Performance (Simple Test) ---
    print("\n--- Testing Trained Agent ---")
    state_grid = env.reset()
    current_state_tuple = env.get_state_representation()
    done = False
    test_reward = 0
    test_steps = 0
    agent.epsilon = 0 # Turn off exploration for testing

    while not done and test_steps < env.max_steps * 2: # Give it more steps for testing
        action = np.argmax(agent.q_table[current_state_tuple]) # Exploit learned policy
        next_state_grid, reward, done, _ = env.step(action)
        current_state_tuple = env.get_state_representation()
        test_reward += reward
        test_steps += 1

    print(f"Test Run: Total Reward = {test_reward:.2f}, Steps = {test_steps}")
    if np.array_equal(env.agent_pos, env.goal):
        print("Agent successfully reached the goal!")
    else:
        print("Agent did not reach the goal during testing.")

if __name__ == "__main__":
    main()

Google Generative AI configured successfully using Colab Secrets.
Gemini Strategist initialized with model: gemini-2.5-flash

--- Starting RL Training in EST ---

--- Proposing Initial High-Level Strategy (Model: gemini-2.5-flash) ---
Gemini's initial strategy: The high-level strategy to solve this pathfinding task involves systematically exploring the grid to find a clear path from the agent to the goal while adhering to the given constraints.

Here's a sequence of conceptual steps:

1.  **Identify Start and Goal:** Pinpoint the exact location of the agent (start point) and the goal (target point) on the grid.
2.  **Identify Obstacles:** Mark all cells containing obstacles as forbidden to traverse.
3.  **Systematic Exploration:** Begin a search process starting from the agent's position.
    *   From the current cell, examine all immediately adjacent cells (up, down, left, right).
    *   For each adjacent cell, check if it is within the grid boundaries and if it is an obstacle.
    *

## CASE2

In [None]:
!pip install xai-sdk -q

In [None]:
!git clone https://github.com/fchollet/ARC.git
!ls ARC/data/training/

In [5]:
import numpy as np
import random
from collections import defaultdict
import logging
import json
import os

# New imports for xAI SDK
from xai_sdk import Client
from xai_sdk.chat import user, system
from google.colab import userdata

# Configure logging to see the full error messages
logging.basicConfig(level=logging.INFO)

# --- Agent Configuration Class ---
class AgentConfig:
    LLM_MODEL_NAME: str = "grok-4-0709"

# --- True ARC-AGI-3 Environment Wrapper (Conceptual) ---
class ArcAGI3TaskEnv:
    """
    A conceptual wrapper for an ARC-AGI-3 task, loading data from JSON.
    This simulates the environment's state and a simplified action space,
    focusing on transforming an input grid to an output.
    """
    def __init__(self, task_filepath: str):
        self.task_filepath = task_filepath
        self.task_data = self._load_task_data()
        self.current_pair_idx = 0 # Focus on the first test pair
        self.current_input_grid = None
        self.current_output_grid_target = None # The target grid for the current task
        self.agent_working_grid = None # The grid the agent actively modifies

        # Define a simplified set of ARC-like actions
        # Each action modifies the self.agent_working_grid
        self.available_actions_map = {
            0: "move_cursor_up",      # Moves an implicit cursor for set_pixel
            1: "move_cursor_down",
            2: "move_cursor_left",
            3: "move_cursor_right",
            # Colors 0-9 for setting pixels
            4: "set_current_pixel_color(0)",
            5: "set_current_pixel_color(1)",
            6: "set_current_pixel_color(2)",
            7: "set_current_pixel_color(3)",
            8: "set_current_pixel_color(4)",
            9: "set_current_pixel_color(5)",
            10: "set_current_pixel_color(6)",
            11: "set_current_pixel_color(7)",
            12: "set_current_pixel_color(8)",
            13: "set_current_pixel_color(9)",
            14: "fill_region_at_cursor()", # Flood fill from cursor (using existing color)
            # New action: Resize the working grid to a target output shape
            15: "resize_to_target_output_shape()" # This action will resize to current_output_grid_target's shape
        }
        self.cursor_pos = (0,0)
        self.max_episode_steps = 200 # Increased max steps for more complex tasks
        self.current_step_count = 0
        self.total_reward = 0

        logging.info(f"ARC-AGI-3 Task Environment initialized for: {os.path.basename(task_filepath)}")

    def _load_task_data(self):
        with open(self.task_filepath, 'r') as f:
            data = json.load(f)
        return data

    def reset(self):
        """
        Resets the environment to the beginning of a test task.
        Returns the initial input grid for the agent.
        """
        logging.info(f"Resetting ARC-AGI-3 task {os.path.basename(self.task_filepath)}")
        self.current_input_grid = np.array(self.task_data['test'][self.current_pair_idx]['input'], dtype=int)
        self.current_output_grid_target = np.array(self.task_data['test'][self.current_pair_idx]['output'], dtype=int)

        # Agent's working grid starts as a copy of the input grid size
        self.agent_working_grid = self.current_input_grid.copy()
        self.cursor_pos = (0,0)
        self.current_step_count = 0
        self.total_reward = 0

        logging.info(f"Loaded Task. Initial Input Grid Shape: {self.current_input_grid.shape}\n{self.current_input_grid}")
        logging.info(f"Target Output Grid Shape: {self.current_output_grid_target.shape}\n{self.current_output_grid_target}")
        self._update_agent_display_grid()

        # State is flattened current working grid
        return self.agent_working_grid.flatten() # Agent perceives its working grid

    def _update_agent_display_grid(self):
        self.display_grid = self.agent_working_grid.copy()

    def step(self, action_id: int):
        """
        Applies an action to the agent's working grid.
        Returns new_state, reward, done, info.
        """
        reward = -0.01 # Small penalty per step
        done = False
        info = {"action_name": self.available_actions_map.get(action_id, "unknown")}

        old_working_grid = self.agent_working_grid.copy() # Capture grid BEFORE modification
        current_r, current_c = self.cursor_pos
        grid_height, grid_width = self.agent_working_grid.shape

        if action_id == 0: # move_cursor_up
            self.cursor_pos = (max(0, current_r - 1), current_c)
        elif action_id == 1: # move_cursor_down
            self.cursor_pos = (min(grid_height - 1, current_r + 1), current_c)
        elif action_id == 2: # move_cursor_left
            self.cursor_pos = (current_r, max(0, current_c - 1))
        elif action_id == 3: # move_cursor_right
            self.cursor_pos = (current_r, min(grid_width - 1, current_c + 1))
        elif 4 <= action_id <= 13: # set_current_pixel_color(color_id)
            color_to_set = action_id - 4
            self.agent_working_grid[current_r, current_c] = color_to_set
            reward -= 0.05
        elif action_id == 14: # fill_region_at_cursor
            target_color = self.agent_working_grid[current_r, current_c]
            available_colors = [c for c in range(10) if c != target_color]
            if not available_colors:
                fill_color = target_color
            else:
                fill_color = random.choice(available_colors)

            if target_color != fill_color:
                q = [(current_r, current_c)]
                visited = set([(current_r, current_c)])
                height, width = self.agent_working_grid.shape
                while q:
                    r, c = q.pop(0)
                    if self.agent_working_grid[r,c] == target_color:
                        self.agent_working_grid[r,c] = fill_color
                        for dr, dc in [(-1,0),(1,0),(0,-1),(0,1)]:
                            nr, nc = r+dr, c+dc
                            if 0 <= nr < height and 0 <= nc < width and \
                                (nr,nc) not in visited and self.agent_working_grid[nr,nc] == target_color:
                                q.append((nr,nc))
                                visited.add((nr,nc))
                reward -= 0.2
        elif action_id == 15: # resize_to_target_output_shape() - NEW ACTION
            target_shape = self.current_output_grid_target.shape
            new_grid = np.zeros(target_shape, dtype=int) # Create a new grid of target size
            # Copy existing content from old grid to new grid (top-left aligned)
            min_rows = min(grid_height, target_shape[0])
            min_cols = min(grid_width, target_shape[1])
            new_grid[:min_rows, :min_cols] = self.agent_working_grid[:min_rows, :min_cols]
            self.agent_working_grid = new_grid
            self.cursor_pos = (0,0) # Reset cursor after resize
            reward -= 0.5 # Significant cost for a major transformation
            info['resized'] = True
        else:
            reward = -0.5 # Penalty for invalid action
            logging.warning(f"Unknown action_id: {action_id}")

        self.current_step_count += 1
        self.total_reward += reward

        # Check for goal achievement (exact match of working grid to target)
        goal_achieved_this_step = False
        if self.agent_working_grid.shape == self.current_output_grid_target.shape:
            if np.array_equal(self.agent_working_grid, self.current_output_grid_target):
                reward += 100 # Large positive reward for exact match
                goal_achieved_this_step = True
                done = True
                logging.info(f"Task completed successfully! Episode reward: {self.total_reward}")
        else:
            reward -= 0.005 # Small penalty for not being in the right shape yet

        # Calculate a pixel correctness reward (for more dense feedback)
        pixel_correctness = 0.0
        old_pixel_correctness = 0.0

        # ONLY calculate pixel correctness if BOTH the current and old grid match target shape
        if self.agent_working_grid.shape == self.current_output_grid_target.shape and \
           old_working_grid.shape == self.current_output_grid_target.shape:
            pixel_correctness = np.sum(self.agent_working_grid == self.current_output_grid_target) / self.agent_working_grid.size
            old_pixel_correctness = np.sum(old_working_grid == self.current_output_grid_target) / old_working_grid.size
            reward += (pixel_correctness - old_pixel_correctness) * 10
        elif goal_achieved_this_step:
            pass # No additional pixel reward if just resized to perfect match, +100 covers it.


        # Check for episode termination (after all rewards/penalties)
        if self.current_step_count >= self.max_episode_steps and not done:
            reward -= 20 # Penalty for not solving in time
            done = True
            logging.info(f"Max steps reached. Task not completed. Episode reward: {self.total_reward}")

        self._update_agent_display_grid()

        return self.agent_working_grid.flatten(), reward, done, info

    def render(self):
        logging.info(f"--- Agent Working Grid (Step {self.current_step_count}, Cursor: {self.cursor_pos}) ---")
        display_grid_with_cursor = self.agent_working_grid.copy()
        if 0 <= self.cursor_pos[0] < display_grid_with_cursor.shape[0] and \
           0 <= self.cursor_pos[1] < display_grid_with_cursor.shape[1]:
            original_color = display_grid_with_cursor[self.cursor_pos]
            display_grid_with_cursor[self.cursor_pos] = 99
            print(display_grid_with_cursor)
            display_grid_with_cursor[self.cursor_pos] = original_color
        else:
            print(display_grid_with_cursor)

        print("-" * (self.agent_working_grid.shape[1] * 2 + 1))

    def get_action_space_size(self):
        return len(self.available_actions_map)

    def get_state_representation(self):
        return tuple(self.agent_working_grid.flatten())

# --- Q-Learning Agent (Adapted for potentially larger state/action space) ---
class QLearningAgent:
    def __init__(self, actions, learning_rate=0.1, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.01, epsilon_decay_rate=0.001):
        self.actions = actions
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay_rate = epsilon_decay_rate
        self.q_table = defaultdict(lambda: np.zeros(len(actions)))

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(list(self.actions.keys()))
        else:
            if state not in self.q_table:
                return random.choice(list(self.actions.keys()))
            return np.argmax(self.q_table[state])

    def learn(self, state, action, reward, next_state):
        if action < 0 or action >= len(self.actions):
            logging.warning(f"Invalid action {action} passed to learn. Skipping update.")
            return

        current_q = self.q_table[state][action]
        max_next_q = np.max(self.q_table[next_state])
        new_q = current_q + self.lr * (reward + self.gamma * max_next_q - current_q)
        self.q_table[state][action] = new_q

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_end, self.epsilon - self.epsilon_decay_rate)

# --- Grok 4 LLM Integration ---
class GrokArcStrategist:
    def __init__(self, api_key: str, model_name: str):
        try:
            self.client = Client(api_host="api.x.ai", api_key=api_key)
            self.model_name = model_name
            print(f"GrokArcStrategist initialized with model: {model_name}")
        except Exception as e:
            logging.error(f"Failed to initialize GrokArcStrategist: {e}")
            self.client = None

    def _send_message_with_handling(self, system_prompt: str, user_prompt: str) -> str:
        if not self.client:
            return "Grok API is not initialized."

        try:
            chat_session = self.client.chat.create(model=self.model_name, temperature=0)
            chat_session.append(system(system_prompt))
            chat_session.append(user(user_prompt))
            response = chat_session.sample()

            if response and response.content:
                return response.content.strip()
            else:
                logging.warning(f"Grok response incomplete or empty. Response: {response}")
                return "Grok could not generate a response for this request (empty or malformed)."

        except Exception as e:
            logging.error(f"An error occurred during Grok API call: {e}")
            return "Grok API call failed due to an error."

    def propose_high_level_strategy(self, current_grid_state_description, task_description: str, available_actions_map: dict):
        system_prompt = """
        You are an AI assistant specialized in solving Abstract and Reasoning Corpus (ARC) tasks.
        Your goal is to propose high-level strategies based on grid states and available actions.
        The agent will interact with an environment that provides a grid.
        """
        user_prompt = f"""
        The current ARC-AGI-3 game environment presents a grid and allows modification using various actions.
        Current Grid State:
        {current_grid_state_description}

        Your overarching task goal (inferred by human, for your strategic planning): {task_description}

        Available actions for the agent (ID: Description). These actions modify the grid:
        {json.dumps(available_actions_map, indent=2)}

        Primitive actions like 'move_cursor_up/down/left/right' change the agent's implicit cursor position.
        Actions like 'set_current_pixel_color(X)' change the color of the pixel at the cursor.
        'fill_region_at_cursor' applies a flood fill from the cursor's position.
        'resize_to_target_output_shape' resizes the current grid to the target output size defined by the task.

        Given this, propose a high-level, step-by-step strategy to transform the 'Current Grid State' to achieve the 'overarching task goal'.
        Consider when it would be beneficial to use complex actions (like set_current_pixel_color, fill_region_at_cursor, resize_to_target_output_shape) versus simple cursor movements.
        Think methodically. Your strategy should outline a logical plan.
        """
        return self._send_message_with_handling(system_prompt, user_prompt)

    def interpret_failed_trajectory(self, initial_state_desc, trajectory_log, reason_for_failure="did not achieve the desired grid transformation"):
        system_prompt = """
        You are an AI assistant specialized in analyzing failures in interactive ARC tasks.
        Provide insights on why an agent failed to transform the grid as expected and suggest abstract adjustments to its strategy or learning process.
        """
        user_prompt = f"""
        An AI agent attempted an ARC-AGI-3 task and failed to reach the desired grid configuration.
        Initial Grid State:
        {initial_state_desc}

        Trajectory (State, Action, Reward, Next State, Macro Action Used (True/False)) examples:
        {trajectory_log}
        Reason for Failure: {reason_for_failure}

        Analyze this trajectory and provide insights on why the agent failed at a high level.
        Consider if the agent used available actions effectively to modify the grid, or when it should have used them.
        Suggest abstract adjustments to the agent's strategy or learning process for ARC-AGI-3.
        """
        return self._send_message_with_handling(system_prompt, user_prompt)

# --- Main Reinforcement Learning Loop with Conceptual Grok Integration ---
def main():
    # --- Setup for ARC-AGI-3 Task Files (in Colab) ---
    # This section now *correctly* reflects that you have cloned the ARC repo.

    # Command to clone the repo (if not already cloned in a previous cell):
    # !git clone https://github.com/fchollet/ARC.git

    # --- IMPORTANT: Ensure 'ARC' directory is present in your Colab files. ---
    # After cloning, the path to the task file is relative to the root of your Colab content directory.
    task_directory = "ARC/data/training/"
    task_id = "007bbfb7" # This task is about resizing and filling a 3x3 cross to a 5x5 square
    task_filepath = os.path.join(task_directory, f"{task_id}.json")

    # Double check if the ARC directory exists, and if the specific file exists within it.
    if not os.path.exists("ARC"):
        print("CRITICAL ERROR: 'ARC' directory not found. Please run '!git clone https://github.com/fchollet/ARC.git' in a cell above.")
        return
    if not os.path.exists(task_filepath):
        print(f"CRITICAL ERROR: ARC-AGI-3 task file '{task_filepath}' not found inside the cloned 'ARC' repo.")
        print(f"Please ensure '{task_id}.json' exists in 'ARC/data/training/'")
        return

    # --- Configure xAI API Key using Colab Secrets ---
    try:
        XAI_KEY = userdata.get('XAI_KEY')
        print("xAI API key configured successfully using Colab Secrets.")
    except Exception as e:
        logging.error(f"Failed to retrieve XAI API key from Colab Secrets: {e}")
        print("Please ensure you have added your API key to Colab Secrets named 'XAI_KEY'.")
        print("Grok integration will be skipped.")
        XAI_KEY = None

    grok_strategist = None
    if XAI_KEY:
        grok_strategist = GrokArcStrategist(XAI_KEY, AgentConfig.LLM_MODEL_NAME)
    else:
        logging.warning("Skipping Grok Strategist initialization due to missing API key.")

    # --- Environment and Agent Initialization ---
    env = ArcAGI3TaskEnv(task_filepath=task_filepath)

    # Q-learning agent uses the new env's action space
    agent = QLearningAgent(actions=env.available_actions_map, epsilon_decay_rate=0.005)

    num_episodes = 500 # Adjust number of episodes for this more complex env
    rewards_per_episode = []

    print(f"\n--- Starting RL Training for ARC-AGI-3 Task in EST ---")

    # The human-inferred task goal for Grok to reason about
    # This comes from understanding the input/output pairs in the task JSON.
    # For '007bbfb7.json', the task is to expand the central 3x3 blue cross into a 5x5 blue square.
    task_goal_description_for_llm = """
    The goal is to transform the input grid.
    Observe the 'train' examples to infer the rule: The central 3x3 blue cross (color 1) needs to be expanded into a solid 5x5 blue square.
    The output grid will always be 5x5.
    You need to fill the empty (color 0) cells surrounding the initial cross with blue (color 1) to form a larger square.
    This involves resizing the grid from 3x3 to 5x5, then filling the appropriate cells.
    """


    for episode in range(num_episodes):
        initial_flat_state = env.reset()
        current_state_tuple = env.get_state_representation()
        done = False
        total_reward = 0
        episode_trajectory = []

        if grok_strategist and episode == 0:
            print(f"\n--- Grok 4 Proposing Initial High-Level Strategy (Model: {AgentConfig.LLM_MODEL_NAME}) ---")
            grid_desc = str(initial_flat_state.reshape(env.current_input_grid.shape).tolist()) # Convert to list for LLM context
            strategy = grok_strategist.propose_high_level_strategy(grid_desc, task_goal_description_for_llm, env.available_actions_map)
            print(f"Grok's initial strategy: {strategy}")
            print("--------------------------------------------------")

        while not done:
            # env.render() # Uncomment to see steps, but very verbose
            action = agent.choose_action(current_state_tuple)

            # --- Capture grid BEFORE the step for old_pixel_correctness and trajectory logging ---
            grid_before_step_for_logging = env.agent_working_grid.copy()

            next_state_flat, reward, done, info = env.step(action)
            # next_state_flat is the flattened NumPy array from env.step, it's NOT a tuple yet.
            # current_state_tuple IS a tuple of the flattened grid.

            next_state_tuple = env.get_state_representation() # This converts the new grid to a tuple

            agent.learn(current_state_tuple, action, reward, next_state_tuple)

            total_reward += reward
            # Log action name from the map for Grok's readability
            action_name_logged = env.available_actions_map.get(action, f"Unknown_Action_{action}")

            # Store states as lists for robust JSON serialization and LLM display
            episode_trajectory.append({
                'state_before_action': grid_before_step_for_logging.tolist(),
                'action': action_name_logged,
                'reward': reward,
                'state_after_action': env.agent_working_grid.tolist(), # current state after action
                'macro_used': info.get('macro_action_executed', False)
            })

            current_state_tuple = next_state_tuple

        rewards_per_episode.append(total_reward)
        agent.decay_epsilon()

        if episode % 50 == 0: # Check less frequently due to LLM calls
            print(f"Episode {episode}: Total Reward = {total_reward:.2f}, Epsilon = {agent.epsilon:.2f}")
            if grok_strategist and total_reward < 0: # If episode was a significant failure
                print(f"\n--- Grok 4 Analyzing Failed Trajectory (Model: {AgentConfig.LLM_MODEL_NAME}, Example) ---")
                initial_grid_desc = str(env.current_input_grid.copy().tolist()) # Initial grid for analysis as list
                # Log a few steps for brevity for LLM, including action names and grid changes
                log_snippet_lines = []
                for i, entry in enumerate(episode_trajectory[:5]):
                    # Convert list back to NumPy array for display, then back to string
                    log_snippet_lines.append(
                        f"Step {i}:\n"
                        f"  State Before Action:\n{np.array(entry['state_before_action'])}\n"
                        f"  Action: {entry['action']}, Reward: {entry['reward']:.2f}\n"
                        f"  State After Action:\n{np.array(entry['state_after_action'])}"
                    )
                log_snippet = "\n".join(log_snippet_lines)

                failure_reason = "Did not achieve the target grid transformation (exact match)."
                interpretation = grok_strategist.interpret_failed_trajectory(initial_grid_desc, log_snippet, failure_reason)
                print(f"Grok's analysis: {interpretation}")
                print("----------------------------------------------------")


    print("\n--- Training Complete ---")

    # --- Evaluate Agent Performance (Final Test) ---
    print(f"\n--- Testing Trained Agent on ARC-AGI-3 Task: {os.path.basename(task_filepath)} ---")
    initial_flat_state = env.reset()
    current_state_tuple = env.get_state_representation()
    done = False
    test_reward = 0
    test_steps = 0
    agent.epsilon = 0 # Turn off exploration for testing

    path_taken_actions = []

    while not done and test_steps < env.max_episode_steps * 2: # Give it more steps for testing
        action = np.argmax(agent.q_table[current_state_tuple])
        action_name = env.available_actions_map.get(action, action)
        path_taken_actions.append(f"Action: {action_name}")

        next_state_flat, reward, done, info = env.step(action)
        current_state_tuple = env.get_state_representation()
        test_reward += reward
        test_steps += 1

    print(f"Test Run: Total Reward = {test_reward:.2f}, Steps = {test_steps}")
    print("Path taken (actions):", " -> ".join(path_taken_actions))
    if np.array_equal(env.agent_working_grid, env.current_output_grid_target):
        print("Agent successfully transformed the grid to the target output!")
        env.render() # Show the final grid
        print("Target Output:")
        print(env.current_output_grid_target)
    else:
        print("Agent did not achieve the exact target grid transformation in testing.")
        print("Final Agent Grid:")
        env.render()
        print("Target Output:")
        print(env.current_output_grid_target)

if __name__ == "__main__":
    main()

xAI API key configured successfully using Colab Secrets.
GrokArcStrategist initialized with model: grok-4-0709

--- Starting RL Training for ARC-AGI-3 Task in EST ---

--- Grok 4 Proposing Initial High-Level Strategy (Model: grok-4-0709) ---
Grok's initial strategy: ### High-Level Step-by-Step Strategy for Transforming the Grid

Based on the current grid state ([[7, 0, 7], [7, 0, 7], [7, 7, 0]]), the inferred task goal (expanding a 3x3 cross into a solid 5x5 blue square of color 1 by resizing to 5x5 and filling empty cells with color 1 to form a larger square), and the available actions, I propose the following high-level strategy. This plan assumes:

- The target is a solid 5x5 grid where **all cells are color 1** (blue), as per the goal description. This requires filling all 0s with 1 (including internal 0s in the original cross) and changing the existing 7s to 1 to achieve uniformity.
- The resize action pads the original 3x3 grid in the center of the new 5x5 grid with 0s around it.

## Implement Deep Reinforcement Learning (DRL)

In [1]:
!nvidia-smi

Mon Jul 21 18:41:40 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   53C    P8             12W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install xai-sdk -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/109.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.5/109.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!git clone https://github.com/fchollet/ARC.git
#!ls ARC/data/training/

In [6]:
import numpy as np
import random
from collections import defaultdict, deque
import logging
import json
import os

# PyTorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# xAI SDK Imports
from xai_sdk import Client
from xai_sdk.chat import user, system
from google.colab import userdata

# Configure logging to see the full error messages
logging.basicConfig(level=logging.INFO)

# Set device for PyTorch (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Agent Configuration Class ---
class AgentConfig:
    LLM_MODEL_NAME: str = "grok-4-0709"
    # DQN Hyperparameters
    DQN_HIDDEN_SIZE = 128
    DQN_BATCH_SIZE = 64
    DQN_GAMMA = 0.99
    DQN_LR = 1e-4
    DQN_REPLAY_BUFFER_SIZE = 10000
    DQN_MIN_REPLAY_SIZE = 1000 # Minimum experiences before training starts
    DQN_TARGET_UPDATE_FREQ = 10 # Update target network every N episodes
    DQN_EPSILON_START = 1.0
    DQN_EPSILON_END = 0.01
    DQN_EPSILON_DECAY_RATE = 0.0005 # Decay per episode


# --- True ARC-AGI-3 Environment Wrapper (Conceptual) ---
class ArcAGI3TaskEnv:
    """
    A conceptual wrapper for an ARC-AGI-3 task, loading data from JSON.
    This simulates the environment's state and a simplified action space,
    focusing on transforming an input grid to an output.
    """
    def __init__(self, task_filepath: str):
        self.task_filepath = task_filepath
        self.task_data = self._load_task_data()
        self.current_pair_idx = 0 # Focus on the first test pair
        self.current_input_grid = None
        self.current_output_grid_target = None # The target grid for the current task
        self.agent_working_grid = None # The grid the agent actively modifies

        # Define a simplified set of ARC-like actions
        self.available_actions_map = {
            0: "move_cursor_up",
            1: "move_cursor_down",
            2: "move_cursor_left",
            3: "move_cursor_right",
            # Colors 0-9 for setting pixels
            4: "set_current_pixel_color(0)",
            5: "set_current_pixel_color(1)",
            6: "set_current_pixel_color(2)",
            7: "set_current_pixel_color(3)",
            8: "set_current_pixel_color(4)",
            9: "set_current_pixel_color(5)",
            10: "set_current_pixel_color(6)",
            11: "set_current_pixel_color(7)",
            12: "set_current_pixel_color(8)",
            13: "set_current_pixel_color(9)",
            14: "fill_region_at_cursor()",
            15: "resize_to_target_output_shape()"
        }
        self.cursor_pos = (0,0)
        self.max_episode_steps = 200
        self.current_step_count = 0
        self.total_reward = 0

        logging.info(f"ARC-AGI-3 Task Environment initialized for: {os.path.basename(task_filepath)}")

    def _load_task_data(self):
        with open(self.task_filepath, 'r') as f:
            data = json.load(f)
        return data

    def reset(self):
        logging.info(f"Resetting ARC-AGI-3 task {os.path.basename(self.task_filepath)}")
        self.current_input_grid = np.array(self.task_data['test'][self.current_pair_idx]['input'], dtype=int)
        self.current_output_grid_target = np.array(self.task_data['test'][self.current_pair_idx]['output'], dtype=int)

        self.agent_working_grid = self.current_input_grid.copy()
        self.cursor_pos = (0,0)
        self.current_step_count = 0
        self.total_reward = 0

        logging.info(f"Loaded Task. Initial Input Grid Shape: {self.current_input_grid.shape}\n{self.current_input_grid}")
        logging.info(f"Target Output Grid Shape: {self.current_output_grid_target.shape}\n{self.current_output_grid_target}")
        self._update_agent_display_grid()

        return self.agent_working_grid.flatten()

    def _update_agent_display_grid(self):
        self.display_grid = self.agent_working_grid.copy()

    def step(self, action_id: int):
        reward = -0.01
        done = False
        info = {"action_name": self.available_actions_map.get(action_id, "unknown")}

        old_working_grid = self.agent_working_grid.copy()
        current_r, current_c = self.cursor_pos
        grid_height, grid_width = self.agent_working_grid.shape

        next_grid_height, next_grid_width = grid_height, grid_width
        if action_id == 15:
             next_grid_height, next_grid_width = self.current_output_grid_target.shape

        if action_id == 0: # move_cursor_up
            self.cursor_pos = (max(0, current_r - 1), current_c)
        elif action_id == 1: # move_cursor_down
            self.cursor_pos = (min(grid_height - 1, current_r + 1), current_c)
        elif action_id == 2: # move_cursor_left
            self.cursor_pos = (current_r, max(0, current_c - 1))
        elif action_id == 3: # move_cursor_right
            self.cursor_pos = (current_r, min(grid_width - 1, current_c + 1))
        elif 4 <= action_id <= 13: # set_current_pixel_color(color_id)
            if 0 <= current_r < grid_height and 0 <= current_c < grid_width:
                color_to_set = action_id - 4
                self.agent_working_grid[current_r, current_c] = color_to_set
                reward -= 0.05
            else:
                reward -= 0.1
        elif action_id == 14: # fill_region_at_cursor
            if 0 <= current_r < grid_height and 0 <= current_c < grid_width:
                target_color = self.agent_working_grid[current_r, current_c]
                available_colors = [c for c in range(10) if c != target_color]
                if not available_colors:
                    fill_color = target_color
                else:
                    fill_color = random.choice(available_colors)

                if target_color != fill_color:
                    q = [(current_r, current_c)]
                    visited = set([(current_r, current_c)])
                    height, width = self.agent_working_grid.shape
                    while q:
                        r, c = q.pop(0)
                        if self.agent_working_grid[r,c] == target_color:
                            self.agent_working_grid[r,c] = fill_color
                            for dr, dc in [(-1,0),(1,0),(0,-1),(0,1)]:
                                nr, nc = r+dr, c+dc
                                if 0 <= nr < height and 0 <= nc < width and \
                                    (nr,nc) not in visited and self.agent_working_grid[nr,nc] == target_color:
                                    q.append((nr,nc))
                                    visited.add((nr,nc))
                    reward -= 0.2
            else:
                reward -= 0.2
        elif action_id == 15: # resize_to_target_output_shape()
            target_shape = self.current_output_grid_target.shape
            new_grid = np.zeros(target_shape, dtype=int)
            min_rows = min(grid_height, target_shape[0])
            min_cols = min(grid_width, target_shape[1])
            new_grid[:min_rows, :min_cols] = self.agent_working_grid[:min_rows, :min_cols]
            self.agent_working_grid = new_grid
            self.cursor_pos = (0,0)
            reward -= 0.5
            info['resized'] = True
        else:
            reward = -0.5
            logging.warning(f"Unknown action_id: {action_id}")

        self.current_step_count += 1
        self.total_reward += reward

        goal_achieved_this_step = False
        if self.agent_working_grid.shape == self.current_output_grid_target.shape:
            if np.array_equal(self.agent_working_grid, self.current_output_grid_target):
                reward += 100
                goal_achieved_this_step = True
                done = True
                logging.info(f"Task completed successfully! Episode reward: {self.total_reward}")
        else:
            reward -= 0.005

        pixel_correctness = 0.0
        old_pixel_correctness = 0.0

        if self.agent_working_grid.shape == self.current_output_grid_target.shape and \
           old_working_grid.shape == self.current_output_grid_target.shape:
            pixel_correctness = np.sum(self.agent_working_grid == self.current_output_grid_target) / self.agent_working_grid.size
            old_pixel_correctness = np.sum(old_working_grid == self.current_output_grid_target) / old_working_grid.size
            reward += (pixel_correctness - old_pixel_correctness) * 10
        elif goal_achieved_this_step:
            pass

        if self.current_step_count >= self.max_episode_steps and not done:
            reward -= 20
            done = True
            logging.info(f"Max steps reached. Task not completed. Episode reward: {self.total_reward}")

        self._update_agent_display_grid()

        return self.agent_working_grid.flatten(), reward, done, info

    def render(self):
        logging.info(f"--- Agent Working Grid (Step {self.current_step_count}, Cursor: {self.cursor_pos}) ---")
        display_grid_with_cursor = self.agent_working_grid.copy()
        if 0 <= self.cursor_pos[0] < display_grid_with_cursor.shape[0] and \
           0 <= self.cursor_pos[1] < display_grid_with_cursor.shape[1]:
            original_color = display_grid_with_cursor[self.cursor_pos]
            display_grid_with_cursor[self.cursor_pos] = 99
            print(display_grid_with_cursor)
            display_grid_with_cursor[self.cursor_pos] = original_color
        else:
            print(display_grid_with_cursor)

        print("-" * (self.agent_working_grid.shape[1] * 2 + 1))

    def get_action_space_size(self):
        return len(self.available_actions_map)

    def get_state_representation(self):
        return tuple(self.agent_working_grid.flatten())


# --- DQN Model (Neural Network for Q-function approximation) ---
class DQN(nn.Module):
    def __init__(self, input_dim, action_size, hidden_size):
        super(DQN, self).__init__()
        # Input: Flattened grid. For 007bbfb7, initially 3x3=9, after resize 9x9=81.
        # Max possible input size will be 9x9=81. We need to handle variable input size
        # Or, assume max input size and pad smaller ones.
        # This MLP will now expect `input_dim` features.
        self.fc1 = nn.Linear(input_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_size)

    def forward(self, state):
        # state is expected to be a flattened tensor [batch_size, input_dim]
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)


# --- Replay Buffer ---
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        # state and next_state should already be numpy arrays (padded if necessary)
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        if batch_size > len(self.buffer):
            return random.sample(self.buffer, len(self.buffer))
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)


# --- DQNAgent ---
class DQNAgent:
    def __init__(self, state_dim, action_size, hyperparameters):
        self.action_size = action_size
        self.gamma = hyperparameters.DQN_GAMMA
        self.lr = hyperparameters.DQN_LR
        self.batch_size = hyperparameters.DQN_BATCH_SIZE
        self.epsilon = hyperparameters.DQN_EPSILON_START
        self.epsilon_end = hyperparameters.DQN_EPSILON_END
        self.epsilon_decay_rate = hyperparameters.DQN_EPSILON_DECAY_RATE
        self.target_update_freq = hyperparameters.DQN_TARGET_UPDATE_FREQ

        self.policy_net = DQN(state_dim, action_size, hyperparameters.DQN_HIDDEN_SIZE).to(device)
        self.target_net = DQN(state_dim, action_size, hyperparameters.DQN_HIDDEN_SIZE).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
        self.criterion = nn.MSELoss()

        self.memory = ReplayBuffer(hyperparameters.DQN_REPLAY_BUFFER_SIZE)
        self.steps_done = 0

        # Define the padding function here, as max_input_dim is needed.
        self.max_input_dim = state_dim # This is the fixed size the NN expects.

    def _pad_state_to_max(self, state_flat_np):
        current_dim = len(state_flat_np)
        if current_dim < self.max_input_dim:
            return np.pad(state_flat_np, (0, self.max_input_dim - current_dim), 'constant', constant_values=0)
        return state_flat_np

    def choose_action(self, state_tuple):
        # Pad the state_np *before* converting to tensor and passing to network
        state_np = np.array(state_tuple, dtype=np.float32)
        padded_state_np = self._pad_state_to_max(state_np) # Apply padding here
        state_tensor = torch.from_numpy(padded_state_np).float().unsqueeze(0).to(device)

        if random.random() < self.epsilon:
            return random.randrange(self.action_size)
        else:
            self.policy_net.eval()
            with torch.no_grad():
                q_values = self.policy_net(state_tensor)
            self.policy_net.train()
            return q_values.argmax(1).item()

    def learn(self):
        if len(self.memory) < AgentConfig.DQN_MIN_REPLAY_SIZE:
            return

        current_batch_size = min(self.batch_size, len(self.memory))
        transitions = self.memory.sample(current_batch_size)
        batch_state_np, batch_action, batch_reward, batch_next_state_np, batch_done = zip(*transitions)

        # Pad states in the batch
        batch_state_padded = torch.tensor([self._pad_state_to_max(s) for s in batch_state_np], dtype=torch.float32).to(device)
        batch_action = torch.tensor(batch_action, dtype=torch.long).unsqueeze(1).to(device)
        batch_reward = torch.tensor(batch_reward, dtype=torch.float32).unsqueeze(1).to(device)
        batch_next_state_padded = torch.tensor([self._pad_state_to_max(ns) for ns in batch_next_state_np], dtype=torch.float32).to(device)
        batch_done = torch.tensor(batch_done, dtype=torch.float32).unsqueeze(1).to(device)

        current_q_values = self.policy_net(batch_state_padded).gather(1, batch_action)

        next_q_values = self.target_net(batch_next_state_padded).max(1)[0].unsqueeze(1)
        expected_q_values = batch_reward + (self.gamma * next_q_values * (1 - batch_done))

        loss = self.criterion(current_q_values, expected_q_values.detach())

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
        self.optimizer.step()

        self.steps_done += 1

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

    def decay_epsilon(self, episode):
        self.epsilon = AgentConfig.DQN_EPSILON_END + (AgentConfig.DQN_EPSILON_START - AgentConfig.DQN_EPSILON_END) * \
                       np.exp(-AgentConfig.DQN_EPSILON_DECAY_RATE * episode)


# --- Grok 4 LLM Integration (remains mostly same) ---
class GrokArcStrategist:
    def __init__(self, api_key: str, model_name: str):
        try:
            self.client = Client(api_host="api.x.ai", api_key=api_key)
            self.model_name = model_name
            print(f"GrokArcStrategist initialized with model: {model_name}")
        except Exception as e:
            logging.error(f"Failed to initialize GrokArcStrategist: {e}")
            self.client = None

    def _send_message_with_handling(self, system_prompt: str, user_prompt: str) -> str:
        if not self.client:
            return "Grok API is not initialized."

        try:
            chat_session = self.client.chat.create(model=self.model_name, temperature=0)
            chat_session.append(system(system_prompt))
            chat_session.append(user(user_prompt))
            response = chat_session.sample()

            if response and response.content:
                return response.content.strip()
            else:
                logging.warning(f"Grok response incomplete or empty. Response: {response}")
                return "Grok could not generate a response for this request (empty or malformed)."

        except Exception as e:
            logging.error(f"An error occurred during Grok API call: {e}")
            return "Grok API call failed due to an error."

    def propose_high_level_strategy(self, current_grid_state_description, task_description: str, available_actions_map: dict):
        system_prompt = """
        You are an AI assistant specialized in solving Abstract and Reasoning Corpus (ARC) tasks.
        Your goal is to propose high-level strategies based on grid states and available actions.
        The agent will interact with an environment that provides a grid.
        """
        user_prompt = f"""
        The current ARC-AGI-3 game environment presents a grid and allows modification using various actions.
        Current Grid State:
        {current_grid_state_description}

        Your overarching task goal (inferred by human, for your strategic planning): {task_description}

        Available actions for the agent (ID: Description). These actions modify the grid:
        {json.dumps(available_actions_map, indent=2)}

        Primitive actions like 'move_cursor_up/down/left/right' change the agent's implicit cursor position.
        Actions like 'set_current_pixel_color(X)' change the color of the pixel at the cursor.
        'fill_region_at_cursor' applies a flood fill from the cursor's position.
        'resize_to_target_output_shape' resizes the current grid to the target output size defined by the task.

        Given this, propose a high-level, step-by-step strategy to transform the 'Current Grid State' to achieve the 'overarching task goal'.
        Consider when it would be beneficial to use complex actions (like set_current_pixel_color, fill_region_at_cursor, resize_to_target_output_shape) versus simple cursor movements.
        Think methodically. Your strategy should outline a logical plan.
        """
        return self._send_message_with_handling(system_prompt, user_prompt)

    def interpret_failed_trajectory(self, initial_state_desc, trajectory_log, reason_for_failure="did not achieve the desired grid transformation"):
        system_prompt = """
        You are an AI assistant specialized in analyzing failures in interactive ARC tasks.
        Provide insights on why an agent failed to transform the grid as expected and suggest abstract adjustments to its strategy or learning process.
        """
        user_prompt = f"""
        An AI agent attempted an ARC-AGI-3 task and failed to reach the desired grid configuration.
        Initial Grid State:
        {initial_state_desc}

        Trajectory (State, Action, Reward, Next State, Macro Action Used (True/False)) examples:
        {trajectory_log}
        Reason for Failure: {reason_for_failure}

        Analyze this trajectory and provide insights on why the agent failed at a high level.
        Consider if the agent used available actions effectively to modify the grid, or when it should have used them.
        Suggest abstract adjustments to the agent's strategy or learning process for ARC-AGI-3.
        """
        return self._send_message_with_handling(system_prompt, user_prompt)

# --- Main Reinforcement Learning Loop with Conceptual Grok Integration ---
def main():
    # --- Setup for ARC-AGI-3 Task Files (in Colab) ---
    # Command to clone the repo (if not already cloned in a previous cell):
    # !git clone https://github.com/fchollet/ARC.git

    # --- IMPORTANT: Ensure 'ARC' directory is present in your Colab files. ---
    task_directory = "ARC/data/training/"
    task_id = "007bbfb7" # This task is about resizing and filling a 3x3 cross to a 5x5 square
    task_filepath = os.path.join(task_directory, f"{task_id}.json")

    if not os.path.exists("ARC"):
        print("CRITICAL ERROR: 'ARC' directory not found. Please run '!git clone https://github.com/fchollet/ARC.git' in a cell above.")
        return
    if not os.path.exists(task_filepath):
        print(f"CRITICAL ERROR: ARC-AGI-3 task file '{task_filepath}' not found inside the cloned 'ARC' repo.")
        print(f"Please ensure '{task_id}.json' exists in 'ARC/data/training/'")
        return

    # --- Configure xAI API Key using Colab Secrets ---
    try:
        XAI_KEY = userdata.get('XAI_KEY')
        print("xAI API key configured successfully using Colab Secrets.")
    except Exception as e:
        logging.error(f"Failed to retrieve XAI API key from Colab Secrets: {e}")
        print("Please ensure you have added your API key to Colab Secrets named 'XAI_KEY'.")
        print("Grok integration will be skipped.")
        XAI_KEY = None

    grok_strategist = None
    if XAI_KEY:
        grok_strategist = GrokArcStrategist(XAI_KEY, AgentConfig.LLM_MODEL_NAME)
    else:
        logging.warning("Skipping Grok Strategist initialization due to missing API key.")

    # --- Environment and Agent Initialization ---
    env = ArcAGI3TaskEnv(task_filepath=task_filepath)
    # --- FIX: Call env.reset() BEFORE accessing env.current_output_grid_target ---
    initial_observation_flat_np = env.reset() # This populates current_output_grid_target

    # State dimension for DQN: max possible flattened grid size (9x9=81 for this task)
    state_dim = env.current_output_grid_target.size # Now this will work
    action_size = env.get_action_space_size()

    # Replace QLearningAgent with DQNAgent
    agent = DQNAgent(state_dim, action_size, AgentConfig)

    num_episodes = 1000
    rewards_per_episode = []

    print(f"\n--- Starting DRL Training for ARC-AGI-3 Task in EST ---")

    task_goal_description_for_llm = """
    The goal is to transform the input grid.
    Observe the 'train' examples to infer the rule: The central 3x3 blue cross (color 1) needs to be expanded into a solid 5x5 blue square.
    The output grid will always be 5x5.
    You need to fill the empty (color 0) cells surrounding the initial cross with blue (color 1) to form a larger square.
    This involves resizing the grid from 3x3 to 5x5, then filling the appropriate cells.
    """

    for episode in range(num_episodes):
        initial_flat_state_np = env.reset() # Reset for each episode
        # Get the initial state as a tuple for consistency with how it's stored and passed around.
        current_state_tuple = env.get_state_representation()

        done = False
        total_reward = 0
        episode_trajectory = []

        if grok_strategist and episode == 0:
            print(f"\n--- Grok 4 Proposing Initial High-Level Strategy (Model: {AgentConfig.LLM_MODEL_NAME}) ---")
            grid_desc = str(initial_flat_state_np.reshape(env.current_input_grid.shape).tolist())
            strategy = grok_strategist.propose_high_level_strategy(grid_desc, task_goal_description_for_llm, env.available_actions_map)
            print(f"Grok's initial strategy: {strategy}")
            print("--------------------------------------------------")

        while not done:
            # env.render() # Uncomment to see steps, but very verbose

            grid_before_step_for_logging = env.agent_working_grid.copy()

            action = agent.choose_action(current_state_tuple) # Agent picks action
            next_state_flat_np, reward, done, info = env.step(action) # Env returns numpy array

            next_state_tuple = env.get_state_representation() # Get tuple for next state

            # Store experience in replay buffer (states should be padded NumPy arrays or original)
            # The agent.learn() will handle padding the states from the buffer when they are sampled.
            # So, store the states *as they are returned by get_state_representation (tuples)*,
            # and _pad_state_to_max will handle converting from tuple to padded np array when needed by NN.
            agent.memory.push(current_state_tuple, action, reward, next_state_tuple, done)

            agent.learn()

            total_reward += reward
            action_name_logged = env.available_actions_map.get(action, f"Unknown_Action_{action}")

            episode_trajectory.append({
                'state_before_action': grid_before_step_for_logging.tolist(),
                'action': action_name_logged,
                'reward': reward,
                'state_after_action': env.agent_working_grid.tolist(),
                'macro_used': info.get('macro_action_executed', False)
            })

            current_state_tuple = next_state_tuple

        rewards_per_episode.append(total_reward)
        agent.decay_epsilon(episode)

        if episode % AgentConfig.DQN_TARGET_UPDATE_FREQ == 0:
            agent.update_target_network()

        if episode % 50 == 0:
            print(f"Episode {episode}: Total Reward = {total_reward:.2f}, Epsilon = {agent.epsilon:.2f}")
            if grok_strategist and total_reward < 0 and episode > 0:
                print(f"\n--- Grok 4 Analyzing Failed Trajectory (Model: {AgentConfig.LLM_MODEL_NAME}, Example) ---")
                initial_grid_desc = str(env.current_input_grid.copy().tolist())
                log_snippet_lines = []
                for i, entry in enumerate(episode_trajectory[:5]):
                    log_snippet_lines.append(
                        f"Step {i}:\n"
                        f"  State Before Action:\n{np.array(entry['state_before_action'])}\n"
                        f"  Action: {entry['action']}, Reward: {entry['reward']:.2f}\n"
                        f"  State After Action:\n{np.array(entry['state_after_action'])}"
                    )
                log_snippet = "\n".join(log_snippet_lines)

                failure_reason = "Did not achieve the target grid transformation (exact match)."
                interpretation = grok_strategist.interpret_failed_trajectory(initial_grid_desc, log_snippet, failure_reason)
                print(f"Grok's analysis: {interpretation}")
                print("----------------------------------------------------")


    print("\n--- Training Complete ---")

    print(f"\n--- Testing Trained Agent on ARC-AGI-3 Task: {os.path.basename(task_filepath)} ---")
    initial_flat_state_np = env.reset()
    current_state_tuple = env.get_state_representation()
    done = False
    test_reward = 0
    test_steps = 0
    agent.epsilon = 0 # Turn off exploration for testing

    path_taken_actions = []
    agent.policy_net.eval()

    while not done and test_steps < env.max_episode_steps * 2:
        action = agent.choose_action(current_state_tuple)
        action_name = env.available_actions_map.get(action, action)
        path_taken_actions.append(f"Action: {action_name}")

        next_state_flat_np, reward, done, info = env.step(action)
        current_state_tuple = env.get_state_representation()
        test_reward += reward
        test_steps += 1

    print(f"Test Run: Total Reward = {test_reward:.2f}, Steps = {test_steps}")
    print("Path taken (actions):", " -> ".join(path_taken_actions))
    if np.array_equal(env.agent_working_grid, env.current_output_grid_target):
        print("Agent successfully transformed the grid to the target output!")
        env.render()
        print("Target Output:")
        print(env.current_output_grid_target)
    else:
        print("Agent did not achieve the exact target grid transformation in testing.")
        print("Final Agent Grid:")
        env.render()
        print("Target Output:")
        print(env.current_output_grid_target)

if __name__ == "__main__":
    main()

Using device: cuda
xAI API key configured successfully using Colab Secrets.
GrokArcStrategist initialized with model: grok-4-0709

--- Starting DRL Training for ARC-AGI-3 Task in EST ---

--- Grok 4 Proposing Initial High-Level Strategy (Model: grok-4-0709) ---
Grok's initial strategy: ### High-Level Step-by-Step Strategy for Transforming the Grid

Based on the current grid state ([[7, 0, 7], [7, 0, 7], [7, 7, 0]]), the inferred task goal (expanding a central 3x3 cross into a solid 5x5 square of color 1 by resizing and filling 0s with color 1), and the available actions, I have devised a logical, efficient strategy. The strategy prioritizes complex actions (resize, fill_region_at_cursor, set_current_pixel_color) for broad transformations and efficiency, falling back to simple cursor movements only for precise targeting or when complex actions can't cover everything in one step. 

I assume the following about the environment based on typical ARC-like tasks and action names:
- The cursor