# PIP

In [3]:
pip install gymnasium ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
Downloading widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
   ---------------------------------------- 2.2/2.2 MB 13.7 MB/s eta 0:00:00
Installing collected packages: widgetsnbextension, jupyterlab_widgets, ipywidgets
Successfully installed ipywidgets-8.1.7 jupyterlab_widgets-3.0.15 widgetsnbextension-4.0.14
Note: you may need to restart the kernel to use updated packages.


# LOCKANDKEY

## 1 IMPORTS

In [72]:
# CELL 1: Imports & helpers
# Paste in first cell
import os
import time
import math
import pickle
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict, deque

# Import your external env (must be in same folder)
from lock_key_env import LockKeyEnv

# Pygame used for menu + visualization
import pygame

# Small safe initializer used before creating fonts/displays
def pygame_safe_init():
    if not pygame.get_init():
        pygame.init()
    if not pygame.font.get_init():
        pygame.font.init()

# small helper: moving average
def moving_avg(arr, window):
    if len(arr) < window:
        return np.array(arr)
    return np.convolve(arr, np.ones(window)/window, mode='valid')


## 2 RL AGENTS

In [73]:
# ----------------- CELL 2: RL Agents (Q-Learning, Monte Carlo, Actor-Critic) -----------------
import numpy as np
from collections import defaultdict

# --- Utility to make states hashable ---
def hashable_state(s):
    """
    Convert dict, ndarray, or list observation into a hashable key.
    """
    if isinstance(s, dict):
        return tuple(sorted((k, hashable_state(v)) for k, v in s.items()))
    elif isinstance(s, np.ndarray):
        return tuple(map(float, s.flatten()))
    elif isinstance(s, (list, tuple)):
        return tuple(map(hashable_state, s))
    else:
        try:
            hash(s)
            return s
        except TypeError:
            return str(s)


# ----------------- Q-Learning Agent -----------------
class QLearningAgent:
    def __init__(self, n_states=None, n_actions=4, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.n_states = n_states
        self.n_actions = n_actions
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = defaultdict(lambda: np.zeros(self.n_actions))

    def select_action(self, state):
        state = hashable_state(state)
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.n_actions)
        return int(np.argmax(self.Q[state]))

    def update(self, state, action, reward, next_state, done):
        state = hashable_state(state)
        next_state = hashable_state(next_state)
        target = reward + self.gamma * np.max(self.Q[next_state]) * (1 - done)
        current = self.Q[state][action]
        self.Q[state][action] = current + self.alpha * (target - current)


# ----------------- Monte Carlo Agent -----------------
class MonteCarloAgent:
    def __init__(self, n_states=None, n_actions=4, gamma=0.9, epsilon=0.1):
        self.n_states = n_states
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = defaultdict(lambda: np.zeros(self.n_actions))
        self.returns = defaultdict(list)

    def select_action(self, state):
        state = hashable_state(state)
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.n_actions)
        return int(np.argmax(self.Q[state]))

    def update(self, episode):
        """Episode is a list of (state, action, reward)."""
        G = 0
        visited = set()
        for state, action, reward in reversed(episode):
            state = hashable_state(state)
            G = self.gamma * G + reward
            if (state, action) not in visited:
                self.returns[(state, action)].append(G)
                self.Q[state][action] = np.mean(self.returns[(state, action)])
                visited.add((state, action))


# ----------------- Actor-Critic Agent -----------------
class ActorCriticAgent:
    def __init__(self, n_states=None, n_actions=4, alpha=0.1, beta=0.01, gamma=0.9):
        self.n_states = n_states
        self.n_actions = n_actions
        self.alpha = alpha  # critic lr
        self.beta = beta    # actor lr
        self.gamma = gamma

        self.V = defaultdict(float)
        self.pi = defaultdict(lambda: np.ones(self.n_actions) / self.n_actions)

    def select_action(self, state):
        state = hashable_state(state)
        probs = self.pi[state]
        return np.random.choice(self.n_actions, p=probs)

    def update(self, state, action, reward, next_state, done):
        state = hashable_state(state)
        next_state = hashable_state(next_state)

        # TD error
        td_target = reward + self.gamma * self.V[next_state] * (1 - done)
        td_error = td_target - self.V[state]
        self.V[state] += self.alpha * td_error

        # Actor update
        probs = self.pi[state]
        one_hot = np.zeros_like(probs)
        one_hot[action] = 1.0
        self.pi[state] += self.beta * td_error * (one_hot - probs)

        # Normalize
        self.pi[state] = np.clip(self.pi[state], 1e-5, 1.0)
        self.pi[state] /= np.sum(self.pi[state])


## 3 PYGAME MENU

In [74]:
# CELL 3: Pygame menu & parameter UI (paste into third cell)
# Usage: algo, params = menu_and_params()
def draw_button(screen, rect, text, font, color=(100,200,150), border=2):
    pygame.draw.rect(screen, color, rect)
    pygame.draw.rect(screen, (0,0,0), rect, border)
    txt = font.render(text, True, (0,0,0))
    tx = rect.x + (rect.width - txt.get_width())//2
    ty = rect.y + (rect.height - txt.get_height())//2
    screen.blit(txt, (tx, ty))

def menu_and_params(initial_params=None):
    pygame_safe_init()
    w,h = 980, 620
    screen = pygame.display.set_mode((w,h))
    pygame.display.set_caption('Locke N Key — Choose Algorithm & Parameters')
    font = pygame.font.SysFont('arial', 22)
    small = pygame.font.SysFont('arial', 16)

    algo_choices = ['Q-Learning', 'Monte Carlo', 'Actor-Critic']
    algo_idx = 0
    params = {
        'episodes': 800,
        'gamma': 0.9,
        'epsilon': 0.2,
        'alpha': 0.1,
        'actor_lr': 0.001,
        'critic_lr': 0.005,
        'phase': 5
    }
    if initial_params:
        params.update(initial_params)

    input_active = None
    input_text = ''
    clock = pygame.time.Clock()
    start_sim = False

    boxes = {
        'episodes': (50,220,220,44),
        'gamma':    (300,220,160,44),
        'epsilon':  (480,220,160,44),
        'alpha':    (50,300,160,44),
        'phase':    (240,300,120,44),
        'actor_lr': (380,300,160,44),
        'critic_lr':(560,300,160,44)
    }

    # placeholder style vars (easy to change)
    bg_color = (245,245,245)
    box_color = (255,255,255)
    title_color = (30,30,30)

    running = True
    while running:
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit(); raise SystemExit()
            elif event.type == pygame.KEYDOWN:
                if input_active is not None:
                    if event.key == pygame.K_RETURN:
                        try:
                            val = float(input_text) if '.' in input_text else int(input_text)
                            params[input_active] = val
                        except:
                            pass
                        input_active = None; input_text = ''
                    elif event.key == pygame.K_BACKSPACE:
                        input_text = input_text[:-1]
                    else:
                        input_text += event.unicode
                else:
                    if event.key == pygame.K_RIGHT:
                        algo_idx = (algo_idx + 1) % len(algo_choices)
                    elif event.key == pygame.K_LEFT:
                        algo_idx = (algo_idx - 1) % len(algo_choices)
                    elif event.key == pygame.K_RETURN:
                        start_sim = True
                    elif event.key == pygame.K_ESCAPE:
                        pygame.quit(); raise SystemExit()

            elif event.type == pygame.MOUSEBUTTONDOWN:
                mx,my = pygame.mouse.get_pos()
                for k,rect in boxes.items():
                    x,y,wbox,hbox = rect
                    if x <= mx <= x+wbox and y <= my <= y+hbox:
                        input_active = k
                        input_text = str(params.get(k, ''))
                        break

        screen.fill(bg_color)
        title = font.render('Locke N Key — Choose Algorithm and Parameters', True, title_color)
        screen.blit(title, (40,30))

        # algorithm selector as big button
        draw_button(screen, pygame.Rect(50,80,520,80), f'<<  {algo_choices[algo_idx]}  >>', font)

        info = small.render('LEFT/RIGHT to change algorithm. Click parameter boxes to edit. ENTER to start training. ESC to quit.', True, (10,10,10))
        screen.blit(info, (50,180))

        # draw parameter boxes and labels
        # boxes layout (see boxes dict)
        for k,rect in boxes.items():
            x,y,wbox,hbox = rect
            pygame.draw.rect(screen, box_color, (x,y,wbox,hbox))
            pygame.draw.rect(screen, (0,0,0), (x,y,wbox,hbox), 1)
        # labels
        screen.blit(small.render('Episodes (int)', True, (0,0,0)), (50,200))
        screen.blit(small.render('Gamma (γ)', True, (0,0,0)), (300,200))
        screen.blit(small.render('Epsilon (ε)', True, (0,0,0)), (480,200))
        screen.blit(small.render('Alpha (Q lr)', True, (0,0,0)), (50,280))
        screen.blit(small.render('Phase (1-5)', True, (0,0,0)), (240,280))
        screen.blit(small.render('Actor LR', True, (0,0,0)), (380,280))
        screen.blit(small.render('Critic LR', True, (0,0,0)), (560,280))

        # values displayed
        screen.blit(font.render(str(params['episodes']), True, (0,0,0)), (boxes['episodes'][0]+10, boxes['episodes'][1]+6))
        screen.blit(font.render(str(params['gamma']), True, (0,0,0)), (boxes['gamma'][0]+10, boxes['gamma'][1]+6))
        screen.blit(font.render(str(params['epsilon']), True, (0,0,0)), (boxes['epsilon'][0]+10, boxes['epsilon'][1]+6))
        screen.blit(font.render(str(params['alpha']), True, (0,0,0)), (boxes['alpha'][0]+10, boxes['alpha'][1]+6))
        screen.blit(font.render(str(params['phase']), True, (0,0,0)), (boxes['phase'][0]+10, boxes['phase'][1]+6))
        screen.blit(font.render(str(params['actor_lr']), True, (0,0,0)), (boxes['actor_lr'][0]+10, boxes['actor_lr'][1]+6))
        screen.blit(font.render(str(params['critic_lr']), True, (0,0,0)), (boxes['critic_lr'][0]+10, boxes['critic_lr'][1]+6))

        # input editing indicator
        if input_active is not None:
            pygame.draw.rect(screen, (200,220,255), (40,520,900,72), 3)
            screen.blit(small.render(f'Editing: {input_active} -> {input_text}', True, (0,0,0)), (50,540))

        start_hint = font.render('Press ENTER to START SIMULATION (training)', True, (180,30,30))
        screen.blit(start_hint, (50,420))

        pygame.display.update()
        clock.tick(30)

        if start_sim:
            pygame.quit()
            chosen_algo = algo_choices[algo_idx]
            return chosen_algo, params


## 4 TRAINING LOOP

In [75]:
# ----------------- CELL 4: Training loop + visualization with HUD -----------------
# ✅ CELL 4: Training loop + visualization (final fixed version)

def train_and_visualize(algo_name, params, save_path="lockkey_policy.pkl"):
    pygame_safe_init()

    # --- Environment setup ---
    env = LockKeyEnv(render_mode='human', size=6, phase=int(params.get('phase', 5)))
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    episodes = int(params.get('episodes', 800))
    gamma = float(params.get('gamma', 0.9))

    # --- Agent creation ---
    if algo_name == 'Q-Learning':
        agent = QLearningAgent(n_states, n_actions,
                               alpha=float(params.get('alpha', 0.1)),
                               gamma=gamma,
                               epsilon=float(params.get('epsilon', 0.2)))
    elif algo_name == 'Monte Carlo':
        agent = MonteCarloAgent(n_states, n_actions,
                                gamma=gamma,
                                epsilon=float(params.get('epsilon', 0.1)))
    else:  # Actor-Critic
        agent = ActorCriticAgent(n_states, n_actions,
                                 actor_lr=float(params.get('actor_lr', 1e-3)),
                                 critic_lr=float(params.get('critic_lr', 5e-3)),
                                 gamma=gamma)

    episode_rewards, successes = [], []

    pygame_safe_init()
    font = pygame.font.SysFont('arial', 18)
    start_time = time.time()

    try:
        if hasattr(env, 'start_pygame'):
            try:
                env.start_pygame()
            except Exception:
                pass

        # --- Main training loop ---
        for ep in range(1, episodes + 1):
            obs = env.reset()
            env.current_episode = ep
            done = False
            total_r = 0
            steps = 0
            episode_hist = []

            while not done and steps < 400:
                # Allow window events
                for event in pygame.event.get():
                    if event.type == pygame.QUIT:
                        env.close(); pygame.quit(); raise SystemExit()
                    elif event.type == pygame.KEYDOWN and event.key == pygame.K_ESCAPE:
                        env.close(); pygame.quit(); raise SystemExit()

                # Agent selects an action
                s = obs
                a = agent.policy(s) if isinstance(agent, MonteCarloAgent) else agent.select_action(s)
                obs2, r, done, trunc, info = env.step(a)
                total_r += r
                steps += 1

                if isinstance(agent, MonteCarloAgent):
                    episode_hist.append((s, a, r))
                else:
                    agent.update(s, a, r, obs2, done)

                obs = obs2

                # 🟢 Right-side overlay panel (live info)
                try:
                    screen = env.window
                    screen = env.window
                    if screen:
                        grid_px = env.size * env.cell_size
                        panel_width = 220
                        total_width = grid_px + panel_width

                        # Resize window once if not yet adjusted
                        if screen.get_width() < total_width:
                            env.window = pygame.display.set_mode((total_width, grid_px))

                        # Draw dark side panel
                        panel_rect = pygame.Rect(grid_px, 0, panel_width, grid_px)
                        pygame.draw.rect(screen, (20, 20, 30), panel_rect)  # darker blue-gray
                        pygame.draw.rect(screen, (80, 80, 120), panel_rect, 2)  # subtle border

                        lines = [
                            f"Episode: {ep}/{episodes}",
                            f"Steps: {steps}",
                            f"Reward: {total_r:.2f}",
                            f"Epsilon: {getattr(agent, 'epsilon', 0):.2f}",
                            f"Elapsed: {time.time() - start_time:.1f}s"
                        ]
                        y = 30
                        for line in lines:
                            text_surface = font.render(line, True, (255, 255, 255))
                            screen.blit(text_surface, (grid_px + 20, y))
                            y += 30

                        # Optional live reward bar
                        bar_width = int((min(1.0, max(0.0, total_r / 20))) * (panel_width - 40))
                        pygame.draw.rect(screen, (0, 200, 0), (grid_px + 20, y + 10, bar_width, 20))

                        pygame.draw.rect(screen, (25, 25, 25), panel_rect)  # dark background
                        lines = [
                            f"Episode: {ep}/{episodes}",
                            f"Steps: {steps}",
                            f"Reward: {total_r:.2f}",
                            f"Epsilon: {getattr(agent, 'epsilon', 0):.2f}",
                            f"Elapsed: {time.time() - start_time:.1f}s"
                        ]
                        y = 20
                        for line in lines:
                            text_surface = font.render(line, True, (255, 255, 255))
                            screen.blit(text_surface, (env.size * env.cell_size + 10, y))
                            y += 25
                except Exception:
                    pass

                # Render environment
                try:
                    env.render(font=font, episode=ep, ep_reward=total_r)
                except TypeError:
                    env.render()

                # 💬 Terminal live updates every few steps
                if steps % 40 == 0:
                    print(f"Ep {ep} | Step {steps:03d} | Reward={total_r:.2f}", end='\r')

            # End of episode updates
            if isinstance(agent, MonteCarloAgent):
                agent.update(episode_hist)

            if hasattr(agent, 'epsilon'):
                agent.epsilon = max(0.01, agent.epsilon * 0.995)

            episode_rewards.append(total_r)
            successes.append(1 if done and total_r > 0 else 0)
            print(f"\n✅ Episode {ep}/{episodes} finished | Reward={total_r:.2f} | Steps={steps}")

    finally:
        try:
            env.close()
        except Exception:
            pass

    # --- Save policy safely ---
    try:
        def safe_convert(obj):
            """Recursively convert defaultdicts with lambdas to plain dicts."""
            if isinstance(obj, defaultdict):
                obj = {k: safe_convert(v) for k, v in obj.items()}
            return obj

        data_to_save = None
        if algo_name == 'Q-Learning':
            data_to_save = safe_convert(agent.Q)
        elif algo_name == 'Monte Carlo':
            data_to_save = safe_convert(agent.Q)
        else:  # Actor-Critic
            data_to_save = {
                'V': safe_convert(agent.V),
                'pi': safe_convert(agent.pi)
            }

        with open(save_path, 'wb') as f:
            pickle.dump(data_to_save, f)

        print(f"✅ Saved {algo_name} policy successfully to {save_path}")

    except Exception as e:
        print(f"⚠️ Warning: Could not save policy due to {e}")

    # --- Plot training results ---
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(episode_rewards)
    plt.title('Episode Rewards')
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.grid(True)

    plt.subplot(1, 2, 2)
    window = min(50, max(1, len(successes)))
    if len(successes) >= window:
        mov = moving_avg(successes, window)
        x = np.arange(window - 1, window - 1 + len(mov))
        plt.plot(x, mov)
    else:
        plt.plot(successes)
    plt.title(f'Success Rate (moving avg window={window})')
    plt.xlabel('Episode')
    plt.ylabel('Success Rate')
    plt.ylim(-0.05, 1.05)
    plt.tight_layout()
    plt.savefig('training_results.png')
    plt.show()

    return episode_rewards, successes


## 5 RUN WHOLE FLOW

In [76]:
# CELL 5: Run the whole flow (paste last)
# Run this cell to open the Pygame menu, choose algorithm & params, then train & plot.

# 1) show menu
algo, params = menu_and_params()
print("Chosen algorithm:", algo)
print("Parameters:", params)

# 2) optional visuals placeholder (we rely on env.render so keep empty or pass any mapping if your env accepts it)
visuals = {
    'agent_color': (50,100,255),
    'key_color': (255,215,0),
    'lock_color': (0,180,0),
    'wall_color': (80,80,80),
    'enemy_color': (180,0,180),
    'bg_color': (230,230,230)
}

# 3) train & visualize (this will open the env.render Pygame window)
rewards, successes = train_and_visualize(algo, params, save_path='lockkey_policy.pkl')
print("Training complete. Saved policy and plotted results.")


Chosen algorithm: Q-Learning
Parameters: {'episodes': 800, 'gamma': 0.9, 'epsilon': 0.2, 'alpha': 0.1, 'actor_lr': 0.001, 'critic_lr': 0.005, 'phase': 5}

✅ Episode 1/800 finished | Reward=-51.00 | Steps=23
Ep 2 | Step 120 | Reward=-263.00
✅ Episode 2/800 finished | Reward=-420.00 | Steps=159

✅ Episode 3/800 finished | Reward=-12.00 | Steps=12
Ep 4 | Step 080 | Reward=-221.00
✅ Episode 4/800 finished | Reward=-221.00 | Steps=80
Ep 5 | Step 040 | Reward=-94.00
✅ Episode 5/800 finished | Reward=-161.00 | Steps=61


SystemExit: 