#yourNAME, yourID

# Imports

In [1]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

# LuffyDodge (3)

In [7]:
class LuffyDodgeEnv(gym.Env):
    """
    Custom environment where Luffy must dodge cannonballs.
    Luffy can move left, right, or stay still to avoid being hit.
    """

    metadata = {"render_modes": ["human"]}

    def __init__(self, render_mode=None):
        """
        Initialize environment parameters.
        """
        super(LuffyDodgeEnv, self).__init__()

        # Actions: 0 = left, 1 = stay, 2 = right
        self.action_space = spaces.Discrete(3)

        # Observation: [Luffy_x, Cannonball_x, Cannonball_y]
        self.observation_space = spaces.Box(
            low=np.array([0, 0, 0], dtype=np.float32),
            high=np.array([9, 9, 9], dtype=np.float32),
            dtype=np.float32
        )

        self.render_mode = render_mode
        self.reset()

    def reset(self, seed=None, options=None):
        """
        Reset the environment to its initial state.
        """
        super().reset(seed=seed)

        # Luffy starts in the middle bottom
        self.luffy_x = 5.0

        # Cannonball starts at a random x and top y = 9
        self.cannon_x = np.random.randint(0, 10)
        self.cannon_y = 9.0

        # Return initial observation
        obs = np.array([self.luffy_x, self.cannon_x, self.cannon_y], dtype=np.float32)
        return obs, {}

    def step(self, action):

        # Step 1: Update Luffy's position based on action
      if action == 0:  # Move left
          self.luffy_x = max(0.0, self.luffy_x - 1.0)
      elif action == 2:  # Move right
          self.luffy_x = min(9.0, self.luffy_x + 1.0)
        # action == 1 means stay still, so no change to luffy_x

        # Step 2: Move cannonball down by one step
      self.cannon_y -= 1.0

        # Step 3: Check for collision
        # Collision happens when cannonball reaches y=0 and has same x as Luffy
      if self.cannon_y == 0.0 and int(self.luffy_x) == int(self.cannon_x):
            # Hit! Game over
            reward = -10.0
            done = True
            truncated = False
      elif self.cannon_y < 0.0:
            # Cannonball missed Luffy and went past bottom
            # Reset cannonball to top with random x position
            self.cannon_x = np.random.randint(0, 10)
            self.cannon_y = 9.0
            reward = +1.0
            done = False
            truncated = False
      else:
            # Cannonball is still falling, Luffy survived this step
            reward = +1.0
            done = False
            truncated = False

        # Step 4: Return observation and other info
      obs = np.array([self.luffy_x, self.cannon_x, self.cannon_y], dtype=np.float32)
      info = {}

      return obs, reward, done, truncated, info

    def render(self):
        """
        Render the environment as simple text output.
        """
        grid = np.full((10, 10), " ", dtype=str)
        grid[int(self.cannon_y), int(self.cannon_x)] = "O"
        grid[0, int(self.luffy_x)] = "L"
        print("\n".join(["".join(row) for row in grid[::-1]]))
        print("-" * 10)

    def close(self):
        """Close the environment."""
        pass

In [8]:
env = LuffyDodgeEnv()
obs, _ = env.reset()

done = False
while not done:
    action = env.action_space.sample()
    obs, reward, done, truncated, info = env.step(action)
    env.render()
    print(f"Action: {action}, Reward: {reward}")

          
 O        
          
          
          
          
          
          
          
      L   
----------
Action: 2, Reward: 1.0
          
          
 O        
          
          
          
          
          
          
       L  
----------
Action: 2, Reward: 1.0
          
          
          
 O        
          
          
          
          
          
       L  
----------
Action: 1, Reward: 1.0
          
          
          
          
 O        
          
          
          
          
      L   
----------
Action: 0, Reward: 1.0
          
          
          
          
          
 O        
          
          
          
      L   
----------
Action: 1, Reward: 1.0
          
          
          
          
          
          
 O        
          
          
     L    
----------
Action: 0, Reward: 1.0
          
          
          
          
          
          
          
 O        
          
    L     
----------
Action: 0, Rewa

# Policy Evaluation (2)

In [9]:
# Grid size
GRID_SIZE = 5

# Actions: 0 = left, 1 = stay, 2 = right
ACTIONS = [0, 1, 2]
NUM_ACTIONS = len(ACTIONS)

# All possible states: (Luffy_x, Cannon_x, Cannon_y)
states = [(lx, cx, cy) for lx in range(GRID_SIZE)
                         for cx in range(GRID_SIZE)
                         for cy in range(GRID_SIZE)]
NUM_STATES = len(states)

# Initialize policy randomly
policy = np.random.choice(ACTIONS, size=NUM_STATES)

# Initialize value function
V = np.zeros(NUM_STATES)

# Discount factor
gamma = 0.9


def state_to_index(state):
    """Convert a (luffy_x, cannon_x, cannon_y) state tuple into its index in the state list."""
    lx, cx, cy = state
    return lx * GRID_SIZE * GRID_SIZE + cx * GRID_SIZE + cy


def transition(state, action):
    """
    Deterministic transition function for the environment.

    Parameters
    ----------
    state : tuple (luffy_x, cannon_x, cannon_y)
    action : int (0=left, 1=stay, 2=right)

    Returns
    -------
    next_state : tuple or None
        Next state after taking the action.
    reward : float
        Reward received after the transition.
    done : bool
        Whether the episode terminates (Luffy hit by cannonball).
    """
    lx, cx, cy = state

    # Move Luffy
    if action == 0:
        lx = max(0, lx - 1)
    elif action == 2:
        lx = min(GRID_SIZE - 1, lx + 1)

    # Move cannonball down
    cy -= 1

    # Check terminal condition
    if cy < 0:
        if lx == cx:
            # Hit
            return None, -10.0, True
        else:
            # Miss → reset cannonball to top
            return (lx, cx, GRID_SIZE - 1), +1.0, False

    # Otherwise just one step closer to bottom
    return (lx, cx, cy), +1.0, False


def policy_evaluation(policy, V, theta=1e-4):

    while True:
        delta = 0.0

        # Loop through all states
        for s, state in enumerate(states):
            # Save old value
            v_old = V[s]

            # Get action from policy
            action = policy[s]

            # Get transition
            next_state, reward, done = transition(state, action)

            # Bellman update
            if done:
                V[s] = reward
            else:
                next_state_index = state_to_index(next_state)
                V[s] = reward + gamma * V[next_state_index]

            # Track maximum change
            delta = max(delta, abs(v_old - V[s]))

        # Check convergence
        if delta < theta:
            break

    return V


def policy_improvement(V, policy):
    """Greedy policy improvement based on the current value function."""
    policy_stable = True
    for s, state in enumerate(states):
        old_action = policy[s]
        action_values = []

        # Try all actions and pick the best one
        for a in ACTIONS:
            next_state, reward, done = transition(state, a)
            if done:
                action_values.append(reward)
            else:
                action_values.append(reward + gamma * V[state_to_index(next_state)])

        best_action = np.argmax(action_values)
        policy[s] = best_action

        # Check if policy changed
        if old_action != best_action:
            policy_stable = False
    return policy, policy_stable


def policy_iteration():
    """Run the full policy iteration loop."""
    global V, policy
    iteration = 0
    while True:
        iteration += 1
        V = policy_evaluation(policy, V)
        policy, stable = policy_improvement(V, policy)
        print(f"Iteration {iteration} completed.")
        if stable:
            print("✅ Policy converged!")
            break
    return policy, V


if __name__ == "__main__":
    optimal_policy, optimal_V = policy_iteration()

    print("\nOptimal policy (sample of 10 states):")
    for i in range(10):
        print(f"State {states[i]} → Action {optimal_policy[i]}")


Iteration 1 completed.
Iteration 2 completed.
Iteration 3 completed.
Iteration 4 completed.
Iteration 5 completed.
Iteration 6 completed.
Iteration 7 completed.
Iteration 8 completed.
Iteration 9 completed.
Iteration 10 completed.
Iteration 11 completed.
Iteration 12 completed.
Iteration 13 completed.
Iteration 14 completed.
Iteration 15 completed.
Iteration 16 completed.
Iteration 17 completed.
Iteration 18 completed.
Iteration 19 completed.
Iteration 20 completed.
Iteration 21 completed.
Iteration 22 completed.
Iteration 23 completed.
Iteration 24 completed.
Iteration 25 completed.
Iteration 26 completed.
Iteration 27 completed.
Iteration 28 completed.
Iteration 29 completed.
Iteration 30 completed.
Iteration 31 completed.
Iteration 32 completed.
Iteration 33 completed.
Iteration 34 completed.
Iteration 35 completed.
Iteration 36 completed.
✅ Policy converged!

Optimal policy (sample of 10 states):
State (0, 0, 0) → Action 2
State (0, 0, 1) → Action 0
State (0, 0, 2) → Action 0
State