In [13]:
import numpy as np

In [14]:
class BananaGridWorld:
    def __init__(self):
        self.grid_size = 3
        self.start_position = (2, 0)  # Bottom-left corner
        self.goal_position = (0, 2)  # Top-right corner (Banana)
        self.agent_position = self.start_position

    def reset(self):
        """ Reset the environment to the initial state. """
        self.agent_position = self.start_position
        return self.agent_position

    def step(self, action):
        """Take a step in the environment.

        Args:
            action (str): ['up', 'down', 'left', 'right']

        Returns:
            tuple: (new_position, reward, done)
        """

        # Decompose into x and y
        x, y = self.agent_position

        # This can be confusing (x left/right vs y up/down, but I did this only because of
        # drawing the state in the render() function below. You can change this to the more
        # intuitive version and then have a better render() function
        if action == 'up' and x > 0:
            x -= 1
        elif action == 'down' and x < self.grid_size - 1:
            x += 1
        elif action == 'left' and y > 0:
            y -= 1
        elif action == 'right' and y < self.grid_size - 1:
            y += 1

        # Update the agent's position
        self.agent_position = (x, y)

        # Return 
        # ... TODO

    def render(self):
        """Print the grid with the agent's current position."""
        # Make a grid_size x grid_size array filled with '-'
        grid = [['-' for _ in range(self.grid_size)] for _ in range(self.grid_size)]

        # Replace the - at the monkey's position with an M
        x, y = self.agent_position
        grid[x][y] = 'M'  # M for Monkeuy

        # And same for the Banana
        x, y = self.goal_position
        grid[x][y] = 'B'

        # Print the grid
        for row in grid:
            print(' '.join(row))
        print() # With an extra newline


## Testing the environment

In [15]:
testenv = BananaGridWorld()
testenv.reset()
testenv.render()

_, r, done = testenv.step('right')
testenv.render()
print(f"We got reward {r} and the episode is {'' if done else 'not'} done")

_, r, done = testenv.step('right')
testenv.render()
_, r, done = testenv.step('up')
testenv.render()
_, r, done = testenv.step('up')
testenv.render()

testenv.render()
print(f"We got reward {r} and the episode is {'' if done else 'not'} done")

- - B
- - -
M - -



TypeError: cannot unpack non-iterable NoneType object

In [17]:
# Actions the agent can take
actions = ['up', 'down', 'left', 'right']

# Initialize the environment
env = BananaGridWorld()

# Q-learning parameters
q_table = {}  # Q-table, which we will fill with *tuples* of the form ( state, action )
alpha = 0.1  # Learning rate
epsilon = 0.1

def get_q_value(state, action):
    # Implement returning the value at key '(state,action)', and include a default of 0 if (state,action) is not in the dictionary
    # using dict.get( key, default_value )
    return 0

def update_q_value(state, action, reward):
    # Implement the update rule to update the value at q_table[(state,action)] 
    current_q = get_q_value(state, action)
    #q_table[(state, action)] = ...

def choose_action(state):
    # Implement an epsilon-greedy policy (so w/ prob epsilon choose randomly from actions, otherwise pick according to Q
    if np.random.rand() < epsilon:
        # Explore, pick randomly
        return 0
    else:
        # Exploit, pick action with highest q_value
        return 0

# Run Q-learning agent
num_episodes = 100
for episode in range(num_episodes):

    # Restart at each episode
    observation = env.reset()
    done = False
    total_reward = 0

    print(f"Episode {episode + 1}")
    env.render()

    while not done:
        action = choose_action(observation)
        next_observation, reward, done = env.step(action)
        total_reward += reward

        # Update Q-value
        update_q_value(observation, action, reward, next_observation)
        observation = next_observation

        env.render()

    print(f"Episode finished with total reward: {total_reward}\n")

Episode 1
- - B
- - -
M - -



TypeError: cannot unpack non-iterable NoneType object

## Todo's:

* 1) Make plots of the total reward over time (episodes)
  2) See how changing the hyperparameters influences this (e.g. gamma = 0 should do what?)
  3) Compare SARSA and Q-learning in the case with cliffs