In [1]:
import numpy as np

In [12]:
class BananaGridWorld:
    def __init__(self):
        self.grid_size = 3
        self.start_position = (2, 0)  # Bottom-left corner
        self.goal_position = (0, 2)  # Top-right corner (Banana)
        self.agent_position = self.start_position

    def reset(self):
        """ Reset the environment to the initial state. """
        self.agent_position = self.start_position
        return self.agent_position

    def step(self, action):
        """Take a step in the environment.

        Args:
            action (str): ['up', 'down', 'left', 'right']

        Returns:
            tuple: (new_position, reward, done)
        """

        # Decompose into x and y
        x, y = self.agent_position

        # This can be confusing (x left/right vs y up/down, but I did this only because of
        # drawing the state in the render() function below. You can change this to the more
        # intuitive version and then have a better render() function
        if action == 'up' and x > 0:
            x -= 1
        elif action == 'down' and x < self.grid_size - 1:
            x += 1
        elif action == 'left' and y > 0:
            y -= 1
        elif action == 'right' and y < self.grid_size - 1:
            y += 1

        # Update the agent's position
        self.agent_position = (x, y)

        # Return 
        if self.agent_position == self.goal_position:
            return self.agent_position, 10, True
        else:
            return self.agent_position, -1, False

    def render(self):
        """Print the grid with the agent's current position."""
        # Make a grid_size x grid_size array filled with '-'
        grid = [['-' for _ in range(self.grid_size)] for _ in range(self.grid_size)]

        # Replace the - at the monkey's position with an M
        x, y = self.agent_position
        grid[x][y] = 'M'  # M for Monkeuy

        # And same for the Banana
        x, y = self.goal_position
        grid[x][y] = 'B'

        # Print the grid
        for row in grid:
            print(' '.join(row))
        print() # With an extra newline


## Testing the environment

In [13]:
testenv = BananaGridWorld()
testenv.reset()
testenv.render()

_, r, done = testenv.step('right')
testenv.render()
print(f"We got reward {r} and the episode is {'' if done else 'not'} done")

_, r, done = testenv.step('right')
testenv.render()
_, r, done = testenv.step('up')
testenv.render()
_, r, done = testenv.step('up')
testenv.render()

testenv.render()
print(f"We got reward {r} and the episode is {'' if done else 'not'} done")

- - B
- - -
M - -

- - B
- - -
- M -

We got reward -1 and the episode is not done
- - B
- - -
- - M

- - B
- - M
- - -

- - B
- - -
- - -

- - B
- - -
- - -

We got reward 10 and the episode is  done


In [15]:
# Actions the agent can take
actions = ['up', 'down', 'left', 'right']

# Initialize the environment
env = BananaGridWorld()

# Q-learning parameters
q_table = {}  # Q-table, which we will fill with *tuples* of the form ( state, action )
alpha = 0.1  # Learning rate
epsilon = 0.1

def get_q_value(state, action):
    # Implement returning the value at key '(state,action)', and include a default of 0 if (state,action) is not in the dictionary
    # using dict.get( key, default_value )
    return q_table.get((state, action), 0.0)

def update_q_value(state, action, reward, next_state, next_action):
    # Implement the update rule to update the value at q_table[(state,action)] 
    current_q = get_q_value(state, action)
    #next_q = get_q_value(next_state, next_action)
    next_q = max([get_q_value(next_state, a) for a in actions])
    q_table[(state, action)] = (1-alpha)*current_q + alpha * (reward + next_q)

def choose_action(state):
    if np.random.rand() < epsilon:
        return np.random.choice(actions)  # Explore
    else:
        q_values = {action: get_q_value(state, action) for action in actions}
        return max(q_values, key=q_values.get)  # Exploit

# Run Q-learning agent
num_episodes = 1000
for episode in range(num_episodes):

    # Restart at each episode
    observation = env.reset()
    done = False
    total_reward = 0

    print(f"Episode {episode + 1}")
    env.render()

    while not done:
        action = choose_action(observation)
        
        next_observation, reward, done = env.step(action)
        total_reward += reward

        next_action = choose_action(next_observation)

        # Update Q-value
        update_q_value(observation, action, reward, next_observation, next_action)
        
        observation = next_observation
        #env.render()

    print(f"Episode finished with total reward: {total_reward}\n")

Episode 1
- - B
- - -
M - -

Episode finished with total reward: -15

Episode 2
- - B
- - -
M - -

Episode finished with total reward: 2

Episode 3
- - B
- - -
M - -

Episode finished with total reward: 7

Episode 4
- - B
- - -
M - -

Episode finished with total reward: 4

Episode 5
- - B
- - -
M - -

Episode finished with total reward: -1

Episode 6
- - B
- - -
M - -

Episode finished with total reward: 4

Episode 7
- - B
- - -
M - -

Episode finished with total reward: 4

Episode 8
- - B
- - -
M - -

Episode finished with total reward: 3

Episode 9
- - B
- - -
M - -

Episode finished with total reward: 6

Episode 10
- - B
- - -
M - -

Episode finished with total reward: 0

Episode 11
- - B
- - -
M - -

Episode finished with total reward: 7

Episode 12
- - B
- - -
M - -

Episode finished with total reward: 7

Episode 13
- - B
- - -
M - -

Episode finished with total reward: 5

Episode 14
- - B
- - -
M - -

Episode finished with total reward: 7

Episode 15
- - B
- - -
M - -

Episode fi

## Todo's:

* 1) Make plots of the total reward over time (episodes)
  2) See how changing the hyperparameters influences this (e.g. gamma = 0 should do what?)
  3) Compare SARSA and Q-learning in the 'cliffwalking' game
