# Reinforcement Learning practice
## Implementing the SARSA on-policy algorithm for learning the optimal policy
Let's consider a simple 3x3 grid world scenario

In [51]:
import random
import numpy as np
import matplotlib.pyplot as plt
import time

In [52]:
player = -1
goal = 1
initial_state = 0
final_state = 8

states = [0 for i in range(0,9)]
states[initial_state] = player
states[final_state] = goal
visited_states = [initial_state]
learning_history = [states.copy()]
print(f"states: {states}")
print(f"learning history: {learning_history}")

states: [-1, 0, 0, 0, 0, 0, 0, 0, 1]
learning history: [[-1, 0, 0, 0, 0, 0, 0, 0, 1]]


In [53]:
actions = ["up", "down", "left", "right"]
goal_reward = 1
wall_reward = -1
repeated_state_reward = -1
step_reward = -0.1

gamma = 0.9
alpha = 0.5

In [54]:
def reset_position(states, visited_states, learning_history):
    states = [0 for i in range(0,9)]
    states[initial_state] = player
    states[final_state] = goal
    visited_states = [initial_state]
    learning_history = [states.copy()]
    return states, visited_states, learning_history

In [55]:
transition_model = {
    0: {
        "up": 0,
        "down": 3,
        "right": 1,
        "left": 0
    },
    1: {
        "up": 1,
        "down": 4,
        "right": 2,
        "left": 0
    },
    2: {
        "up": 2,
        "down": 5,
        "right": 2,
        "left": 1
    },
    3: {
        "up": 0,
        "down": 6,
        "right": 4,
        "left": 3
    },
    4: {
        "up": 1,
        "down": 7,
        "right": 5,
        "left": 3
    },
    5: {
        "up": 2,
        "down": 8,
        "right": 5,
        "left": 4
    },
    6: {
        "up": 3,
        "down": 6,
        "right": 7,
        "left": 6
    },
    7: {
        "up": 4,
        "down": 7,
        "right": 8,
        "left": 6
    },
    8: {
        "up": 5,
        "down": 8,
        "right": 8,
        "left": 7
    }
}

In [56]:
q_table = {
    0: {
        "up": 0,
        "down": 0,
        "right": 0,
        "left": 0
    },
    1: {
        "up": 0,
        "down": 0,
        "right": 0,
        "left": 0
    },
    2: {
        "up": 0,
        "down": 0,
        "right": 0,
        "left": 0
    },
    3: {
        "up": 0,
        "down": 0,
        "right": 0,
        "left": 0
    },
    4: {
        "up": 0,
        "down": 0,
        "right": 0,
        "left": 0
    },
    5: {
        "up": 0,
        "down": 0,
        "right": 0,
        "left": 0
    },
    6: {
        "up": 0,
        "down": 0,
        "right": 0,
        "left": 0
    },
    7: {
        "up": 0,
        "down": 0,
        "right": 0,
        "left": 0
    },
    8: {
        "up": 0,
        "down": 0,
        "right": 0,
        "left": 0
    }
}

In [57]:
def choose_action(current_state):
    # let's choose an action with ε greedy
    print(f"q scores: {q_table.get(current_state)}")
    
    epsilon = 0.05
    sample = random.uniform(0,1)
 
    next_action = "up" # setting an initial arbitrary value
    best_q_value = q_table.get(current_state).get(next_action) # setting an initial value 

    for action in actions:
        if q_table.get(current_state).get(action) > best_q_value:
            next_action = action
            best_q_value = q_table.get(current_state).get(next_action)
    
    # for now my next_action is the best according to greedy selection.
    
    if sample > epsilon:
        # greedy case
        next_action = next_action
        print(f"greedy action: {next_action}")
        
    else:
        actions_without_the_best = [action for action in actions if action != next_action]
        next_action = random.choice(actions_without_the_best)
        best_q_value = q_table.get(current_state).get(next_action)
        print(f"random action: {next_action}")
    

    return next_action, best_q_value
    

def get_reward(current_state, new_state):
    if current_state == final_state:
        reward = goal_reward
        print("I reached the goal!")

    elif current_state == new_state:
        reward = wall_reward
        print("I hit the wall!")

    elif new_state in visited_states:
        print("I have already visited this state!")
        reward = repeated_state_reward
    
    elif current_state != new_state:
        print("I made a step!")
        reward = step_reward

    else:
        print("what is going on???")
        exit(0)
    
    return reward
    

def move_player(current_state, new_state):
    # print(f"current state: {current_state}")
    # print(f"new state: {new_state}")

    states[current_state] = 0
    states[new_state] = -1
    learning_history.append(states.copy())
    if new_state not in visited_states:
        visited_states.append(new_state)
    

In [58]:
def print_maze(states):
    print(states[:3])
    print(states[3:6])
    print(states[6:])

In [103]:
states, visited_states, learning_history = reset_position(states, visited_states, learning_history)
print_maze(states)

training_time = range(0,1000)
current_state = initial_state
current_action, old_q_estimate = choose_action(current_state) # return a string containing the action and the current q_value?

print(f"I'm in position {current_state}. I will select {current_action}")

for t in training_time: # for each episode
    # choose the action according to the q_table
    
    new_state = transition_model.get(current_state).get(current_action)
    print(f"I'm moving to the position {new_state}")
    reward = get_reward(current_state, new_state)
    print(f"reward: {reward}")
    
    move_player(current_state, new_state)
    print_maze(states)

    new_action, next_q_estimate = choose_action(new_state)
    print(f"From {new_state}, i select {new_action} as the next action")

    updated_q_estimate = old_q_estimate + alpha*(reward + gamma*next_q_estimate - old_q_estimate)
    q_table[current_state][current_action] = updated_q_estimate

    current_state = new_state
    current_action, old_q_estimate = new_action, next_q_estimate

    if current_state == 8:
        break

print(f"Training time (iterations): {t}")

[-1, 0, 0]
[0, 0, 0]
[0, 0, 1]
q scores: {'up': -0.975, 'down': -0.343899998589838, 'right': -0.7047098046875, 'left': -1.2290905687093736}
greedy action: down
I'm in position 0. I will select down
I'm moving to the position 3
I made a step!
reward: -0.1
[0, 0, 0]
[-1, 0, 0]
[0, 0, 1]
q scores: {'up': -0.7725, 'down': -0.525, 'right': -0.27099999821966053, 'left': -0.975}
greedy action: right
From 3, i select right as the next action
I'm moving to the position 4
I made a step!
reward: -0.1
[0, 0, 0]
[0, -1, 0]
[0, 0, 1]
q scores: {'up': -0.5, 'down': -0.1899999998137355, 'right': -0.344912109375, 'left': -0.5225}
greedy action: down
From 4, i select down as the next action
I'm moving to the position 7
I made a step!
reward: -0.1
[0, 0, 0]
[0, 0, 0]
[0, -1, 1]
q scores: {'up': -0.5, 'down': -0.5, 'right': -0.09999999999417925, 'left': -0.725}
greedy action: right
From 7, i select right as the next action
I'm moving to the position 8
I made a step!
reward: -0.1
[0, 0, 0]
[0, 0, 0]
[0, 0,