In [1]:
using PyPlot
ioff()

In [2]:
# world height
WORLD_HEIGHT = 4

# world width
WORLD_WIDTH = 12

# probability for exploration
EPSILON = 0.5

# step size
ALPHA = 0.5

# gamma for Q-Learning and Expected Sarsa
GAMMA = 1

# all possible actions
ACTION_UP = 1
ACTION_DOWN = 2
ACTION_LEFT = 3
ACTION_RIGHT = 4
ACTIONS = [ACTION_UP, ACTION_DOWN, ACTION_LEFT, ACTION_RIGHT]

# initial state action pair values
START = [4, 1]
GOAL = [4, 12]

2-element Array{Int64,1}:
  4
 12

In [3]:
function step(state, action)
    i, j = state
    if action == ACTION_UP
        next_state = [max(i - 1, 1), j]
    elseif action == ACTION_LEFT
        next_state = [i, max(j - 1, 1)]
    elseif action == ACTION_RIGHT
        next_state = [i, min(j + 1, WORLD_WIDTH)]
    elseif action == ACTION_DOWN
        next_state = [min(i + 1, WORLD_HEIGHT), j]
    else
        return false
    end
    
    reward = -1
    if (action == ACTION_DOWN && i == 3 && 2 <= j && j <= 11) ||
        (action == ACTION_RIGHT && state == START)
        reward = -100
        next_state = START
    end
    return next_state, reward
end

step

step (generic function with 1 method)

In [4]:
function choose_action(state, q_value)
    if rand(1)[1] < EPSILON
        return ACTIONS[rand(1:4)]
    else
        values_ = q_value[state[1], state[2],:]
        action = []
        for i in enumerate(values_)
            action_,value_ = i
            if value_ == maximum(values_)
                push!(action,action_)
            end
        end
        action = action[rand(1:length(action))]
        return action
    end
end

choose_action (generic function with 1 method)

In [5]:
function sarsa(q_value, expected = false, step_size = ALPHA)
    state = START
    action = choose_action(state, q_value)
    rewards = 0.0
    while state != GOAL
        next_state, reward = step(state, action)
        next_action = choose_action(next_state,q_value)
        rewards += reward
        if !expected
            target = q_value[next_state[1], next_state[2], next_action]
        else
            target = 0.0
            q_next = q_value[next_state[1], next_state[2], :]
            best_actions = transpose(hcat(nonzero(q_next == np.max(q_next))...))
            for action_ in ACTIONS
                if action_ in best_actions
                    target += ((1.0 - EPSILON) / len(best_actions) + EPSILON / len(ACTIONS)) * q_value[next_state[1], next_state[2], action_]
                else
                    target += EPSILON / len(ACTIONS) * q_value[next_state[1], next_state[2], action_]
                end
            end
        end
        target = target * GAMMA
        q_value[state[1], state[2], action] += step_size * (reward + target - q_value[state[1], state[2], action])
        state = next_state
        action = next_action
    end
    return rewards
end            

sarsa (generic function with 3 methods)

In [6]:
function q_learning(q_value, step_size = ALPHA)
    state = START
    rewards = 0.0
    while state != GOAL
        action = choose_action(state, q_value)
        next_state, reward = step(state, action)
        rewards += reward
        # Q-Learning update
        q_value[state[1], state[2], action] += step_size * (
                reward + GAMMA * maximum(q_value[next_state[1], next_state[2], :]) -
                q_value[state[1], state[2], action])
        state = next_state
    end
    return rewards
end

q_learning (generic function with 2 methods)

In [7]:
#FUNCION PARA IMPRIMIR POLITICA ROBADA ILEGALMENTE DEL RICARDO
function print_optimal_policy(q_value)
    optimal_policy = []
    for i in range(1, stop=WORLD_HEIGHT)
        push!(optimal_policy,[])
        for j in range(1, stop=WORLD_WIDTH)
            if [i, j] == GOAL
                append!(optimal_policy[end], 'G')
                continue
            end
            bestAction = argmax(q_value[i, j, :])
            if bestAction == ACTION_UP
                append!(optimal_policy[end], 'U')
            elseif bestAction == ACTION_DOWN
                append!(optimal_policy[end], 'D')
            elseif bestAction == ACTION_LEFT
                append!(optimal_policy[end], 'L')
            elseif bestAction == ACTION_RIGHT
                append!(optimal_policy[end], 'R')
            end
        end
    end
    for row in optimal_policy
        println(row)
    end
end

print_optimal_policy (generic function with 1 method)

In [None]:
episodes = 500
runs = 500
rewards_q_learning = zeros(episodes)
rewards_sarsa = zeros(episodes)
q_q_learning = []
q_sarsa = []
for r in 1:runs  
    q_q_learning = zeros(WORLD_HEIGHT, WORLD_WIDTH , 4)
    q_sarsa = zeros(WORLD_HEIGHT, WORLD_WIDTH, 4)
    for i in 1:episodes
        rewards_q_learning[i] += q_learning(q_q_learning)
        rewards_sarsa[i] += sarsa(q_sarsa)
    end
end

rewards_q_learning /= runs
rewards_sarsa /= runs

println("Q-Learning Optimal Policy:")
print_optimal_policy(q_q_learning)
println("SARSA Optimal Policy:")
print_optimal_policy(q_sarsa)


In [None]:
# draw reward curves
Plots.plot(rewards_sarsa, label="Sarsa", fmt = :png)
Plots.plot!(rewards_q_learning, label="Q-Learning")
Plots.xaxis!("Episodes")
Plots.yaxis!("Sum of rewards during episode")