# Let's race!

## Can you learn to turn?

We will implement a Reinforcement Learning Framework according to class. We need:

1. An **environment** 
2. An **agent**
3. A method for their **interaction**

We will try to keep it as simple as possible sacrificing generality. For this tashs we will create `types` which are Julias `classes` (Julia's class system is more similar to `C` rather than `Python` classes) and `modules` (functions that work on those classes).

In [1]:
# OUR REINFORCEMENT LEARNING FRAMEWORK MODULE
type RLEnv # reinforcement learning environment, should be immutable for efficiency
    state_space::Array{Any, 1} # 
    trans_fun::Function # (state, action) -> (new_state, reward)
    action_set::Function # (state) -> (array of available actions from state_space)
end
type RLAgent
    policy::Function # (state) -> (action)
    state::Any # current
end
function interact!(agent::RLAgent, env::RLEnv)
    # la convencion (opcional) de julia es incluir '!' al final de una funcion si modifica sus argumentos 
    new_state, reward = env.trans_fun(agent.state, agent.policy(agent.state))
    agent.state = new_state
    return new_state, reward # a veces es conveniente regresar los rewards de cada iteracion
end
# multiple distpatch at work! What if we want to interact fixing an action????
function interact!(agent::RLAgent, env::RLEnv, action)
    # la convencion (opcional) de julia es incluir '!' al final de una funcion si modifica sus argumentos 
    new_state, reward = env.trans_fun(agent.state, action)
    agent.state = new_state
    return new_state, reward # a veces es conveniente regresar los rewards de cada iteracion
end

interact! (generic function with 2 methods)

Esta estructura es muy flexible y vamos a demostrar como podemos resolver problemas con ella

# Are you ready to race!!!!!!! brum brum!

In [26]:
race_track = readcsv("data/race_track.csv")
function print_race_track(track, pos)
    print("Voici our race track!\n")
    for i in 1:size(track, 1)  
        for j in 1:size(track, 2)
            x = track[i, j]
            if pos == [i, j] x = "+"; color=:red end
            if x == "*" color = :green elseif x=="" x=" ";color=:white elseif x=="F" color=:red end
                print_with_color(color, x, " ")
        end
        print("\n")
    end
    print("F: Finish line\n*: Outside the track\n+: Racer position\n")
end
print_race_track(race_track, [16, 4])    

Voici our race track!
[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[31mF [0m[1m[31mF [0m[1m[31mF [0m[1m[31mF [0m[1m[32m* [0m[1m[32m* [0m
[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[37m  [0m[1m[37m  [0m[1m[37m  [0m[1m[37m  [0m[1m[37m  [0m[1m[32m* [0m[1m[32m* [0m
[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[37m  [0m[1m[37m  [0m[1m[37m  [0m[1m[37m  [0m[1m[37m  [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m
[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[32m* [0m[1m[37m  [0m[1m[37m  [0m[1m[37m  [0m[1m[37m  [0m[1m[37m  [0m[1m[37m  [0m[1m[32m* [0m



In [3]:
state_space = [[i,j] for i in 1:size(race_track, 1) for j in 1:size(race_track, 2)] # list comprenhensions cool
println(join(state_space,",")) # to save space

[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7],[1,8],[1,9],[1,10],[1,11],[1,12],[1,13],[1,14],[1,15],[1,16],[1,17],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7],[2,8],[2,9],[2,10],[2,11],[2,12],[2,13],[2,14],[2,15],[2,16],[2,17],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7],[3,8],[3,9],[3,10],[3,11],[3,12],[3,13],[3,14],[3,15],[3,16],[3,17],[4,1],[4,2],[4,3],[4,4],[4,5],[4,6],[4,7],[4,8],[4,9],[4,10],[4,11],[4,12],[4,13],[4,14],[4,15],[4,16],[4,17],[5,1],[5,2],[5,3],[5,4],[5,5],[5,6],[5,7],[5,8],[5,9],[5,10],[5,11],[5,12],[5,13],[5,14],[5,15],[5,16],[5,17],[6,1],[6,2],[6,3],[6,4],[6,5],[6,6],[6,7],[6,8],[6,9],[6,10],[6,11],[6,12],[6,13],[6,14],[6,15],[6,16],[6,17],[7,1],[7,2],[7,3],[7,4],[7,5],[7,6],[7,7],[7,8],[7,9],[7,10],[7,11],[7,12],[7,13],[7,14],[7,15],[7,16],[7,17],[8,1],[8,2],[8,3],[8,4],[8,5],[8,6],[8,7],[8,8],[8,9],[8,10],[8,11],[8,12],[8,13],[8,14],[8,15],[8,16],[8,17],[9,1],[9,2],[9,3],[9,4],[9,5],[9,6],[9,7],[9,8],[9,9],[9,10],[9,11],[9,12],[9,13],[9,14],[9,15],[9,16],[9,17],[10,1],[10

In [4]:
action_set(state) = ["up", "left", "down", "right"] 
# available actions do NOT depend on current state.... for now...

action_set (generic function with 1 method)

In [5]:
# CAN'T GET OFF THE GRID!
function trans_fun(state, action)
    i, j = state
    if action == "up" && i > 1
        i = i - 1
    elseif action == "right" && j < size(race_track, 2)
        j = j + 1
    elseif action == "left" && j > 1
        j = j - 1
    elseif action == "down" && i < size(race_track, 1)
        i = i + 1
    end
    new_state = [i, j]
    if race_track[i, j] == "*"    
        reward = -5
    elseif race_track[i, j] == "F"
        reward = 0
    else
        reward = -1
    end
    return new_state, reward
end
trans_fun([18, 3], "down")

([18,3],-1)

In [6]:
# THAT'S ALL WE NEED FOR A TASK
race_task = RLEnv(state_space, trans_fun, action_set)

RLEnv(Any[[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7],[1,8],[1,9],[1,10]  …  [18,8],[18,9],[18,10],[18,11],[18,12],[18,13],[18,14],[18,15],[18,16],[18,17]],trans_fun,action_set)

### Who is racing?

Let's create an agent

In [7]:
# random directions (for a start...)
policy(state) = rand(race_task.action_set(state))
state = [18, 3] # we give it an initial state
agent = RLAgent(policy, state)
print_race_track(race_track, state)

Voici our race track!
* * * * * * * * * * * F F F F * *
* * * * * * * * * *           * *
* * * * * * * * *           * * *
* * * * * * *             * * * *
* * * * * *             * * * * *
* * * * *           * * * * * * *
* * * *           * * * * * * * *
* * *           * * * * * * * * *
* *           * * * * * * * * * *
* *           * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* * +       * * * * * * * * * * *
F: Finish line
*: Outside the track
+: Racer position


In [8]:
agent.state = [18, 3]
s, r = interact!(agent, race_task, "up")
println("Move to position $s obtaining reward $r")

Move to position [17,3] obtaining reward -1


In [9]:
# Let's simulate some moves!
for i in 1:10
    s, r = interact!(agent, race_task)
    println("Move to position $s obtaining reward $r")
end

Move to position [17,4] obtaining reward -1
Move to position [17,3] obtaining reward -1
Move to position [17,2] obtaining reward -5
Move to position [17,1] obtaining reward -5
Move to position [17,1] obtaining reward -5
Move to position [16,1] obtaining reward -5
Move to position [16,2] obtaining reward -5
Move to position [16,1] obtaining reward -5
Move to position [16,1] obtaining reward -5
Move to position [16,1] obtaining reward -5


In [12]:
# SOME PRINTING (WILL IMPROVE FOR NEXT CLASS USING ESCHER)
for i in 1:100
    s, r = interact!(agent, race_task)
    IJulia.clear_output(true)
    print_race_track(race_track, s)
    sleep(0.2)
end

Voici our race track!
* * * * * * * * * * * F F F F * *
* * * * * * * * * *           * *
* * * * * * * * *           * * *
* * * * * * * +           * * * *
* * * * * *             * * * * *
* * * * *           * * * * * * *
* * * *           * * * * * * * *
* * *           * * * * * * * * *
* *           * * * * * * * * * *
* *           * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
F: Finish line
*: Outside the track
+: Racer position


## Let's talk about complete races!

Each episodic task has **episodes** (hence the name...). Episodes for different tasks are very different, that's why having a common method was not very practical. But for each task we can create it's episode easily!!! Here's our racing episode!

In [11]:
# the racing episode its fixed for our racing task, but different agent's have different policies
# we can run en episode by fixing a first action or not!! MULTIPLE DISPATCH AT WORK
# Starting fixing an action!
terminal_states = [x for x in race_task.state_space if race_track[x...] == "F"] # list comprenhensions cool!
# optional arguments and keyword arguments
function race_episode(agent, state, action; max_steps = 10000)
    agent.state = state
    new_state, reward = interact!(agent, race_task, action)
    terminate = (new_state in terminal_states)
    step = 0
    rewards = Real[]
    states = Any[]
    while !terminate && step < max_steps 
        new_state, reward  = interact!(agent, race_task)
        step =+ 1
        push!(states, new_state)
        push!(rewards, reward)
        terminate = (new_state in terminal_states)
    end
    return states, rewards
end
function race_episode(agent, state; max_steps = 10000)
    action = agent.policy(state)
    race_episode(agent, state, action, max_steps = max_steps)
end

race_episode (generic function with 2 methods)

In [12]:
states, rewards = race_episode(agent, [18, 3], "up")
println("Terminated episode in $(size(states, 1)) steps with total reward $(sum(rewards)), first step is $(states[1])")

Terminated episode in 1596 steps with total reward -6487, first step is [17,4]


states, rewards = race_episode(agent, [18, 3])
println("Terminated episode in $(size(states, 1)) steps with total reward $(sum(rewards))")

# Let's find the optimal strategy: MCMC with Exploring Starts

This code should work for other environments and agents!

In [14]:
function train_mcmc_es(agent::RLAgent, env::RLEnv, episode_fun; iter = 10000) 
    # Double dictionary, but could be a normal list...
    q = Dict([(s, Dict([(x, 0) for x in env.action_set(s)])) for s in env.state_space])
    count = Dict([(x ,0) for x in env.state_space])
    for i in 1:iter
        state = rand(env.state_space)
        action = rand(env.action_set(state))
        states, rewards = episode_fun(agent, state, action)
        count[state] =+ 1
        q[state][action] += (sum(rewards) - q[state][action]) / count[state]
    end
    return q
end

train_mcmc_es (generic function with 1 method)

In [15]:
q_estim = train_mcmc_es(agent, race_task, race_episode, iter = 1000)

Dict{Array{Int64,1},Dict{String,Int64}} with 306 entries:
  [5,10]  => Dict("left"=>-4733,"right"=>0,"down"=>-63,"up"=>-4173)
  [6,9]   => Dict("left"=>-1258,"right"=>0,"down"=>0,"up"=>0)
  [17,13] => Dict("left"=>0,"right"=>-5443,"down"=>-21591,"up"=>0)
  [3,2]   => Dict("left"=>-1757,"right"=>-5051,"down"=>-6755,"up"=>-3146)
  [9,8]   => Dict("left"=>-1084,"right"=>-22301,"down"=>0,"up"=>0)
  [1,7]   => Dict("left"=>0,"right"=>0,"down"=>-447,"up"=>-484)
  [11,12] => Dict("left"=>-6672,"right"=>0,"down"=>-6722,"up"=>-7869)
  [5,17]  => Dict("left"=>0,"right"=>-852,"down"=>0,"up"=>0)
  [8,3]   => Dict("left"=>0,"right"=>-1780,"down"=>0,"up"=>-6397)
  [1,15]  => Dict("left"=>0,"right"=>-47,"down"=>-9051,"up"=>0)
  [12,5]  => Dict("left"=>0,"right"=>-4855,"down"=>-8285,"up"=>0)
  [14,4]  => Dict("left"=>-4084,"right"=>-2826,"down"=>0,"up"=>-435)
  [1,17]  => Dict("left"=>0,"right"=>-5,"down"=>-26,"up"=>0)
  [3,4]   => Dict("left"=>0,"right"=>0,"down"=>-99,"up"=>-2832)
  [16,14] => Dict("

In [16]:
function improve_policy(q)
    function new_policy(s)
        max, idx = findmax(collect(values(q[s])))
        action = collect(keys(q[s]))[idx]
        return action
    end
    return new_policy
end

improve_policy (generic function with 1 method)

In [17]:
new_pol = improve_policy(q_estim)
for s in race_task.state_space 
    println("$s will now go $(new_pol(s))") 
end

[1,1] will now go right
[1,2] will now go left
[1,3] will now go left
[1,4] will now go left
[1,5] will now go left
[1,6] will now go left
[1,7] will now go left
[1,8] will now go left
[1,9] will now go left
[1,10] will now go left
[1,11] will now go right
[1,12] will now go left
[1,13] will now go left
[1,14] will now go left
[1,15] will now go left
[1,16] will now go left
[1,17] will now go left
[2,1] will now go left
[2,2] will now go right
[2,3] will now go right
[2,4] will now go right
[2,5] will now go left
[2,6] will now go left
[2,7] will now go left
[2,8] will now go right
[2,9] will now go down
[2,10] will now go left
[2,11] will now go left
[2,12] will now go down
[2,13] will now go left
[2,14] will now go left
[2,15] will now go left
[2,16] will now go left
[2,17] will now go down
[3,1] will now go left
[3,2] will now go left
[3,3] will now go left
[3,4] will now go left
[3,5] will now go right
[3,6] will now go left
[3,7] will now go right
[3,8] will now go right
[3,9] wil

Now train a lot

In [20]:
agent.state = [18, 3]
for i in 1:100
    q_estim = train_mcmc_es(agent, race_task, race_episode, iter = 100)
    agent.policy = improve_policy(q_estim)
    print("finished iter $i")
end

finished iter 1

LoadError: InterruptException:

In [19]:
new_pol = improve_policy(q_estim)
for s in race_task.state_space 
    println("$s will now go $(new_pol(s))") 
end

[1,1] will now go right
[1,2] will now go left
[1,3] will now go left
[1,4] will now go left
[1,5] will now go left
[1,6] will now go left
[1,7] will now go left
[1,8] will now go left
[1,9] will now go left
[1,10] will now go left
[1,11] will now go right
[1,12] will now go left
[1,13] will now go left
[1,14] will now go left
[1,15] will now go left
[1,16] will now go left
[1,17] will now go left
[2,1] will now go left
[2,2] will now go right
[2,3] will now go right
[2,4] will now go right
[2,5] will now go left
[2,6] will now go left
[2,7] will now go left
[2,8] will now go right
[2,9] will now go down
[2,10] will now go left
[2,11] will now go left
[2,12] will now go down
[2,13] will now go left
[2,14] will now go left
[2,15] will now go left
[2,16] will now go left
[2,17] will now go down
[3,1] will now go left
[3,2] will now go left
[3,3] will now go left
[3,4] will now go left
[3,5] will now go right
[3,6] will now go left
[3,7] will now go right
[3,8] will now go right
[3,9] wil