### Un framework de RL

Queremos implementar un framework para trabajar con RL. Necesitamos tres cosas:

1. Un **ambiente** que tiene 
    + Un conjunto de estados
    + Determina que acciones son disponibles en cada estado
    + Y dada un accion que un agente puede tomar en un estado determina una transicion que trae consigo un pago
2. Un **agente** que quiere maximizar su utilidad en ese ambiente
    + Esta en un estado dentro de un ambiente
    + Tiene una politica que dicta que accion toma segun su estado
3. Una forma en que **interactuen**
    + Interactuan cuando el agente decide y por eso recibe un pago y el agente cambio de estado

In [4]:
type RLEnv
    state_space::Array{Any, 1}
    action_set::Function # (state)->(available action set)
    trans_fun::Function # (state, action) -> (new_state, reward)
end
type RLAgent
    state::Any
    policy::Function # (state) -> (action)
end
function interact!(agent::RLAgent, env::RLEnv)
    # la convencion de julia es usar ! al final del nombre de la funcion
    # si la funcion transforma los argumentos
    new_state, reward = env.trans_fun(agent.state, agent.policy(agent.state))
    agent.state = new_state
    return new_state, reward # util
end
function interact_given_action!(agent::RLAgent, env::RLEnv, action)
    # la convencion de julia es usar ! al final del nombre de la funcion
    # si la funcion transforma los argumentos
    if action ∉ env.action_set(agent.state)  error("Not valid") end
    new_state, reward = env.trans_fun(agent.state, action)
    agent.state = new_state 
    return new_state, reward # util
end



interact_given_action! (generic function with 1 method)

# Let's race!!!

In [1]:
race_track = readcsv("data/race_track.csv")
function print_race_track(race_track, pos)
    clean_track = map(x -> x == "" ? " " : x, race_track)
    print("Voici the race track!!!\n") 
    clean_track[pos[1], pos[2]] = "+"
    for i in 1:size(clean_track, 1)
        print(join(clean_track[i,:], " "), "\n")
    end
    print("F: finishing line\n*: out of the track\n+: racer\n")
end
print_race_track(race_track, [18, 3])

Voici the race track!!!
* * * * * * * * * * * F F F F * *
* * * * * * * * * *      * *
* * * * * * * * *      * * *
* * * * * * *       * * * *
* * * * * *       * * * * *
* * * * *      * * * * * * *
* * * *      * * * * * * * *
* * *      * * * * * * * * *
* *      * * * * * * * * * *
* *      * * * * * * * * * *
* *     * * * * * * * * * * *
* *     * * * * * * * * * * *
* *     * * * * * * * * * * *
* *     * * * * * * * * * * *
* *     * * * * * * * * * * *
* *     * * * * * * * * * * *
* *     * * * * * * * * * * *
* * +    * * * * * * * * * * *
F: finishing line
*: out of the track
+: racer


In [32]:
# Truco julia: list comprehensions
state_space= [[i, j] for i in 1:size(race_track,1) for j in 1:size(race_track,2)]
# Trucho julia 2: hacer funciones en una linea facilmente
action_set = state -> ["up", "left", "right", "down"]
# Cosa 3 que necesitamos es una funcion de transicion
function trans_fun(state, action)
    i, j = state # julia trick
    if (action == "up" && i > 1) i = i - 1 end
    if (action == "down" && i < size(race_track, 1)) i = i + 1 end
    if (action == "left" && j > 1) j = j - 1 end
    if (action == "right" && j < size(race_track, 2)) j = j + 1 end
    new_state = [i, j]
    reward = -1 # default
    if race_track[i, j] == "*" reward = -10 end
    if race_track[i, j] == "F" reward = 0 end
    return new_state, reward
end



trans_fun (generic function with 1 method)

In [35]:
state = [18, 1]
action = "up"
new_state, reward = trans_fun(state, action)
println("Starting from $start choosing $action, moving to $new_state and receiving reward $reward")

Starting from [18,1] choosing up, moving to [17,1] and receiving reward -10


In [37]:
race_task = RLEnv(state_space, action_set, trans_fun)
state = [18, 1]
action = "up"
new_state, reward = race_task.trans_fun(state, action)
println("Starting from $start choosing $action, moving to $new_state and receiving reward $reward")

Starting from [18,1] choosing up, moving to [17,1] and receiving reward -10


### Quien esta corriendo?????

Necesitamos el corredor: el agente

In [38]:
# Primera politica
policy(state) = rand(["up", "left", "right", "down"])
init_state = [18, 3]
agent = RLAgent(init_state, policy)

RLAgent([18,3],policy)

In [39]:
for i in 1:20
    s, r = interact!(agent, race_task)
    println("Moving to position $s receiving $r")
end

Moving to position [18,4] receiving -1
Moving to position [18,3] receiving -1
Moving to position [18,4] receiving -1
Moving to position [18,4] receiving -1
Moving to position [18,3] receiving -1
Moving to position [17,3] receiving -1
Moving to position [17,2] receiving -10
Moving to position [16,2] receiving -10
Moving to position [17,2] receiving -10
Moving to position [16,2] receiving -10
Moving to position [17,2] receiving -10
Moving to position [17,1] receiving -10
Moving to position [18,1] receiving -10
Moving to position [18,1] receiving -10
Moving to position [18,1] receiving -10
Moving to position [18,2] receiving -10
Moving to position [17,2] receiving -10
Moving to position [17,1] receiving -10
Moving to position [16,1] receiving -10
Moving to position [15,1] receiving -10


In [41]:
agent.state = [18, 3]
for i in 1:100
    s, r = interact!(agent, race_task)
    IJulia.clear_output()
    print_race_track(race_track, s)
    sleep(0.3)
end

Voici the race track!!!
* * * * * * * * * * * F F F F * *
* * * * * * * * * *           * *
* * * * * * * * *           * * *
* * * * * * *             * * * *
* * * * * *             * * * * *
* * * * *           * + * * * * *
* * * *           * * * * * * * *
* * *           * * * * * * * * *
* *           * * * * * * * * * *
* *           * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
* *         * * * * * * * * * * *
F: finishing line
*: out of the track
+: racer


## Ahora sí las carreras!!!!!!

In [44]:
states = race_task.state_space
finish_line = [x for x in states if race_track[x...] == "F"]

4-element Array{Array{Int64,1},1}:
 [1,12]
 [1,13]
 [1,14]
 [1,15]

In [122]:
function race_episode(agent, state; max_steps = 10000)
    step = 0
    total_reward = 0
    terminate = (state in finish_line)
    agent.state = state
    while !terminate && step < max_steps
        new_state, reward = interact!(agent, race_task)
        terminate = (agent.state in finish_line)
        step += 1
        total_reward += reward
    end
    return step, total_reward
end
function race_episode(agent, state, action; max_steps = 10000)
    step = 0
    total_reward = 0
    terminate = (state in finish_line)
    agent.state = state
    if !terminate
        new_state, reward = interact_given_action!(agent, race_task, action)
        terminate = (agent.state in finish_line)
        step += 1
        total_reward += reward 
    end
    while !terminate && step < max_steps
        new_state, reward = interact!(agent, race_task)
        terminate = (agent.state in finish_line)
        step += 1
        total_reward += reward
    end
    return step, total_reward
end



race_episode (generic function with 3 methods)

In [123]:
steps, reward = race_episode(agent, [18, 3])
println("Termino la carrera despues de $steps pasos con reward $reward")

Termino la carrera despues de 713 pasos con reward -5428


In [129]:
steps, reward = race_episode(agent, [18, 3], "up")
println("Termino la carrera despues de $steps pasos con reward $reward empezando con up")

Termino la carrera despues de 3490 pasos con reward -22857 empezando con up


# Let's learn!!

In [133]:
state = [2, 16]
action = "up"
qsa = 0
for i in 1:1000
    steps, total_reward = race_episode(agent, state, action)
    qsa += (total_reward - qsa) / i
end
println("El valor de empezar en la posicion $state con accion $action es $qsa")

El valor de empezar en la posicion [2,16] con accion up es -1060.8700000000022


In [140]:
# using NamedArrays #Pkg.add("NamedArrays")
n_states = size(state_space, 1)
n_actions = 4
Q = zeros(n_states, n_actions)
for i in 1:n_states
    for j in 1:n_actions
        s = state_space[i]
        a = race_task.action_set(s)[j]
        qsa = 0
        for k in 1:100
            steps, total_reward = race_episode(agent, s, a)
            qsa += (total_reward - qsa) / k
        end 
        Q[i, j] = qsa
    end
end

In [141]:
Q

306×4 Array{Float64,2}:
  -7477.83  -7810.18  -6733.12   -9058.55
  -6660.32  -7248.03  -7637.39   -6406.63
  -5763.84  -7723.01  -5888.19   -7161.78
  -6920.83  -6250.54  -5413.99   -6804.23
  -6291.67  -7023.96  -5830.46   -6648.4 
  -4795.43  -6660.65  -5790.87   -5657.02
  -5891.97  -5719.41  -5990.87   -5424.97
  -4574.76  -6203.56  -3680.84   -5012.16
  -4713.78  -5638.07  -3479.87   -6134.13
  -4965.81  -3919.55  -1652.55   -4058.66
  -2446.05  -3983.56      0.0    -2454.89
      0.0       0.0       0.0        0.0 
      0.0       0.0       0.0        0.0 
      ⋮                                  
  -7966.1   -8645.99  -7823.74   -7979.24
 -10298.6   -7366.03  -8710.59   -8309.73
  -7162.51  -9171.87  -8837.74  -10024.8 
  -8676.65  -8560.73  -8134.07   -8044.47
  -8382.81  -8330.86  -8667.11   -7772.7 
  -8330.15  -9698.92  -9007.81   -9362.5 
  -8922.93  -8603.48  -8092.33   -8025.64
  -7126.28  -8644.43  -8851.09   -7436.26
  -8410.19  -8525.67  -8584.88   -8941.08
  -8824.19

In [145]:

for i in 1:n_states
    state = race_task.state_space[i]
    max, idx = findmax(Q[i,:])
    action = race_task.action_set(state)[idx]
    println("Best choice in state $state is action $action")
end
function improved_policy(state)
    max, idx = findmax(Q[i,:])
    race_task.action_set(state)[idx]      
end

Best choice in state [1,1] is action right
Best choice in state [1,2] is action down
Best choice in state [1,3] is action up
Best choice in state [1,4] is action right
Best choice in state [1,5] is action right
Best choice in state [1,6] is action up
Best choice in state [1,7] is action down
Best choice in state [1,8] is action right
Best choice in state [1,9] is action right
Best choice in state [1,10] is action right
Best choice in state [1,11] is action right
Best choice in state [1,12] is action up
Best choice in state [1,13] is action up
Best choice in state [1,14] is action up
Best choice in state [1,15] is action up
Best choice in state [1,16] is action left
Best choice in state [1,17] is action left
Best choice in state [2,1] is action down
Best choice in state [2,2] is action up
Best choice in state [2,3] is action right
Best choice in state [2,4] is action up
Best choice in state [2,5] is action down
Best choice in state [2,6] is action right
Best choice in state [2,7] is act

In [142]:
# Mejorar agent
agent.policy = improved_policy

(3,1)

In [None]:
i