In [1]:
using PyCall
using PyPlot
ioff()

In [2]:
@pyimport numpy as np
@pyimport gym

Rectangle = pyimport("matplotlib.patches")["Rectangle"]

plt[:style][:use]("ggplot")
Rectangle

PyObject <class 'matplotlib.patches.Rectangle'>

In [3]:
env = gym.make("Acrobot-v1")
 
env[:seed](505);
println("State space: ", env[:observation_space])

println("-  low: ", env[:observation_space][:low])

println("- high: ", env[:observation_space][:high])
println("Action space: ", env[:action_space])

State space: PyObject Box(6,)
-  low: Float32[-1.0, -1.0, -1.0, -1.0, -12.5664, -28.2743]
- high: Float32[1.0, 1.0, 1.0, 1.0, 12.5664, 28.2743]
Action space: PyObject Discrete(3)


In [4]:
function create_tiling_grid(low, high, bins = (10, 10), offsets = (0.0, 0.0))

    
    grid = [collect(range(low[dim], stop = high[dim], length = bins[dim] + 1))[2:end-1] for dim in 1:length(bins)]

    for i in 1:length(bins)
        for j in 1:bins[1]-1
            grid[i][j] = grid[i][j] + offsets[i]
        end
    end
    println("Tiling: [<low>, <high>] / <bins> + (<offset>) => <splits>")
    for (l, h, b, o, splits) in zip(low, high, bins, offsets, grid)
        println("    [$l, $h]/$b + ($o) => $splits")
    end
    return grid
end
low = [-1.0, -5.0]
high = [1.0, 5.0]

create_tiling_grid(low, high, (10, 10), (-0.1, 0.5));  # [test]

Tiling: [<low>, <high>] / <bins> + (<offset>) => <splits>
    [-1.0, 1.0]/10 + (-0.1) => [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7]
    [-5.0, 5.0]/10 + (0.5) => [-3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5, 4.5]


In [7]:
function discretize(sample, grid)
    list = []
    for a in zip(sample,grid)
        aux = searchsortedfirst(a[2],a[1])
        push!(list,aux)
    end

    return list
end
function tile_encode(sample, tilings)
    encoded_sample = []
    for grid in tilings
        aux = discretize(sample,grid)
        push!(encoded_sample,aux)
    end
    
    return encoded_sample
end
samples = [(-1.2 , -5.1 ),
           (-0.75,  3.25),
           (-0.5 ,  0.0 ),
           ( 0.25, -1.9 ),
           ( 0.15, -1.75),
           ( 0.75,  2.5 ),
           ( 0.7 , -3.7 ),
           ( 1.0 ,  5.0 )]
encoded_samples = []
for sample in samples
    aux = tile_encode(sample,tilings)
    push!(encoded_samples,aux)
end
encoded_samples

8-element Array{Any,1}:
 Any[Any[1, 1], Any[1, 1], Any[1, 1]]      
 Any[Any[2, 9], Any[2, 9], Any[1, 8]]      
 Any[Any[3, 6], Any[3, 5], Any[3, 5]]      
 Any[Any[7, 4], Any[7, 4], Any[6, 3]]      
 Any[Any[7, 4], Any[6, 4], Any[6, 3]]      
 Any[Any[10, 8], Any[9, 8], Any[9, 8]]     
 Any[Any[9, 2], Any[9, 2], Any[9, 1]]      
 Any[Any[10, 10], Any[10, 10], Any[10, 10]]

In [8]:
function create_tilings(low, high, tiling_specs)
    low = convert(Array{Float64,1}, low)
    high = convert(Array{Float64,1}, high)
    return[create_tiling_grid(low,high,aux[1],aux[2]) for aux in tiling_specs]
end
tiling_specs = [((10, 10), (-0.066, -0.33)),
                ((10, 10), (0.0, 0.0)),
                ((10, 10), (0.066, 0.33))]
tilings = create_tilings(low, high, tiling_specs);

Tiling: [<low>, <high>] / <bins> + (<offset>) => <splits>
    [-1.0, 1.0]/10 + (-0.066) => [-0.866, -0.666, -0.466, -0.266, -0.066, 0.134, 0.334, 0.534, 0.734]
    [-5.0, 5.0]/10 + (-0.33) => [-4.33, -3.33, -2.33, -1.33, -0.33, 0.67, 1.67, 2.67, 3.67]
Tiling: [<low>, <high>] / <bins> + (<offset>) => <splits>
    [-1.0, 1.0]/10 + (0.0) => [-0.8, -0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 0.6, 0.8]
    [-5.0, 5.0]/10 + (0.0) => [-4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0]
Tiling: [<low>, <high>] / <bins> + (<offset>) => <splits>
    [-1.0, 1.0]/10 + (0.066) => [-0.734, -0.534, -0.334, -0.134, 0.066, 0.266, 0.466, 0.666, 0.866]
    [-5.0, 5.0]/10 + (0.33) => [-3.67, -2.67, -1.67, -0.67, 0.33, 1.33, 2.33, 3.33, 4.33]


In [11]:
mutable struct QTable
    state_size
    action_size
    q_table
    function QTable(state_size,action_size)
        println("QTable(): size = ", tuple(state_size..., action_size...))
        new(state_size, action_size, zeros(tuple(state_size..., action_size...))) 
    end
end
    
mutable struct TiledQTable
    tilings
    state_sizes
    action_size
    q_tables
    function TiledQTable(low, high, tiling_specs, action_size)
        tilings = create_tilings(low, high, tiling_specs)
        state_sizes = [tuple([length(splits)+1 for splits in tiling_grid]...) for tiling_grid in tilings]
        action_size = action_size
        q_tables = [QTable(state_size,action_size) for state_size in state_sizes]
        println("TiledQTable(): no. of internal tables = $(length(q_tables))")
        new(tilings, state_sizes, action_size, q_tables)
    end
end

function getTiledQTable(self, state, action)
    encoded_state = tile_encode(state, self.tilings)
    value = 0.0
    for (idx, q_table) in zip(encoded_state, self.q_tables)
        value = value + q_table.q_table[idx..., action]
    end
    return value/length(self.q_tables)
    
end




function updateTiledQTable(self::TiledQTable, state, action, value, alpha = 0.1)
    encoded_state = tile_encode(state, self.tilings)
    alphacomp = 1 - alpha
    
    for (idx,q_table) in zip(encoded_state, self.q_tables)
        auxval = q_table.q_table[idx..., action]
        q_table.q_table[idx...,action] = (alpha * value) + (alphacomp) * auxval
    end
end


#
tq = TiledQTable(low, high, tiling_specs, 2);
#println("#################################################################")

s1 = 4; s2 = 5; a = 1; q = 1.0



println("[GET]    Q( $(samples[s1]) , $a) = $(getTiledQTable(tq, samples[s1], a))") # check value at sample = s1
println("[UPDATE] Q( $(samples[s2]) , $a) = $q") 
updateTiledQTable(tq,samples[s2], a, q)  # update value for sample with some common tile(s)
println("[GET]    Q( $(samples[s1]) , $a) = $(getTiledQTable(tq, samples[s1], a))")


Tiling: [<low>, <high>] / <bins> + (<offset>) => <splits>
    [-1.0, 1.0]/10 + (-0.066) => [-0.866, -0.666, -0.466, -0.266, -0.066, 0.134, 0.334, 0.534, 0.734]
    [-5.0, 5.0]/10 + (-0.33) => [-4.33, -3.33, -2.33, -1.33, -0.33, 0.67, 1.67, 2.67, 3.67]
Tiling: [<low>, <high>] / <bins> + (<offset>) => <splits>
    [-1.0, 1.0]/10 + (0.0) => [-0.8, -0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 0.6, 0.8]
    [-5.0, 5.0]/10 + (0.0) => [-4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0]
Tiling: [<low>, <high>] / <bins> + (<offset>) => <splits>
    [-1.0, 1.0]/10 + (0.066) => [-0.734, -0.534, -0.334, -0.134, 0.066, 0.266, 0.466, 0.666, 0.866]
    [-5.0, 5.0]/10 + (0.33) => [-3.67, -2.67, -1.67, -0.67, 0.33, 1.33, 2.33, 3.33, 4.33]
QTable(): size = (10, 10, 2)
QTable(): size = (10, 10, 2)
QTable(): size = (10, 10, 2)
TiledQTable(): no. of internal tables = 3
[GET]    Q( (0.25, -1.9) , 1) = 0.0
[UPDATE] Q( (0.15, -1.75) , 1) = 1.0
[GET]    Q( (0.25, -1.9) , 1) = 0.06666666666666667


In [17]:
mutable struct QLearningAgent
    env
    tq
    state_sizes
    action_size
    alpha
    gamma
    epsilon
    epsilon_decay_rate
    min_epsilon
    initial_epsilon
    last_state
    last_action
    function QLearningAgent(env, tq, alpha = 0.02, gamma = 0.99,
        epsilon = 1.0, epsilon_decay_rate = 1, min_epsilon = 0.01)
        
        # Environment info
        env = env
        tq = tq
        state_sizes = tq.state_sizes
        action_size = env[:action_space][:n]
        
        # Learning parameters
        alpha = alpha
        gamma = gamma
        epsilon = epsilon
        initial_epsilon = epsilon
        min_epsilon = min_epsilon
        
        println("Environment: ", env)
        println("State space sizes: ", state_sizes)
        println("Action space size: ", action_size)
        new(env, tq, state_sizes, action_size, alpha, gamma, epsilon, epsilon_decay_rate, min_epsilon, initial_epsilon,nothing, nothing)
    end
end
function reset_episode(self::QLearningAgent, state)
    #reducir gradualmente la exploracion
    self.epsilon = self.epsilon * self.epsilon_decay_rate
    self.epsilon = max(self.epsilon, self.min_epsilon)
    
    self.last_state = state
    Q_s = [getTiledQTable(self.tq, state, action) for action in 1:length(self.action_size)]
    self.last_action = np.argmax(Q_s) + 1
    return self.last_action  
end

function reset_exploration(self::QLearningAgent, epsilon = nothing)
    # Reset rate de exploracion
    if epsilon == nothing
       self.epsilon = self.initial_epsilon 
    else
       self.epsilon = epsilon 
    end
end
    

function act(self::QLearningAgent, state, reward, mode)
    Q_s = [getTiledQTable(self.tq, state, action) for action in 1:self.action_size]
    greedy_action = np.argmax(Q_s) + 1
    
    
    if mode == "test"
        action = greedy_action
    else
        value = reward + self.gamma * maximum(Q_s)
        updateTiledQTable(self.tq, self.last_state, self.last_action, value, self.alpha)
        
        do_exploration = np.random[:uniform](0,1) < self.epsilon
        
        if do_exploration
            action = np.random[:randint](1, self.action_size + 1)
            
        else
            action = greedy_action
            
        end
    end
    self.last_state = state
    self.last_action = action
    return action
end

act (generic function with 1 method)

In [18]:
n_bins = 5

bins = tuple([n_bins for i in 1:env[:observation_space][:shape][1]]...)



offset_pos = (env[:observation_space][:high] - env[:observation_space][:low])/15


offset_pos = convert(Array{Float64,1}, offset_pos)

tiling_specs = [(bins, -offset_pos), 
                (bins, tuple([0.0 for i in 1:env[:observation_space][:shape][1]]...)),
                (bins, offset_pos)]


tq = TiledQTable(env[:observation_space][:low], 
                 env[:observation_space][:high], 
                 tiling_specs, 
                 env[:action_space][:n])


agent = QLearningAgent(env, tq);


Tiling: [<low>, <high>] / <bins> + (<offset>) => <splits>
    [-1.0, 1.0]/5 + (-0.13333334028720856) => [-0.733333, -0.333333, 0.0666667, 0.466667]
    [-1.0, 1.0]/5 + (-0.13333334028720856) => [-0.733333, -0.333333, 0.0666667, 0.466667]
    [-1.0, 1.0]/5 + (-0.13333334028720856) => [-0.733333, -0.333333, 0.0666667, 0.466667]
    [-1.0, 1.0]/5 + (-0.13333334028720856) => [-0.733333, -0.333333, 0.0666667, 0.466667]
    [-12.566370964050293, 12.566370964050293]/5 + (-1.675516128540039) => [-9.21534, -4.18879, 0.837758, 5.86431]
    [-28.274333953857422, 28.274333953857422]/5 + (-3.769911289215088) => [-20.7345, -9.42478, 1.88496, 13.1947]
Tiling: [<low>, <high>] / <bins> + (<offset>) => <splits>
    [-1.0, 1.0]/5 + (0.0) => [-0.6, -0.2, 0.2, 0.6]
    [-1.0, 1.0]/5 + (0.0) => [-0.6, -0.2, 0.2, 0.6]
    [-1.0, 1.0]/5 + (0.0) => [-0.6, -0.2, 0.2, 0.6]
    [-1.0, 1.0]/5 + (0.0) => [-0.6, -0.2, 0.2, 0.6]
    [-12.566370964050293, 12.566370964050293]/5 + (0.0) => [-7.53982, -2.51327, 2.51327, 

In [19]:
function run(agent, env, num_episodes = 10000, mode = "train")                 
    scores = []
    for i_episode in 1:num_episodes
        state = env[:reset]()
        action = reset_episode(agent,state)
        total_reward = 0
        done = false
        
        while !done
            state, reward, done, info = env[:step](action - 1)
            total_reward += reward
            action = act(agent,state,reward,mode)
        end
        append!(scores, total_reward)
        print("\r Episode $i_episode/$num_episodes | reward: $total_reward")
    end
    return scores
end
scores = run(agent, env);

 Episode 10000/10000 | reward: -500.0

In [36]:
env = gym.make("Acrobot-v1")

state = env[:reset]()
score = 0
action = act(agent, state, 0, "train")
for t in 1:500
    env[:render]()
    state, reward, done, _ = env[:step](action-1)
    action = act(agent, state, 0, "train")
    score += reward
    if t % 100 == 0
        print("\rEpisode $t / 500 | Max Average Score: $score /r")
    end
end
env[:close]()

Episode 500 / 500 | Max Average Score: -500.0 /r