In [1]:
using Gym
using TensorFlow
using Distributions
include("utils.jl");
include("ReplayMemory.jl");

# Constants

In [38]:
ALPHA = 0.001 # learning rate
GAMMA = 0.99 # decay rate of past observations
OBSERVE = 100. # timesteps to observe before training
EXPLORE = 500. # frames over which to anneal epsilon
FINAL_EPSILON = 0.05 # final value of epsilon
INITIAL_EPSILON = 1.0 # starting value of epsilon
REPLAY_MEM_SIZE = 590000 # number of previous transitions to remember
BATCHSIZE = 32 # size of minibatch
K = 1 # only select an action every Kth frame, repeat prev for others
TARGET_UPDATE_FREQ = 10 # update frequency for weights of target network

10

# ENV setup

In [41]:
step = 1
done = false
r_tot = 0.0
nsteps = 10

obs = 0

env = GymEnvironment("CartPole-v0")
@show INITIAL_STATE = reset(env)
@show ACTIONS = n_actions(env)         # number of valid actions                                           
@show STATE_DIMS = obs_dimensions(env)[1];

INITIAL_STATE = reset(env) = [-0.0483421,-0.038216,0.0310807,-0.028438]
ACTIONS = n_actions(env) = 2
STATE_DIMS = (obs_dimensions(env))[1] = 4


[2016-11-28 02:54:45,118] Making new env: CartPole-v0


# Network

In [32]:
function createNetwork(ACTIONS, input_dim, hidden_dim=2)
    # network weights
    W1 = weight_variable([input_dim, hidden_dim])
    b1 = bias_variable([hidden_dim])
    
    W2 = weight_variable([hidden_dim, ACTIONS])
    b2 = bias_variable([ACTIONS])

    # input layer
    s = placeholder(Float32, shape=[nothing, input_dim])
    
    # hidden layer
    h1 = nn.tanh(s*W1 + b1)
    
    # readout layer
    readout = h1*W2 + b2

    return s, readout, [W1, b1, W2, b2]
end



createNetwork (generic function with 2 methods)

# Frame step

In [42]:
# closure
function frame_step(action)
    x_t, r_0, is_terminal = step!(env, action)
    s_t = x_t #preprocess(x_t)
    if is_terminal
        reset(env)
    end
    s_t, r_0, is_terminal
end



frame_step (generic function with 1 method)

# Unpack memory

In [23]:
function unpack_memory(minibatch, BATCHSIZE)
    s_j_batch = zeros(Float64, BATCHSIZE, STATE_DIMS)
    a_batch = zeros(Int, BATCHSIZE)
    r_batch = zeros(Float64, BATCHSIZE)
    s_j1_batch = zeros(Float64, BATCHSIZE, STATE_DIMS)
    
    for i=1:BATCHSIZE
        s_j_batch[i,:] = minibatch[i][1]
        a_batch[i] = minibatch[i][2]
        r_batch[i] = minibatch[i][3]
        s_j1_batch[i,:] = minibatch[i][4]
    end
    s_j_batch, a_batch, r_batch, s_j1_batch
end



unpack_memory (generic function with 1 method)

# Train network

In [43]:
function trainNetwork(frame_step, s, readout, wgts, s_target, readout_target, wgts_target, sess)
    # one hot vector of the action taken
    a = placeholder(Int32, shape=[nothing])
    # scalar for r + gamma max_a' Q(s',a';theta_i^') from target
    y = placeholder(Float32, shape=[nothing])
    # dot product to get Q(s,a;theta_i) from main
    readout_action = reduce_sum(readout.*one_hot(a, ACTIONS), reduction_indices=[2])
    #readout_action = (readout.*one_hot(a, ACTIONS))*ones(Float32,ACTIONS)
    # [ (r + gamma max_a' Q(s',a';theta_i^'))  -  Q(s,a;theta_i) ]^2
    loss = reduce_mean((y - readout_action)^2) #, reduction_indices=[2])
    # use adam update rule
    train_step = train.minimize(train.AdamOptimizer(ALPHA), loss, var_list=wgts)

    # store the previous observations in replay memory
    D = ReplayMemory(REPLAY_MEM_SIZE)
    
    # initialize state
    s_t, r_0, is_terminal = frame_step(0)

    # must initialize tf vars before accessing
    run(sess, initialize_all_variables())

    # start training
    epsilon = INITIAL_EPSILON
    t = 0
    while t < 1000
       ## choose an action epsilon greedily
        a_t = 0
        if rand() <= epsilon || t <= OBSERVE
            a_t = (rand(UInt) % ACTIONS)
        else
            # readout_t = [Q(s,a;theta_i) for all a in ACTIONS]
            readout_t = run(sess, readout,  Dict(s=>s_t'))[1]    
            a_t = indmax(readout_t)
        end

        # scale down epsilon
        if epsilon > FINAL_EPSILON && t > OBSERVE
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
        end

        # run same action K=1 times
        for _=1:K
            # run the selected action and observe next state and reward
            s_t1, r_t, is_terminal = frame_step(a_t)

            # store the transition in D
            push_memory!(D, [s_t, a_t, r_t, s_t1, is_terminal])
            
            if is_terminal
                s_t = INITIAL_STATE
            end
        end

        # only train if done observing
        if t > 100 #OBSERVE
            # sample a minibatch to train on
            BATCHSIZE = 6
            minibatch = sample(D, BATCHSIZE)
            s_j_batch, a_batch, r_batch, s_j1_batch = unpack_memory(minibatch, BATCHSIZE)
            
            y_batch = Float64[]
            # readout_j1_batch = [Q(s',a'; theta_i^') for all a in ACTIONS]
            readout_j1_batch = run(sess, readout_target, Dict(s_target=>s_j1_batch))
            for i=1:BATCHSIZE
                # minibatch[i][5] = is_terminal, if is_terminal, only expect reward
                if minibatch[i][5]
                    push!(y_batch, r_batch[i])
                # otherwise, need future reward from best action from current state
                else
                    push!(y_batch, r_batch[i] + GAMMA * max(readout_j1_batch[i]))
                end
            end

            ## perform gradient step
            dic = Dict(y=>y_batch, a=>a_batch, s=>s_j_batch)
            
            # run is where we compute [y - Q(s,a;theta_i)]^2 and do the update
            run(sess, train_step, dic)
            
            # update target weights to match main weights
            if t % TARGET_UPDATE_FREQ == 0
                run(sess, [assign(vars[2], vars[1]) for vars=zip(wgts, wgts_target)])
            end
        end
        
        t += 1
    end
end



trainNetwork (generic function with 1 method)

# RunDQN

In [44]:
function runDQN(frame_step, k=0)
    start_monitor(env, string("exp-", env.name, "_", k));
    reset(env) # reset the environment
    # create tf session
    sess = Session()
    # training DQN
    s, readout, wgts = createNetwork(ACTIONS, STATE_DIMS)
    # check point DQN, only gets updated occassionally to preserve stability
    s_target, readout_target, wgts_target = createNetwork(ACTIONS, STATE_DIMS)
    trainNetwork(frame_step, s, readout, wgts, s_target, readout_target, wgts_target, sess)
    close_monitor(env)
end



runDQN (generic function with 2 methods)

# Tests

In [30]:
k=0

0

In [45]:
runDQN(frame_step,k)
k+=1

[2016-11-28 02:55:22,677] Creating monitor directory exp-CartPole-v0_3
[2016-11-28 02:55:22,681] Starting new video recorder writing to /home/carol/Documents/dqn_julia/exp-CartPole-v0_3/openaigym.video.3.26010.video000000.mp4
[2016-11-28 02:55:34,251] Starting new video recorder writing to /home/carol/Documents/dqn_julia/exp-CartPole-v0_3/openaigym.video.3.26010.video000001.mp4
[2016-11-28 02:55:35,755] Starting new video recorder writing to /home/carol/Documents/dqn_julia/exp-CartPole-v0_3/openaigym.video.3.26010.video000008.mp4
[2016-11-28 02:55:40,130] Starting new video recorder writing to /home/carol/Documents/dqn_julia/exp-CartPole-v0_3/openaigym.video.3.26010.video000027.mp4
[2016-11-28 02:55:46,208] Starting new video recorder writing to /home/carol/Documents/dqn_julia/exp-CartPole-v0_3/openaigym.video.3.26010.video000064.mp4
[2016-11-28 02:55:48,939] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/carol/Documents/dqn_julia/exp-CartPole-v0_

4