## Trust Region Fitted-Q Iteration

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["CGT_COMPAT_MODE"] = "cgt"
#os.environ["THEANO_FLAGS"] = "device=gpu"
from sampler import parallel_sampler
parallel_sampler.reset()
parallel_sampler.init_pool(1)
from mdp import AtariMDP
from policy import EpsilonGreedyPolicy
from qfunc import AtariRAMQFunction
import cPickle as pickle

Couldn't import dot_parser, loading of dot files will not be possible.
Using CGT for CGT compatibility mode


Using gpu device 0: GeForce GTX 770 (CNMeM is disabled)


In [7]:
import numpy as np
import tensorfuse as theano
import tensorfuse.tensor as T
import scipy
from misc.tensor_utils import flatten_tensors
import sys

In [18]:
def setup_experiment():
    mdp = AtariMDP(rom_path="vendor/atari_roms/pong.bin", obs_type="ram")
    qfunc = AtariRAMQFunction(mdp, hidden_sizes=[512, 256, 128])
    eps_policy = EpsilonGreedyPolicy(qfunc, epsilon=1)
    parallel_sampler.populate_task(mdp, eps_policy)
    return mdp, qfunc, eps_policy

In [16]:
def request_samples(eps_policy, epsilon, n_samples):
    eps_policy.epsilon = epsilon
    return parallel_sampler.request_samples(eps_policy.get_param_values(), n_samples)

Trust region formulation:

At each iteration, we solve the optimization problem

\begin{eqnarray*}
\min &&\frac{1}{N} \sum_{n=1}^N \left[Q_\theta(s_n, a_n) - y_n \right]^2 \cr
s.t. && \frac{1}{N} \sum_{n=1}^N \frac{\left[Q_\theta(s_n, a_n) - Q_{\theta_{old}}(s_n, a_n)\right]^2}{2\sigma^2}  < \delta
\end{eqnarray*}

where $y_n = r_n + \gamma\max_{a'} Q_{\theta_{old}}(s'_{n}, a')$ and $\sigma^2 = \frac{1}{N} \sum_{n=1}^N \left[Q_{\theta_{old}}(s_n, a_n) - y_n\right]^2$.

In [19]:
np.random.seed(0)
def new_train_vars(qfunc):
    obs = qfunc.input_var
    actions = T.ivector("actions")
    rewards = T.vector("rewards")
    terminate = T.vector("terminate")
    penalty = T.scalar("penalty")
    prev_qval = T.matrix("prev_qval")
    return dict(
        obs=obs,
        actions=actions,
        rewards=rewards,
        terminate=terminate,
        prev_qval=prev_qval,
        penalty=penalty,
    )

def to_train_var_list(obs, actions, rewards, terminate, prev_qval, penalty):
    return [obs, actions, rewards, terminate, prev_qval, penalty]

def new_loss(qfunc, discount, obs, actions, rewards, terminate, prev_qval, penalty):
    qval = qfunc.qval_var
    N = obs.shape[0]
    qsa = qval[T.arange(N), actions]
    y = rewards + (1 - terminate) * discount * T.concatenate([T.max(prev_qval[1:], axis=1), np.array([0.0]).astype(theano.config.floatX)])
    prev_qsa = prev_qval[T.arange(N), actions]
    
    loss = T.mean(T.square(qsa - y))
    sigmasq = T.mean(T.square(prev_qsa - y))
    reg = T.mean(T.square(qsa - prev_qsa))
    reg_normalized = reg / (2*T.square(sigmasq))
    reg_loss = loss + penalty * reg
    return dict(
        loss=loss,
        reg=reg_normalized,
        reg_loss=reg_loss,
        sigmasq=sigmasq
    )
    

samples_per_itr = 10000
max_epsilon = 1
min_epsilon = 0.1
epsilon_decay_range = 20
discount = 0.99

mdp, qfunc, eps_policy = setup_experiment()

train_vars = new_train_vars(qfunc)
result_vars = new_loss(qfunc, discount, **train_vars)
reg_loss_var = result_vars["reg_loss"]
loss_var = result_vars["loss"]
reg_var = result_vars["reg"]

grads_var = T.grad(reg_loss_var, qfunc.params)

train_var_list = to_train_var_list(**train_vars)
print "compiling functions..."
f_loss = theano.function(train_var_list, [loss_var, reg_var, reg_loss_var], allow_input_downcast=True, on_unused_input='ignore')
f_grads = theano.function(train_var_list, grads_var, allow_input_downcast=True, on_unused_input='ignore')

def evaluate_loss(train_vals):
    def evaluate(params):
        qfunc.set_param_values(params)
        loss, reg, reg_loss = f_loss(*train_vals)
        return reg_loss.astype(np.float64)
    return evaluate

def evaluate_grad(train_vals):
    def evaluate(params):
        qfunc.set_param_values(params)
        grad = f_grads(*train_vals)
        flattened_grad = flatten_tensors(map(np.asarray, grad))
        return flattened_grad.astype(np.float64)
    return evaluate

for itr in range(100):
    cur_epsilon = max(min_epsilon, max_epsilon - (max_epsilon - min_epsilon) * itr / epsilon_decay_range)
    print "epsilon: ", cur_epsilon
    paths = request_samples(eps_policy, cur_epsilon, samples_per_itr)

    print "forming samples..."
    observations = np.vstack([path["observations"][:-1] for path in paths])
    actions = np.concatenate([path["actions"].reshape(-1) for path in paths])
    rewards = np.concatenate([path["rewards"].reshape(-1) for path in paths])
    terminate = np.concatenate([np.append(np.zeros(len(path["rewards"]) - 1), 1) for path in paths]).astype(int)

    prev_qval = qfunc.compute_qval(observations)
    train_vals = [observations, actions, rewards, terminate, prev_qval]
    
    loss_before, reg_before, _ = f_loss(*(train_vals + [0]))

    cur_params = qfunc.get_param_values()
    
    print "optimizing..."
    for penalty in [0]:
        result = scipy.optimize.fmin_l_bfgs_b(
            func=evaluate_loss(train_vals + [penalty]),
            x0=qfunc.get_param_values(),
            fprime=evaluate_grad(train_vals + [penalty]),
            maxiter=20
        )

        loss_after, reg_after, _ = f_loss(*(train_vals + [penalty]))
        print "penalty:", penalty
        print "loss before:", loss_before, "after:", loss_after
        print "reg before:", reg_before, "after:", reg_after
        
    sys.stdout.flush()
        
    opt_params = qfunc.get_param_values()
    
    # test performance
    test_paths = request_samples(eps_policy, 0, samples_per_itr)
    avg_reward = np.mean([sum(path["rewards"]) for path in test_paths])
    print "EpRewMean:", avg_reward
    sys.stdout.flush()

0%                          100%
[##############################] | ETA[sec]: 0.000 

compiling functions...
epsilon:  1.0
forming samples...
optimizing...
penalty: 0
loss before: 0.215750336647 after: 0.0261024385691
reg before: 0.0 after: 2.03084945679



Total time elapsed: 10.244 sec
0%                          100%
[##############################] | ETA[sec]: 0.000 

EpRewMean: -19.5714285714



Total time elapsed: 10.275 sec
0%                          100%
[##############################] | ETA[sec]: 0.000 

epsilon:  0.955
forming samples...
optimizing...
penalty: 0
loss before: 0.0236016716808 after: 0.0216105543077
reg before: 0.0 after: 2.0114300251



Total time elapsed: 10.271 sec
0%                          100%
[##############################] | ETA[sec]: 0.000 

EpRewMean: -19.6153846154



Total time elapsed: 10.279 sec
0%                          100%
[##############################] | ETA[sec]: 0.000 

epsilon:  0.91
forming samples...
optimizing...
penalty: 0
loss before: 0.0221971087158 after: 0.0209395941347
reg before: 0.0 after: 1.39678692818



Total time elapsed: 10.227 sec
0%                          100%
[##############################] | ETA[sec]: 0.000 

EpRewMean: -20.5833333333



Total time elapsed: 10.229 sec
0%                          100%
[##############################] | ETA[sec]: 0.000 

epsilon:  0.865
forming samples...
optimizing...



Total time elapsed: 10.282 sec


KeyboardInterrupt: 

In [None]:
print itr

In [None]:
# discount = 0.99
# obs_var = qfunc.input_var
# N = obs_var.shape[0]
# actions_var = T.ivector("actions")
# rewards_var = T.vector("rewards")
# terminate_var = T.ivector("terminate")
# qval_var = qfunc.qval_var
                  
# qa = qval_var[T.arange(N), actions_var]

# ys = rewards_var + discount * T.concatenate([qa[1:], np.array([0])])
# loss = T.sum(T.square((1 - terminate_var) * (qa - ys))) / T.sum(1 - terminate_var)


# all_vars = [obs_var, actions_var, rewards_var, terminate_var]
# all_vals = [batch_observations, batch_actions, batch_rewards, batch_terminate]

# grads = T.grad(loss, qfunc.params)


# f_loss = theano.function(all_vars, loss, allow_input_downcast=True, on_unused_input='ignore')
# f_grads = theano.function(all_vars, grads, allow_input_downcast=True, on_unused_input='ignore')
# print f_loss(*all_vals)


# def evaluate_cost(params):
#     qfunc.set_param_values(params)
#     return f_loss(*all_vals).astype(np.float64)

# def evaluate_grad(params):
#     qfunc.set_param_values(params)
#     grads = f_grads(*all_vals)
#     flattened_grads = flatten_tensors(map(np.asarray, grads))
#     return flattened_grads.astype(np.float64)


# result = scipy.optimize.fmin_l_bfgs_b(func=evaluate_cost, x0=qfunc.get_param_values(), fprime=evaluate_grad, maxiter=20)
# opt_params = qfunc.get_param_values()

In [None]:
result

For each iteration, we sample a bunch of trajectories, and fit the q function according to the sampled trajectories