In [13]:
%matplotlib inline

import gym
import itertools
import matplotlib
import numpy as np
import sys
import tensorflow as tf
import os
import collections
import multiprocessing
import threading
import time
import shutil
import unittest

if "../" not in sys.path:
  sys.path.append("../") 
  sys.path.append("./a3c") 
from lib.envs.cliff_walking import CliffWalkingEnv
from lib.atari.state_processor import StateProcessor
from lib.atari import helpers as atari_helpers
from lib import plotting

matplotlib.style.use('ggplot')

In [None]:
#### tf.reset_default_graph()

with tf.variable_scope("bb"):
    tf.scalar_summary("aa", tf.constant(5.0))

summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES)
[s.name for s in summary_ops]


In [3]:
def make_env():
    return gym.envs.make("Breakout-v0")

# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions
VALID_ACTIONS = [0, 1, 2, 3]

# observation_space = make_env().observation_space
# action_space = make_env().action_space

In [4]:
env = make_env()

[2016-11-01 15:47:29,020] Making new env: Breakout-v0


In [9]:
from a3c.worker import make_copy_params_op

class PolicyMonitor(object):
    def __init__(env, policy_net, summary_writer):
        self.env = env
        self.global_policy_net = policy_net
        self.summary_writer = summary_writer
        self.sp = StateProcessor()
        
        with tf.variable_scope("policy_eval"):
            self.policy_net = PolicyEstimator(policy_net.num_outputs)
        
        self.video_dir = os.path.join(summary_writer.get_logdir(), "videos")
        os.makedirs(self.video_dir)
            
        # Op to copy params from global policy/valuenets
        self.copy_params_op = make_copy_params_op(
            tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
            tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES))            

    def _policy_net_predict(self, state, sess):
        feed_dict = { self.policy_net.states: [state] }
        preds = sess.run(self.policy_net.predictions, feed_dict)
        return preds["probs"][0]
    
    def eval_once(self, sess):
        # Copy params to local model
        global_step, _ = sess.run([tf.contrib.framework.get_global_step(), self.copy_params_op])
            
        # Run an episode
        
        
        done = False
        state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset()))
        total_reward = 0.0
        episode_length = 0
        
        while not done:
            action_probs = self._policy_net_predict(self.state, sess)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = self.env.step(action)
            total_reward += reward
            episode_length += 1
            state = next_state

        # Add summary
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=total_reward, tag="total_reward")
        episode_summary.value.add(simple_value=episode_length, tag="episode_length")
        self.summary_writer.add_summary(episode_summary, global_step)
        self.summary_writer.flush()
        
        return total_reward, episode_length
    
    def continuous_eval(self, eval_every, sess):
        while True:
            self.eval_once(sess)
            # Sleep until next evaluation cycle
            time.sleep(eval_every)

ImportError: No module named 'estimators'

In [12]:
summary_writer.get_logdir()

'/tmp/a3c'

In [10]:
T_MAX = 7
NUM_WORKERS = 3

# Model Directory
model_dir = "/tmp/a3c"
shutil.rmtree(model_dir, ignore_errors=True)
os.makedirs(model_dir)
a = tf.train.SummaryWriter(model_dir)

tf.reset_default_graph()

global_step = tf.Variable(0, name="global_step", trainable=False)

with tf.variable_scope("global") as vs:
    value_net = ValueEstimator()
    policy_net = PolicyEstimator(reuse=True)

global_counter = itertools.count()

workers = []
for worker_id in range(NUM_WORKERS):
    worker = Worker(
        name="worker_{}".format(worker_id),
        policy_net=policy_net,
        value_net=value_net,
        global_counter=global_counter,
        discount_factor = 0.99,
        summary_writer=summary_writer)
    workers.append(worker)
    

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    
    coord = tf.train.Coordinator()
    
    # Start workers
    worker_threads = []
    for worker in workers:
        worker_fn = lambda: worker.run(sess, coord, T_MAX)
        t = threading.Thread(target=worker_fn)
        t.start()
        worker_threads.append(t) 
    
    # Start update
    # updater.run(sess, coord, MAX_UPDATE_STEPS)
    
    # Wait for all workers to finish
    coord.join(worker_threads)
    
    # Close the queues
    # policy_net_batcher.queue.close()
    # value_net_batcher.queue.close()    
    
    # update_fn = lambda: updater.run(sess, coord, MAX_UPDATE_STEPS)
    # threading.Thread(target=update_fn).start()

    

NameError: name 'ValueEstimator' is not defined

In [None]:
# def actor_critic(env,
#                  sess,
#                  policy_net,
#                  policy_net_batcher,
#                  value_net,
#                  value_net_batcher,
#                  num_episodes,
#                  discount_factor=0.99):
#     """
#     Q-Learning algorithm for fff-policy TD control using Function Approximation.
#     Finds the optimal greedy policy while following an epsilon-greedy policy.
    
#     Args:
#         env: OpenAI environment.
#         estimator: Action-Value function estimator
#         num_episodes: Number of episodes to run for.
#         discount_factor: Lambda time discount factor.
#         epsilon: Chance the sample a random action. Float betwen 0 and 1.
#         epsilon_decay: Each episode, epsilon is decayed by this factor
    
#     Returns:
#         An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
#     """

#     # Keeps track of useful statistics
#     stats = plotting.EpisodeStats(
#         episode_lengths=np.zeros(num_episodes),
#         episode_rewards=np.zeros(num_episodes))    
    
#     Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    
#     for i_episode in range(num_episodes):
#         # print("Starting episode {}".format(i_episode))
#         # sys.stdout.flush()
        
#         # Reset the environment and pick the fisrst action
#         state = env.reset()
        
#         episode = []
        
#         # One step in the environment
#         for t in itertools.count():
            
#             # Take a step
#             action_probs = policy_net.predict(state, sess=sess)
#             action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
#             next_state, reward, done, _ = env.step(action)
            
#             # Keep track of the transition
# #             episode.append(Transition(
# #               state=state, action=action, reward=reward, next_state=next_state, done=done))
            
#             # Update statistics
#             stats.episode_rewards[i_episode] += reward
#             stats.episode_lengths[i_episode] = t
            
#             # Calculate TD Target
#             value_next = value_net.predict(next_state, sess=sess)
#             td_target = reward + discount_factor * value_next
#             td_error = td_target - value_net.predict(state, sess=sess)
            
#             # Update the value estimator
#             feed_dict = { value_net.states: [state], value_net.targets: [td_target] }
#             value_net_batcher.add_grads(feed_dict, sess=sess)
            
#             # Update the policy estimator
#             # using the td error as our advantage estimate
#             feed_dict = { policy_net.states: [state], policy_net.targets: [td_error], policy_net.actions: [action]  }
#             policy_net_batcher.add_grads(feed_dict, sess=sess)
            
#             if done:
#                 break
                
#             state = next_state
            
#     return stats

In [None]:
def test_policy(env, sess, coord, estimator_policy, test_every):
    global_step = tf.contrib.framework.get_global_step(graph=sess.graph)
    try:
        while not coord.should_stop():
            time.sleep(test_every)
            state = env.reset()
            done = False
            total_reward = 0.0
            num_steps = 0
            while not done:
                action_probs = estimator_policy.predict(state, sess=sess)
                action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
                state, reward, done, _ = env.step(action)
                num_steps += 1
                total_reward += reward
            
            global_step_ = sess.run(global_step)
            print("{}: Epsiode Finished. length: {}, reward: {}".format(global_step_, num_steps, total_reward))
            sys.stdout.flush()
            
    except tf.errors.CancelledError:
        return

In [None]:
# tf.reset_default_graph()

# global_step = tf.Variable(0, name="global_step", trainable=False)

# policy_net = PolicyEstimator()
# policy_net_batcher = GradientBatcher(policy_net.optimizer, policy_net.loss, "policy")

# value_net = ValueEstimator()
# value_net_batcher = GradientBatcher(value_net.optimizer, value_net.loss, "value")

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    
    # threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    
    def gradient_worker_fn(coord):
        try:
            while not coord.should_stop():
                policy_net_batcher.apply_grads_batch(8, sess=sess)
                value_net_batcher.apply_grads_batch(8, sess=sess)
        except tf.errors.CancelledError:
            return
        
    # Start a new worker thread for each CPU
    worker_coord = tf.train.Coordinator()
    worker_fn = lambda: actor_critic(make_env(), sess, policy_net, policy_net_batcher, value_net, value_net_batcher, 100)
    worker_threads = []
    for i_cpu in range(multiprocessing.cpu_count()):
        t = threading.Thread(target=worker_fn)
        t.start()
        worker_threads.append(t)
    
    # Start a thread that applies gradients
    threading.Thread(target=lambda: gradient_worker_fn(worker_coord)).start()
    
    # Start a thread that evaluates
    threading.Thread(target=lambda: test_policy(make_env(), sess, worker_coord, policy_net, 1)).start()
    
    # Wait for all workers to finish
    worker_coord.join(worker_threads)
    
    # Close the queues
    policy_net_batcher.queue.close()
    value_net_batcher.queue.close()

In [None]:
# plotting.plot_episode_stats(stats, smoothing_window=10)