In [1]:
import os
import sys
import logging
import time
import numpy as np
import tensorflow as tf
import gym
import scipy.signal
import os
import time
import inspect
from general import get_logger, Progbar, export_plot
from config import config
from test_env import EnvAdd, EnvAdd2



In [2]:
def build_mlp(
          mlp_input, 
          output_size,
          scope, 
          n_layers=config.n_layers, 
          size=config.layer_size, 
          output_activation=None):
  with tf.variable_scope(scope, reuse = False):
    cur_input = mlp_input
    #cur_input = tf.contrib.layers.fully_connected(cur_input, size)
    for i in range(n_layers):
      cur_input = tf.contrib.layers.fully_connected(cur_input,size)
    output = tf.contrib.layers.fully_connected(cur_input,output_size, activation_fn=output_activation)
  return output

class PG(object):
  def __init__(self, env, config, logger=None):
    
    if not os.path.exists(config.output_path):
      os.makedirs(config.output_path)
    self.config = config
    self.env = env
    self.logger = logger
    if logger is None:
      self.logger = get_logger(config.log_path)
    self.env = env
  
    self.discrete = True
    #self.observation_dim = self.env.observation_space.shape[0]
    self.observation_dim = 4
    #self.action_dim = self.env.action_space.n if self.discrete else self.env.action_space.shape[0]
    self.action_dim = 4
  
    self.lr = self.config.learning_rate
  
    # build model
    self.build()
  
  
  def add_placeholders_op(self):
    self.observation_placeholder = tf.placeholder(tf.float32,(None,self.observation_dim))
    if self.discrete:
      self.action_placeholder = tf.placeholder(tf.int32)
    else:
      self.action_placeholder = tf.placeholder(tf.float32)
  
    # Define a placeholder for advantages
    self.advantage_placeholder = tf.placeholder(tf.float32)

  
  
  def build_policy_network_op(self, scope = "policy_network"):

    if self.discrete:
      action_logits = build_mlp(mlp_input=self.observation_placeholder, output_size=self.action_dim, scope= scope) 
      self.sampled_action = tf.squeeze(tf.multinomial(action_logits,1),axis=[1])
      #self.sampled_action = tf.multinomial(action_logits,1)
      #action_taken = tf.one_hot(self.action_placeholder, depth=self.action_dim, axis=-1, dtype = tf.int32)
      cross_ent_neg = (-1)*tf.nn.sparse_softmax_cross_entropy_with_logits(logits= action_logits, labels= self.action_placeholder)
      #action_taken = tf.cast(action_taken,tf.float32)
      self.logprob = cross_ent_neg
    else:
      action_means =  build_mlp(self.observation_placeholder, self.action_dim, scope=scope)
      #with tf.variable_scope("log_std", reuse=tf.AUTO_REUSE):
      log_std = tf.get_variable("log_std",[self.action_dim])
      std = tf.exp(log_std)
      self.sampled_action = action_means+ std*tf.random_normal([self.action_dim])
      density = tf.contrib.distributions.MultivariateNormalDiag(loc=action_means, scale_diag=std)
      self.logprob = tf.log(density.prob(self.action_placeholder))
            
  
  
  def add_loss_op(self):

    self.loss = -tf.reduce_mean(tf.multiply(self.logprob,self.advantage_placeholder))

  
  
  def add_optimizer_op(self):

    optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
    self.train_op = optimizer.minimize(self.loss)

  
  
  def add_baseline_op(self, scope = "baseline"):

    self.baseline = build_mlp(self.observation_placeholder,1,scope=scope)
    self.baseline_target_placeholder = tf.placeholder(tf.float32)
    baseline_loss = tf.losses.mean_squared_error(self.baseline_target_placeholder, self.baseline)
    optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
    self.update_baseline_op = optimizer.minimize(baseline_loss)

  
  def build(self):

    # add placeholders
    self.add_placeholders_op()
    # create policy net
    self.build_policy_network_op()
    # add square loss
    self.add_loss_op()
    # add optmizer for the main networks
    self.add_optimizer_op()
  
    if self.config.use_baseline:
      self.add_baseline_op()
  
  def initialize(self):

    # create tf session
    self.sess = tf.Session()
    # tensorboard stuff
    self.add_summary()
    # initiliaze all variables
    init = tf.global_variables_initializer()
    self.sess.run(init)
  
  
  def add_summary(self):

    # extra placeholders to log stuff from python
    self.avg_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="avg_reward")
    self.max_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="max_reward")
    self.std_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="std_reward")
  
    self.eval_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="eval_reward")
  
    # extra summaries from python -> placeholders
    tf.summary.scalar("Avg Reward", self.avg_reward_placeholder)
    tf.summary.scalar("Max Reward", self.max_reward_placeholder)
    tf.summary.scalar("Std Reward", self.std_reward_placeholder)
    tf.summary.scalar("Eval Reward", self.eval_reward_placeholder)
            
    # logging
    self.merged = tf.summary.merge_all()
    self.file_writer = tf.summary.FileWriter(self.config.output_path,self.sess.graph) 

  def init_averages(self):

    self.avg_reward = 0.
    self.max_reward = 0.
    self.std_reward = 0.
    self.eval_reward = 0.
  

  def update_averages(self, rewards, scores_eval):

    self.avg_reward = np.mean(rewards)
    self.max_reward = np.max(rewards)
    self.std_reward = np.sqrt(np.var(rewards) / (len(rewards)+0.000000001))
  
    if len(scores_eval) > 0:
      self.eval_reward = scores_eval[-1]
  
  
  def record_summary(self, t):

    fd = {
      self.avg_reward_placeholder: self.avg_reward, 
      self.max_reward_placeholder: self.max_reward, 
      self.std_reward_placeholder: self.std_reward, 
      self.eval_reward_placeholder: self.eval_reward, 
    }
    summary = self.sess.run(self.merged, feed_dict=fd)
    # tensorboard stuff
    self.file_writer.add_summary(summary, t)
  
  
  def sample_path(self, env, num_episodes = None):

    episode = 0
    episode_rewards = []
    paths = []
    t = 0
  
    while (num_episodes or t < self.config.batch_size):
      state = env.reset()
      states, actions, rewards = [], [], []
      episode_reward = 0
  
      for step in range(self.config.max_ep_len):
        states.append(state)
        action = self.sess.run(self.sampled_action, feed_dict={self.observation_placeholder : states[-1][None]})[0]
        state, reward, done, info = env.step(action)
        actions.append(action)
        rewards.append(reward)
        episode_reward += reward
        t += 1
        if (done or step == self.config.max_ep_len-1):
          episode_rewards.append(episode_reward)  
          break
        if (not num_episodes) and t == self.config.batch_size:
          break
  
      path = {"observation" : np.array(states), 
                      "reward" : np.array(rewards), 
                      "action" : np.array(actions)}
      paths.append(path)
      episode += 1
      if num_episodes and episode >= num_episodes:
        break        
  
    return paths, episode_rewards
  
  
  def get_returns(self, paths):


    all_returns = []
    for path in paths:
      rewards = path["reward"]
      path_returns = np.zeros(len(rewards))
      T = len(rewards)
      for i in range(T):
        G_t = 0
        for j in range(T-i):
          G_t += np.power(self.config.gamma,j)*rewards[i+j]
        path_returns[i] = G_t
      all_returns.append(path_returns)
    returns = np.concatenate(all_returns)
  
    return returns
  
  
  def calculate_advantage(self, returns, observations):

    adv = returns
    if self.config.use_baseline:
      baseline = self.sess.run(self.baseline, feed_dict={self.observation_placeholder : observations})
      adv = adv - baseline
    if self.config.normalize_advantage:
      mean=np.mean(adv)
      std = np.std(adv)
      adv = (adv - mean)/(std+0.0000001)
    return adv
  
  
  def update_baseline(self, returns, observations):

    self.sess.run(self.update_baseline_op, feed_dict={self.observation_placeholder : observations, self.baseline_target_placeholder : returns})

  
  
  def train(self):

    last_eval = 0 
    last_record = 0
    scores_eval = []
    
    self.init_averages()
    scores_eval = [] # list of scores computed at iteration time
  
    for t in range(self.config.num_batches):
  
      # collect a minibatch of samples
      paths, total_rewards = self.sample_path(self.env) 
      scores_eval = scores_eval + total_rewards
      observations = np.concatenate([path["observation"] for path in paths])
      actions = np.concatenate([path["action"] for path in paths])
      rewards = np.concatenate([path["reward"] for path in paths])
      # compute Q-val estimates (discounted future returns) for each time step
      returns = self.get_returns(paths)
      advantages = self.calculate_advantage(returns, observations)

      # run training operations
      if self.config.use_baseline:
        self.update_baseline(returns, observations)
      self.sess.run(self.train_op, feed_dict={
                    self.observation_placeholder : observations, 
                    self.action_placeholder : actions, 
                    self.advantage_placeholder : advantages})
  
      # tf stuff
      if (t % self.config.summary_freq == 0):
        self.update_averages(total_rewards, scores_eval)
        #self.record_summary(t)

      # compute reward statistics for this batch and log
      avg_reward = np.mean(total_rewards)
      sigma_reward = np.sqrt(np.var(total_rewards) / (len(total_rewards)+0.0000001))
      msg = "Average reward: {:04.2f} +/- {:04.2f}".format(avg_reward, sigma_reward)
      self.logger.info(msg)
  
      if  self.config.record and (last_record > self.config.record_freq):
        self.logger.info("Recording...")
        last_record =0
        self.record()
  
    self.logger.info("- Training done.")
    export_plot(scores_eval, "Score", config.env_name, self.config.plot_output)


  def evaluate(self, env=None, num_episodes=1):
    """
    Evaluates the return for num_episodes episodes.
    Not used right now, all evaluation statistics are computed during training 
    episodes.
    """
    if env==None: env = self.env
    paths, rewards = self.sample_path(env, num_episodes)
    avg_reward = np.mean(rewards)
    sigma_reward = np.sqrt(np.var(rewards) / len(rewards))
    msg = "Average reward: {:04.2f} +/- {:04.2f}".format(avg_reward, sigma_reward)
    self.logger.info(msg)
    return avg_reward
     
  def produce_example(self):
    env = self.env
    state = env.reset()
    states = []
    for step in range(self.config.max_ep_len):
      states.append(state)
      print("At step "+ str(step)+ " the state is :" + str(state))
      action = self.sess.run(self.sampled_action, feed_dict={self.observation_placeholder : states[-1][None]})[0]
      actions = ["Done","Give A", "Give B", "Count on"]
      print("the action is " + actions[action])
      state, reward, done, info = env.step(action)
      if (done or step == self.config.max_ep_len-1):
        break

  

  def run(self):
        
    self.initialize()

    self.train()

  def change_env(self,env):
    self.env=env

In [3]:
tf.reset_default_graph()

In [4]:
env = EnvAdd(1,2)
# train model
model = PG(env, config)
model.run()
model.produce_example()

INFO:tensorflow:Summary name Avg Reward is illegal; using Avg_Reward instead.


[2018-03-20 16:55:56,894] Summary name Avg Reward is illegal; using Avg_Reward instead.


INFO:tensorflow:Summary name Max Reward is illegal; using Max_Reward instead.


[2018-03-20 16:55:56,902] Summary name Max Reward is illegal; using Max_Reward instead.


INFO:tensorflow:Summary name Std Reward is illegal; using Std_Reward instead.


[2018-03-20 16:55:56,908] Summary name Std Reward is illegal; using Std_Reward instead.


INFO:tensorflow:Summary name Eval Reward is illegal; using Eval_Reward instead.


[2018-03-20 16:55:56,913] Summary name Eval Reward is illegal; using Eval_Reward instead.
[2018-03-20 16:55:58,978] Average reward: 1.14 +/- 0.39
[2018-03-20 16:55:59,313] Average reward: 3.06 +/- 0.47
[2018-03-20 16:55:59,669] Average reward: 3.65 +/- 0.42
[2018-03-20 16:56:00,027] Average reward: 2.50 +/- 0.33
[2018-03-20 16:56:00,398] Average reward: 3.00 +/- 0.36
[2018-03-20 16:56:00,748] Average reward: 3.98 +/- 0.43
[2018-03-20 16:56:01,091] Average reward: 4.25 +/- 0.44
[2018-03-20 16:56:01,433] Average reward: 5.90 +/- 0.52
[2018-03-20 16:56:01,764] Average reward: 5.97 +/- 0.52
[2018-03-20 16:56:02,103] Average reward: 6.58 +/- 0.54
[2018-03-20 16:56:02,445] Average reward: 7.61 +/- 0.56
[2018-03-20 16:56:02,797] Average reward: 6.88 +/- 0.56
[2018-03-20 16:56:03,141] Average reward: 9.02 +/- 0.59
[2018-03-20 16:56:03,491] Average reward: 10.32 +/- 0.60
[2018-03-20 16:56:03,828] Average reward: 10.66 +/- 0.61
[2018-03-20 16:56:04,164] Average reward: 10.46 +/- 0.59
[2018-03-20

At step 0 the state is :[ 1.  2.  0.  0.]
the action is Count on
At step 1 the state is :[ 1.  2.  1.  1.]
the action is Count on
At step 2 the state is :[ 1.  2.  2.  2.]
the action is Count on
At step 3 the state is :[ 1.  2.  3.  3.]
the action is Done


In [11]:
model.produce_example()

At step 0 the state is :[ 1.  2.  0.  0.]
the action is Count on
At step 1 the state is :[ 1.  2.  1.  1.]
the action is Count on
At step 2 the state is :[ 1.  2.  2.  2.]
the action is Count on
At step 3 the state is :[ 1.  2.  3.  3.]
the action is Done


In [61]:
model.change_env(EnvAdd(5,1))

In [112]:
model.train()

[2018-03-20 16:32:06,950] Average reward: 12.08 +/- 0.80
[2018-03-20 16:32:07,283] Average reward: 13.02 +/- 0.78
[2018-03-20 16:32:07,620] Average reward: 15.31 +/- 0.70
[2018-03-20 16:32:07,952] Average reward: 9.87 +/- 0.81
[2018-03-20 16:32:08,298] Average reward: 10.59 +/- 0.81
[2018-03-20 16:32:08,663] Average reward: 12.80 +/- 0.78
[2018-03-20 16:32:09,067] Average reward: 14.86 +/- 0.73
[2018-03-20 16:32:09,438] Average reward: 14.31 +/- 0.75
[2018-03-20 16:32:09,790] Average reward: 9.93 +/- 0.81
[2018-03-20 16:32:10,130] Average reward: 8.96 +/- 0.80
[2018-03-20 16:32:10,458] Average reward: 9.93 +/- 0.81
[2018-03-20 16:32:10,779] Average reward: 13.15 +/- 0.78
[2018-03-20 16:32:11,124] Average reward: 12.05 +/- 0.80
[2018-03-20 16:32:11,471] Average reward: 14.42 +/- 0.74
[2018-03-20 16:32:11,812] Average reward: 14.76 +/- 0.73
[2018-03-20 16:32:12,156] Average reward: 15.62 +/- 0.68
[2018-03-20 16:32:12,481] Average reward: 12.80 +/- 0.78
[2018-03-20 16:32:12,821] Average r

In [113]:
model.produce_example()

At step 0 the state is :[ 5.  1.  0.  0.]
the action is Count on
At step 1 the state is :[ 5.  1.  1.  1.]
the action is Count on
At step 2 the state is :[ 5.  1.  2.  2.]
the action is Count on
At step 3 the state is :[ 5.  1.  3.  3.]
the action is Count on
At step 4 the state is :[ 5.  1.  4.  4.]
the action is Count on
At step 5 the state is :[ 5.  1.  5.  5.]
the action is Count on
At step 6 the state is :[ 5.  1.  6.  6.]
the action is Done


In [80]:
model.change_env(EnvAdd(1,3))

In [90]:
model.produce_example()

At step 0 the state is :[ 1.  3.  0.  0.]
the action is Count on
At step 1 the state is :[ 1.  3.  1.  1.]
the action is Count on
At step 2 the state is :[ 1.  3.  2.  2.]
the action is Count on
At step 3 the state is :[ 1.  3.  3.  3.]
the action is Done


In [73]:
model.change_env(EnvAdd2(1,4))

In [91]:
model.produce_example()

At step 0 the state is :[ 1.  4.  0.  0.]
the action is Count on
At step 1 the state is :[ 1.  4.  1.  1.]
the action is Count on
At step 2 the state is :[ 1.  4.  2.  2.]
the action is Count on
At step 3 the state is :[ 1.  4.  3.  3.]
the action is Count on
At step 4 the state is :[ 1.  4.  4.  4.]
the action is Count on
At step 5 the state is :[ 1.  4.  5.  5.]
the action is Done
