In [7]:
import tensorflow as tf
from tensorflow.layers import dense, flatten
import gym
import numpy as np
from rewards import discount_rewards
from memory import MemoryBuffer

In [8]:
class RLNetwork():
    def __init__(self, in_op, out_op, act_type='discrete', sess=None):
        """
        Create a wrapper for RL networks for easy training.
        Args:
            in_op (tf.Placeholder): Observation input to architecture
            out_op (tf.Variable): Action output of architecture
            act_type (string): 'discrete' for a discrete actions space or 'continuous'
                               for a continuous actions space
            sess (tf.Session): A session if you would like to use a custom session,
                               if left none it will be automatically created
        """

        if not sess:
            self.renew_sess()
        
        self.in_op = in_op
        self.out_op = out_op
        
        if act_type in ('discrete', 'd'):
            self.train = self._create_discrete_trainer()
            self.act_type = 'discrete'
        elif act_type == ('continuous', 'c'):
            self.train = self._create_continuous_trainer()
            self.act_type = 'continuous'
        else:
            raise('act_type must be \'discrete\' or \'continuous\'')
        
    def renew_sess(self):
        self.sess = tf.Session()
        
    def end_sess(self):
        self.sess.close()
        
    def _create_discrete_trainer(self, optimizer=tf.train.AdamOptimizer()):
        self.act_holders = tf.placeholder(tf.int64, shape=[None])
        self.reward_holders = tf.placeholder(tf.float64, shape=[None])
        
        self.act_masks = tf.one_hot(self.act_holders, self.out_op.shape[1].value, dtype=tf.float64)
        self.log_probs = tf.log(self.out_op)
        
        self.resp_acts = tf.reduce_sum(self.act_masks *  self.log_probs, axis=1)
        self.loss = -tf.reduce_mean(self.resp_acts * self.reward_holders)
        
        self.optimizer = optimizer
        self.update = self.optimizer.minimize(self.loss)
        
        update_func = lambda train_data: self.sess.run(self.update, 
                                                       feed_dict={self.in_op: np.vstack(train_data[:, 0]),
                                                            self.act_holders: train_data[:, 1],
                                                            self.reward_holders: train_data[:, 2]})
        
        self.sess.run(tf.global_variables_initializer())
        
        return update_func
        
    def _create_continuous_trainer(self, optimizer=tf.train.AdamOptimizer()):
        self.act_holders = tf.placeholder(tf.float64, shape=[None])
        self.reward_holders = tf.placeholder(tf.float64, shape=[None])
        
        self.act_masks = tf.one_hot(self.act_holders, self.out_op.shape[1].value, dtype=tf.float64)
        self.log_probs = tf.log(self.out_op)
        
        self.resp_acts = tf.reduce_sum(self.act_masks *  self.log_probs, axis=1)
        self.loss = -tf.reduce_mean(self.resp_acts * self.reward_holders)
        
        self.optimizer = optimizer
        self.update = self.optimizer.minimize(self.loss)
        
        update_func = lambda train_data: self.sess.run(self.update, 
                                                       feed_dict={self.in_op: np.vstack(train_data[:, 0]),
                                                            self.act_holders: train_data[:, 1],
                                                            self.reward_holders: train_data[:, 2]})
        
        self.sess.run(tf.global_variables_initializer())
        
        return update_func
        
    def _gen_discrete_act(self, obs):
        act_probs = self.sess.run(self.out_op, feed_dict={self.in_op: obs})
        act = np.random.choice(list(range(len(act_probs)+1)), p=act_probs[0])
        
        return act
    
    def _gen_continuous_act(self, obs):
        raise('Unimplemented')
        
    def gen_act(self, obs):
        if self.act_type == 'discrete':
            return self._gen_discrete_act(obs)
        else:
            return self._gen_continuous_act(obs)
        
    def train(self, obs, rewards, acts):
        raise('The train method was not properly created')

In [3]:
def make_cart_pole():
    return gym.make('CartPole-v1')

def make_car_race():
    return gym.make('CarRacing-v0')

In [23]:
obs = tf.placeholder(tf.float64, shape=[None, 4])
dense1 = dense(obs, 128, activation=tf.tanh)
dense2 = dense(dense1, 128, activation=tf.tanh)
act_probs = dense(dense2, 2)
softmax_probs = tf.nn.softmax(act_probs)

network = RLNetwork(obs, softmax_probs)

In [24]:
n_episodes = 10000
max_steps = 200
update_freq = 10 # In episodes

mb = MemoryBuffer(update_freq)
env = make_cart_pole()

In [1]:
import time
total_steps = 0
total_episodes = 0
all_rewards = []

for episode in range(n_episodes):
    obs = env.reset()
    
    episode_reward = 0
    mb.start_rollout()
    for step in range(max_steps):
        act = network.gen_act([obs])

#         env.render()
#         time.sleep(0.02)
        obs_next, rew, d, _ = env.step(act)
        episode_reward += rew
        
        mb.record(obs, act, rew)
        obs = obs_next
        
        total_steps += 1
        if d:
            break
    # print(episode_reward)
    all_rewards.append(episode_reward)
    total_episodes += 1
            
    if total_episodes % update_freq == 0:
        print('Recent Reward:', np.mean(all_rewards[-update_freq:]))
#         print('Total Episodes:', total_episodes)
#         print('Total Steps:', total_steps)
#         print('\n-----------')
        print()
        
        train_data = mb.to_data()
        network.train(train_data)

NameError: name 'n_episodes' is not defined