In [1]:
import tensorflow as tf
from tensorflow.layers import dense
import gym
import numpy as np

In [2]:
def make_cart_pole():
    return gym.make("CartPole-v1")

def discount_rewards(rewards, gamma=0.99):
    new_rewards = [rewards[-1]]
    for i in reversed(range(len(rewards)-1)):
        new_rewards.append(rewards[i] + gamma * new_rewards[-1])
    return new_rewards[::-1]

class MemoryBuffer():
    def __init__(self, max_size):
        self.max_size = max_size
        self.rollouts = []
        self.rollout_idx = -1
    
    def start_rollout(self):
        self.rollout_idx = (self.rollout_idx + 1) % self.max_size
        if self.rollout_idx >= len(self.rollouts):
            self.rollouts.append([])
        else:
            self.rollouts[self.rollout_idx] = []
            
    def end_rollout(self):
        self.start_rollout()
    
    def record(self, obs, act, rew):
        self.rollouts[self.rollout_idx].append([obs, act, rew])
        
    def to_data(self, reset=True):
        all_data = []
        
        try:
            for rollout in self.rollouts:
                rollout = np.array(rollout)
                # Discount the rewards for every rollout
                rollout[:,2] = discount_rewards(rollout[:,2])
                all_data.extend(list(rollout))

            if reset:
                self.reset()
        except IndexError:
            return np.array([])
            
        return np.array(all_data)
                
    def reset(self):
        self.rollouts = []
        self.rollout_idx = -1

In [3]:
class Network():
    def __init__(self, obs_shape, act_space, sess=None):
        self.obs_shape = obs_shape
        self.act_space = act_space
        self.sess = sess
        
        self.build_network()
        
    def build_network(self):
        self.obs = tf.placeholder(tf.float64, shape=[None, self.obs_shape])
        self.dense1 = dense(self.obs, 128, activation=tf.tanh)
        self.dense2 = dense(self.dense1, 128, activation=tf.tanh)
        self.act_probs = dense(self.dense2, self.act_space)
        self.softmax_probs = tf.nn.softmax(self.act_probs)
        self.chosen_act = tf.squeeze(tf.multinomial(logits=self.softmax_probs, num_samples=1), axis=1)
        self.actions = tf.placeholder(tf.int64, shape=[None])
        self.rewards = tf.placeholder(tf.float64, shape=[None])
        
        self.action_masks = tf.one_hot(self.actions, self.act_space, dtype=tf.float64)
        self.log_probs = tf.log(self.softmax_probs)
        
        self.resp_actions = tf.reduce_sum(self.action_masks *  self.log_probs, axis=1)
        self.loss = -tf.reduce_mean(self.resp_actions * self.rewards)
        
        self.optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
        self.update = self.optimizer.minimize(self.loss)
        
#         self.action_masks = tf.one_hot(self.actions, self.act_space, dtype=tf.float64)
#         self.resp_actions = tf.reduce_sum(self.action_masks * tf.log(self.out), axis=1)
        
#         self.loss = -tf.reduce_mean(self.resp_actions * self.rewards)
        
# #         self.loss = -tf.reduce_mean(tf.transpose(tf.log(self.resp_actions)) * self.rewards)
#         self.optimizer = tf.train.AdamOptimizer()
#         self.optimize = self.optimizer.minimize(self.loss)
        
    def init_session(self):
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.sess.__enter__()
        return self.sess
    
    def end_session(self):
        if self.sess is not None:
            self.sess.close()
        
    def choose_action1(self, obs, sess=None):
        if not sess:
            if self.sess is not None:
                sess = self.sess
            else:
                sess = tf.get_default_session()
                assert sess is not None
        
        act = sess.run(self.chosen_act, feed_dict={self.obs: obs})
        return act
    
    def choose_action2(self, obs, sess=None):
        if not sess:
            if self.sess is not None:
                sess = self.sess
            else:
                sess = tf.get_default_session()
                assert sess is not None
        
        probs = sess.run(self.softmax_probs, feed_dict={self.obs: obs})
        act = np.random.choice(list(range(len(probs)+1)), p=probs[0])
        
        return act, probs
    
    def train(self, train_data, sess=None):
        """train_data: in the two-dimensional numpy array with the format, [obs, act, rew]"""
        if not sess:
            if self.sess is not None:
                sess = self.sess
            else:
                sess = tf.get_default_session()
                assert sess is not None
        
        a, b, _ = sess.run([self.softmax_probs, tf.multinomial(logits=self.softmax_probs,num_samples=1), self.update], feed_dict={self.obs: np.vstack(train_data[:,0]),
                                            self.actions: train_data[:,1],
                                            self.rewards: train_data[:,2]})
#         print(a[:50])
#         print(b[:50])

In [4]:
n_episodes = 10000
max_steps = 200
update_freq = 200 # In episodes

In [6]:
mb = MemoryBuffer(update_freq)
env = make_cart_pole()
network = Network(env.observation_space.shape[0], env.action_space.n)
network.init_session();

In [10]:
import time
total_steps = 0
total_episodes = 0
all_rewards = []

for episode in range(n_episodes):
    obs = env.reset()
    
    episode_reward = 0
    mb.start_rollout()
    for step in range(max_steps):
        act = network.choose_action2([obs])
        act2 = network.choose_action1([obs])[0]
        
        if total_steps % 500 == 0:
            print(act, act2)

        # print(act)
        # env.render()
        # time.sleep(0.01)
        obs_next, rew, d, _ = env.step(act)
        episode_reward += rew
        
        mb.record(obs, act, rew)
        obs = obs_next
        
        total_steps += 1
        if d: # or total_steps % update_freq == 0:
            #  print(act)
            break
    # print(episode_reward)
    all_rewards.append(episode_reward)
    total_episodes += 1
            
    if total_episodes % 100 == 0:
        print('Recent Reward:', np.mean(all_rewards[-100:]))
        print('Total Episodes:', total_episodes)
        print('Total Steps:', total_steps)
        print('\n-----------')
        
        train_data = mb.to_data()
#         rews = train_data[:,2]
#         train_data[:,2] = (rews - np.mean(rews)) / np.std(rews)
        network.train(train_data)

1 1
0 0
1 1
1 1
0 0
1 1
1 1
0 1
0 1
0 1
0 0
1 0
Recent Reward: 59.76
Total Episodes: 100
Total Steps: 5976

-----------
0 1
0 0
0 1
1 0
0 1
0 1
0 0
0 0
1 1
1 0
0 0
0 1
1 1
Recent Reward: 60.46
Total Episodes: 200
Total Steps: 12022

-----------
1 0
1 1
1 0
1 1
0 0
0 0
0 0
1 1
1 1
1 0
0 1
1 1
1 1
Recent Reward: 66.05
Total Episodes: 300
Total Steps: 18627

-----------
1 1
0 0
1 0
0 0
1 1
1 1
1 1
1 1
0 1
0 1
1 1
1 1
0 0
1 0
Recent Reward: 73.39
Total Episodes: 400
Total Steps: 25966

-----------
1 1
1 0
0 0
1 1
0 1
1 1
0 0
0 0
0 1
0 1
1 1
0 0
1 1
1 0
1 0
Recent Reward: 72.38
Total Episodes: 500
Total Steps: 33204

-----------
0 1
0 1
0 0
0 1
0 0
1 1
1 1
0 0
1 0
1 0
1 0
0 1
1 0
1 0
0 1
0 1
Recent Reward: 78.94
Total Episodes: 600
Total Steps: 41098

-----------
0 1
1 0
1 0
1 1
1 0
1 1
1 1


KeyboardInterrupt: 

In [None]:
network.end_session()

In [None]:
mb.to_data().shape