In [186]:
import gym

import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras import Model

import numpy as np
import pandas as pd

import time

In [187]:
def discounted_rewards(rewards, gamma):
    
    rtg = np.zeros_like(rewards , dtype=np.float32)
    rtg[-1] = rewards[-1]
    for i in reversed(range(len(rewards)-1)):
        
        rtg[i] = rewards[i] + gamma * rtg[i+1]
        
    return rtg

In [188]:
class Policy(Model):
    
    def __init__(self, hidden_layers, hidden_size, output_size, activation, output_activation):
        
        super(Policy, self).__init__()
        self.hidden_layers = [Dense(hidden_size[i], activation=activation) for i in range(hidden_layers)]
        self.output_layer = Dense(output_size, activation=output_activation)
        
    def call(self, state):
        
        x = state
        
        for layer in self.hidden_layers:
            
            x = layer(x)
        
        return self.output_layer(x)

In [189]:
class Buffer():
    
    def __init__(self, gamma):
        
        self.gamma = gamma
        self.obs = []
        self.actions = []
        self.returns = []
    
    def store(self, temp_traj):
        
        if len(temp_traj) > 0:
            self.obs.extend(temp_traj[:,0])
            ret = discounted_rewards(temp_traj[:,1], self.gamma)
            self.returns.extend(ret)
            self.actions.extend(temp_traj[:,2])
        
    def get_batch(self):
        
        return np.array(self.obs,dtype=np.float32), self.actions, self.returns
    
    def __len__(self):
        
        assert(len(self.obs) == len(self.actions) == len(self.returns))
        return len(self.obs)

In [190]:
env_test = gym.make('LunarLander-v2')
buffer_test = Buffer(0.95)
buffer_list = []

obs = np.array([env_test.reset()])
obs_2 = np.array([env_test.reset()])

buffer_list.append([obs.copy(),10,1])
buffer_list.append([obs_2.copy(),2,3])

buffer_test.store(np.array(buffer_list))

obs_buffer, action_buffer, return_buffer = buffer_test.get_batch()

print(obs_buffer.shape)
print(obs_buffer[1])
obs_buffer[1].shape

(2, 1, 8)
[[ 0.00248661  1.4141665   0.2518405   0.14427547 -0.00287447 -0.05704566
   0.          0.        ]]


(1, 8)

In [223]:
def REINFORCE(env_name, hidden_layers, hidden_size, activation, output_activation, 
              alpha, num_epochs, gamma, steps_per_epoch):
    
    env = gym.make(env_name)
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.n
    policy = Policy(hidden_layers, hidden_size, act_dim, activation, output_activation)
    
    obs = env.reset()
    _ = policy.predict(np.array([obs]))
    
    policy.compile(optimizer = tf.keras.optimizers.Adam(alpha))
    
    step_count = 0
    train_rewards = []
    train_ep_len = []
    
    timer = time.time()
    
    for epoch in range(num_epochs):
        
        obs = np.array([env.reset()])
        buffer = Buffer(gamma)
        env_buffer = []
        epoch_rewards = []
        
        done = False
        #while len(buffer) < steps_per_epoch:
        while not done:
            
            policy_actions = policy.predict(obs)
            action = tf.squeeze(tf.random.categorical(policy_actions,1))
            next_obs, reward, done, _ = env.step(np.squeeze(action))
            
            env_buffer.append([obs.copy(), reward, action])
            
            obs = np.array([next_obs.copy()])
            step_count += 1
            epoch_rewards.append(reward)
            
            if done: 
                
                buffer.store(np.array(env_buffer))
                env_buffer = []
                
                train_rewards.append((np.sum(epoch_rewards)))
                train_ep_len.append(len(epoch_rewards))
                
                obs = env.reset()
                epoch_rewards = []
                
        # Policy Optimization
        
        obs_batch, action_batch, return_batch = buffer.get_batch()
        with tf.GradientTape() as tape:
            
            one_hot_actions = tf.keras.utils.to_categorical(action_batch, act_dim, dtype=np.float32)
            pi_logits = policy(obs_batch)
            pi_log = tf.reduce_sum(tf.multiply(one_hot_actions.reshape(one_hot_actions.shape[0],1,one_hot_actions.shape[1]),
                                               tf.nn.log_softmax(pi_logits)), axis=2)
            
            pi_loss = -tf.reduce_mean(pi_log * return_batch)
            
            model_gradients = tape.gradient(pi_loss, policy.trainable_variables)
            policy.optimizer.apply_gradients(zip(model_gradients, policy.trainable_variables))
        
        # Statistics
        
        if epoch % 100 == 0:
            
            print('Ep:%d MnRew:%.2f MxRew:%.1f EpLen:%.1f Buffer:%d -- Step:%d -- Time:%d' % 
                  (epoch, np.mean(train_rewards), np.max(train_rewards), np.mean(train_ep_len), 
                   len(buffer), step_count,time.time()-timer))
            
            train_rewards = []
            train_ep_len = []
    
    return env, policy
    env.close

In [224]:
#one_hot_actions.reshape([110,1,4])

In [225]:
#tf.reduce_sum(tf.multiply(one_hot_actions.reshape(one_hot_actions.shape[0],1,one_hot_actions.shape[1]),
                          #tf.nn.log_softmax(pi_logits)),axis=2)

In [226]:
#def REINFORCE(env_name, hidden_layers, hidden_size, activation, output_activation, 
              #alpha, num_epochs, gamma, steps_per_epoch):
    
env,policy = REINFORCE('LunarLander-v2', 1, [64], activation=tf.tanh, output_activation=None, 
              alpha=8e-3, num_epochs=1000, gamma=0.99, steps_per_epoch=1000)

Ep:0 MnRew:-357.07 MxRew:-357.1 EpLen:98.0 Buffer:98 -- Step:98 -- Time:1
Ep:10 MnRew:-178.06 MxRew:-105.7 EpLen:84.4 Buffer:89 -- Step:942 -- Time:13
Ep:20 MnRew:-114.64 MxRew:-72.0 EpLen:87.3 Buffer:78 -- Step:1815 -- Time:25
Ep:30 MnRew:-158.02 MxRew:-63.0 EpLen:98.8 Buffer:108 -- Step:2803 -- Time:38
Ep:40 MnRew:-161.47 MxRew:-48.3 EpLen:132.0 Buffer:257 -- Step:4123 -- Time:56
Ep:50 MnRew:-340.19 MxRew:-5.4 EpLen:123.2 Buffer:94 -- Step:5355 -- Time:74
Ep:60 MnRew:-218.40 MxRew:-14.7 EpLen:115.4 Buffer:105 -- Step:6509 -- Time:89
Ep:70 MnRew:-222.05 MxRew:-1.7 EpLen:102.3 Buffer:81 -- Step:7532 -- Time:104
Ep:80 MnRew:-100.43 MxRew:-13.7 EpLen:102.0 Buffer:72 -- Step:8552 -- Time:118
Ep:90 MnRew:-97.97 MxRew:-62.8 EpLen:95.6 Buffer:78 -- Step:9508 -- Time:131
Ep:100 MnRew:-105.37 MxRew:-26.4 EpLen:81.4 Buffer:70 -- Step:10322 -- Time:142
Ep:110 MnRew:-91.61 MxRew:-60.0 EpLen:80.9 Buffer:80 -- Step:11131 -- Time:153
Ep:120 MnRew:-105.84 MxRew:-67.4 EpLen:85.0 Buffer:66 -- Step:1198

In [240]:
current_milli_time = lambda: int(round(time.time() * 1000))
env_name = 'LunarLander-v2'
env_test = gym.make(env_name)

#env_test = gym.wrappers.Monitor(env_test, "VIDEOS/TEST_VIDEOS"+env_name+str(current_milli_time()), force=True,
 #                                   video_callable=lambda x: x%20==0)

In [254]:
env_test.close()
obs = env_test.reset()
obs = np.array([obs])
done = False

while not done:
    policy_actions = policy.predict(obs)
    action = tf.squeeze(tf.random.categorical(policy_actions,1))
    next_obs, reward, done, _ = env_test.step(np.squeeze(action))
    env_test.render()
            
    obs = np.array([next_obs.copy()])
    