In [1]:
import gym

import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras import Model

import numpy as np
import pandas as pd

import time


In [2]:
def discounted_rewards(rewards, gamma):
    
    rtg = np.zeros_like(rewards , dtype=np.float32)
    rtg[-1] = rewards[-1]
    for i in reversed(range(len(rewards)-1)):
        
        rtg[i] = rewards[i] + gamma * rtg[i+1]
        
    return rtg


In [3]:
class Policy(Model):
    
    def __init__(self, hidden_layers, hidden_size, output_size, activation, output_activation):
        
        super(Policy, self).__init__()
        self.hidden_layers = [Dense(hidden_size[i], activation=activation) for i in range(hidden_layers)]
        self.output_layer = Dense(output_size, activation=output_activation)
        
    def call(self, state):
        
        x = state
        
        for layer in self.hidden_layers:
            
            x = layer(x)
        
        return self.output_layer(x)

In [4]:
class Buffer():
    
    def __init__(self, gamma):
        
        self.gamma = gamma
        self.obs = []
        self.actions = []
        self.returns = []
    
    def store(self, temp_traj):
        
        if len(temp_traj) > 0:
            self.obs.extend(temp_traj[:,0])
            ret = discounted_rewards(temp_traj[:,1], self.gamma)
            self.returns.extend(ret)
            self.actions.extend(temp_traj[:,2])
        
    def get_batch(self):
        
        return np.array(self.obs,dtype=np.float32), self.actions, self.returns
    
    def __len__(self):
        
        assert(len(self.obs) == len(self.actions) == len(self.returns))
        return len(self.obs)

In [35]:
def REINFORCE(env_name, hidden_layers, hidden_size, activation, output_activation, 
              alpha, num_epochs, gamma, steps_per_epoch):
    
    env = gym.make(env_name)
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.n
    policy = Policy(hidden_layers, hidden_size, act_dim, activation, output_activation)
    
    obs = env.reset()
    _ = policy.predict(np.array([obs]))
    
    policy.compile(optimizer = tf.keras.optimizers.Adam(alpha))
    
    step_count = 0
    train_rewards = []
    train_ep_len = []
    
    timer = time.time()
    
    for epoch in range(num_epochs):
        
        obs = np.array([env.reset()])
        buffer = Buffer(gamma)
        env_buffer = []
        epoch_rewards = []
        
        done = False
        #while len(buffer) < steps_per_epoch:
        while len(buffer) < steps_per_epoch or not done:
            
            policy_actions = policy.predict(obs)
            action = tf.squeeze(tf.random.categorical(policy_actions,1))
            next_obs, reward, done, _ = env.step(np.squeeze(action))
            
            env_buffer.append([obs.copy(), reward, action])
            
            obs = np.array([next_obs.copy()])
            step_count += 1
            epoch_rewards.append(reward)
            
            if done: 
                
                buffer.store(np.array(env_buffer))
                env_buffer = []
                
                train_rewards.append(np.sum(epoch_rewards))
                train_ep_len.append(len(epoch_rewards))
                
                obs = np.array([env.reset()])
                epoch_rewards = []
                
        # Policy Optimization
        
        obs_batch, action_batch, return_batch = buffer.get_batch()
        with tf.GradientTape() as tape:
            
            one_hot_actions = tf.keras.utils.to_categorical(action_batch, act_dim, dtype=np.float32)
            pi_logits = policy(obs_batch)
            pi_log = tf.reduce_sum(tf.multiply(one_hot_actions.reshape(one_hot_actions.shape[0],1,one_hot_actions.shape[1]),
                                               tf.nn.log_softmax(pi_logits)), axis=2)
            
            pi_loss = -tf.reduce_mean(pi_log * return_batch)
            
            model_gradients = tape.gradient(pi_loss, policy.trainable_variables)
            policy.optimizer.apply_gradients(zip(model_gradients, policy.trainable_variables))
        
        # Statistics
        
        if epoch % 10 == 0:
            
            print('Ep:%d MnRew:%.2f MxRew:%.1f EpLen:%.1f Buffer:%d -- Step:%d -- Time:%d' % 
                  (epoch, np.mean(train_rewards), np.max(train_rewards), np.mean(train_ep_len), 
                   len(buffer), step_count,time.time()-timer))
            
            train_rewards = []
            train_ep_len = []
    
    return env, policy
    env.close

In [8]:
env,policy = REINFORCE('LunarLander-v2', 1, [64], activation=tf.tanh, output_activation=None, 
              alpha=8e-3, num_epochs=1000, gamma=0.99, steps_per_epoch=1000)



Ep:0 MnRew:-99.77 MxRew:-99.8 EpLen:67.0 Buffer:67 -- Step:67 -- Time:0
Ep:10 MnRew:-293.05 MxRew:28.2 EpLen:100.1 Buffer:92 -- Step:1068 -- Time:14
Ep:20 MnRew:-232.50 MxRew:-104.8 EpLen:95.5 Buffer:89 -- Step:2023 -- Time:27
Ep:30 MnRew:-157.74 MxRew:-9.0 EpLen:123.4 Buffer:103 -- Step:3257 -- Time:44
Ep:40 MnRew:-250.59 MxRew:-62.3 EpLen:149.7 Buffer:123 -- Step:4754 -- Time:65
Ep:50 MnRew:-196.51 MxRew:-40.5 EpLen:111.6 Buffer:107 -- Step:5870 -- Time:80
Ep:60 MnRew:-310.93 MxRew:-164.1 EpLen:92.8 Buffer:88 -- Step:6798 -- Time:92
Ep:70 MnRew:-334.11 MxRew:-221.0 EpLen:82.4 Buffer:91 -- Step:7622 -- Time:103
Ep:80 MnRew:-218.36 MxRew:-130.4 EpLen:78.0 Buffer:94 -- Step:8402 -- Time:114
Ep:90 MnRew:-197.66 MxRew:-54.1 EpLen:80.3 Buffer:59 -- Step:9205 -- Time:125
Ep:100 MnRew:-217.11 MxRew:-128.0 EpLen:82.1 Buffer:77 -- Step:10026 -- Time:136
Ep:110 MnRew:-136.35 MxRew:-59.7 EpLen:104.0 Buffer:96 -- Step:11066 -- Time:150
Ep:120 MnRew:-163.63 MxRew:-93.7 EpLen:141.8 Buffer:151 -- St

In [36]:
env,policy = REINFORCE('LunarLander-v2', 1, [64], activation=tf.tanh, output_activation=None, 
              alpha=8e-3, num_epochs=1000, gamma=0.99, steps_per_epoch=250)


Ep:0 MnRew:-156.71 MxRew:-55.6 EpLen:94.0 Buffer:282 -- Step:282 -- Time:4
Ep:10 MnRew:-312.37 MxRew:-20.2 EpLen:80.2 Buffer:286 -- Step:3168 -- Time:45
Ep:20 MnRew:-321.72 MxRew:2.4 EpLen:76.4 Buffer:299 -- Step:6070 -- Time:88
Ep:30 MnRew:-169.68 MxRew:-21.2 EpLen:76.5 Buffer:296 -- Step:8978 -- Time:129
Ep:40 MnRew:-154.93 MxRew:6.9 EpLen:73.4 Buffer:252 -- Step:11840 -- Time:170
Ep:50 MnRew:-160.91 MxRew:47.3 EpLen:74.1 Buffer:260 -- Step:14728 -- Time:211
Ep:60 MnRew:-125.84 MxRew:15.3 EpLen:70.2 Buffer:269 -- Step:17466 -- Time:250
Ep:70 MnRew:-138.05 MxRew:45.1 EpLen:72.4 Buffer:259 -- Step:20290 -- Time:290
Ep:80 MnRew:-141.82 MxRew:-85.8 EpLen:67.0 Buffer:263 -- Step:22972 -- Time:328
Ep:90 MnRew:-131.11 MxRew:37.5 EpLen:73.0 Buffer:272 -- Step:25747 -- Time:367
Ep:100 MnRew:-146.37 MxRew:-27.2 EpLen:75.4 Buffer:282 -- Step:28763 -- Time:410
Ep:110 MnRew:-121.30 MxRew:7.1 EpLen:68.6 Buffer:331 -- Step:31711 -- Time:452
Ep:120 MnRew:-138.34 MxRew:-49.5 EpLen:76.8 Buffer:296 -- 

KeyboardInterrupt: 

In [34]:
current_milli_time = lambda: int(round(time.time() * 1000))
env_name = 'LunarLander-v2'
env_test = gym.make(env_name)


obs = env_test.reset()
obs = np.array([obs])
done = False

while not done:
    policy_actions = policy.predict(obs)
    action = tf.squeeze(tf.random.categorical(policy_actions,1))
    next_obs, reward, done, _ = env_test.step(np.squeeze(action))
    env_test.render()
            
    obs = np.array([next_obs.copy()])
    
env_test.close()