In [1]:
import gym

import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras import Model
import tensorflow_probability as tfp

import numpy as np
import pandas as pd

import time

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [3]:
def discounted_rewards(rewards, gamma):
    
    rtg = np.zeros_like(rewards , dtype=np.float32)
    rtg[-1] = rewards[-1]
    for i in reversed(range(len(rewards)-1)):
        
        rtg[i] = rewards[i] + gamma * rtg[i+1]
        
    return rtg

In [4]:
class Policy(Model):
    
    def __init__(self, hidden_layers, hidden_size, output_size, activation, output_activation):
        
        super(Policy, self).__init__()
        self.hidden_layers = [Dense(hidden_size[i], activation=activation[i]) for i in range(hidden_layers)]
        self.output_layer = Dense(output_size, activation=output_activation)
        
    def call(self, state):
        
        x = state
        
        for layer in self.hidden_layers:
            
            x = layer(x)
        
        return self.output_layer(x)

In [5]:
class Buffer():
    
    def __init__(self, gamma):
        
        self.gamma = gamma
        self.obs = []
        self.actions = []
        self.returns = []
    
    def store(self, temp_traj):
        
        if len(temp_traj) > 0:
            self.obs.extend(temp_traj[:,0])
            ret = discounted_rewards(temp_traj[:,1], self.gamma)
            self.returns.extend(ret)
            self.actions.extend(temp_traj[:,2])
        
    def get_batch(self):
        
        return np.array(self.obs,dtype=np.float32), self.actions, self.returns
    
    def __len__(self):
        
        assert(len(self.obs) == len(self.actions) == len(self.returns))
        return len(self.obs)

In [6]:
def REINFORCE(env_name, hidden_layers, hidden_size, activation, output_activation, 
              alpha, num_epochs, gamma, steps_per_epoch):
    
    env = gym.make(env_name)
    
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.n
    policy = Policy(hidden_layers, hidden_size, act_dim, activation, output_activation)
    
    obs = env.reset()
    _ = policy.predict(np.array([obs]))
    
    policy.compile(optimizer = tf.keras.optimizers.Adam(alpha))
    
    step_count = 0
    train_rewards = []
    train_ep_len = []
    plot_mean_rew = []
    plot_steps = []
    plot_std = []
    
    timer = time.time()
    
    for epoch in range(num_epochs):
        
        
        obs = np.array([env.reset()])
        
        buffer = Buffer(gamma)
        env_buffer = []
        epoch_rewards = []
        
        done = False
        while len(buffer) < steps_per_epoch:
           
            
            #policy_actions = policy.predict(obs)
            actions_prob = policy.predict(obs)
            actions_dist = tfp.distributions.Categorical(probs=actions_prob, dtype=tf.float32)
            action = int(actions_dist.sample().numpy()[0])
            
            #action = tf.squeeze(tf.random.categorical(policy_actions,1))
            
            #next_obs, reward, done, _ = env.step(np.squeeze(action))
            next_obs, reward, done, _ = env.step(action)
            
            env_buffer.append([obs.copy(), reward, action])
            
            obs = np.array([next_obs.copy()])
            step_count += 1
            epoch_rewards.append(reward)
            
            if done: 
                
                buffer.store(np.array(env_buffer))
                env_buffer = []
                
                train_rewards.append((np.sum(epoch_rewards)))
                train_ep_len.append(len(epoch_rewards))
                
                obs = np.array([env.reset()])
                epoch_rewards = []
                
        # Policy Optimization
        
        obs_batch, action_batch, return_batch = buffer.get_batch()
        
        with tf.GradientTape() as tape:
            
            one_hot_actions = tf.keras.utils.to_categorical(action_batch, act_dim, dtype=np.float32)
            
            pi_logits = policy(obs_batch, training=True)
            
            pi_log = tf.reduce_sum(tf.multiply(one_hot_actions.reshape(one_hot_actions.shape[0],1,one_hot_actions.shape[1]),
                                               tf.math.log(pi_logits)), axis=2)
            
            return_batch_array = np.array(return_batch).reshape(len(return_batch),1)
            pi_loss = -tf.reduce_mean(pi_log * return_batch_array)
            
            
            model_gradients = tape.gradient(pi_loss, policy.trainable_variables)
            policy.optimizer.apply_gradients(zip(model_gradients, policy.trainable_variables))
        
        # Statistics
        
        if epoch % 10 == 0:
            
            print('Ep:%d MnRew:%.2f StdRew:%.1f EpLen:%.1f Buffer:%d -- Step:%d -- Time:%d' % 
                  (epoch, np.mean(train_rewards), np.std(train_rewards), np.mean(train_ep_len), 
                   len(buffer), step_count,time.time()-timer))
            
            plot_mean_rew.append(np.mean(train_rewards))
            plot_steps.append(step_count)
            plot_std.append(np.std(train_rewards))
            
            train_rewards = []
            train_ep_len = []
            
            policy.save_weights('./saved_models/enforce_nn')
    
    env.close
    return policy, plot_mean_rew, plot_steps,plot_std
    

In [58]:
#policy,plot_mean_rew,plot_steps,plot_std= REINFORCE('LunarLander-v2', 2, [64,64], activation=tf.tanh, 
        #                                        output_activation=None, 
        #                                        alpha=8e-3, num_epochs=2000, gamma=0.99, 
         #                                       steps_per_epoch=1000)

In [59]:
np.random.seed(100)
tf.random.set_seed(100)
policy, plot_mean_rew, plot_steps,plot_std= REINFORCE('LunarLander-v2', 1,[64],activation=[tf.tanh], output_activation=tf.nn.softmax,
                      alpha=8e-3, num_epochs=1000, gamma=0.99, steps_per_epoch=1000)

Ep:0 MnRew:-290.53 StdRew:138.4 EpLen:95.2 Buffer:1047 -- Step:1047 -- Time:22
Ep:10 MnRew:-139.54 StdRew:83.0 EpLen:95.5 Buffer:1046 -- Step:11647 -- Time:256
Ep:20 MnRew:-114.74 StdRew:52.5 EpLen:92.2 Buffer:1073 -- Step:22157 -- Time:490
Ep:30 MnRew:-115.74 StdRew:73.2 EpLen:108.6 Buffer:1003 -- Step:32693 -- Time:731
Ep:40 MnRew:-94.01 StdRew:52.2 EpLen:107.3 Buffer:1107 -- Step:43208 -- Time:982
Ep:50 MnRew:-78.04 StdRew:51.8 EpLen:106.2 Buffer:1032 -- Step:53823 -- Time:1234
Ep:60 MnRew:-69.17 StdRew:40.5 EpLen:103.0 Buffer:1014 -- Step:64230 -- Time:1469
Ep:70 MnRew:-43.70 StdRew:26.4 EpLen:117.2 Buffer:1006 -- Step:74896 -- Time:1705
Ep:80 MnRew:-36.36 StdRew:41.3 EpLen:159.4 Buffer:1025 -- Step:85573 -- Time:1941
Ep:90 MnRew:-37.61 StdRew:68.3 EpLen:350.2 Buffer:1393 -- Step:97829 -- Time:2217
Ep:100 MnRew:-13.73 StdRew:59.9 EpLen:446.8 Buffer:1652 -- Step:110787 -- Time:2511
Ep:110 MnRew:0.09 StdRew:37.8 EpLen:415.0 Buffer:1926 -- Step:124482 -- Time:2820
Ep:120 MnRew:9.03 St

KeyboardInterrupt: 

In [7]:
np.random.seed(100)
tf.random.set_seed(100)
policy, plot_mean_rew, plot_steps,plot_std= REINFORCE('LunarLander-v2', 2,[64,64],activation=[tf.nn.relu,tf.tanh], output_activation=tf.nn.softmax,
                      alpha=8e-3, num_epochs=1000, gamma=0.99, steps_per_epoch=1000)

Ep:0 MnRew:-215.47 StdRew:98.5 EpLen:91.9 Buffer:1011 -- Step:1011 -- Time:21
Ep:10 MnRew:-297.44 StdRew:243.5 EpLen:97.0 Buffer:1053 -- Step:11584 -- Time:248
Ep:20 MnRew:-342.59 StdRew:318.6 EpLen:113.9 Buffer:1066 -- Step:22405 -- Time:480
Ep:30 MnRew:-156.12 StdRew:85.8 EpLen:92.9 Buffer:1008 -- Step:32718 -- Time:703
Ep:40 MnRew:-129.10 StdRew:34.2 EpLen:75.2 Buffer:1031 -- Step:43097 -- Time:931
Ep:50 MnRew:-124.55 StdRew:26.5 EpLen:73.6 Buffer:1038 -- Step:53470 -- Time:1157
Ep:60 MnRew:-124.60 StdRew:32.4 EpLen:75.7 Buffer:1047 -- Step:63923 -- Time:1385
Ep:70 MnRew:-115.67 StdRew:18.2 EpLen:78.4 Buffer:1055 -- Step:74422 -- Time:1614
Ep:80 MnRew:-96.86 StdRew:24.5 EpLen:80.9 Buffer:1005 -- Step:84944 -- Time:1843
Ep:90 MnRew:-71.99 StdRew:35.5 EpLen:106.4 Buffer:1087 -- Step:95584 -- Time:2075
Ep:100 MnRew:-63.69 StdRew:31.6 EpLen:103.7 Buffer:1082 -- Step:106062 -- Time:2303
Ep:110 MnRew:-41.99 StdRew:40.3 EpLen:131.9 Buffer:1026 -- Step:116743 -- Time:2536
Ep:120 MnRew:-18.2

KeyboardInterrupt: 