In [7]:
import gym

import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras import Model
import tensorflow_probability as tfp

import numpy as np
import pandas as pd

import time

from reinforce import discounted_rewards, Policy

In [8]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [9]:
class Buffer_baseline():
    
    def __init__(self, gamma):
        
        self.gamma = gamma
        self.obs = []
        self.actions = []
        self.returns = []
        self.returns_g = []
    
    def store(self, temp_traj):
        
        if len(temp_traj) > 0:
            self.obs.extend(temp_traj[:,0])
            ret = discounted_rewards(temp_traj[:,1], self.gamma)
            
            self.returns.extend(ret - temp_traj[:,3])
        
            self.returns_g.extend(ret)
            self.actions.extend(temp_traj[:,2])
            
    def get_batch(self):
        
        return np.array(self.obs,dtype=np.float32), self.actions, self.returns, self.returns_g
    
    def __len__(self):
        
        assert(len(self.obs) == len(self.actions) == len(self.returns))
        return len(self.obs)

In [23]:
#hidden_layers = [policy_hidden_layers,value_hidden_layers]
#hidden_size = [policy_hidden_size,value_hidden_size]
#activation = [policy_activation,value_activation]
#output_activation = [policy_activation,value_activation]
#alpha = [policy_alpha,value_alpha]

def REINFORCE_baseline(env_name, hidden_layers, hidden_size, activation, output_activation, 
              alpha, num_epochs, gamma, steps_per_epoch):
    
    env = gym.make(env_name)
    
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.n
    policy = Policy(hidden_layers[0], hidden_size[0], act_dim, activation[0], output_activation[0])
    value_function = Policy(hidden_layers[1], hidden_size[1], 1, activation[1], output_activation[1])
    
    obs = env.reset()
    
    _ = policy.predict(np.array([obs]))
    
    _ = value_function(np.array([obs]))
    
    policy.compile(optimizer = tf.keras.optimizers.Adam(alpha[0]))
    value_function.compile(optimizer = tf.keras.optimizers.Adam(alpha[1]))
    
    step_count = 0
    train_rewards = []
    train_ep_len = []
    plot_mean_rew = []
    plot_steps = []
    plot_std = []
    
    timer = time.time()
    
    for epoch in range(num_epochs):
        
        
        obs = np.array([env.reset()])
        
        buffer = Buffer_baseline(gamma)
        env_buffer = []
        epoch_rewards = []
        
        done = False
        while len(buffer) < steps_per_epoch:
           
            
            actions_prob = policy.predict(obs)
            state_value = value_function.predict(obs) 
            
            actions_dist = tfp.distributions.Categorical(probs=actions_prob, dtype=tf.float32)
            action = int(actions_dist.sample().numpy()[0])
            
            next_obs, reward, done, _ = env.step(action)
            
            env_buffer.append([obs.copy(), reward, action, np.squeeze(state_value)])
            
            obs = np.array([next_obs.copy()])
            step_count += 1
            epoch_rewards.append(reward)
            
            if done: 
                
                buffer.store(np.array(env_buffer))
                env_buffer = []
                
                train_rewards.append((np.sum(epoch_rewards)))
                train_ep_len.append(len(epoch_rewards))
                
                obs = np.array([env.reset()])
                epoch_rewards = []
                
        # Policy Optimization
        
        obs_batch, action_batch, return_batch, return_g_batch = buffer.get_batch()
        
        with tf.GradientTape() as tape_v:
            
            #Value Function
            
            value_states = tf.squeeze(value_function(obs_batch, training=True))
            value_loss = tf.reduce_mean((return_g_batch - value_states)**2)
            
            model_gradients_value = tape_v.gradient(value_loss, value_function.trainable_variables)
            value_function.optimizer.apply_gradients(zip(model_gradients_value,value_function.trainable_variables))
            
        with tf.GradientTape() as tape:
            
            #Policy 
            one_hot_actions = tf.keras.utils.to_categorical(action_batch, act_dim, dtype=np.float32)
            
            pi_logits = policy(obs_batch, training=True)
            
            pi_log = tf.reduce_sum(tf.multiply(one_hot_actions.reshape(one_hot_actions.shape[0],1,one_hot_actions.shape[1]),
                                               tf.math.log(pi_logits)), axis=2)
            
            return_batch_array = np.array(return_batch).reshape(len(return_batch),1)
            pi_loss = -tf.reduce_mean(pi_log * return_batch_array)
            
            
            model_gradients = tape.gradient(pi_loss, policy.trainable_variables)
            policy.optimizer.apply_gradients(zip(model_gradients, policy.trainable_variables))
        
        # Statistics
        
        if epoch % 10 == 0:
            
            print('Ep:%d MnRew:%.2f StdRew:%.1f EpLen:%.1f Buffer:%d -- Step:%d -- Time:%d' % 
                  (epoch, np.mean(train_rewards), np.std(train_rewards), np.mean(train_ep_len), 
                   len(buffer), step_count,time.time()-timer))
            
            plot_mean_rew.append(np.mean(train_rewards))
            plot_steps.append(step_count)
            plot_std.append(np.std(train_rewards))
            
            train_rewards = []
            train_ep_len = []
            
            policy.save_weights('./saved_models/enforce_nn')
    
    env.close
    return policy, plot_mean_rew, plot_steps,plot_std

In [24]:
#hidden_layers = [policy_hidden_layers,value_hidden_layers]
#hidden_size = [policy_hidden_size,value_hidden_size]
#activation = [policy_activation,value_activation]
#output_activation = [policy_activation,value_activation]
#alpha = [policy_alpha,value_alpha]

policy, plot_mean_rew, plot_steps,plot_std = REINFORCE_baseline('LunarLander-v2', hidden_layers=[1,2], hidden_size=[[64],[32,32]], 
                           activation=[[tf.tanh],[tf.nn.relu,tf.nn.relu]], output_activation=[tf.nn.softmax,None], 
                           alpha=[8e-3,8e-3], num_epochs=1000, gamma=0.99, steps_per_epoch=1000)

Ep:0 MnRew:-380.63 StdRew:170.8 EpLen:87.5 Buffer:1050 -- Step:1050 -- Time:43
Ep:10 MnRew:-152.00 StdRew:77.7 EpLen:82.0 Buffer:1024 -- Step:11463 -- Time:475
Ep:20 MnRew:-121.17 StdRew:37.1 EpLen:77.3 Buffer:1026 -- Step:21824 -- Time:914
Ep:30 MnRew:-114.96 StdRew:35.5 EpLen:77.7 Buffer:1037 -- Step:32240 -- Time:1338
Ep:40 MnRew:-114.26 StdRew:23.0 EpLen:78.6 Buffer:1080 -- Step:42779 -- Time:1756
Ep:50 MnRew:-105.56 StdRew:27.8 EpLen:80.2 Buffer:1002 -- Step:53043 -- Time:2165
Ep:60 MnRew:-97.74 StdRew:41.3 EpLen:89.9 Buffer:1066 -- Step:63474 -- Time:2582
Ep:70 MnRew:-70.82 StdRew:40.7 EpLen:101.7 Buffer:1024 -- Step:74154 -- Time:3011
Ep:80 MnRew:-73.45 StdRew:58.1 EpLen:161.4 Buffer:1455 -- Step:85775 -- Time:3483
Ep:90 MnRew:-85.90 StdRew:85.8 EpLen:314.4 Buffer:1027 -- Step:97722 -- Time:4005
Ep:100 MnRew:-59.80 StdRew:74.0 EpLen:298.8 Buffer:1253 -- Step:109076 -- Time:4645
Ep:110 MnRew:-42.24 StdRew:76.3 EpLen:445.4 Buffer:1534 -- Step:121101 -- Time:5317
Ep:120 MnRew:-32.2

KeyboardInterrupt: 

In [18]:
value

<tf.Tensor: shape=(1005,), dtype=float32, numpy=
array([-0.2372631 , -0.24137191, -0.24068499, ..., -0.31203863,
       -0.32794836, -0.29087335], dtype=float32)>

In [15]:
value

array([[-0.23116519]], dtype=float32)