In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras import Model
import tensorflow_probability as tfp

import gym

import numpy as np
import pandas as pd
import time

from reinforce import Policy

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [3]:
def discounted_rewards(rewards, last_sv, gamma, n_step):
    
    rewards_g = np.zeros_like(rewards, dtype=np.float32)
    rewards_g[-1] = rewards[-1] + gamma * last_sv
    
    for i in reversed(range(len(rewards) - 1)):
        
        rewards_g[i] = rewards[i] + gamma * rewards_g[i+1]
        
    return rewards_g

In [4]:
class Buffer_AC():
    
    def __init__(self,gamma):
        
        self.gamma = gamma
        self.obs = []
        self.actions = []
        self.returns = []
        self.returns_g = []
        
    def store(self, temp_traj, last_sv, n_step):
        
        if len(temp_traj) > 0:
            
            self.obs.extend(temp_traj[:,0])
            return_g = discounted_rewards(temp_traj[:,1], last_sv, self.gamma, n_step)
            self.returns.extend(return_g - temp_traj[:,3])
            self.returns_g.extend(return_g)
            self.actions.extend(temp_traj[:,2])
            
    def get_batch(self):
        
        return np.array(self.obs, dtype=np.float32), self.actions, self.returns, self.returns_g
    
    def __len__(self):
        
        assert(len(self.obs) == len(self.actions) == len(self.returns))
        return len(self.obs)

In [24]:
#hidden_layers = [policy_hidden_layers,value_hidden_layers]
#hidden_size = [policy_hidden_size,value_hidden_size]
#activation = [policy_activation,value_activation]
#output_activation = [policy_activation,value_activation]
#alpha = [policy_alpha,value_alpha]

def AC(env_name, hidden_layers, hidden_size, activation, output_activation, alpha, num_epochs, 
       gamma, steps_per_epoch, n_step):
    
    env = gym.make(env_name)
    
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.n
    policy = Policy(hidden_layers[0], hidden_size[0], act_dim, activation[0], output_activation[0])
    value_function = Policy(hidden_layers[1], hidden_size[1], 1, activation[1], output_activation[1])
    
    obs = env.reset()
    
    _ = policy.predict(np.array([obs]))
    _ = value_function.predict(np.array([obs]))
    
    policy.compile(optimizer = tf.keras.optimizers.Adam(alpha[0]))
    value_function.compile(optimizer = tf.keras.optimizers.Adam(alpha[1]))
    
    step_count = 0
    train_rewards = []
    train_ep_len = []
    plot_mean_rew = []
    plot_steps = []
    plot_std = []
    
    timer = time.time()
    
    for epoch in range(num_epochs):
        
        obs = np.array([env.reset()])
        
        buffer = Buffer_AC(gamma)
        env_buffer = []
        epoch_rewards = []
        
        done = False
        
        while (len(buffer) < steps_per_epoch) or not done:
            
            actions_prob = policy.predict(obs)
            state_value = value_function.predict(obs)
            
            actions_dist = tfp.distributions.Categorical(probs=actions_prob, dtype=tf.float32)
            action = int(actions_dist.sample().numpy()[0])
            
            next_obs, reward, done, _ = env.step(action)
            
            env_buffer.append([obs.copy(), reward, action, np.squeeze(state_value)])
            
            obs = np.array([next_obs.copy()])
            step_count += 1
            epoch_rewards.append(reward)
            
            if done:
                
                buffer.store(np.array(env_buffer),0, n_step)
                env_buffer = []
                
                train_rewards.append(np.sum(epoch_rewards))
                train_ep_len.append(len(epoch_rewards))
                
                obs = np.array([env.reset()])
                epoch_rewards = []
                
        obs_batch, action_batch, return_batch, return_g_batch = buffer.get_batch()
        
        with tf.GradientTape() as tape_v:
            
            #Value Function
            
            value_states = tf.squeeze(value_function(obs_batch, training=True))
            value_loss = tf.reduce_mean((return_g_batch - value_states)**2)
            
            model_gradients_value = tape_v.gradient(value_loss, value_function.trainable_variables)
            value_function.optimizer.apply_gradients(zip(model_gradients_value, value_function.trainable_variables))
        
        with tf.GradientTape() as tape:
            
            #Policy
            
            one_hot_actions = tf.keras.utils.to_categorical(action_batch, act_dim, dtype=np.float32)
            pi_logits = policy(obs_batch, training=True)
            
            pi_log = tf.reduce_sum(tf.multiply(one_hot_actions.reshape(one_hot_actions.shape[0],1,one_hot_actions.shape[1]),tf.math.log(pi_logits)), axis=2)
            return_batch_array = np.array(return_batch).reshape(len(return_batch),1)
            
            pi_loss = -tf.reduce_mean(pi_log * return_batch_array)
            
            model_gradients = tape.gradient(pi_loss, policy.trainable_variables)
            policy.optimizer.apply_gradients(zip(model_gradients, policy.trainable_variables))
            
        
        # Statistics
        
        if epoch % 10 == 0:
            
            print('Ep:%d MnRew:%.2f StdRew:%.1f EpLen:%.1f Buffer:%d -- Step:%d -- Time:%d' % 
                  (epoch, np.mean(train_rewards), np.std(train_rewards), np.mean(train_ep_len), 
                   len(buffer), step_count,time.time()-timer))
            
            plot_mean_rew.append(np.mean(train_rewards))
            plot_steps.append(step_count)
            plot_std.append(np.std(train_rewards))
            
            train_rewards = []
            train_ep_len = []
            
            policy.save_weights('./saved_models/enforce_nn')
    
    env.close
    return policy, plot_mean_rew, plot_steps,plot_std

In [26]:
policy, plot_mean_rew, plot_steps,plot_std = AC('LunarLander-v2', hidden_layers=[1,2], hidden_size=[[64],[32,32]], 
                           activation=[[tf.tanh],[tf.nn.relu,tf.nn.relu]], output_activation=[tf.nn.softmax,None], 
                           alpha=[8e-3,8e-3], num_epochs=2000, gamma=0.99, steps_per_epoch=1000, n_step=1)

Ep:0 MnRew:-164.26 StdRew:85.5 EpLen:77.2 Buffer:1003 -- Step:1003 -- Time:43
Ep:10 MnRew:-159.01 StdRew:102.9 EpLen:103.9 Buffer:1001 -- Step:12330 -- Time:533
Ep:20 MnRew:-143.91 StdRew:78.8 EpLen:97.5 Buffer:1075 -- Step:22961 -- Time:992
Ep:30 MnRew:-103.74 StdRew:53.6 EpLen:114.5 Buffer:1079 -- Step:34867 -- Time:1510
Ep:40 MnRew:-92.74 StdRew:54.4 EpLen:101.5 Buffer:1011 -- Step:45631 -- Time:1977
Ep:50 MnRew:-72.70 StdRew:48.6 EpLen:103.4 Buffer:1083 -- Step:56282 -- Time:2436
Ep:60 MnRew:-74.82 StdRew:87.7 EpLen:183.2 Buffer:1092 -- Step:67276 -- Time:2911
Ep:70 MnRew:-72.96 StdRew:81.3 EpLen:231.2 Buffer:1066 -- Step:78834 -- Time:3413
Ep:80 MnRew:-51.05 StdRew:55.4 EpLen:164.8 Buffer:1340 -- Step:91032 -- Time:4047
Ep:90 MnRew:-30.11 StdRew:59.1 EpLen:211.3 Buffer:1686 -- Step:104980 -- Time:4727
Ep:100 MnRew:-21.64 StdRew:55.2 EpLen:241.1 Buffer:1199 -- Step:116795 -- Time:5306
Ep:110 MnRew:0.57 StdRew:51.1 EpLen:284.9 Buffer:1000 -- Step:129046 -- Time:5908
Ep:120 MnRew:10.

KeyboardInterrupt: 

In [22]:
#pi_alog = tf.reduce_sum(tf.multiply(one_hot_actions.reshape(one_hot_actions.shape[0],1,one_hot_actions[1]),tf.math.log(pi_logits)), axis=2)