In [4]:
import gym

import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras import Model

import numpy as np
import pandas as pd

In [5]:
def discounted_rewards(rewards, gamma):
    
    rtg = np.zeros_like(rewards , dtype=np.float32)
    rtg[-1] = rewards[-1]
    for i in reversed(range(len(rewards)-1)):
        
        rtg[i] = rewards[i] + gamma * rtg[i+1]
        
    return rtg

In [6]:
class Policy(Model):
    
    def __init__(self, hidden_layers, hidden_size, output_size, activation, output_activation):
        
        super(Policy, self).__init__()
        self.hidden_layers = [Dense(hidden_size[i], activation=activation) for i in range(hidden_layers)]
        self.output_layer = Dense(output_size, activation=output_activation)
        
    def call(self, state):
        
        x = state
        
        for layer in self.hidden_layers:
            
            x = layer(x)
        
        return self.output_layer(x)

In [16]:
class Buffer():
    
    def __init__(self, gamma):
        
        self.gamma = gamma
        self.obs = []
        self.actions = []
        self.returns = []
    
    def store(self, temp_traj):
        
        if len(temp_traj) > 0:
            self.obs.extend(temp_traj[:,0])
            ret = discounted_rewards(temp_traj[:,1], self.gamma)
            self.returns.extend(ret)
            self.actions.extend(temp_traj[:,2])
        
    def get_batch(self):
        
        return np.array(self.obs,dtype=np.float32), self.actions, self.returns
    
    def __len__(self):
        
        assert(len(self.obs) == len(self.actions) == len(self.returns))
        return len(self.obs)

In [19]:
def REINFORCE(env_name, hidden_layers, hidden_size, activation, output_activation, 
              alpha, num_epochs, gamma, steps_per_epoch):
    
    env = gym.make(env_name)
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.n
    policy = Policy(hidden_layers, hidden_size, act_dim, activation, output_activation)
    
    obs = env.reset()
    _ = policy.predict(np.array([obs]))
    
    policy.compile(optimizer = tf.keras.optimizers.Adam(alpha))
    
    step_count = 0
    train_rewards = []
    train_ep_len = []
    
    for epoch in range(num_epochs):
        
        obs = np.array([env.reset()])
        
        buffer = Buffer(gamma)
        env_buffer = []
        epoch_rewards = []
        
        done = False
        #while len(buffer) < steps_per_epoch:
        while not done:
            
            policy_actions = policy.predict(obs)
            action = tf.squeeze(tf.random.categorical(policy_actions,1))
            next_obs, reward, done, _ = env.step(np.squeeze(action))
            
            env_buffer.append([obs.copy(), reward, action])
            
            obs = np.array([next_obs.copy()])
            step_count += 1
            epoch_rewards.append(reward)
            
            if done: 
                
                buffer.store(np.array(env_buffer))
                env_buffer = []
                
                train_rewards.append((np.sum(epoch_rewards)))
                train_ep_len.append(len(epoch_rewards))
                
                obs = env.reset()
                epoch_rewards = []
                
        # Policy Optimization
        
        obs_batch, action_batch, return_batch = buffer.get_batch()
        
        with tf.GradientTape() as tape:
            
            one_hot_actions = tf.keras.utils.to_categorical(action_batch, act_dim, dtype=np.float32)
            pi_logits = policy.predict(obs_batch)
            pi_log = tf.reduce_sum(tf.multiply(one_hot_actions, tf.nn.log_softmax(pi_logits)), axis=1)
            
            pi_loss = -tf.reduce_mean(pi_log * return_batch)
            
            model_gradients = tape.gradient(pi_loss, policy.trainable_variables)
            policy.optimizer.apply_gradients(zip(model_gradients, policy.trainable_variables))
        
        # Statistics
        
        if epoch % 10 == 0:
            
            print('Ep:%d MnRew:%.2f MxRew:%.1f EpLen:%.1f Buffer:%d -- Step:%d -- Time:%d' % 
                  (ep, np.mean(train_rewards), np.max(train_rewards), np.mean(train_ep_len), 
                   len(buffer), step_count,time.time()-timer))
            
            train_rewards = []
            train_ep_len = []
            
    env.close

In [20]:
REINFORCE('LunarLander-v2', 1, [32], activation=tf.tanh, output_activation=None, 
              alpha=8e-3, num_epochs=1, gamma=0.99, steps_per_epoch=1000)

LookupError: No gradient defined for operation 'IteratorGetNext' (op type: IteratorGetNext)

In [6]:
all_envs = gym.envs.registry.all()
env_ids = [env_spec.id for env_spec in all_envs]
print('Total number of enviroments: ',len(env_ids),'\n')
print('Lunar enviroments: ',sorted([word for word in env_ids if word.startswith('Lunar')]))

Total number of enviroments:  859 

Lunar enviroments:  ['LunarLander-v2', 'LunarLanderContinuous-v2']


In [8]:
teste_env = gym.make('LunarLander-v2')
obs = teste_env.reset()
obs

array([ 0.00591211,  1.4067276 ,  0.5988082 , -0.18634862, -0.00684377,
       -0.13563897,  0.        ,  0.        ], dtype=float32)

In [28]:
teste_env.action_space.n

4

In [41]:
teste_env.step(0)

(array([ 0.01182432,  1.4019583 ,  0.59799904, -0.2120081 , -0.01354632,
        -0.13406302,  0.        ,  0.        ], dtype=float32),
 -0.9305277305408879,
 False,
 {})

In [72]:
teste_env.observation_space.shape[0]

8

In [117]:
obs.reshape(1,8)

array([[ 0.00591211,  1.4067276 ,  0.5988082 , -0.18634862, -0.00684377,
        -0.13563897,  0.        ,  0.        ]], dtype=float32)

In [122]:
pd.DataFrame(obs)

Unnamed: 0,0
0,0.005912
1,1.406728
2,0.598808
3,-0.186349
4,-0.006844
5,-0.135639
6,0.0
7,0.0
