In [None]:
import gym
env = gym.make('LunarLander-v2')
'''from tensorflow.keras.layers import Dense, Activation, Input
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
import numpy as np
import tensorflow.keras.backend as K'''

observation = env.reset()
for _ in range(1000):
    env.render()
    action = env.action_space.sample() # your agent here (this takes random actions)
    observation, reward, done, info = env.step(action)

    if done:
        observation = env.reset()
env.close()

In [None]:
from keras.layers import Dense, Activation, Input
from keras.models import Model, load_model
from keras.optimizers import Adam
import numpy as np
import keras.backend as K
class Agent(object):
    def __init__(self, alpha, gamma=0.99, n_actions=4, layer1_size=16,layer2_size=16, ip_dims =120, fname="ppo_rl.h5"):
        
        self.gamma = gamma
        self.lr= alpha
        self.G = 0
        self.ip_dims = ip_dims
        self.fc1_dims = layer1_size
        self.fc2_dims = layer2_size
        self.n_actions = n_actions
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        
        self.policy, self.predict = self.build_policy_network(alpha, n_actions, ip_dims, layer1_size, layer2_size)
        
        self.action_space = [i for i in range(n_actions)]
        self.model_file = fname
        
    
    def build_policy_network(self, lr, n_actions, ip_dims, l1_size, l2_size):
        ip = Input(shape=(self.ip_dims,))
        advantages = Input(shape=[1])
        dense1  = Dense(self.fc1_dims, activation='relu')(ip)
        dense2 = Dense(self.fc2_dims, activation='relu')(dense1)
        probs = Dense(self.n_actions, activation='softmax')(dense2)
        
        def loss(y_true, y_pred):
            out= K.clip(y_pred, 1e-8, 1-1e-8)
            log_k = y_true*K.log(out)
            
            return K.sum(-log_k* advantages)
        
        policy =Model(inputs=[ip, advantages], outputs=[probs])
        
        policy.compile(optimizer=Adam(lr = self.lr), loss=loss)
        
        predict = Model(inputs=[ip], outputs=[probs])
        
        return policy, predict
    
    def choose_action(self, observation):
        state =  observation[np.newaxis, :]
        probabilities = self.predict.predict(state)[0]
        action = np.random.choice(self.action_space, p=probabilities)    
        return action
    
    def store_transition(self, observation, action, reward):
        self.action_memory.append(action)
        self.state_memory.append(observation)
        self.reward_memory.append(reward)
        
    def learn(self):
        state_memory = np.array(self.state_memory)
        action_memory = np.array(self.action_memory)
        reward_memory = np.array(self.reward_memory)
        
        action = np.zeros([len(action_memory), self.n_actions])
        action[np.arange(len(action_memory)), action_memory] = 1
        
        G= np.zeros_like(reward_memory)
        
        for t in range(len(reward_memory)):
            G_sum = 0
            discount  = 1
            for k in range(t, len(reward_memory)):
                G_sum += reward_memory[k]*discount
                
                discount *= self.gamma
                
            G[t] = G_sum
            
        mean  = np.mean(G)
        std  = np.std(G) if np.std(G) >0 else 1
        self.G = (G-mean)/std
        
        cost = self.policy.train_on_batch([state_memory, self.G], action)
        
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        
    def save_model(self):
        self.policy.save(self.model_file)
    def load_model(self):
        self.policy = load_model(self.model_file)

In [None]:
import gym 
import matplotlib.pyplot as plt
import numpy as np

if __name__ == '__main__':
    agent  = Agent(alpha=0.0005, gamma=0.99, n_actions=4, layer1_size=64, layer2_size=64, ip_dims=8)
    
    env = gym.make('LunarLander-v2')
    score_history= []
    
    n_episodes = 2000
    
    for i in range(n_episodes):
        done = False
        score = 0
        obesrvation = env.reset()
        
        while not done:
            action  = agent.choose_action(obesrvation)
            observation_, reward, done, info = env.step(action)
            agent.store_transition(obesrvation, action, reward)
            
            observation = observation_
            
            score += reward
            
        score_history.append(score)
        
        agent.learn()
        print('episode', i, 'score %.1f'%score, 'average_score %.1f', np.mean(score_history[-100:]))

In [None]:
import gym 
import matplotlib.pyplot as plt
import numpy as np

if __name__ == '__main__':
    agent  = Agent(alpha=0.0005, gamma=0.99, n_actions=4, layer1_size=64, layer2_size=64, ip_dims=8)
    
    env = gym.make('LunarLander-v2')
    score_history= []
    
    n_episodes = 2000
    
    for i in range(n_episodes):
        done = False
        score = 0
        obesrvation = env.reset()
        
        while not done:
            env.render()
            action  = agent.choose_action(obesrvation)
            observation_, reward, done, info = env.step(action)
            agent.store_transition(obesrvation, action, reward)
            
            observation = observation_
            
            score += reward
            
        score_history.append(score)
        
        agent.learn()
        print('episode', i, 'score %.1f'%score, 'average_score %.1f', np.mean(score_history[-100:]))