<a href="https://colab.research.google.com/github/ericgao96/Reinforcement-Learning-2nd-Edition-by-Sutton-Exercise-Solutions/blob/master/Project2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%pip install ribs[all] gym~=0.17.0 Box2D~=2.3.10 tqdm
import gym
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

Collecting ribs[all]
  Downloading ribs-0.4.0-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 2.1 MB/s 
Collecting Box2D~=2.3.10
  Downloading Box2D-2.3.10-cp37-cp37m-manylinux1_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 6.5 MB/s 
Collecting toml>=0.10.0
  Downloading toml-0.10.2-py2.py3-none-any.whl (16 kB)
Installing collected packages: toml, ribs, Box2D
Successfully installed Box2D-2.3.10 ribs-0.4.0 toml-0.10.2


In [None]:
%%time

env = gym.make('LunarLander-v2')
env.seed(1)
N_EPISODE = 1000
LR = 1e-3 # LR = 1E-4, EPSILON_END = 1E-2, BATCH_SIZE = 32 would work
GAMMA = 0.99
EPSILON = 1
EPSILON_END = 0.01
MEMORY_CAPACITY = 100000
BATCH_SIZE = 32
N_ACTION = env.action_space.n
N_STATE_VAR = env.observation_space.shape[0]
TARGET_UPDATE_FREQ = 10
EPSILON_DECAY = 0.999
LAYER1_NEURON = 128
LAYER2_NEURON = 64


# Get cpu or gpu device for training.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

def creat_nn_model(layer1_neuron=128, layer2_neuron=64, device=device):
  net = NeuralNetwork(layer1_neuron, layer2_neuron).to(device)
  return net

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self, layer1_neuron, layer2_neuron):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(N_STATE_VAR, layer1_neuron),
            nn.ReLU(),
            nn.Linear(layer1_neuron, layer2_neuron),
            nn.ReLU(),
            nn.Linear(layer2_neuron, N_ACTION)
        )

    def forward(self, x):
        x = self.flatten(x)
        actions_value = self.linear_relu_stack(x)
        return actions_value

D = np.zeros((MEMORY_CAPACITY, N_STATE_VAR * 2 + 3)) # store S, action, reward, is_terminal, and S_prime in replay memory
param_update_counter = 0
memory_counter = 0
epsilon = EPSILON
reward_list = []

policy_net = creat_nn_model(LAYER1_NEURON, LAYER2_NEURON, device)
target_net = creat_nn_model(LAYER1_NEURON, LAYER2_NEURON, device)
target_net.load_state_dict(policy_net.state_dict())
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(policy_net.parameters(), lr=LR)

for episode in range(N_EPISODE):
    S = env.reset()
    reward_episode = 0
    for step in range(1000):
        # epsilon-greedy policy
        is_greedy = np.random.random() > epsilon
        if is_greedy:
            action = torch.argmax(policy_net(torch.tensor(S.reshape(1,-1), device=device, dtype=torch.float))).item()
        else:
            action = env.action_space.sample()
        S_prime, reward, done, info = env.step(action)
        # store experience in replay memory
        store_index = memory_counter % MEMORY_CAPACITY
        D[store_index] = np.concatenate([S,[action, reward, done], S_prime])
        memory_counter += 1
        S = S_prime
        reward_episode += reward
        
        # train only when have more than BATCH_SIZE samples
        if memory_counter >= BATCH_SIZE:
            # sample random batch of expereince from memory
            sample_index = np.random.choice(min(memory_counter, MEMORY_CAPACITY), size=BATCH_SIZE, replace=False)
            sample_memory = D[sample_index]
            S_memory = sample_memory[:, :N_STATE_VAR]
            S_prime_memory = sample_memory[:, -N_STATE_VAR:]
            action_memory = sample_memory[:, N_STATE_VAR, None]
            reward_memory = sample_memory[:, N_STATE_VAR+1, None]
            done_memory = sample_memory[:, N_STATE_VAR+2, None]
            
            batch_S = torch.tensor(S_memory, device=device, dtype=torch.float)
            batch_S_prime = torch.tensor(S_prime_memory, device=device, dtype=torch.float)
            batch_action = torch.tensor(action_memory.astype(int), device=device, dtype=torch.long)
            batch_reward = torch.tensor(reward_memory, device=device, dtype=torch.float)
            batch_done = torch.tensor(done_memory.astype(int), device=device, dtype=torch.float)
            
            
            # set training target
            Q_next = target_net(batch_S_prime).detach()
            Q_pred = policy_net(batch_S).gather(1, batch_action)
            Q_target = (batch_reward + GAMMA * Q_next.max(1)[0].reshape(-1,1))
            Q_target = torch.where(batch_done == 1, batch_reward, Q_target)
            
            # perform gradient descent to update weights
            loss = loss_fn(Q_pred, Q_target)
            optimizer.zero_grad()
            loss.backward()
#             for param in policy_net.parameters():
#                 param.grad.data.clamp_(-1, 1)
            optimizer.step()
            
            # update target network params
            if param_update_counter % TARGET_UPDATE_FREQ == 0:
                target_net.load_state_dict(policy_net.state_dict())
            param_update_counter += 1
                
        if done or step == 999:
            reward_list.append(reward_episode)
            break
            
        # epsilon decay
        if epsilon > EPSILON_END:
            epsilon = epsilon * EPSILON_DECAY
    if episode % 100 == 0 and episode != 0:
        print('Episode: {episode}, Reward: {reward:.2f}, Average Reward: {avg_reward:.2f}'.format(
            episode=episode, reward=reward_episode, avg_reward=np.mean(reward_list[-100:])))

Using cpu device
