In [4]:
import gym
from collections import deque
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
# Hyperparameters
learning_rate = 0.0005
gamma = 0.98
buffer_limit = 50000
batch_size = 32

In [24]:
class ReplayBuffer():
    def __init__(self, buffer_limit=buffer_limit):
        self.buffer = deque(maxlen=buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
        
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        states, actions, rewards, next_states, done_masks = [], [], [], [], []
        
        for transition in mini_batch:
            sample, action, reward, next_sample, done_mask = transition
            states.append(sample)
            actions.append([action])
            rewards.append([reward])
            next_states.append(next_sample)
            done_masks.append([done_mask])
        
        return torch.tensor(states, dtype=torch.float), torch.tensor(actions), torch.tensor(rewards), torch.tensor(next_states, dtype=torch.float), torch.tensor(done_masks)
    
    def __getitem__(self, idx):
        return self.buffer[idx]
    
    def __len__(self):
        return len(self.buffer)

In [25]:
class QNet(nn.Module):
    def __init__(self):
        super(QNet, self).__init__()
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        if random.random() < epsilon:
            return random.randint(0, 1)
        else:
            return out.argmax().item()

In [26]:
def train(q, q_target, memory, optimizer):
    for _ in range(10):
        state, action, reward, next_state, done_mask = memory.sample(batch_size)
        
        q_out = q(state)
        q_a = q_out.gather(1, action)
        max_next_q = q_target(next_state).max(1)[0].unsqueeze(1)
        target = reward + gamma*max_next_q*done_mask
        loss = F.smooth_l1_loss(q_a, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [30]:
env = gym.make("CartPole-v1")
q = QNet()
q_target = QNet()
q_target.load_state_dict(q.state_dict())
memory = ReplayBuffer()

optimizer = optim.Adam(q.parameters(), lr=learning_rate)

for n_epi in range(2000):
    epsilon = max(0.01, 0.08 - 0.01*(n_epi/200))
    state = env.reset()
    done = False
    score = 0.
    
    while not done:
        action = q.sample_action(torch.from_numpy(state).float(), epsilon)
        next_state, reward, done, info = env.step(action)
        done_mask = 0. if done else 1.
        memory.put((state, action, reward/100., next_state, done_mask))
        state = next_state
        
        score += reward
        if done:
            break
        
    if len(memory) > 2000:
        train(q, q_target, memory, optimizer)
    
    if n_epi % 20 == 0 and n_epi != 0:
        q_target.load_state_dict(q.state_dict())
        print(f"[Episode {n_epi}] score: {score}, n_buffer: {len(memory)}, epsilon: {epsilon}")
env.close()

[Episode 20] score: 14.0, n_buffer: 278, epsilon: 0.079
[Episode 40] score: 12.0, n_buffer: 531, epsilon: 0.078
[Episode 60] score: 10.0, n_buffer: 772, epsilon: 0.077
[Episode 80] score: 13.0, n_buffer: 1026, epsilon: 0.076
[Episode 100] score: 11.0, n_buffer: 1285, epsilon: 0.075
[Episode 120] score: 10.0, n_buffer: 1542, epsilon: 0.074
[Episode 140] score: 12.0, n_buffer: 1800, epsilon: 0.07300000000000001
[Episode 160] score: 14.0, n_buffer: 2054, epsilon: 0.07200000000000001
[Episode 180] score: 13.0, n_buffer: 2288, epsilon: 0.07100000000000001
[Episode 200] score: 12.0, n_buffer: 2481, epsilon: 0.07
[Episode 220] score: 8.0, n_buffer: 2685, epsilon: 0.069
[Episode 240] score: 21.0, n_buffer: 2905, epsilon: 0.068
[Episode 260] score: 271.0, n_buffer: 5155, epsilon: 0.067
[Episode 280] score: 500.0, n_buffer: 11502, epsilon: 0.066
[Episode 300] score: 500.0, n_buffer: 18799, epsilon: 0.065
[Episode 320] score: 251.0, n_buffer: 23839, epsilon: 0.064
[Episode 340] score: 161.0, n_bu