In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [2]:
# Hyperparameters
learning_rate = 0.0002
gamma = 0.98

In [5]:
class PolicyNet(nn.Module):
    def __init__(self):
        super(PolicyNet, self).__init__()
        self.data = []
        
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 2)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=0)
        return x

    def put_data(self, item):
        self.data.append(item)
        
    def train_net(self):
        R = 0
        self.optimizer.zero_grad()
        for reward, prob in self.data[::-1]:
            R = reward + gamma*R
            loss = -R * torch.log(prob)
            loss.backward()
        self.optimizer.step()
        self.data = []

In [9]:
env = gym.make("CartPole-v1")
pi = PolicyNet()

for n_epi in range(10000):
    score = 0.
    state = env.reset()
    done = False
    
    while not done:
        prob = pi(torch.from_numpy(state).float())
        m = Categorical(prob)
        action = m.sample()
        next_state, reward, done, info = env.step(action.item())
        pi.put_data((reward, prob[action]))
        state = next_state
        score += reward

    pi.train_net()
    if n_epi % 100 == 0 and n_epi != 0:
        print(f"[Episode {n_epi}] score: {score}")

env.close()

[Episode 100] score: 38.0
[Episode 200] score: 28.0
[Episode 300] score: 12.0
[Episode 400] score: 69.0
[Episode 500] score: 27.0
[Episode 600] score: 16.0
[Episode 700] score: 22.0
[Episode 800] score: 137.0
[Episode 900] score: 101.0
[Episode 1000] score: 63.0
[Episode 1100] score: 118.0
[Episode 1200] score: 110.0
[Episode 1300] score: 162.0
[Episode 1400] score: 204.0
[Episode 1500] score: 149.0
[Episode 1600] score: 149.0
[Episode 1700] score: 275.0
[Episode 1800] score: 337.0
[Episode 1900] score: 104.0
[Episode 2000] score: 194.0
[Episode 2100] score: 169.0
[Episode 2200] score: 249.0
[Episode 2300] score: 500.0
[Episode 2400] score: 366.0
[Episode 2500] score: 465.0
[Episode 2600] score: 321.0
[Episode 2700] score: 500.0
[Episode 2800] score: 484.0
[Episode 2900] score: 133.0
[Episode 3000] score: 500.0
[Episode 3100] score: 418.0
[Episode 3200] score: 424.0
[Episode 3300] score: 372.0
[Episode 3400] score: 145.0
[Episode 3500] score: 374.0
[Episode 3600] score: 285.0
[Episode 