In [2]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [4]:
# Hyperparameters
LR = 0.0002
GAMMA = 0.98
N_ROLLOUT = 10      # update networks for every 10 transitions

In [13]:
class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.data = []
        
        self.fc1 = nn.Linear(4, 256)
        self.fc_policy = nn.Linear(256, 2)
        self.fc_value = nn.Linear(256, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=LR)
        
    def policy(self, x, softmax_dim=0):
        x = F.relu(self.fc1(x))
        x = self.fc_policy(x)
        prob = F.softmax(x, dim=softmax_dim)
        
        return prob
    
    def value(self, x):
        x = F.relu(self.fc1(x))
        value = self.fc_value(x)
        
        return value

    def put_data(self, transition):
        self.data.append(transition)
        
    def make_batch(self):
        states, actions, rewards, next_states, done_masks = [], [], [], [], []
        for transition in self.data:
            state, action, reward, next_state, done_mask = transition
            states.append(state)
            actions.append([action])
            rewards.append([reward/100.])
            next_states.append(next_state)
            done_masks.append([0. if done_mask else 1.])

        state_batch = torch.tensor(states, dtype=torch.float)
        action_batch = torch.tensor(actions)
        reward_batch = torch.tensor(rewards, dtype=torch.float)
        next_state_batch = torch.tensor(next_states, dtype=torch.float)
        done_mask_batch = torch.tensor(done_masks, dtype=torch.float)
        self.data = []
        return (state_batch, action_batch, reward_batch, next_state_batch, done_mask_batch)

    def train_net(self):
        state, action, reward, next_state, done_mask = self.make_batch()
        td_target = reward + GAMMA*self.value(next_state)*done_mask         # done_mask for episodic scenario
        delta = td_target - self.value(state)
        
        policy = self.policy(state, softmax_dim=1)
        policy_a = policy.gather(1, action)
        loss = -torch.log(policy_a) * delta.detach() + F.smooth_l1_loss(self.value(state), td_target.detach())
        
        self.optimizer.zero_grad()
        loss.mean().backward()
        self.optimizer.step()
        

In [14]:
env = gym.make('CartPole-v1')
model = ActorCritic()
score = 0.

for n_epi in range(10000):
    done = False
    state = env.reset()
    
    while not done:
        for transition in range(N_ROLLOUT):
            prob = model.policy(torch.from_numpy(state).float())
            m = Categorical(prob)
            action = m.sample().item()
            next_state, reward, done, info = env.step(action)
            model.put_data((state, action, reward, next_state, done))
            
            state = next_state
            score += reward
            
            if done:
                break
            
        model.train_net()
    
    if n_epi%20==0 and n_epi != 0:
        print(f"[EPISODE {n_epi}] avg score: {score/n_epi}")
        score = 0.

env.close()

[EPISODE 20] avg score: 21.05
[EPISODE 40] avg score: 8.95
[EPISODE 60] avg score: 5.466666666666667
[EPISODE 80] avg score: 5.2625
[EPISODE 100] avg score: 3.52
[EPISODE 120] avg score: 3.716666666666667
[EPISODE 140] avg score: 3.3714285714285714
[EPISODE 160] avg score: 3.90625
[EPISODE 180] avg score: 3.588888888888889
[EPISODE 200] avg score: 3.095
[EPISODE 220] avg score: 3.2590909090909093
[EPISODE 240] avg score: 3.370833333333333
[EPISODE 260] avg score: 2.3076923076923075
[EPISODE 280] avg score: 1.9928571428571429
[EPISODE 300] avg score: 1.8033333333333332
[EPISODE 320] avg score: 2.0875
[EPISODE 340] avg score: 1.9676470588235293
[EPISODE 360] avg score: 1.625
[EPISODE 380] avg score: 1.7552631578947369
[EPISODE 400] avg score: 1.7625
[EPISODE 420] avg score: 1.3976190476190475
[EPISODE 440] avg score: 1.3386363636363636
[EPISODE 460] avg score: 1.1695652173913043
[EPISODE 480] avg score: 1.81875
[EPISODE 500] avg score: 2.43
[EPISODE 520] avg score: 2.498076923076923
[EPI

KeyboardInterrupt: 