In [1]:
import argparse
import gym
import numpy as np
from itertools import count
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

Published solutions predict a count(actions) vector with each value being the value of taking that action. this is equivalent to a state value but duplicate each state by the number of actions that can achieve it. Is that Q-Learning????


In [2]:
parser = argparse.ArgumentParser(description='PyTorch actor-critic example')
parser.add_argument('--gamma', type=float, default=0.999, metavar='G', help='discount factor (default: 0.99)')
parser.add_argument('--seed', type=int, default=543, metavar='N', help='random seed (default: 543)')
# parser.add_argument('--render', action='store_true', help='render the environment')
parser.add_argument('--render', type=bool,default=False, help='render the environment')
parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='interval between training status logs (default: 10)')
parser.add_argument('-f','--file',help='Path for input file. (Dummy arg to enable execution in notebook.)' )
args = parser.parse_args() 
torch.manual_seed(args.seed)

#Get the smallest possible non-zero number on this machine
eps = np.finfo(np.float32).eps.item()

In [34]:
class World():
    def __init__(self):
        self.env = gym.make('CartPole-v0')
        self.env.seed(args.seed)
        self.reward = 0.0
        self.done = False
        self.reset()
    def reset(self):
        self.state = torch.tensor(self.env.reset(), requires_grad=False, dtype=torch.float)
    def action_count(self):
        return self.env.action_space.n
    def world_dimensions(self):
        return self.env.observation_space.shape[0]
    def step(self,action):
        self.state, self.reward, self.done, _ = self.env.step(action.item())
        self.state = torch.tensor(self.state, requires_grad=False, dtype=torch.float)
        if args.render:
            self.env.render()
        
class Actor(nn.Module):
    
    def __init__(self,world: World):
        super(Actor, self).__init__()
        self.l1 = nn.Linear(world.world_dimensions(),48)
        self.l2 = nn.Linear(48,24)
        self.head = nn.Linear(24, world.action_count())

    def forward(self, state):
        a1 = F.softplus(self.l1(state))
        a2 = F.softplus(self.l2(a1))
        head = self.head(a2)
        action_scores = F.softmax(head, dim=-1)
        return action_scores
    
    def choose_action(self,scores):
        self.categories = Categorical(scores)
        self.action = self.categories.sample()
        return self.action
    
    def advantage_loss(self,critic,world):
        advantage = world.reward + critic.value.data * args.gamma - critic.prev_value.data
        return self.categories.log_prob(self.action)*advantage
    
class Critic(nn.Module):
    
    def __init__(self,world: World):
        super(Critic, self).__init__()
        self.l1 = nn.Linear(world.world_dimensions(),48)
        self.l2 = nn.Linear(48,24)
        self.head = nn.Linear(24, 1)
        self.value = 10.

    def forward(self, state):
        self.prev_value = self.value
        a1 = F.softplus(self.l1(state))
        a2 = F.softplus(self.l2(a1))
        self.value = self.head(a2)
        return self.value
    
    def td_loss(self,world):
        hindsight_value = world.reward + self.value.data * args.gamma
        return hindsight_value - self.prev_value

world = World()
actor = Actor(world)
critic = Critic(world)

In [36]:
args.render = False
def main():
    action_optimizer = optim.Adagrad(actor.parameters(), lr=5e-3,lr_decay= 0.0,weight_decay=0.0001)
    critic_optimizer = optim.Adagrad(critic.parameters(), lr=5e-3,lr_decay= 0.0,weight_decay=0.0001)
    value = critic(world.state)
    total_r = 0.
    for i in range(10000):
        action_scores = actor(world.state)
        action = actor.choose_action(action_scores)
        world.step(action)
        total_r += world.reward
        critic(world.state)
        
        critic_optimizer.zero_grad()
        value_loss = critic.td_loss(world)
        value_loss.backward()                                      
        critic_optimizer.step()
        
        action_optimizer.zero_grad() 
        action_loss = actor.advantage_loss(critic,world)
        action_loss.backward()                                      
        action_optimizer.step()
        if world.done:
            if (total_r < 30):
                args.render = False
            if (total_r > 120):
                args.render = True
            print(total_r, " ", end ="")
            world.reset()
            value = critic(world.state)
            total_r = 0.
    print("DONE")
main()      

0
1


RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.