In [39]:
import argparse
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
print(torch.cuda.is_available())
if torch.cuda.is_available() and False:
    print ("cuda in use")
    device = torch.device('cuda') 
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
else:
    print ("cuda not used")
    device = torch.device('cpu')
    torch.set_default_tensor_type('torch.FloatTensor')
    dtype = torch.float32

True
cuda not used


In [40]:
parser = argparse.ArgumentParser(description='PyTorch actor-critic example')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G', help='discount factor (default: 0.99)')
parser.add_argument('--seed', type=int, default=543, metavar='N', help='random seed (default: 543)')
# parser.add_argument('--render', action='store_true', help='render the environment')
parser.add_argument('--render', type=bool,default=False, help='render the environment')
parser.add_argument('--trace', type=bool,default=False, help='render the environment')
parser.add_argument('--log-interval', type=int, default=100, metavar='N', help='interval between training status logs (default: 10)')
parser.add_argument('-f','--file',help='Path for input file. (Dummy arg to enable execution in notebook.)' )
args = parser.parse_args() 

In [41]:

class World():
    
    def __init__(self):
        self.env = gym.make('CartPole-v0')
#         self.env = gym.make('CartPole-v1')
#         self.env = gym.make('Acrobot-v1')
        self.reset()
        
    def reset(self):
        self.reward = 0.0
        self.done = False
        self.actions = 0
        self.state = torch.tensor(self.env.reset(), requires_grad=False, dtype=dtype, device=device)
        
    def action_count(self):
        return self.env.action_space.n
    
    def dimension_count(self):
        return self.env.observation_space.shape[0]
    
    def step(self,action):
        self.state, self.reward, self.done, _ = self.env.step(action.item())
        self.state = torch.tensor(self.state, requires_grad=False, dtype=dtype, device=device)
        self.actions += 1
        if args.render: self.env.render()

def selu(x):
    alpha = 1.6732632423543772848170429916717
    scale = 1.0507009873554804934193349852946
    return scale * F.elu(x, alpha)

class Critic(nn.Module):
    
    def __init__(self,world: World,hidden_nodes=32):
        super(Critic, self).__init__()
        self.world = world
        self.one = torch.ones([1], requires_grad=False, dtype=dtype, device=device)
        self.zero = torch.zeros([1], requires_grad=False, dtype=dtype, device=device)
        self.l1 = nn.Linear(world.dimension_count(),hidden_nodes)
        self.l1.weight.data.normal_(0.0, np.sqrt(1./(self.world.dimension_count())))
        self.head = nn.Linear(hidden_nodes, 1)
        self.head.weight.data.normal_(0.0, np.sqrt(1./(hidden_nodes)))
        self.prev_value = self.zero
        self.value = self.zero
        
    def forward(self, state):
        self.prev_value = self.value
        self.l1_out = selu(self.l1(state))
        self.value = self.head(self.l1_out)
        return self.value
    
    #What the previous value should have been knowing what we know after the last state transition
    def hindsight_value(self):
        #Do not include gradient of the critic value here, just the data.
        return self.world.reward * self.zero if self.world.done and self.world.actions < 199 else self.world.reward + args.gamma * self.value.data
         
    #Temporal Difference Loss is for the previous state!
    def get_loss(self):
        self.loss = F.mse_loss(self.prev_value,self.hindsight_value())
        if args.trace: print("Critic value and loss:",self.prev_value,self.loss)
        return self.loss
    
class Actor(nn.Module):
    
    def __init__(self, critic: Critic,hidden_nodes=64):
        super(Actor, self).__init__()  
        self.episode_discount = 1.
        self.critic = critic
        self.l1_zeros = torch.zeros([critic.world.dimension_count()], requires_grad=False, dtype=dtype, device=device)
        self.l1 = nn.Linear(critic.world.dimension_count(),hidden_nodes)
        self.l1.weight.data.normal_(0.0, np.sqrt(1./(critic.world.dimension_count())))
        self.head = nn.Linear(hidden_nodes, critic.world.action_count())
        self.head.weight.data.normal_(0.0, np.sqrt(1./(hidden_nodes)))
                
    def forward(self, state):
        self.l1_out = F.selu(self.l1(state))
        self.value = F.softmax(selu(self.head(self.l1_out)),dim=0)
        return self.value
    
    def randomize(self):
        self.l1_out = self.l1_zeros
        self.value = F.softmax(torch.rand([self.head.out_features], requires_grad=False, dtype=dtype, device=device), dim=0)
        return self.value
    
    def choose_action(self):
        self.categories = Categorical(self.value)
        self.action = self.categories.sample()
        if args.trace: print("action scores:",self.categories.probs,"Action:",self.action.item())
        return self.action

    #The "advantage" is how much better the state is after the action than we expected it would be
    def get_loss(self):
        #Do not include gradient from critic.
        advantage = self.critic.hindsight_value() - self.critic.prev_value.data
        self.loss = -self.categories.log_prob(self.action)*advantage*self.episode_discount
        if args.trace: print("actor loss:", self.loss)
        return self.loss


In [42]:
def train(episodes=1000):
    mave_reward = 10
    mave_value = 10.
    action_preferences = np.array([0.5,0.5])
    
    for i_episode in range(1,episodes+1):
        ep_reward = 0
        ep_value = 0.
        ep_action_preferences = np.array([0.,0.])
        world.reset()
        critic.forward(world.state)     
        I = 1.
        for moves in range(10000):
            
            #Take an action and evaluate it.
            actor.episode_discount = I
            I *= args.gamma
            actor.forward(world.state)
            world.step(actor.choose_action())
            critic.forward(world.state)
            ep_reward += world.reward
            
            #Train the critic's value forecast
            ep_value += critic.value.item()
            critic_optimizer.zero_grad()
            loss = critic.get_loss()
            loss.backward()
            critic_optimizer.step()
            
            #Train the action policy
            ep_action_preferences += actor.categories.probs.detach().cpu().numpy()
            actor_optimizer.zero_grad()
            loss = actor.get_loss()
            loss.backward()    
            actor_optimizer.step()
            
            if(world.done):
                if args.trace: print("DONE")
                break

        ep_action_preferences /= moves
        action_preferences =  0.05 * ep_action_preferences + (1 - 0.05) * action_preferences
        mave_value /= moves
        mave_reward = 0.05 * ep_reward + (1 - 0.05) * mave_reward
        mave_value = 0.05 * ep_value + (1 - 0.05) * mave_value
        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tMoving average reward: {:.2f}\tMoving average critic value: {:.2f}\tAction Preferences: {:.2f},{:.2f}'.format(
                  i_episode, ep_reward, mave_reward, mave_value,action_preferences[0],action_preferences[1]))
        if mave_reward > world.env.spec.reward_threshold:
            print("Episode {}\tSolved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(i_episode,mave_reward, moves))
            break

In [43]:
#pretrain critic on random action policy
def prime_critic(episodes=1000):
    for i_episode in range(1,episodes+1):
        world.reset()
        critic.forward(world.state)
        for t in range(1000):
            actor.randomize()
            world.step(actor.choose_action())
            critic.forward(world.state)
            critic_optimizer.zero_grad()
            critic.get_loss().backward()                          
            critic_optimizer.step()
            if(world.done):
                break
                
#pretrain policy on current critic evaluation
def prime_actor(episodes=1000):
    for t in range(episodes):
        world.reset()
        critic.forward(world.state)  
        actor.forward(world.state)
        world.step(actor.choose_action())
        critic.forward(world.state)
        actor_optimizer.zero_grad() 
        actor.get_loss().backward()                                 
        actor_optimizer.step()

In [44]:
def reset_trainer():
    args.trace = False
    args.render = False
    global world
    global actor
    global critic
    global actor_optimizer
    global critic_optimizer
    world = World()
    critic = Critic(world,32)
    actor = Actor(critic,64)
    world.env.seed(args.seed)
    torch.manual_seed(args.seed)
    actor_optimizer = optim.Adam(actor.parameters(), lr=4e-5,weight_decay=0.01)#lr=4e-5,weight_decay=0.00001)
    critic_optimizer = optim.Adam(critic.parameters(), lr=5e-3,weight_decay=0.01)#lr=5e-3,weight_decay=0.00001)
    torch.cuda.empty_cache()

In [45]:

args.trace = False
args.render = False
args.log_interval = 100
for i in range(3):
    print("New training test")
    reset_trainer()
    prime_critic(1)
    prime_actor(1)
    train(1000)

New training test
Episode 100	Last reward: 28.00	Moving average reward: 38.37	Moving average critic value: 36.71	Action Preferences: 0.52,0.51
Episode 200	Last reward: 39.00	Moving average reward: 51.81	Moving average critic value: 57.87	Action Preferences: 0.50,0.52
Episode 300	Last reward: 80.00	Moving average reward: 80.67	Moving average critic value: 177.52	Action Preferences: 0.51,0.51
Episode 400	Last reward: 200.00	Moving average reward: 186.24	Moving average critic value: 978.45	Action Preferences: 0.50,0.51
Episode 500	Last reward: 190.00	Moving average reward: 172.87	Moving average critic value: 769.32	Action Preferences: 0.51,0.49
Episode 600	Last reward: 115.00	Moving average reward: 135.51	Moving average critic value: 437.68	Action Preferences: 0.50,0.51
Episode 700	Last reward: 200.00	Moving average reward: 157.64	Moving average critic value: 785.80	Action Preferences: 0.50,0.51
Episode 800	Last reward: 161.00	Moving average reward: 110.50	Moving average critic value: 358

In [None]:
args.log_interval = 10
args.trace = False
args.render = True
train(100) 