In [6]:
import gym
import numpy as np


In [1]:
import numpy as np
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
class ActorCriticNetwork(nn.Module):
    def __init__(self, lr, input_dims, n_actions, fc1_dims=256, fc2_dims=256):
        super(ActorCriticNetwork, self).__init__()
        self.fc1 = nn.Linear(*input_dims, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.pi = nn.Linear(fc2_dims, n_actions)
        self.v = nn.Linear(fc2_dims, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        pi = self.pi(x)
        v = self.v(x)

        return (pi, v)

In [3]:
class Agent():
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions, 
                 gamma=0.99):
        self.gamma = gamma
        self.lr = lr
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.actor_critic = ActorCriticNetwork(lr, input_dims, n_actions, 
                                               fc1_dims, fc2_dims)
        self.log_prob = None

    def choose_action(self, observation):
        state = T.tensor([observation], dtype=T.float).to(self.actor_critic.device)
        probabilities, _ = self.actor_critic.forward(state)
        probabilities = F.softmax(probabilities, dim=1)
        action_probs = T.distributions.Categorical(probabilities)
        action = action_probs.sample()
        log_prob = action_probs.log_prob(action)
        self.log_prob = log_prob

        return action.item()

    def learn(self, state, reward, state_, done):
        self.actor_critic.optimizer.zero_grad()

        state = T.tensor([state], dtype=T.float).to(self.actor_critic.device)
        state_ = T.tensor([state_], dtype=T.float).to(self.actor_critic.device)
        reward = T.tensor(reward, dtype=T.float).to(self.actor_critic.device)

        _, critic_value = self.actor_critic.forward(state)
        _, critic_value_ = self.actor_critic.forward(state_)

        delta = reward + self.gamma*critic_value_*(1-int(done)) - critic_value

        actor_loss = -self.log_prob*delta
        critic_loss = delta**2

        (actor_loss + critic_loss).backward()
        self.actor_critic.optimizer.step()

In [7]:
if __name__ == '__main__':
    env = gym.make('LunarLander-v2')
    agent = Agent(gamma=0.99, lr=5e-6, input_dims=[8], n_actions=4,
                  fc1_dims=2048, fc2_dims=1536)
    n_games = 500

    fname = 'ACTOR_CRITIC_' + 'lunar_lander_' + str(agent.fc1_dims) + \
            '_fc1_dims_' + str(agent.fc2_dims) + '_fc2_dims_lr' + str(agent.lr) +\
            '_' + str(n_games) + 'games'
    figure_file = 'plots/' + fname + '.png'

    scores = []
    for i in range(n_games):
        done = False
        observation = env.reset()
        score = 0
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.learn(observation, reward, observation_, done)
            observation = observation_
        scores.append(score)

        avg_score = np.mean(scores[-100:])
        print('episode ', i, 'score %.1f' % score,
                'average score %.1f' % avg_score)

    x = [i+1 for i in range(n_games)]


episode  0 score -143.5 average score -143.5
episode  1 score -96.8 average score -120.2
episode  2 score -447.8 average score -229.4
episode  3 score -244.6 average score -233.2
episode  4 score -201.8 average score -226.9
episode  5 score -91.4 average score -204.3
episode  6 score -79.6 average score -186.5
episode  7 score -153.8 average score -182.4
episode  8 score -81.6 average score -171.2
episode  9 score -401.6 average score -194.3
episode  10 score -368.9 average score -210.1
episode  11 score -193.9 average score -208.8
episode  12 score -103.8 average score -200.7
episode  13 score -85.7 average score -192.5
episode  14 score -68.7 average score -184.2
episode  15 score -45.6 average score -175.6
episode  16 score -197.8 average score -176.9
episode  17 score -101.5 average score -172.7
episode  18 score -260.6 average score -177.3
episode  19 score -217.7 average score -179.3
episode  20 score -43.9 average score -172.9
episode  21 score -133.5 average score -171.1
episod

episode  178 score -225.6 average score -185.3
episode  179 score -186.2 average score -184.7
episode  180 score -239.9 average score -184.2
episode  181 score -90.8 average score -182.6
episode  182 score -54.8 average score -180.8
episode  183 score 50.9 average score -179.7
episode  184 score -245.5 average score -177.4
episode  185 score -125.8 average score -178.3
episode  186 score -277.7 average score -180.4
episode  187 score 23.1 average score -178.3
episode  188 score -161.3 average score -178.8
episode  189 score -124.7 average score -176.7
episode  190 score -51.5 average score -174.4
episode  191 score -261.9 average score -176.1
episode  192 score -238.4 average score -175.9
episode  193 score -43.6 average score -174.3
episode  194 score -126.4 average score -174.8
episode  195 score -243.4 average score -175.7
episode  196 score -186.4 average score -176.5
episode  197 score -135.1 average score -175.2
episode  198 score -91.9 average score -175.4
episode  199 score -15

episode  356 score -232.8 average score -89.4
episode  357 score -88.0 average score -89.7
episode  358 score -22.3 average score -86.9
episode  359 score -165.7 average score -88.0
episode  360 score -130.5 average score -88.8
episode  361 score 4.1 average score -87.1
episode  362 score -43.8 average score -86.8
episode  363 score -224.8 average score -88.6
episode  364 score -235.3 average score -90.0
episode  365 score -10.8 average score -89.6
episode  366 score -251.8 average score -91.5
episode  367 score -146.7 average score -92.7
episode  368 score -62.9 average score -92.8
episode  369 score -235.2 average score -93.1
episode  370 score -233.0 average score -92.9
episode  371 score -189.6 average score -93.8
episode  372 score 56.2 average score -94.2
episode  373 score -96.1 average score -95.6
episode  374 score -3.4 average score -93.6
episode  375 score -14.6 average score -93.5
episode  376 score -26.8 average score -92.9
episode  377 score 49.3 average score -90.4
episo

KeyboardInterrupt: 