In [1]:
import numpy as np
import gym
import gym_game
import random

In [2]:
env = gym.make("MemTask-v0")

pygame 2.1.0 (SDL 2.0.16, Python 3.8.5)
Hello from the pygame community. https://www.pygame.org/contribute.html


## A2C with DND

In [3]:
from model import DNDLSTM as Agent
from model import compute_a2c_loss, compute_returns
import torch
import torch.nn as nn

In [4]:
n_trials = 9999
trial_length = 1000
learning_rate = 5e-4
dim_hidden = 1280
# init agent / optimizer
agent = Agent(env.observation_space.shape[0], dim_hidden, env.action_space.n, 5)
optimizer = torch.optim.Adam(agent.parameters(), lr=learning_rate)

In [5]:
env.observation_space.shape[0]

4500

In [6]:
env.action_space.n

4

In [9]:
def simulate():
    global epsilon, epsilon_decay
    for episode in range(n_trials):

        # Init environment
        state = env.reset()
        probs, rewards, values = [], [], []
        h_t, c_t = agent.get_init_states()

        # AI tries up to MAX_TRY times
        for t in range(trial_length):
            # only save memory at the last time point
            agent.turn_off_encoding()
            if t == trial_length-1:
                agent.turn_on_encoding()

            # A2C agent picks action
            output_t, _ = agent(torch.tensor(state).float().view(1, 1, -1), h_t, c_t) ### .view(1,1,-1) for LSTM input with (seq_len, batch, input_size)
            action, action_prob, value, h_t, c_t = output_t

            # Do action and get result
            next_state, reward, done, _ = env.step(action)
            probs.append(action_prob)
            rewards.append(reward)
            values.append(value)

            # Set up for the next iteration, and remember last action
            state = next_state

            # Draw games
            # env.render()

            # When episode is done, print reward
            if done or t >= trial_length - 1:
                print("Episode %d finished after %i time steps with total reward = %f." % (episode, t, sum(rewards)))
                break

        returns = compute_returns(rewards, gamma=0.0, normalize=False) + 0.001
        loss_policy, loss_value = compute_a2c_loss(probs, values, returns)
        loss = loss_policy + loss_value
        print('loss = %f' % (loss.item()))
        nn.utils.clip_grad_norm_(agent.parameters(), 0.5)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [10]:
simulate()

Episode 0 finished after 87 time steps with total reward = -110.000000.
loss = 268.365479
Episode 1 finished after 115 time steps with total reward = -90.000000.
loss = 300.667999
Episode 2 finished after 125 time steps with total reward = -90.000000.
loss = -224.907257
Episode 3 finished after 129 time steps with total reward = 80.000000.
loss = 414.764465
Episode 4 finished after 126 time steps with total reward = -70.000000.
loss = -166.942932
Episode 5 finished after 125 time steps with total reward = 50.000000.
loss = 322.983337
Episode 6 finished after 139 time steps with total reward = 60.000000.
loss = 348.609802
Episode 7 finished after 139 time steps with total reward = -60.000000.
loss = -137.808212
Episode 8 finished after 137 time steps with total reward = -70.000000.
loss = -156.426010
