In [1]:
import numpy as np
import gym
import gym_game


In [2]:
env = gym.make("Pygame-v0", )

pygame 2.1.0 (SDL 2.0.16, Python 3.8.5)
Hello from the pygame community. https://www.pygame.org/contribute.html


### Q-Learning

In [3]:
n_trials = 9999
trial_length = 1000
epsilon = 1
epsilon_decay = 0.999
learning_rate = 0.1
gamma = 0.6
num_box = tuple((env.observation_space.high + np.ones(env.observation_space.shape)).astype(int))
q_table = np.zeros(num_box + (env.action_space.n,))


In [4]:
def simulate():
    global epsilon, epsilon_decay
    for episode in range(n_trials):

        # Init environment
        state = env.reset()
        total_reward = 0

        # AI tries up to MAX_TRY times
        for t in range(trial_length):

            # In the beginning, do random action to learn
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state])

            # Do action and get result
            next_state, reward, done, _ = env.step(action)
            total_reward += reward

            # Get correspond q value from state, action pair
            q_value = q_table[state][action]
            best_q = np.max(q_table[next_state])

            # Q(state, action) <- (1 - a)Q(state, action) + a(reward + rmaxQ(next state, all actions))
            q_table[state][action] = (1 - learning_rate) * q_value + learning_rate * (reward + gamma * best_q)

            # Set up for the next iteration
            state = next_state

            # Draw games
            # env.render()

            # When episode is done, print reward
            if done or t >= trial_length - 1:
                print("Episode %d finished after %i time steps with total reward = %f., epsilon = %f" % (episode, t, total_reward, epsilon))
                break

        # exploring rate decay
        if epsilon >= 0.005:
            epsilon *= epsilon_decay

In [5]:
# simulate()

## A2C

In [26]:
from model import A2C as Agent
from model import compute_a2c_loss, compute_returns
import torch
import torch.nn as nn

In [27]:
learning_rate = 5e-4
dim_hidden = 128
# init agent / optimizer
agent = Agent(env.observation_space.shape[0], dim_hidden, env.action_space.n)
optimizer = torch.optim.Adam(agent.parameters(), lr=learning_rate)

In [34]:
def simulate():
    global epsilon, epsilon_decay
    for episode in range(n_trials):

        # Init environment
        state = env.reset()
        probs, rewards, values = [], [], []

        # AI tries up to MAX_TRY times
        for t in range(trial_length):

            # A2C agent picks action
            a_dist, value = agent(torch.tensor(state).float())  ### .view(1,1,-1) for LSTM
            action, action_prob = agent.pick_action(a_dist)

            # Do action and get result
            next_state, reward, done, _ = env.step(action)
            reward += env.pygame.car.average_speed/env.pygame.car.time_spent * 10  ### encourage to go faster
            probs.append(action_prob)
            rewards.append(reward/200)
            values.append(value)

            # Set up for the next iteration, and remember last action
            state = next_state

            # Draw games
            env.render()

            # When episode is done, print reward
            if done or t >= trial_length - 1:
                print("Episode %d finished after %i time steps with total reward = %f." % (episode, t, sum(rewards)))
                break

        returns = compute_returns(rewards) + 0.01
        loss_policy, loss_value = compute_a2c_loss(probs, values, returns)
        loss = loss_policy + loss_value
        print('loss = %f' % (loss.item()))
        nn.utils.clip_grad_norm_(agent.parameters(), 0.5)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [35]:
simulate()

Episode 0 finished after 652 time steps with total reward = -25.236710.
loss = 14.258533
Episode 1 finished after 644 time steps with total reward = 125.090152.
loss = 133.279388
Episode 2 finished after 198 time steps with total reward = -40.375058.
loss = -32.345512
Episode 3 finished after 201 time steps with total reward = -39.319001.
loss = 25.136169
Episode 4 finished after 678 time steps with total reward = 125.376783.
loss = 467.224365
Episode 5 finished after 606 time steps with total reward = 125.445632.
loss = 142.285522


KeyboardInterrupt: 