In [2]:
import numpy as np
import gym
import gym_game
import random

In [3]:
env = gym.make("Pygame-v0", )

pygame 2.1.0 (SDL 2.0.16, Python 3.8.5)
Hello from the pygame community. https://www.pygame.org/contribute.html


### Q-Learning

In [4]:
n_trials = 9999
trial_length = 1000
epsilon = 1
epsilon_decay = 0.999
learning_rate = 0.1
gamma = 0.6
num_box = tuple((env.observation_space.high + np.ones(env.observation_space.shape)).astype(int))
q_table = np.zeros(num_box + (env.action_space.n,))


In [5]:
def simulate():
    global epsilon, epsilon_decay
    for episode in range(n_trials):

        # Init environment
        state = env.reset()
        total_reward = 0

        # AI tries up to MAX_TRY times
        for t in range(trial_length):

            # In the beginning, do random action to learn
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state])

            # Do action and get result
            next_state, reward, done, _ = env.step(action)
            total_reward += reward

            # Get correspond q value from state, action pair
            q_value = q_table[state][action]
            best_q = np.max(q_table[next_state])

            # Q(state, action) <- (1 - a)Q(state, action) + a(reward + rmaxQ(next state, all actions))
            q_table[state][action] = (1 - learning_rate) * q_value + learning_rate * (reward + gamma * best_q)

            # Set up for the next iteration
            state = next_state

            # Draw games
            # env.render()

            # When episode is done, print reward
            if done or t >= trial_length - 1:
                print("Episode %d finished after %i time steps with total reward = %f., epsilon = %f" % (episode, t, total_reward, epsilon))
                break

        # exploring rate decay
        if epsilon >= 0.005:
            epsilon *= epsilon_decay

In [6]:
# simulate()

## A2C

In [7]:
from model import A2C as Agent
from model import compute_a2c_loss, compute_returns
import torch
import torch.nn as nn

In [8]:
learning_rate = 5e-4
dim_hidden = 128
# init agent / optimizer
agent = Agent(env.observation_space.shape[0], dim_hidden, env.action_space.n)
optimizer = torch.optim.Adam(agent.parameters(), lr=learning_rate)

In [12]:
def simulate():
    global epsilon, epsilon_decay
    for episode in range(n_trials):

        # Init environment
        state = env.reset()
        probs, rewards, values = [], [], []

        # AI tries up to MAX_TRY times
        for t in range(trial_length):

            # A2C agent picks action
            a_dist, value = agent(torch.tensor(state).float())  ### .view(1,1,-1) for LSTM
            action, action_prob = agent.pick_action(a_dist)

            # Do action and get result
            next_state, reward, done, _ = env.step(action)
            reward += env.pygame.car.average_speed/env.pygame.car.time_spent * 10  ### encourage the car to go faster
            probs.append(action_prob)
            rewards.append(reward/200)
            values.append(value)

            # Set up for the next iteration, and remember last action
            state = next_state

            # Draw games
            env.render()

            # When episode is done, print reward
            if done or t >= trial_length - 1:
                print("Episode %d finished after %i time steps with total reward = %f." % (episode, t, sum(rewards)))
                break

        returns = compute_returns(rewards, gamma=0.1, normalize=True) + 0.01
        loss_policy, loss_value = compute_a2c_loss(probs, values, returns)
        loss = loss_policy + loss_value
        print('loss = %f' % (loss.item()))
        nn.utils.clip_grad_norm_(agent.parameters(), 0.5)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [13]:
simulate()

Episode 0 finished after 769 time steps with total reward = -30.005407.
loss = -4.153461
Episode 1 finished after 678 time steps with total reward = -29.206710.
loss = 30.597309
Episode 2 finished after 997 time steps with total reward = 123.867324.
loss = 34.681225
Episode 3 finished after 999 time steps with total reward = 1.590694.
loss = 222.150101
Episode 4 finished after 44 time steps with total reward = -47.629128.
loss = -22.393373
Episode 5 finished after 695 time steps with total reward = -33.581553.
loss = 23.598576
Episode 6 finished after 407 time steps with total reward = -39.286484.
loss = 1.499432
Episode 7 finished after 762 time steps with total reward = -33.548111.
loss = 78.042816
Episode 8 finished after 758 time steps with total reward = -33.900783.
loss = 60.079620
Episode 9 finished after 999 time steps with total reward = 1.402344.
loss = 253.494324
Episode 10 finished after 818 time steps with total reward = -33.608066.
loss = 39.672077


KeyboardInterrupt: 