In [1]:
import numpy as np
import sys
import os
import random
import time
import imageio
import cv2
from PIL import Image
import torch
from collections import deque
from typing import Optional

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
sys.path.append("game/")
from game import wrapped_flappy_bird as game

# sys.path.append(os.path.join(os.path.dirname(__file__), "../utils"))

pygame 2.1.0 (SDL 2.0.16, Python 3.9.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [5]:
IMG_WIDTH  = 80
IMG_HEIGHT = 80
IMG_DEPTH  = 4

In [6]:
BUFFER_SIZE         = 100
BATCH_SIZE          = 64
GAMMA               = 0.90  # discount factor
TAU                 = 1e-3  # soft update of target parameter
LEARNING_RATE       = 5e-4
UPDATE_EVERY        = 3     # how often to update the local
TARGET_UPDATE_EVERY = 9     # how often to update the target

In [7]:
from Agent import Agent

In [8]:
def action_to_array(action: int):
    if action: return np.array([0,1])
    return np.array([1,0])

In [10]:
def train(agent: Agent, n_episodes, max_time_step, eps_start, eps_end, eps_decay):
    scores = []
    num_rounds = []
    scores_window = deque(maxlen=100)
    eps = eps_start
    for episode in range(n_episodes):
        env = game.GameState()
        state, reward, done = env.frame_step(np.array([0,1]))
        accumulate_reward = 0
        rounds = 0
        for time_step in range(max_time_step):
            action_values = agent.q_value(state, eps)
            action = agent.decide(action_values, eps)
            next_state, reward, done = env.frame_step(action_to_array(action))
            agent.step(state, action, reward, next_state, done)
            """ === this step has finished === """
            # wandb.log({'action':action, 'reward': reward, 'eps': eps})
            # wandb.log({f'action_values[{i}]':q for i, q in enumerate(action_values.cpu().numpy().flatten()) })
            """ === next iteration === """
            state = next_state
            accumulate_reward += reward
            rounds += 1
            if done:
                # wandb.log({'rounds':rounds,'accumulate_reward':accumulate_reward, 'max_number':np.max(state)})
                print({'rounds':rounds,'accumulate_reward':accumulate_reward})
                break
        scores_window.append(accumulate_reward)
        scores.append(accumulate_reward)
        num_rounds.append(rounds)
        eps = max(eps_end, eps-eps_decay)
        if episode % UPDATE_EVERY == 0 and len(agent.memory) > BATCH_SIZE:
            print('update local')
            loss = agent.learn_from_experience()
            # wandb.log({'loss':loss})
        if episode % TARGET_UPDATE_EVERY == 0:
            print('update target')
            agent.soft_update()
        if episode % 100 == 0:
            print(episode, np.mean(scores_window))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pt')
    return scores, num_rounds

In [11]:
agent = Agent('not_used',2, LEARNING_RATE, BUFFER_SIZE, BATCH_SIZE)

In [12]:
agent.qnetwork_local

QNetwork(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(16, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=294912, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=2, bias=True)
)

In [13]:
# mod = flappy()
# mod.play("random")

In [14]:

env = game.GameState()
state, reward, done = env.frame_step(np.array([0,1]))
state

array([[[  0, 135, 147],
        [  0, 135, 147],
        [  0, 135, 147],
        ...,
        [222, 216, 149],
        [222, 216, 149],
        [222, 216, 149]],

       [[  0, 135, 147],
        [  0, 135, 147],
        [  0, 135, 147],
        ...,
        [222, 216, 149],
        [222, 216, 149],
        [222, 216, 149]],

       [[  0, 135, 147],
        [  0, 135, 147],
        [  0, 135, 147],
        ...,
        [222, 216, 149],
        [222, 216, 149],
        [222, 216, 149]],

       ...,

       [[  0, 135, 147],
        [  0, 135, 147],
        [  0, 135, 147],
        ...,
        [222, 216, 149],
        [222, 216, 149],
        [222, 216, 149]],

       [[  0, 135, 147],
        [  0, 135, 147],
        [  0, 135, 147],
        ...,
        [222, 216, 149],
        [222, 216, 149],
        [222, 216, 149]],

       [[ 84,  56,  71],
        [ 84,  56,  71],
        [ 84,  56,  71],
        ...,
        [222, 216, 149],
        [222, 216, 149],
        [222, 216, 149]]

In [15]:
single_batch = torch.tensor(state, device=device).float().unsqueeze(0).permute(0, 3, 2, 1)
single_batch

tensor([[[[  0.,   0.,   0.,  ...,   0.,   0.,  84.],
          [  0.,   0.,   0.,  ...,   0.,   0.,  84.],
          [  0.,   0.,   0.,  ...,   0.,   0.,  84.],
          ...,
          [222., 222., 222.,  ..., 222., 222., 222.],
          [222., 222., 222.,  ..., 222., 222., 222.],
          [222., 222., 222.,  ..., 222., 222., 222.]],

         [[135., 135., 135.,  ..., 135., 135.,  56.],
          [135., 135., 135.,  ..., 135., 135.,  56.],
          [135., 135., 135.,  ..., 135., 135.,  56.],
          ...,
          [216., 216., 216.,  ..., 216., 216., 216.],
          [216., 216., 216.,  ..., 216., 216., 216.],
          [216., 216., 216.,  ..., 216., 216., 216.]],

         [[147., 147., 147.,  ..., 147., 147.,  71.],
          [147., 147., 147.,  ..., 147., 147.,  71.],
          [147., 147., 147.,  ..., 147., 147.,  71.],
          ...,
          [149., 149., 149.,  ..., 149., 149., 149.],
          [149., 149., 149.,  ..., 149., 149., 149.],
          [149., 149., 149.,  ...

In [16]:
print(single_batch.shape)
agent.qnetwork_local(torch.tensor(state, device=device).float().unsqueeze(0))

torch.Size([1, 3, 512, 288])


tensor([[0.0000, 0.4424]], grad_fn=<ReluBackward0>)

In [17]:
train(agent, n_episodes=100, max_time_step=1000, eps_start=1.0, eps_end=0.05, eps_decay=0.01)

{'rounds': 66, 'accumulate_reward': 5.499999999999993}
update local
update target
0 5.499999999999993
{'rounds': 66, 'accumulate_reward': 5.499999999999993}
{'rounds': 66, 'accumulate_reward': 5.499999999999993}
{'rounds': 66, 'accumulate_reward': 5.499999999999993}
update local
{'rounds': 66, 'accumulate_reward': 5.499999999999993}
{'rounds': 66, 'accumulate_reward': 5.499999999999993}
{'rounds': 66, 'accumulate_reward': 5.499999999999993}
update local
{'rounds': 66, 'accumulate_reward': 5.499999999999993}
{'rounds': 66, 'accumulate_reward': 5.499999999999993}


KeyboardInterrupt: 