In [1]:
import numpy as np
import sys
import os
import random
import time
# import imageio
# import cv2
from PIL import Image
import torch
from collections import deque
from typing import Optional
import wandb


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
sys.path.append("game/")
from game import wrapped_flappy_bird as game

# sys.path.append(os.path.join(os.path.dirname(__file__), "../utils"))

pygame 2.1.3 (SDL 2.0.22, Python 3.10.9)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [4]:
try:
    key = key or input()
except NameError:
    key = input('type your wandb api key: ')

In [5]:
wandb.login(key=key)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfelissi[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\user/.netrc


True

In [6]:
wandb.init(project="flappy_bird_dqn")


In [7]:
IMG_WIDTH  = 80
IMG_HEIGHT = 80
IMG_DEPTH  = 4

In [8]:
BUFFER_SIZE         = 200
BATCH_SIZE          = 100
GAMMA               = 0.98  # discount factor
TAU                 = 1e-3  # soft update of target parameter
LEARNING_RATE       = 0.01
UPDATE_EVERY        = 2     # how often to update the local
TARGET_UPDATE_EVERY = 20     # how often to update the target

In [9]:
from Agent import Agent

In [10]:
agent = Agent('not_used',2, LEARNING_RATE, BUFFER_SIZE, BATCH_SIZE)

In [11]:
wandb.watch(agent.qnetwork_local,log_freq=10)


[]

In [12]:
def action_to_array(action: int):
    if action: return np.array([0,1])
    return np.array([1,0])

In [13]:
def train(agent: Agent, n_episodes, max_time_step, eps_start, eps_end, eps_decay):
    scores = []
    num_rounds = []
    scores_window = deque(maxlen=100)
    eps = eps_start
    for episode in range(n_episodes):
        env = game.GameState()
        state, reward, done = env.frame_step(np.array([0,1]))
        accumulate_reward = 0
        rounds = 0
        for time_step in range(max_time_step):
            action_values = agent.q_value(state, eps)
            action = agent.decide(action_values, eps)
            next_state, reward, done = env.frame_step(action_to_array(action))
            agent.step(state, action, reward, next_state, done)
            """ === this step has finished === """
            wandb.log({'action':action, 'reward': reward, 'eps': eps})
            wandb.log({f'action_values[{i}]':q for i, q in enumerate(action_values.cpu().numpy().flatten()) })
            """ === next iteration === """
            state = next_state
            accumulate_reward += reward
            rounds += 1
            if done:
                wandb.log({'rounds':rounds,'accumulate_reward':accumulate_reward})
                # print({'rounds':rounds,'accumulate_reward':accumulate_reward})
                break
        scores_window.append(accumulate_reward)
        scores.append(accumulate_reward)
        num_rounds.append(rounds)
        eps = max(eps_end, eps-eps_decay)
        if episode % UPDATE_EVERY == 0 and len(agent.memory) > BATCH_SIZE:
            # print('update local')
            loss = agent.learn_from_experience()
            wandb.log({'loss':loss})
        if episode % TARGET_UPDATE_EVERY == 0:
            # print('update target')
            agent.soft_update()
        if episode % 100 == 0:
            # print(episode, np.mean(scores_window))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pt')
    return scores, num_rounds

In [14]:
agent.qnetwork_local

QNetwork(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(16, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=294912, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=2, bias=True)
)

In [15]:
# mod = flappy()
# mod.play("random")

In [16]:

env = game.GameState()
state, reward, done = env.frame_step(np.array([0,1]))
state

array([[[  0, 135, 147],
        [  0, 135, 147],
        [  0, 135, 147],
        ...,
        [222, 216, 149],
        [222, 216, 149],
        [222, 216, 149]],

       [[  0, 135, 147],
        [  0, 135, 147],
        [  0, 135, 147],
        ...,
        [222, 216, 149],
        [222, 216, 149],
        [222, 216, 149]],

       [[  0, 135, 147],
        [  0, 135, 147],
        [  0, 135, 147],
        ...,
        [222, 216, 149],
        [222, 216, 149],
        [222, 216, 149]],

       ...,

       [[ 84,  56,  71],
        [ 84,  56,  71],
        [ 84,  56,  71],
        ...,
        [222, 216, 149],
        [222, 216, 149],
        [222, 216, 149]],

       [[ 84,  56,  71],
        [ 84,  56,  71],
        [ 84,  56,  71],
        ...,
        [222, 216, 149],
        [222, 216, 149],
        [222, 216, 149]],

       [[ 85, 128,  34],
        [ 85, 128,  34],
        [ 85, 128,  34],
        ...,
        [222, 216, 149],
        [222, 216, 149],
        [222, 216, 149]]

In [17]:
state.shape

(288, 512, 3)

In [18]:
single_batch = torch.tensor(state, device=device).float().unsqueeze(0).permute(0, 3, 2, 1)
single_batch

tensor([[[[  0.,   0.,   0.,  ...,  84.,  84.,  85.],
          [  0.,   0.,   0.,  ...,  84.,  84.,  85.],
          [  0.,   0.,   0.,  ...,  84.,  84.,  85.],
          ...,
          [222., 222., 222.,  ..., 222., 222., 222.],
          [222., 222., 222.,  ..., 222., 222., 222.],
          [222., 222., 222.,  ..., 222., 222., 222.]],

         [[135., 135., 135.,  ...,  56.,  56., 128.],
          [135., 135., 135.,  ...,  56.,  56., 128.],
          [135., 135., 135.,  ...,  56.,  56., 128.],
          ...,
          [216., 216., 216.,  ..., 216., 216., 216.],
          [216., 216., 216.,  ..., 216., 216., 216.],
          [216., 216., 216.,  ..., 216., 216., 216.]],

         [[147., 147., 147.,  ...,  71.,  71.,  34.],
          [147., 147., 147.,  ...,  71.,  71.,  34.],
          [147., 147., 147.,  ...,  71.,  71.,  34.],
          ...,
          [149., 149., 149.,  ..., 149., 149., 149.],
          [149., 149., 149.,  ..., 149., 149., 149.],
          [149., 149., 149.,  ...

In [19]:
print(single_batch.shape)
agent.qnetwork_local(torch.tensor(state, device=device).float().unsqueeze(0))

torch.Size([1, 3, 512, 288])


tensor([[1.2414, 0.1160]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [21]:
agent.optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: False
    lr: 0.01
    maximize: False
    weight_decay: 0
)

In [20]:
train(agent, n_episodes=100, max_time_step=1000, eps_start=1.0, eps_end=0.05, eps_decay=0.01)

([2.800000000000002,
  3.0000000000000018,
  2.900000000000002,
  2.800000000000002,
  5.299999999999997,
  2.800000000000002,
  6.5999999999999925,
  6.5999999999999925,
  2.900000000000002,
  3.5,
  3.6999999999999993,
  2.800000000000002,
  2.800000000000002,
  6.5999999999999925,
  2.800000000000002,
  3.200000000000001,
  5.299999999999997,
  6.5999999999999925,
  2.900000000000002,
  2.800000000000002,
  3.1000000000000014,
  2.800000000000002,
  6.8999999999999915,
  2.800000000000002,
  4.899999999999999,
  2.800000000000002,
  2.900000000000002,
  2.800000000000002,
  2.900000000000002,
  2.800000000000002,
  2.800000000000002,
  2.800000000000002,
  2.800000000000002,
  2.800000000000002,
  2.800000000000002,
  2.800000000000002,
  2.800000000000002,
  2.800000000000002,
  2.800000000000002,
  2.800000000000002,
  2.800000000000002,
  2.800000000000002,
  2.800000000000002,
  1.1000000000000005,
  2.800000000000002,
  2.700000000000002,
  2.800000000000002,
  2.80000000000000