In [1]:
# import gym
import gymnasium as gym
import numpy as np
from PIL import Image
import cv2 as cv
from collections import deque
from typing import Optional, Iterable
import random
import torch
import wandb
import yaml
import sys, os

In [2]:
sys.path.append('../')

In [3]:
device = torch.device('cuda'if torch.cuda.is_available() else 'cpu')


In [4]:
env = gym.make('CartPole-v1', render_mode='rgb_array')

In [5]:
from Recorder import Recorder

## Preprocess

In [6]:
from utils import preprocess_image

In [7]:
from replay_buffer import ReplayBuffer


## Import NN Model

In [8]:
from Model import QNetwork

# Import agent

In [9]:
BUFFER_SIZE         = 100
BATCH_SIZE          = 64
GAMMA               = 0.99  # discount factor
TAU                 = 0.05   # soft update of target parameter
LEARNING_RATE       = 1e-2
UPDATE_EVERY        = 10    # how often to update the local
TARGET_UPDATE_EVERY = 50    # how often to update the target
FRAME_LENGTH        = 3    # how long
RESIZED_WIDTH       = 240 
RESIZED_HEIGHT      = 160

In [10]:
from Agent import Agent
agent = Agent(FRAME_LENGTH, 2, LEARNING_RATE, BUFFER_SIZE, BATCH_SIZE, GAMMA)

In [11]:
agent.qnetwork_local

QNetwork(
  (conv1): Conv2d(3, 64, kernel_size=(5, 5), stride=(3, 3))
  (conv2): Conv2d(64, 64, kernel_size=(4, 4), stride=(2, 2))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=52992, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=2, bias=True)
)

In [12]:
with open('../.wandb.yaml', 'r') as f:
    key = yaml.safe_load(f.read())['key']
wandb.login(key=key)
wandb.init(project="cart-pole")


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfelissi[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\user/.netrc


In [13]:
wandb.config.update(
    dict(
        BUFFER_SIZE         = BUFFER_SIZE         ,     
        BATCH_SIZE          = BATCH_SIZE          ,
        GAMMA               = GAMMA               ,
        TAU                 = TAU                 ,
        LEARNING_RATE       = LEARNING_RATE       ,
        UPDATE_EVERY        = UPDATE_EVERY        ,
        TARGET_UPDATE_EVERY = TARGET_UPDATE_EVERY ,
        FRAME_LENGTH        = FRAME_LENGTH        ,
        RESIZED_WIDTH       = RESIZED_WIDTH       ,
        RESIZED_HEIGHT      = RESIZED_HEIGHT
    )
    )

In [14]:
wandb.watch(agent.qnetwork_local, log='all', log_freq=100)

[]

## Training

In [15]:
def train(agent: Agent, n_episodes, max_time_step, eps_start, eps_end, eps_decay):
    scores = []
    num_rounds = []
    scores_window = deque(maxlen=100)
    eps = eps_start
    env = gym.make('CartPole-v1',render_mode='rgb_array')
    recorder = Recorder(maxlen=FRAME_LENGTH)

    for episode in range(n_episodes):
        env.reset()
        # Get first FRAME_LENGTH frames
        for _ in range(FRAME_LENGTH):
            env.step(random.choice([0,1]))
            image : np.ndarray = env.render()
            recorder.append(preprocess_image(image, RESIZED_WIDTH, RESIZED_HEIGHT))
            cv.waitKey(25)
            cv.imshow('',image)
        state = recorder.numpy()
        accumulate_reward = 0
        rounds = 0
        for time_step in range(max_time_step):
            cv.waitKey(25)
            cv.imshow('',image)
            action_values = agent.q_value(state, eps)
            action = agent.decide(action_values, eps)
            
            _, reward, done, _ , _ = env.step(action)
            if done:
                reward = -1
            image : np.ndarray = env.render()
            recorder.append(preprocess_image(image, RESIZED_WIDTH, RESIZED_HEIGHT))
            next_state = recorder.numpy()


            agent.step(state, action, reward, next_state, done)
            """ === this step has finished === """
            wandb.log({'action':action, 'reward': reward, 'eps': eps})
            wandb.log({f'action_values[{i}]':q for i, q in enumerate(action_values.cpu().numpy().flatten()) })
            """ === next iteration === """
            state = next_state
            accumulate_reward += reward
            rounds += 1
            if done:
                wandb.log({'rounds':rounds,'accumulate_reward':accumulate_reward})
                print({'episode': episode,'agent.time_step': agent.time_step, 'rounds':rounds,'accumulate_reward':accumulate_reward})
                break
        if episode % UPDATE_EVERY == 0 and len(agent.memory) > BATCH_SIZE:
            loss = agent.learn_from_experience()
            wandb.log({'loss':loss})
            print({'loss':loss})
        if episode % TARGET_UPDATE_EVERY == 0:
            agent.soft_update()
        scores_window.append(accumulate_reward)
        scores.append(accumulate_reward)
        num_rounds.append(rounds)
        eps = max(eps_end, eps*eps_decay)
        if episode % 100 == 0:
            print(episode, np.mean(scores_window))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pt')
    return scores, num_rounds

In [16]:
scores, num_rounds = train(agent, n_episodes=1000, max_time_step=3000, eps_start=1.0, eps_end=0.01, eps_decay=0.97)

{'episode': 0, 'agent.time_step': 20, 'rounds': 20, 'accumulate_reward': 18.0}
0 18.0
{'episode': 1, 'agent.time_step': 36, 'rounds': 16, 'accumulate_reward': 14.0}
{'episode': 2, 'agent.time_step': 55, 'rounds': 19, 'accumulate_reward': 17.0}
{'episode': 3, 'agent.time_step': 64, 'rounds': 9, 'accumulate_reward': 7.0}
{'episode': 4, 'agent.time_step': 75, 'rounds': 11, 'accumulate_reward': 9.0}
{'episode': 5, 'agent.time_step': 85, 'rounds': 10, 'accumulate_reward': 8.0}
{'episode': 6, 'agent.time_step': 142, 'rounds': 57, 'accumulate_reward': 55.0}
{'episode': 7, 'agent.time_step': 148, 'rounds': 6, 'accumulate_reward': 4.0}
{'episode': 8, 'agent.time_step': 177, 'rounds': 29, 'accumulate_reward': 27.0}
{'episode': 9, 'agent.time_step': 193, 'rounds': 16, 'accumulate_reward': 14.0}
{'episode': 10, 'agent.time_step': 217, 'rounds': 24, 'accumulate_reward': 22.0}
(q_current, q_targets): (tensor([[0.1068],
        [0.0708],
        [0.0896],
        [0.6709],
        [0.6845],
        [

KeyboardInterrupt: 

In [None]:
image : np.ndarray = env.render(mode='rgb_array')
