In [1]:
# import gym
import gymnasium as gym
import numpy as np
from PIL import Image
import cv2 as cv
from collections import deque
from typing import Optional, Iterable
import random
import torch
import wandb
import yaml


In [2]:
device = torch.device('cuda'if torch.cuda.is_available() else 'cpu')


In [3]:
env = gym.make('CartPole-v1', render_mode='rgb_array')

In [4]:
class Recorder(deque):
    def __init__(self, iterable: Optional[Iterable]=None, maxlen: Optional[int]=None):
        if iterable is None:
            super(Recorder, self).__init__(maxlen=maxlen)
        else:
            super(Recorder, self).__init__(iterable, maxlen=maxlen)

    def tensor(self):
        """ Give [B, C, H, W] tensor for images 
        e.g. [H, W]x4 -> [4, H, W]
        """
        return torch.stack(tuple(torch.tensor(x, device=device) for x in self)).to(device)
    
    def numpy(self)->np.ndarray:
        return self.tensor().cpu().numpy()

## Preprocess

In [5]:

WIDTH  = 240
HEIGHT = 160
def preprocess_image(image: np.ndarray, width=WIDTH, height=HEIGHT):
    _, gray = cv.threshold(cv.cvtColor(image, cv.COLOR_BGR2GRAY), 200, 255,cv.THRESH_BINARY)
    # gray:np.ndarray = gray[(gray.shape[0]-WIDTH)//2:(gray.shape[0]-WIDTH)//2+WIDTH,(gray.shape[1]-HEIGHT)//2:(gray.shape[1]-HEIGHT)//2+HEIGHT]
    gray = cv.resize(gray, (width, height))
    return gray

In [6]:
env.reset()
recorder = Recorder(maxlen=4)
for _ in range(4):
    env.step(random.choice([0,1]))
    image : np.ndarray = env.render()
    recorder.append(preprocess_image(image))
i:np.ndarray = env.render()

In [7]:
preprocess_image(i).shape

(160, 240)

In [8]:
from replay_buffer import ReplayBuffer
replay = ReplayBuffer(buffer_size=100, batch_size=32)
for _ in range(100):
    replay.add(recorder.numpy(), 0, 0, recorder.numpy(), True)

In [9]:
recorder.tensor().shape

torch.Size([4, 160, 240])

In [10]:
replay.sample().state.shape

torch.Size([32, 4, 160, 240])

In [11]:
i.shape

(400, 600, 3)

## Import NN Model

In [12]:
from Model import QNetwork
model = QNetwork(action_size=2).to(device)
model(replay.sample().state)

tensor([[-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012],
        [-0.1865,  0.0012]], device='cuda:0', grad_fn=<AddmmBackward0>)

# Import agent

In [13]:
BUFFER_SIZE         = 100
BATCH_SIZE          = 32
GAMMA               = 0.99  # discount factor
TAU                 = 0.1   # soft update of target parameter
LEARNING_RATE       = 1e-2
UPDATE_EVERY        = 2    # how often to update the local
TARGET_UPDATE_EVERY = 8    # how often to update the target

In [14]:
from Agent import Agent
agent = Agent('not_used', 2, LEARNING_RATE, BUFFER_SIZE, BATCH_SIZE)

In [15]:
agent.qnetwork_local

QNetwork(
  (conv1): Conv2d(4, 64, kernel_size=(5, 5), stride=(3, 3))
  (conv2): Conv2d(64, 64, kernel_size=(4, 4), stride=(2, 2))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=52992, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=2, bias=True)
)

In [16]:
agent.act(replay.memory[0].state)

0

In [17]:
agent.qnetwork_local(replay.sample().state)

tensor([[ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627],
        [ 0.2144, -1.3627]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [18]:
agent.learn(replay.sample())

tensor(0.0460, device='cuda:0', grad_fn=<MseLossBackward0>)

In [19]:
agent.step(*replay.memory[0])

In [20]:
agent.q_value(replay.memory[0].state, 0)

tensor([[-279739.9062,  -24406.8633]], device='cuda:0')

In [21]:
with open('../.wandb.yaml', 'r') as f:
    key = yaml.safe_load(f.read())['key']
wandb.login(key=key)
wandb.init(project="cart-pole")


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfelissi[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\user/.netrc


In [22]:
wandb.watch(agent.qnetwork_local)

[]

## Training

In [23]:
def train(agent: Agent, n_episodes, max_time_step, eps_start, eps_end, eps_decay):
    scores = []
    num_rounds = []
    scores_window = deque(maxlen=100)
    eps = eps_start
    env = gym.make('CartPole-v1',render_mode='rgb_array')
    recorder = Recorder(maxlen=4)

    for episode in range(n_episodes):
        env.reset()
        # Get first 4 frames
        for _ in range(4):
            env.step(random.choice([0,1]))
            image : np.ndarray = env.render()
            recorder.append(preprocess_image(image))
            cv.waitKey(25)
            cv.imshow('',image)
        state = recorder.numpy()
        accumulate_reward = 0
        rounds = 0
        for time_step in range(max_time_step):
            cv.waitKey(25)
            cv.imshow('',image)
            action_values = agent.q_value(state, eps)
            action = agent.decide(action_values, eps)
            
            _, reward, done, _ , _ = env.step(action)
            if done:
                reward = -1
            image : np.ndarray = env.render()
            recorder.append(preprocess_image(image))
            next_state = recorder.numpy()


            agent.step(state, action, reward, next_state, done)
            """ === this step has finished === """
            wandb.log({'action':action, 'reward': reward, 'eps': eps})
            wandb.log({f'action_values[{i}]':q for i, q in enumerate(action_values.cpu().numpy().flatten()) })
            """ === next iteration === """
            state = next_state
            accumulate_reward += reward
            rounds += 1
            if done:
                wandb.log({'rounds':rounds,'accumulate_reward':accumulate_reward})
                print({'rounds':rounds,'accumulate_reward':accumulate_reward})
                break
        if episode % UPDATE_EVERY == 0 and len(agent.memory) > BATCH_SIZE:
            loss = agent.learn_from_experience()
            wandb.log({'loss':loss})
            print({'loss':loss})
        if episode % TARGET_UPDATE_EVERY == 0:
            agent.soft_update()
        scores_window.append(accumulate_reward)
        scores.append(accumulate_reward)
        num_rounds.append(rounds)
        eps = max(eps_end, eps*eps_decay)
        if episode % 100 == 0:
            print(episode, np.mean(scores_window))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pt')
    return scores, num_rounds

In [24]:
scores, num_rounds = train(agent, n_episodes=1000, max_time_step=3000, eps_start=1.0, eps_end=0.6, eps_decay=0.99)

{'rounds': 8, 'accumulate_reward': 6.0}
0 6.0
{'rounds': 27, 'accumulate_reward': 25.0}
{'rounds': 18, 'accumulate_reward': 16.0}
{'loss': tensor(3.9435e+10, device='cuda:0', grad_fn=<MseLossBackward0>)}
{'rounds': 16, 'accumulate_reward': 14.0}
{'rounds': 9, 'accumulate_reward': 7.0}
{'loss': tensor(1737441.2500, device='cuda:0', grad_fn=<MseLossBackward0>)}
{'rounds': 11, 'accumulate_reward': 9.0}
{'rounds': 12, 'accumulate_reward': 10.0}
{'loss': tensor(1356339.7500, device='cuda:0', grad_fn=<MseLossBackward0>)}
{'rounds': 12, 'accumulate_reward': 10.0}
{'rounds': 21, 'accumulate_reward': 19.0}
{'loss': tensor(24842.7461, device='cuda:0', grad_fn=<MseLossBackward0>)}
{'rounds': 8, 'accumulate_reward': 6.0}
{'rounds': 23, 'accumulate_reward': 21.0}
{'loss': tensor(60.8660, device='cuda:0', grad_fn=<MseLossBackward0>)}
{'rounds': 26, 'accumulate_reward': 24.0}
{'rounds': 19, 'accumulate_reward': 17.0}
{'loss': tensor(1.7943, device='cuda:0', grad_fn=<MseLossBackward0>)}
{'rounds': 8, 

KeyboardInterrupt: 

In [None]:
image : np.ndarray = env.render(mode='rgb_array')
