In [23]:
import gym
import numpy as np
from PIL import Image
import cv2 as cv
from collections import deque
from typing import Optional, Iterable
import random
import torch



In [24]:
device = torch.device('cuda'if torch.cuda.is_available() else 'cpu')


In [25]:
env = gym.make('CartPole-v1')

In [26]:
class Recorder(deque):
    def __init__(self, iterable: Optional[Iterable]=None, maxlen: Optional[int]=None):
        if iterable is None:
            super(Recorder, self).__init__(maxlen=maxlen)
        else:
            super(Recorder, self).__init__(iterable, maxlen=maxlen)

    def tensor(self):
        """ Give [B, C, H, W] tensor for images 
        e.g. [H, W]x4 -> [4, H, W]
        """
        return torch.stack(tuple(torch.tensor(x, device=device) for x in self))
    
    def numpy(self)->np.ndarray:
        return self.tensor().numpy()

## Preprocess

In [27]:

WIDTH  = 240
HEIGHT = 160
def preprocess_image(image: np.ndarray, width=WIDTH, height=HEIGHT):
    _, gray = cv.threshold(cv.cvtColor(image, cv.COLOR_BGR2GRAY), 200, 255,cv.THRESH_BINARY)
    # gray:np.ndarray = gray[(gray.shape[0]-WIDTH)//2:(gray.shape[0]-WIDTH)//2+WIDTH,(gray.shape[1]-HEIGHT)//2:(gray.shape[1]-HEIGHT)//2+HEIGHT]
    gray = cv.resize(gray, (width, height))
    return gray

In [28]:
env.reset()
recorder = Recorder(maxlen=4)
for _ in range(4):
    env.step(random.choice([0,1]))
    image : np.ndarray = env.render(mode='rgb_array')
    recorder.append(preprocess_image(image))
i:np.ndarray = env.render(mode='rgb_array')

In [29]:
preprocess_image(i).shape

(160, 240)

In [30]:
from replay_buffer import ReplayBuffer
replay = ReplayBuffer(buffer_size=100, batch_size=32)
for _ in range(100):
    replay.add(recorder.numpy(), 0, 0, recorder.numpy(), True)

In [31]:
recorder.tensor().shape

torch.Size([4, 160, 240])

In [32]:
replay.sample().state.shape

torch.Size([32, 4, 160, 240])

In [33]:
i.shape

(400, 600, 3)

## Import NN Model

In [34]:
from Model import QNetwork
model = QNetwork(action_size=2)
model(replay.sample().state)

tensor([[ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407],
        [ 0.1227, -0.3407]], grad_fn=<AddmmBackward0>)

# Import agent

In [35]:
BUFFER_SIZE         = 100
BATCH_SIZE          = 32
GAMMA               = 0.99  # discount factor
TAU                 = 1e-3  # soft update of target parameter
LEARNING_RATE       = 1e-2
UPDATE_EVERY        = 10    # how often to update the local
TARGET_UPDATE_EVERY = 50    # how often to update the target

In [36]:
from Agent import Agent
agent = Agent('not_used', 2, LEARNING_RATE, BUFFER_SIZE, BATCH_SIZE)

In [37]:
agent.qnetwork_local

QNetwork(
  (conv1): Conv2d(4, 64, kernel_size=(5, 5), stride=(3, 3))
  (conv2): Conv2d(64, 64, kernel_size=(4, 4), stride=(2, 2))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=52992, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=2, bias=True)
)

In [38]:
agent.act(replay.memory[0].state)

1

In [39]:
agent.qnetwork_local(replay.sample().state)

tensor([[0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075],
        [0.1505, 0.4075]], grad_fn=<AddmmBackward0>)

In [40]:
agent.learn(replay.sample())

tensor(0.0227, grad_fn=<MseLossBackward0>)

In [41]:
agent.step(*replay.memory[0])

In [42]:
agent.q_value(replay.memory[0].state, 0)

tensor([[-183329.9062,   89266.1094]])

## Training

In [68]:
def train(agent: Agent, n_episodes, max_time_step, eps_start, eps_end, eps_decay):
    scores = []
    num_rounds = []
    scores_window = deque(maxlen=100)
    eps = eps_start
    env = gym.make('CartPole-v1')
    recorder = Recorder(maxlen=4)

    for episode in range(n_episodes):
        env.reset()
        # Get first 3 frames
        for _ in range(4):
            env.step(random.choice([0,1]))
            image : np.ndarray = env.render(mode='rgb_array')
            recorder.append(preprocess_image(image))
        state = recorder.numpy()
        accumulate_reward = 0
        rounds = 0
        for time_step in range(max_time_step):
            action_values = agent.q_value(state, eps)
            action = agent.decide(action_values, eps)
            
            _, reward, done, _ = env.step(action)
            image : np.ndarray = env.render(mode='rgb_array')
            recorder.append(preprocess_image(image))
            next_state = recorder.numpy()

            cv.waitKey(25)
            cv.imshow('',image)

            agent.step(state, action, reward, next_state, done)
            """ === this step has finished === """
            # wandb.log({'action':action, 'reward': reward, 'eps': eps})
            # wandb.log({f'action_values[{i}]':q for i, q in enumerate(action_values.cpu().numpy().flatten()) })
            """ === next iteration === """
            state = next_state
            accumulate_reward += reward
            rounds += 1
            if done:
                # wandb.log({'rounds':rounds,'accumulate_reward':accumulate_reward, 'max_number':np.max(state)})
                print({'rounds':rounds,'accumulate_reward':accumulate_reward})
                break
        if episode % UPDATE_EVERY == 0 and len(agent.memory) > BATCH_SIZE:
            loss = agent.learn_from_experience()
            # wandb.log({'loss':loss})
            print({'loss':loss})
        if episode % TARGET_UPDATE_EVERY == 0:
            agent.soft_update()
        scores_window.append(accumulate_reward)
        scores.append(accumulate_reward)
        num_rounds.append(rounds)
        eps = max(eps_end, eps*eps_decay)
        if episode % 100 == 0:
            print(episode, np.mean(scores_window))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pt')
    return scores, num_rounds

In [66]:
scores, num_rounds = train(agent, n_episodes=1000, max_time_step=3000, eps_start=1.0, eps_end=0.01, eps_decay=0.99)

{'rounds': 8, 'accumulate_reward': 8.0}
{'loss': tensor(0.0714, grad_fn=<MseLossBackward0>)}
0 8.0
{'rounds': 12, 'accumulate_reward': 12.0}
{'rounds': 44, 'accumulate_reward': 44.0}
{'rounds': 12, 'accumulate_reward': 12.0}
{'rounds': 16, 'accumulate_reward': 16.0}
{'rounds': 33, 'accumulate_reward': 33.0}
{'rounds': 18, 'accumulate_reward': 18.0}
{'rounds': 15, 'accumulate_reward': 15.0}
{'rounds': 11, 'accumulate_reward': 11.0}
{'rounds': 53, 'accumulate_reward': 53.0}
{'rounds': 27, 'accumulate_reward': 27.0}
{'loss': tensor(0.0401, grad_fn=<MseLossBackward0>)}
{'rounds': 21, 'accumulate_reward': 21.0}
{'rounds': 18, 'accumulate_reward': 18.0}
{'rounds': 19, 'accumulate_reward': 19.0}
{'rounds': 14, 'accumulate_reward': 14.0}
{'rounds': 8, 'accumulate_reward': 8.0}
{'rounds': 12, 'accumulate_reward': 12.0}
{'rounds': 25, 'accumulate_reward': 25.0}
{'rounds': 10, 'accumulate_reward': 10.0}
{'rounds': 13, 'accumulate_reward': 13.0}
{'rounds': 31, 'accumulate_reward': 31.0}
{'loss': t

In [67]:
image : np.ndarray = env.render(mode='rgb_array')
