In [1]:
import os
import sys
import random
from torch import nn
import torch.nn.functional as F
import collections
import torch
import numpy as np

print("TORCH VERSION:", torch.__version__)

TORCH VERSION: 2.0.1


In [6]:
CURRENT_PATH = os.path.dirname("./")
PROJECT_HOME = os.path.abspath("../")
if PROJECT_HOME not in sys.path:
    sys.path.append(PROJECT_HOME)

### setting path
- current_path : relative path -> ./
- project_home : relative path -> ../

In [7]:
MODEL_DIR = os.path.join(PROJECT_HOME, "_03_DQN", "models")
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)

### Setting model path
- model_dir : ../_03_DQN/models -> ./models

In [8]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Setting using device
- device : if you have cuda gpu torch device is cuda else using cpu

In [9]:
class QNet(nn.Module):
    def __init__(self, n_features=4, n_actions=2):
        super(QNet, self).__init__()
        self.n_features = n_features
        self.n_actions = n_actions
        self.fc1 = nn.Linear(n_features, 128)  # fully connected
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, n_actions)
        self.to(DEVICE)

    def forward(self, x):
        if isinstance(x, np.ndarray):
            x = torch.tensor(x, dtype=torch.float32, device=DEVICE)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def get_action(self, obs, epsilon=0.1):
        # random.random(): 0.0과 1.0사이의 임의의 값을 반환
        if random.random() < epsilon:
            action = random.randrange(0, self.n_actions)
        else:
            q_values = self.forward(obs)
            action = torch.argmax(q_values, dim=-1)
            action = action.item()
        return action  # argmax: 가장 큰 값에 대응되는 인덱스 반환

### Q-Net
This network is fully connected network(fcn)
#### Layer Info
- input layer : in(4) -> out(128)
    - using activation function relu
- hidden layer : int(128) -> out(128)
    - using activation function relu
- output layer : int(128) -> out(2)
#### get_action function
this function using epsilon_greedy
- if get random value 0~1 is smaller than epsilon, return random action
- else return greedy action

In [10]:
Transition = collections.namedtuple(
    typename='Transition',
    field_names=['observation', 'action', 'next_observation', 'reward', 'done']
)

- Transition : ReplayBuffer value type

In [12]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def size(self):
        return len(self.buffer)

    def append(self, transition: Transition) -> None:
        self.buffer.append(transition)

    def pop(self):
        return self.buffer.pop()

    def clear(self):
        self.buffer.clear()

    def sample(self, batch_size):
        # Get random index
        indices = np.random.choice(len(self.buffer), size=batch_size, replace=False)
        # Sample
        observations, actions, next_observations, rewards, dones = zip(*[self.buffer[idx] for idx in indices])

        # Convert to ndarray for speed up cuda
        observations = np.array(observations)
        next_observations = np.array(next_observations)
        # observations.shape, next_observations.shape: (32, 4), (32, 4)

        actions = np.array(actions)
        actions = np.expand_dims(actions, axis=-1) if actions.ndim == 1 else actions
        rewards = np.array(rewards)
        rewards = np.expand_dims(rewards, axis=-1) if rewards.ndim == 1 else rewards
        dones = np.array(dones, dtype=bool)
        # actions.shape, rewards.shape, dones.shape: (32, 1) (32, 1) (32,)

        # Convert to tensor
        observations = torch.tensor(observations, dtype=torch.float32, device=DEVICE)
        actions = torch.tensor(actions, dtype=torch.int64, device=DEVICE)
        next_observations = torch.tensor(next_observations, dtype=torch.float32, device=DEVICE)
        rewards = torch.tensor(rewards, dtype=torch.float32, device=DEVICE)
        dones = torch.tensor(dones, dtype=torch.bool, device=DEVICE)

        return observations, actions, next_observations, rewards, dones

### ReplayBuffer
This class used to minibatch SGD.  
Buffer has episode step info.  
when do gradient decent, return batch size random step info.  
If current buffer size is equal buffer capacity, pop the oldest data and append new data.
#### Why have to use minibatch SGD?  
when don't use minibatch SGD, can cause state correlation problem.  
Therefore, use minibatch SGD.
#### What is minibatch SGD?
minibatch SGD is when given the state of batch size, selecte some of random state.


*별첨*  
state correlation : 상태가 너무 가까운 것에만 최적화 하게 됨 즉 local minimum에서 빠져 나올 수 없음