In [93]:
import gym

env_name = "CartPole-v0"
env = gym.make(env_name)
env.reset(seed=1)

array([ 0.00118216,  0.04504637, -0.03558404,  0.04486495], dtype=float32)

In [129]:
class SumTree:
    data_pointer = 0

    def __init__(self, capacity):
        self.capacity = capacity  # leaf node의 수 = capacity
        self.tree = np.zeros(2 * capacity - 1)  # 총 node의 수 -> 우선순위(priority)를 저장
        self.data = np.zeros(capacity, dtype=object)  # 경험(state, action, reward, next state, done flag로 이루어진 tuple)을 저장
        self.n_entries = 0

    def add(self, priority, data):
        tree_index = self.data_pointer + self.capacity - 1
        self.data[self.data_pointer] = data # update data 프레임
        self.update(tree_index, priority) # leaf(priority) 업데이트
        self.data_pointer += 1  # pointer를 1 증가시킴
        if self.data_pointer >= self.capacity:  # capacity를 넘었다면 첫번째 index로 돌아감
            self.data_pointer = 0
        if self.n_entries < self.capacity:
            self.n_entries += 1

    # leaf priority score 업데이트
    def _propagate(self, idx, change):
        parent = (idx - 1) // 2
        self.tree[parent] += change
        if parent != 0:
            self._propagate(parent, change)

    def update(self, tree_index, priority):
        change = priority - self.tree[tree_index]
        self.tree[tree_index] = priority
        self._propagate(tree_index, change)

    def _retrieve(self, idx, s):
        left_child_index = 2 * idx + 1
        right_child_index = left_child_index + 1
        if left_child_index >= len(self.tree):
            return idx
        if s <= self.tree[left_child_index]:
            return self._retrieve(left_child_index, s)
        else:
            return self._retrieve(right_child_index, s - self.tree[left_child_index])

    def get_leaf(self, s):
        leaf_index = self._retrieve(0, s)
        data_index = leaf_index - self.capacity + 1
        return (leaf_index, self.tree[leaf_index], self.data[data_index])

    # 루트 노드를 반환
    def total_priority(self):
        return self.tree[0]

class PrioritizedReplayBuffer(object):
    PER_e = 0.001 # 어떤 경험을 할 확률이 0이 되지 않도록 하는 hyperparameter
    PER_a = 0.6 # 우선순위가 높은 것과 무작위 샘플링 사이 절충을 하기 위한 hyperparameter
    PER_b = 0.4 # Importance Sampling. 1까지 증가
    PER_b_increment_per_sampling = 0.001

    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        self.capacity = capacity

    # 최대 우선 순위 검색
    def _getPriority(self, error):
        return (error + self.PER_e) ** self.PER_a

    def store(self, error, sample):
        max_priority = self._getPriority(error)
        self.tree.add(max_priority, sample)

    def sample(self, n):
        minibatch = []
        idxs = []
        priority_segment = self.tree.total_priority() / n
        priorities = []
        self.PER_b = np.min([1., self.PER_b + self.PER_b_increment_per_sampling])

        for i in range(n):
            a = priority_segment * i
            b = priority_segment * (i + 1)
            value = np.random.uniform(a, b)
            (idx, p, data) = self.tree.get_leaf(value)
            priorities.append(p)
            minibatch.append(data)
            idxs.append(idx)

        sampling_probabilities = priorities / self.tree.total_priority()
        is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.PER_b)
        is_weight /= is_weight.max()

        return minibatch, idxs, is_weight

    def batch_update(self, idxs, errors):
        for i, error in zip(idxs, errors):
            p = self._getPriority(error)
            self.tree.update(i, p)



  and should_run_async(code)


### Q-Network

In [130]:
# INITIALIZING THE Q-PARAMETERS
hidden_size = 128
max_episodes = 200  # Set total number of episodes to train agent on.
batch_size = 64

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability
decay_rate = 0.005            # Exponential decay rate for exploration prob

In [131]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Network(nn.Module):
    def __init__(self, state_size, action_size, hidden_size):
        super(Network, self).__init__()
        self.layer1 = nn.Linear(state_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size)
        self.state = nn.Linear(hidden_size, action_size)
        self.action = nn.Linear(hidden_size, action_size)

    def forward(self, state):
        layer1 = F.relu(self.layer1(state))
        layer2 = F.relu(self.layer2(layer1))
        state_value = self.state(layer2)
        action_advantage = self.action(layer2)
        mean = torch.mean(action_advantage, dim=1, keepdim=True)
        advantage = action_advantage - mean
        value = state_value + advantage
        return value


In [134]:
import torch
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random

class DQNAgent:
    def __init__(self, env, batch_size, target_update, hidden_size):
        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.batch_size = batch_size
        self.gamma = 0.99
        self.lr = 0.001
        self.target_update = target_update
        self.soft_update = False
        self.tau = 0.1

        self.dqn = Network(self.state_size, self.action_size, hidden_size)
        self.dqn_target = Network(self.state_size, self.action_size, hidden_size)
        self.optimizer = optim.Adam(self.dqn.parameters(), lr=self.lr)
        self.memory = PrioritizedReplayBuffer(10000)  # Assuming PrioritizedReplayBuffer is defined

        self._target_hard_update()

    def get_action(self, state, epsilon):
        state = torch.FloatTensor(state).unsqueeze(0)
        q_value = self.dqn(state)
        if random.random() <= epsilon:
            action = random.choice(range(self.action_size))
        else:
            action = torch.argmax(q_value).item()
        return action

    def append_sample(self, state, action, reward, next_state, done):
        state = torch.FloatTensor([state])
        next_state = torch.FloatTensor([next_state])

        self.dqn.eval()
        self.dqn_target.eval()
        with torch.no_grad():
            next_Q = self.dqn(next_state)
            next_action = torch.argmax(next_Q, dim=1)
            target_next_Q = self.dqn_target(next_state)
            target_value = target_next_Q.gather(1, next_action.unsqueeze(1)).squeeze(1)

        target_value = reward + self.gamma * target_value * (1 - done)

        self.dqn.train()
        curr_Q = self.dqn(state)
        curr_Q = curr_Q.gather(1, torch.tensor([[action]])).squeeze(1)

        td_error = torch.abs(target_value - curr_Q).item()

        self.memory.store(td_error, (state, action, reward, next_state, done))

    def train_step(self):

        mini_batch, idxs, IS_weights = self.memory.sample(self.batch_size)
        states = np.array([np.array(x[0], dtype=np.float32) for x in mini_batch])
        states = torch.tensor(states)
        actions = torch.tensor([x[1] for x in mini_batch], dtype=torch.int64)
        rewards = torch.tensor([x[2] for x in mini_batch], dtype=torch.float32)
        next_states = torch.tensor([x[3].numpy() if isinstance(x[3], torch.Tensor) else x[3] for x in mini_batch], dtype=torch.float32)
        dones = torch.tensor([x[4] for x in mini_batch], dtype=torch.float32)


        # Q 값과 타겟 Q 값 계산
        q_values = self.dqn(states).squeeze(1)
        next_q_values = self.dqn_target(next_states).squeeze(1)


        # 여기서 q_values와 next_q_values는 [64, 2] 형태를 가져야 합니다.
        actions = actions.view(-1, 1)
        curr_Q = q_values.gather(1, actions).squeeze(1)
        next_Q = next_q_values.max(1)[0]
        expected_Q = rewards + self.gamma * next_Q * (1 - dones)


        # 손실 계산
        errors = torch.abs(curr_Q - expected_Q.detach())
        IS_weights = torch.tensor(IS_weights, dtype=torch.float32)
        loss = (IS_weights * F.mse_loss(curr_Q, expected_Q.detach(), reduction='none')).mean()

        # 역전파와 옵티마이저를 통한 업데이트
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # 우선순위 업데이트
        new_priorities = errors.detach().numpy() + 1e-5
        self.memory.batch_update(idxs, new_priorities)


        return loss.item()

    def _target_hard_update(self):
        if self.soft_update:
            for target_param, local_param in zip(self.dqn_target.parameters(), self.dqn.parameters()):
                target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
        else:
            self.dqn_target.load_state_dict(self.dqn.state_dict())

    def save(self, filepath):
        """모델의 상태 사전을 파일에 저장합니다."""
        torch.save(self.dqn.state_dict(), filepath)

    def load(self, filepath):
        """파일에서 모델의 상태 사전을 불러와 모델에 적용합니다."""
        self.dqn.load_state_dict(torch.load(filepath))
        self.dqn.eval()  # 모델을 평가 모드로 설정

In [136]:
env_name = "CartPole-v0"
env = gym.make(env_name)

# 파라미터 설정
target_update = 20
hidden_size = 64
max_episodes = 300
batch_size = 64

# 탐색 파라미터
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.025

# 에이전트 초기화
agent = DQNAgent(env, batch_size, target_update, hidden_size)

update_cnt = 0
scores = []

for episode in range(max_episodes):
    state = env.reset()
    episode_reward = 0
    done = False

    while not done:
        update_cnt += 1
        action = agent.get_action(np.array(state, dtype=np.float32), epsilon)
        next_state, reward, done, _ = env.step(action)

        agent.append_sample(state, action, reward, next_state, done)
        state = next_state
        episode_reward += reward

        if done:
            scores.append(episode_reward)
            print(f"episode: {episode+1}/{max_episodes}, score: {episode_reward}, e: {epsilon:.4}")
            break

        if update_cnt >= agent.batch_size:
            agent.train_step()

            if update_cnt % agent.target_update == 0:
                agent._target_hard_update()

    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

episode: 1/300, score: 45.0, e: 1.0
episode: 2/300, score: 12.0, e: 1.0
episode: 3/300, score: 23.0, e: 0.9756
episode: 4/300, score: 29.0, e: 0.9517
episode: 5/300, score: 25.0, e: 0.9285
episode: 6/300, score: 16.0, e: 0.9058
episode: 7/300, score: 10.0, e: 0.8837
episode: 8/300, score: 41.0, e: 0.8621
episode: 9/300, score: 14.0, e: 0.8411
episode: 10/300, score: 13.0, e: 0.8205
episode: 11/300, score: 20.0, e: 0.8005
episode: 12/300, score: 16.0, e: 0.781
episode: 13/300, score: 12.0, e: 0.762
episode: 14/300, score: 36.0, e: 0.7434
episode: 15/300, score: 17.0, e: 0.7253
episode: 16/300, score: 26.0, e: 0.7076
episode: 17/300, score: 20.0, e: 0.6904
episode: 18/300, score: 27.0, e: 0.6736
episode: 19/300, score: 23.0, e: 0.6572
episode: 20/300, score: 19.0, e: 0.6413
episode: 21/300, score: 48.0, e: 0.6257
episode: 22/300, score: 64.0, e: 0.6105
episode: 23/300, score: 46.0, e: 0.5956
episode: 24/300, score: 74.0, e: 0.5812
episode: 25/300, score: 25.0, e: 0.5671
episode: 26/300, 