In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gym
import numpy as np
import random
from collections import deque

# Factorized Gaussian Noise Layer
class NoisyLinear(nn.Module):
    def __init__(self, in_features, out_features, sigma_init=0.5, bias=True):
        super(NoisyLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.sigma_init = sigma_init

        self.weight_mu = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features))

        if bias:
            self.bias_mu = nn.Parameter(torch.FloatTensor(out_features))
            self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features))
            self.register_buffer('bias_epsilon', torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias_mu', None)
            self.register_parameter('bias_sigma', None)

        self.reset_parameters()
        self.reset_noise()

    def reset_parameters(self):
        mu_range = 1 / np.sqrt(self.in_features)
        self.weight_mu.data.uniform_(-mu_range, mu_range)
        self.weight_sigma.data.fill_(self.sigma_init / np.sqrt(self.in_features))

        if self.bias_mu is not None:
            self.bias_mu.data.uniform_(-mu_range, mu_range)
            self.bias_sigma.data.fill_(self.sigma_init / np.sqrt(self.out_features))

    def reset_noise(self):
        epsilon_in = self._scale_noise(self.in_features)
        epsilon_out = self._scale_noise(self.out_features)

        self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
        if self.bias_mu is not None:
            self.bias_epsilon.copy_(epsilon_out)

    def _scale_noise(self, size):
        x = torch.randn(size)
        return x.sign().mul_(x.abs().sqrt_())

    def forward(self, input):
        if self.training:
            return F.linear(input, self.weight_mu + self.weight_sigma * self.weight_epsilon,
                            self.bias_mu + self.bias_sigma * self.bias_epsilon)
        else:
            return F.linear(input, self.weight_mu, self.bias_mu)

# Neural Network Model Defined at Here.
class Network(nn.Module):
    def __init__(self, state_size, action_size, hidden_size):
        super(Network, self).__init__()
        self.fc1 = NoisyLinear(state_size, hidden_size)
        self.fc2 = NoisyLinear(hidden_size, hidden_size)
        self.fc3 = NoisyLinear(hidden_size, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class DQNAgent:
    def __init__(self, env, batch_size, target_update, hidden_size):
        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.batch_size = batch_size
        self.target_update = target_update
        self.gamma = 0.99

        self.dqn = Network(self.state_size, self.action_size, hidden_size)
        self.dqn_target = Network(self.state_size, self.action_size, hidden_size)
        self.optimizer = optim.Adam(self.dqn.parameters(), lr=0.001)

        self.memory = deque(maxlen=10000)
        self._target_hard_update()

    def get_action(self, state, epsilon):
        if random.random() > epsilon:
            with torch.no_grad():
                state = torch.tensor([state], dtype=torch.float32)
                q_value = self.dqn(state)
                action = q_value.max(1)[1].item()
        else:
            action = self.env.action_space.sample()
        return action

    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train_step(self):
        if len(self.memory) < self.batch_size:
            return

        mini_batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*mini_batch)

        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)

        curr_Qs = self.dqn(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        next_Qs = self.dqn_target(next_states).max(1)[0]
        expected_Qs = rewards + self.gamma * next_Qs * (1 - dones)

        loss = F.mse_loss(curr_Qs, expected_Qs.detach())

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def _target_hard_update(self):
        self.dqn_target.load_state_dict(self.dqn.state_dict())


In [None]:
from tqdm import tqdm

# Main training loop
env_name = "CartPole-v0"
env = gym.make(env_name)

hidden_size = 128
max_episodes = 500
batch_size = 64
target_update = 100

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005

agent = DQNAgent(env, batch_size, target_update, hidden_size)

scores = []

with tqdm(total=max_episodes, desc="에피소드 진행") as pbar:
    for episode in range(max_episodes):
        state = env.reset()
        episode_reward = 0
        done = False

        while not done:
            action = agent.get_action(state, epsilon)
            next_state, reward, done, _ = env.step(action)
            agent.append_sample(state, action, reward, next_state, done)

            state = next_state
            episode_reward += reward

            agent.train_step()

            if episode % agent.target_update == 0:
                agent._target_hard_update()

        scores.append(episode_reward)
        pbar.update(1)
        pbar.set_postfix({'episode_reward': episode_reward})
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

print(f"평균 점수: {sum(scores) / max_episodes}")

  and should_run_async(code)
  logger.warn(
  deprecation(
  deprecation(
  states = torch.tensor(states, dtype=torch.float32)
에피소드 진행:  74%|███████▎  | 368/500 [00:50<00:54,  2.40it/s, episode_reward=71]