In [1]:
import gym
import numpy as np
import random
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Neural Network Model Defined using PyTorch
class Network(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=128):
        super(Network, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, 1)
        self.advantage = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        value = self.value(x)
        advantage = self.advantage(x)

        # Q value calculation
        return value + advantage - advantage.mean(dim=1, keepdim=True)

class DQNAgent:
    def __init__(self, env, batch_size, target_update):
        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.batch_size = batch_size
        self.gamma = 0.99
        self.lr = 0.001
        self.target_update = target_update

        self.dqn = Network(self.state_size, self.action_size)
        self.dqn_target = Network(self.state_size, self.action_size)
        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.optimizer = optim.Adam(self.dqn.parameters(), lr=self.lr)

        self.memory = deque(maxlen=2000)

    def get_action(self, state, epsilon):
        if np.random.rand() <= epsilon:
            return self.env.action_space.sample()
        else:
            state = torch.FloatTensor(state).unsqueeze(0)
            with torch.no_grad():
                q_values = self.dqn(state)
            return q_values.max(1)[1].item()

    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train_step(self):
        if len(self.memory) < self.batch_size:
            return

        mini_batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*mini_batch)

        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)

        curr_Q = self.dqn(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        next_Q = self.dqn_target(next_states).max(1)[0]
        expected_Q = rewards + self.gamma * next_Q * (1 - dones)

        loss = nn.MSELoss()(curr_Q, expected_Q.detach())

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target(self):
        self.dqn_target.load_state_dict(self.dqn.state_dict())

# Environment Setup
env = gym.make("CartPole-v0")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Hyperparameters
batch_size = 64
target_update = 100
max_episodes = 200
epsilon = 1.0
min_epsilon = 0.01
max_epsilon = 1.0
decay_rate = 0.005

# Agent Initialization
agent = DQNAgent(env, batch_size, target_update)

# Training Loop
scores = []
update_cnt = 0
with tqdm(total=max_episodes, desc="에피소드 진행") as pbar:
    for episode in range(max_episodes):
        state = env.reset()
        episode_reward = 0
        done = False

        while not done:
            action = agent.get_action(state, epsilon)
            next_state, reward, done, _ = env.step(action)
            agent.append_sample(state, action, reward, next_state, done)

            state = next_state
            episode_reward += reward

            if len(agent.memory) >= agent.batch_size:
                agent.train_step()
                update_cnt += 1

                # Target Network Update
                if update_cnt % agent.target_update == 0:
                    agent.update_target()

        scores.append(episode_reward)
        epsilon = max(min_epsilon, epsilon * np.exp(-decay_rate))
        pbar.update(1)
        pbar.set_postfix({'episode_reward': episode_reward})

print(f"평균 점수: {sum(scores) / max_episodes}")

  logger.warn(
  deprecation(
  deprecation(
  states = torch.FloatTensor(states)
에피소드 진행: 100%|██████████| 200/200 [00:43<00:00,  4.60it/s, episode_reward=97]

평균 점수: 52.815



