In [3]:
import gym
import numpy as np
import random
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim

class Network(nn.Module):
    def __init__(self, state_size: int, action_size: int):
        super(Network, self).__init__()
        self.layer1 = nn.Linear(state_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, action_size)

    def forward(self, state):
        layer1 = torch.relu(self.layer1(state))
        layer2 = torch.relu(self.layer2(layer1))
        value = self.value(layer2)
        return value

class DQNAgent:
    def __init__(self, env: gym.Env, batch_size: int):
        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n

        self.batch_size = batch_size
        self.gamma = 0.99
        self.dqn = Network(self.state_size, self.action_size)
        self.optimizer = optim.Adam(self.dqn.parameters(), lr=0.001)
        self.memory = deque(maxlen=2000)

    def get_action(self, state, epsilon):
        state = torch.from_numpy(state).float().unsqueeze(0)
        q_value = self.dqn(state)[0]
        if np.random.rand() <= epsilon:
            return np.random.choice(self.action_size)
        else:
            return torch.argmax(q_value).item()

    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train_step(self):
        if len(self.memory) < self.batch_size:
            return

        mini_batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*mini_batch)

        states = np.array(states, dtype=np.float32)
        actions = np.array(actions, dtype=np.int64)
        rewards = np.array(rewards, dtype=np.float32)
        next_states = np.array(next_states, dtype=np.float32)
        dones = np.array(dones, dtype=np.float32)

        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)

        curr_Qs = self.dqn(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        next_Qs = self.dqn(next_states).max(1)[0].detach()
        target_Qs = rewards + self.gamma * next_Qs * (1 - dones)

        loss = torch.nn.functional.mse_loss(curr_Qs, target_Qs)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


In [7]:
import gym
import numpy as np
import torch
import random
from collections import deque
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim

# PyTorch 기반의 Network 및 DQNAgent 클래스 정의는 여기에 추가

# 환경 설정
env_name = "CartPole-v0"
env = gym.make(env_name)
env.reset(seed=1)  # 재현 가능성을 위해

# 하이퍼파라미터 초기화
hidden_size = 128
max_episodes = 200  # 총 에피소드 수
batch_size = 64

# 탐험 파라미터
epsilon = 1.0        # 탐험률
max_epsilon = 1.0    # 시작시 탐험 확률
min_epsilon = 0.01   # 최소 탐험 확률
decay_rate = 0.005   # 탐험 확률의 지수 감소율

# 에이전트 훈련
agent = DQNAgent(env, batch_size)

if __name__ == "__main__":
    scores = []

    with tqdm(total=max_episodes, desc="에피소드 진행") as pbar:
        for episode in range(max_episodes):
            state = agent.env.reset()
            episode_reward = 0
            done = False

            while not done:
                action = agent.get_action(state, epsilon)
                next_state, reward, done, _ = agent.env.step(action)
                agent.append_sample(state, action, reward, next_state, done)

                state = next_state
                episode_reward += reward

                if done:
                    scores.append(episode_reward)
                    pbar.set_postfix({'episode_reward': episode_reward})
                    pbar.update(1)
                    break
                if len(agent.memory) >= agent.batch_size:
                    agent.train_step()

            epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

    print(f"평균 점수: {sum(scores) / max_episodes}")


에피소드 진행: 100%|██████████| 200/200 [00:39<00:00,  5.04it/s, episode_reward=108]

평균 점수: 72.57



