In [3]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

class Network(nn.Module):
    def __init__(self, state_size: int, action_size: int):
        super(Network, self).__init__()
        self.layer1 = nn.Linear(state_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, action_size)

    def forward(self, state):
        state = torch.from_numpy(state).float() if not isinstance(state, torch.Tensor) else state
        layer1 = torch.relu(self.layer1(state))
        layer2 = torch.relu(self.layer2(layer1))
        value = self.value(layer2)
        return value

class DQNAgent:
    def __init__(self, env: gym.Env):
        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.lr = 0.001
        self.gamma = 0.99
        self.dqn = Network(self.state_size, self.action_size)
        self.optimizer = optim.Adam(self.dqn.parameters(), lr=self.lr)

    def get_action(self, state, epsilon):
        state = torch.from_numpy(state).float().unsqueeze(0)
        q_value = self.dqn(state)[0]
        if np.random.rand() <= epsilon:
            return np.random.choice(self.action_size)
        else:
            return torch.argmax(q_value).item()

    def train_step(self, state, action, reward, next_state, done, next_action):
        state = torch.from_numpy(state).float()
        next_state = torch.from_numpy(next_state).float()
        action = torch.tensor(action)
        reward = torch.tensor(reward)

        curr_Q = self.dqn(state)[action]
        next_Q = self.dqn(next_state).detach()
        target_Q = reward + self.gamma * next_Q[next_action] if not done else reward

        loss = torch.nn.functional.mse_loss(curr_Q, target_Q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


In [4]:
import gym
import numpy as np
import torch
from tqdm import tqdm

# PyTorch 기반의 DQNAgent 클래스 및 필요한 추가 모듈 정의는 여기에 추가

# 환경 설정
env_name = "CartPole-v0"
env = gym.make(env_name)
env.seed(1)  # 재현 가능성을 위해

# 하이퍼파라미터 초기화
hidden_size = 128
max_episodes = 2500  # 총 에피소드 수

# 탐험 파라미터
epsilon = 1.0        # 탐험률
max_epsilon = 1.0    # 시작시 탐험 확률
min_epsilon = 0.01   # 최소 탐험 확률
decay_rate = 0.005   # 탐험 확률의 지수 감소율

# 에이전트 훈련
agent = DQNAgent(env)

if __name__ == "__main__":
    scores = []

    with tqdm(total=max_episodes, desc="에피소드 진행") as pbar:
        for episode in range(max_episodes):
            state = agent.env.reset()
            episode_reward = 0
            done = False

            while not done:
                action = agent.get_action(state, epsilon)
                next_state, reward, done, _ = agent.env.step(action)
                next_action = agent.get_action(next_state, epsilon)

                agent.train_step(state, action, reward, next_state, done, next_action)

                state = next_state
                episode_reward += reward

                if done:
                    scores.append(episode_reward)
                    pbar.set_postfix({'episode_reward': episode_reward})
                    pbar.update(1)
                    break

            epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

    print(f"평균 점수: {sum(scores) / max_episodes}")

    count = 500
    rewards_per_thousand_episodes = np.split(np.array(scores), int(max_episodes / 500))

    print("********천 개 에피소드당 평균 보상********\n")
    for r in rewards_per_thousand_episodes:
        print(f"{count}: {sum(r) / 500}")
        count += 500

에피소드 진행: 100%|██████████| 2500/2500 [17:18<00:00,  2.41it/s, episode_reward=161]

평균 점수: 160.8664
********천 개 에피소드당 평균 보상********

500: 119.652
1000: 170.21
1500: 179.35
2000: 170.81
2500: 164.31



