In [3]:
import gym
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

# 3.2 Python function for one hot encoding
def to_one_hot(i, n_classes=None):
    a = np.zeros(n_classes, 'uint8')
    a[i] = 1
    return a

# 3.3 CREATING THE Q-Network
# Neural Network Model Defined at Here.
class Network(nn.Module):
    def __init__(self, state_size: int, action_size: int):
        """Initialization."""
        super(Network, self).__init__()

        self.layer1 = nn.Linear(state_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, action_size)

    def forward(self, state):
        if not isinstance(state, torch.Tensor):
            state = torch.from_numpy(state).float()
        layer1 = torch.relu(self.layer1(state))
        layer2 = torch.relu(self.layer2(layer1))
        value = self.value(layer2)
        return value


class DQNAgent:
    def __init__(self, env: gym.Env):
        """Initialization."""
        self.env = env

        self.state_size  = env.observation_space.n
        self.action_size = env.action_space.n

        self.lr = 0.001
        self.gamma = 0.99

        self.dqn = Network(self.state_size, self.action_size)
        self.optimizer = optim.Adam(self.dqn.parameters(), lr=self.lr)

    # 3.4.1 EXPLORATION VS EXPLOITATION
    def get_action(self, state, epsilon):
        state = to_one_hot(state, self.state_size)
        if random.random() <= epsilon:
            return random.choice(range(self.action_size))
        else:
            state = torch.from_numpy(state).float().unsqueeze(0)
            q_values = self.dqn(state)
            return torch.argmax(q_values).item()

    # 3.4.2 UPDATING THE Q-VALUE
    def train_step(self, state, action, reward, next_state, done):
        state = torch.from_numpy(to_one_hot(state, self.state_size)).float()
        next_state = torch.from_numpy(to_one_hot(next_state, self.state_size)).float()
        action = torch.tensor(action)
        reward = torch.tensor(reward)

        if done:
            target = reward
        else:
            next_state_values = self.dqn(next_state).detach()
            target = reward + self.gamma * torch.max(next_state_values)

        predicted_value = self.dqn(state)[action]

        loss = torch.nn.functional.mse_loss(predicted_value, target)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


In [5]:
import gym
import numpy as np
import torch
from tqdm import tqdm

# DQNAgent 클래스와 필요한 모듈들을 정의하는 부분은 여기에 추가해야 합니다.

# 2.2 환경 생성
env_name = "FrozenLake-v1"
env = gym.make(env_name)
env.seed(1)  # 재현 가능성을 위해

# 2.4 하이퍼파라미터 초기화
hidden_size = 128
max_episodes = 2500  # 총 에피소드 수
max_steps = 99       # 에피소드당 최대 스텝 수
gamma = 0.95         # 할인율
render = False       # 게임 환경 표시 여부

# 탐험 파라미터
epsilon = 1.0        # 탐험률
max_epsilon = 1.0    # 시작시 탐험 확률
min_epsilon = 0.01   # 최소 탐험 확률
decay_rate = 0.005   # 탐험 확률의 지수 감소율

# 에이전트 훈련
agent = DQNAgent(env)

if __name__ == "__main__":
    scores = []

    with tqdm(total=max_episodes, desc="에피소드 진행") as pbar:
        for episode in range(max_episodes):
            state = agent.env.reset()
            episode_reward = 0
            done = False

            if render: env.render()

            while not done:
                action = agent.get_action(state, epsilon)
                next_state, reward, done, _ = agent.env.step(action)

                if render: env.render()

                agent.train_step(state, action, reward, next_state, done)

                state = next_state
                episode_reward += reward

                if done:
                    scores.append(episode_reward)
                    pbar.set_postfix({'episode_reward': episode_reward})
                    pbar.update(1)
                    break

            epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

    print(f"평균 점수: {sum(scores) / max_episodes}")

    count = 500
    rewards_per_thousand_episodes = np.split(np.array(scores), int(max_episodes / 500))

    print("********천 개 에피소드당 평균 보상********\n")
    for r in rewards_per_thousand_episodes:
        print(f"{count}: {sum(r) / 500}")
        count += 500


에피소드 진행: 100%|██████████| 2500/2500 [03:53<00:00, 10.71it/s, episode_reward=0]

평균 점수: 0.4356
********천 개 에피소드당 평균 보상********

500: 0.112
1000: 0.366
1500: 0.57
2000: 0.504
2500: 0.626



