In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
from tqdm import tqdm

class Network(nn.Module):
    def __init__(self, state_size, action_size):
        super(Network, self).__init__()
        self.layer1 = nn.Linear(state_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, action_size)

    def forward(self, state):
        x = torch.relu(self.layer1(state))
        x = torch.relu(self.layer2(x))
        return self.output(x)

class DQNAgent:
    def __init__(self, env, batch_size, target_update):
        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.batch_size = batch_size
        self.target_update = target_update
        self.gamma = 0.99
        self.model = Network(self.state_size, self.action_size)
        self.target_model = Network(self.state_size, self.action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.memory = deque(maxlen=2000)
        self.update_cnt = 0  # update_cnt 변수를 초기화
        self.update_target_model()

    def get_action(self, state, epsilon):
        if random.random() < epsilon:
            return random.randrange(self.action_size)
        else:
            state = torch.FloatTensor(state).unsqueeze(0)
            q_value = self.model(state)
            return q_value.argmax().item()

    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train_step(self):
        if len(self.memory) < self.batch_size:
            return

        mini_batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*mini_batch)

        # 리스트를 먼저 numpy 배열로 변환한 후 torch tensor로 변환합니다
        states = np.array(states, dtype=np.float32)
        actions = np.array(actions, dtype=np.int64)
        rewards = np.array(rewards, dtype=np.float32)
        next_states = np.array(next_states, dtype=np.float32)
        dones = np.array(dones, dtype=np.float32)

        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)

        self.update_cnt += 1  # train_step이 호출될 때마다 update_cnt를 증가
        if self.update_cnt % self.target_update == 0:
            self.update_target_model()

        curr_q = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        max_next_q = self.target_model(next_states).max(1)[0]
        expected_q = rewards + self.gamma * max_next_q * (1 - dones)

        loss = nn.MSELoss()(curr_q, expected_q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_model(self):
        if self.update_cnt % self.target_update == 0:
            self.target_model.load_state_dict(self.model.state_dict())

# 환경 설정 및 파라미터
env_name = "CartPole-v0"
env = gym.make(env_name)
target_update = 100
hidden_size = 128
max_episodes = 200
batch_size = 64
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005

# 에이전트 초기화
agent = DQNAgent(env, batch_size, target_update)

# 훈련 루프
with tqdm(total=max_episodes, desc="에피소드 진행") as pbar:
    for episode in range(max_episodes):
        state = env.reset()
        episode_reward = 0
        done = False

        while not done:
            action = agent.get_action(state, epsilon)
            next_state, reward, done, _ = env.step(action)
            agent.append_sample(state, action, reward, next_state, done)

            state = next_state
            episode_reward += reward

            if len(agent.memory) >= agent.batch_size:
                agent.train_step()
                agent.update_target_model()

        scores.append(episode_reward)
        epsilon = max(min_epsilon, epsilon * np.exp(-decay_rate))
        pbar.update(1)
        pbar.set_postfix({'episode_reward': episode_reward})

print(f"평균 점수: {sum(scores) / max_episodes}")


에피소드 진행: 100%|██████████| 200/200 [00:35<00:00,  5.58it/s, episode_reward=12]

평균 점수: 56.105



