In [1]:
import gym
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque


# DQN 모델 정의
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)


# 강화 학습 파라미터
BATCH_SIZE = 64
GAMMA = 0.99
EPSILON_START = 1.0
EPSILON_END = 0.1
EPSILON_DECAY = 1000
LEARNING_RATE = 0.0005
MEMORY_SIZE = 10000
TARGET_UPDATE = 10


# 경험 리플레이 버퍼
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def size(self):
        return len(self.buffer)


# 훈련 함수
def train_dqn():
    # 환경 설정
    env = gym.make(
        "CartPole-v1"
    )  # 자율 주행 시뮬레이터는 'CARLA' 등의 고급 시뮬레이터 사용

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    # DQN 모델, 타겟 네트워크, 옵티마이저 설정
    policy_net = DQN(state_size, action_size)
    target_net = DQN(state_size, action_size)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)

    # 경험 리플레이 버퍼
    memory = ReplayBuffer(MEMORY_SIZE)

    epsilon = EPSILON_START
    episodes = 1000
    for episode in range(episodes):
        state = env.reset()
        state = torch.tensor(state, dtype=torch.float32)
        done = False
        total_reward = 0

        while not done:
            # 탐험 또는 착취
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    action = policy_net(state).argmax().item()

            # 환경에서 한 스텝 진행
            next_state, reward, done, _ = env.step(action)
            next_state = torch.tensor(next_state, dtype=torch.float32)

            # 경험 저장
            memory.add((state, action, reward, next_state, done))
            state = next_state
            total_reward += reward

            # 경험 리플레이에서 배치 샘플링
            if memory.size() >= BATCH_SIZE:
                experiences = memory.sample(BATCH_SIZE)
                (
                    batch_state,
                    batch_action,
                    batch_reward,
                    batch_next_state,
                    batch_done,
                ) = zip(*experiences)

                batch_state = torch.stack(batch_state)
                batch_action = torch.tensor(batch_action)
                batch_reward = torch.tensor(batch_reward)
                batch_next_state = torch.stack(batch_next_state)
                batch_done = torch.tensor(batch_done)

                # Q 값 계산
                current_q = (
                    policy_net(batch_state)
                    .gather(1, batch_action.unsqueeze(1))
                    .squeeze(1)
                )
                next_q = target_net(batch_next_state).max(1)[0]
                target_q = batch_reward + (GAMMA * next_q * (1 - batch_done))

                # 손실 함수 계산
                loss = nn.functional.mse_loss(current_q, target_q)

                # 경사 하강법
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # 탐험률 감소
        epsilon = max(
            EPSILON_END, epsilon - (EPSILON_START - EPSILON_END) / EPSILON_DECAY
        )

        # 주기적으로 타겟 네트워크 업데이트
        if episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

        print(
            f"Episode {episode+1}/{episodes}, Total Reward: {total_reward}, Epsilon: {epsilon:.2f}"
        )


if __name__ == "__main__":
    train_dqn()

ModuleNotFoundError: No module named 'torch'