In [1]:
# Google Colab에서 노트북을 실행하실 때에는 
# https://tutorials.pytorch.kr/beginner/colab 를 참고하세요.
%matplotlib inline


# 강화 학습 (DQN) 튜토리얼

**Author**: [Adam Paszke](https://github.com/apaszke), [Mark Towers](https://github.com/pseudo-rnd-thoughts)
  **번역**: [황성수](https://github.com/adonisues), [박정환](https://github.com/9bow)


**태스크**

에이전트는 연결된 막대가 똑바로 서 있도록 카트를 왼쪽이나 오른쪽으로 움직이는 두 가지 동작 중 하나를 선택. 

In [2]:
import gymnasium as gym
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

env = gym.make("CartPole-v1")

# matplotlib 설정
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# GPU를 사용할 경우
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 재현 메모리(Replay Memory)

우리는 DQN 학습을 위해 경험 재현 메모리를 사용할 것입니다.
에이전트가 관찰한 전환(transition)을 저장하고 나중에 이 데이터를
재사용할 수 있습니다. 무작위로 샘플링하면 배치를 구성하는 전환들이
비상관(decorrelated)하게 됩니다. 이것이 DQN 학습 절차를 크게 안정시키고
향상시키는 것으로 나타났습니다.

이를 위해서 두개의 클래스가 필요합니다:

-  ``Transition`` - 우리 환경에서 단일 전환을 나타내도록 명명된 튜플.
   그것은 화면의 차이인 state로 (state, action) 쌍을 (next_state, reward) 결과로 매핑합니다.
-  ``ReplayMemory`` - 최근 관찰된 전이를 보관 유지하는 제한된 크기의 순환 버퍼.
   또한 학습을 위한 전환의 무작위 배치를 선택하기위한
   ``.sample ()`` 메소드를 구현합니다.



In [3]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """transition 저장"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [4]:
#print(expert.memory[500])

import pickle

# with open('expert.pkl', 'wb') as f:
#     pickle.dump(expert, f)


expert = ReplayMemory(10000)

with open('expert.pkl', 'rb') as f:
    expert = pickle.load(f)

print(expert.memory[0])


Transition(state=tensor([[ 0.0170,  0.2429, -0.0353, -0.3176]], device='cuda:0'), action=tensor([[0]], device='cuda:0'), next_state=tensor([[ 0.0219,  0.0483, -0.0417, -0.0363]], device='cuda:0'), reward=tensor([1.], device='cuda:0'))


## ql_diffusion.py 

In [5]:
# Copyright 2022 Twitter, Inc and Zhendong Wang.
# SPDX-License-Identifier: Apache-2.0

import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import CosineAnnealingLR
from utils.logger import logger

from agents.diffusion import Diffusion
from agents.model import MLP
from agents.helpers import EMA


class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super(Critic, self).__init__()
        self.q1_model = nn.Sequential(nn.Linear(state_dim + action_dim, hidden_dim),
                                      nn.Mish(),
                                      nn.Linear(hidden_dim, hidden_dim),
                                      nn.Mish(),
                                      nn.Linear(hidden_dim, hidden_dim),
                                      nn.Mish(),
                                      nn.Linear(hidden_dim, 1))

        self.q2_model = nn.Sequential(nn.Linear(state_dim + action_dim, hidden_dim),
                                      nn.Mish(),
                                      nn.Linear(hidden_dim, hidden_dim),
                                      nn.Mish(),
                                      nn.Linear(hidden_dim, hidden_dim),
                                      nn.Mish(),
                                      nn.Linear(hidden_dim, 1))

    def forward(self, state, action):
        x = torch.cat([state, action], dim=-1)
        return self.q1_model(x), self.q2_model(x)

    def q1(self, state, action):
        x = torch.cat([state, action], dim=-1)
        return self.q1_model(x)

    def q_min(self, state, action):
        q1, q2 = self.forward(state, action)
        return torch.min(q1, q2)


class Diffusion_QL(object):
    def __init__(self,
                 state_dim,
                 action_dim,
                 device,
                 discount,
                 tau,
                 max_q_backup=False,
                 eta=1.0,
                 beta_schedule='linear',
                 n_timesteps=100,
                 ema_decay=0.995,
                 step_start_ema=1000,
                 update_ema_every=5,
                 lr=3e-4,
                 lr_decay=False,
                 lr_maxt=1000,
                 grad_norm=1.0,
                 ):

        self.model = MLP(state_dim=state_dim, action_dim=action_dim, device=device)

        self.actor = Diffusion(state_dim=state_dim, action_dim=action_dim, model=self.model,
                               beta_schedule=beta_schedule, n_timesteps=n_timesteps,).to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

        self.lr_decay = lr_decay
        self.grad_norm = grad_norm

        self.step = 0
        self.step_start_ema = step_start_ema
        self.ema = EMA(ema_decay)
        self.ema_model = copy.deepcopy(self.actor)
        self.update_ema_every = update_ema_every

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)

        if lr_decay:
            self.actor_lr_scheduler = CosineAnnealingLR(self.actor_optimizer, T_max=lr_maxt, eta_min=0.)
            self.critic_lr_scheduler = CosineAnnealingLR(self.critic_optimizer, T_max=lr_maxt, eta_min=0.)

        self.state_dim = state_dim
        # self.max_action = max_action
        self.action_dim = action_dim
        self.discount = discount
        self.tau = tau
        self.eta = eta  # q_learning weight
        self.device = device
        self.max_q_backup = max_q_backup

    def step_ema(self):
        if self.step < self.step_start_ema:
            return
        self.ema.update_model_average(self.ema_model, self.actor)

    def train(self, replay_buffer, iterations, batch_size=100, log_writer=None):
        metric = {'bc_loss': [], 'ql_loss': [], 'actor_loss': [], 'critic_loss': []} 
        for _ in range(iterations):
            # Sample replay buffer as minbatch size
            temp_buffer = replay_buffer.sample(batch_size)
            state = temp_buffer[0].state
            action = temp_buffer[0].action
            next_state = temp_buffer[0].next_state
            reward = temp_buffer[0].reward
            for i in range(1, len(temp_buffer)):
                state = torch.cat((state, temp_buffer[i].state), 0)
                action = torch.cat((action, temp_buffer[i].action), 0)
                next_state = torch.cat((next_state, temp_buffer[i].next_state), 0)
                reward = torch.cat((reward, temp_buffer[i].reward), 0)

        
        # for _ in range(iterations):
        #     # Sample replay buffer / batch
        #     state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)

            """ Q Training """
            current_q1, current_q2 = self.critic(state, action)

            if self.max_q_backup:
                next_state_rpt = torch.repeat_interleave(next_state, repeats=10, dim=0)
                next_action_rpt = self.ema_model(next_state_rpt)
                target_q1, target_q2 = self.critic_target(next_state_rpt, next_action_rpt)
                target_q1 = target_q1.view(batch_size, 10).max(dim=1, keepdim=True)[0]
                target_q2 = target_q2.view(batch_size, 10).max(dim=1, keepdim=True)[0]
                target_q = torch.min(target_q1, target_q2)
            else:
                next_action = self.ema_model(next_state)
                target_q1, target_q2 = self.critic_target(next_state, next_action)
                target_q = torch.min(target_q1, target_q2)

            target_q = (reward + self.discount * target_q).detach()

            critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)

            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            if self.grad_norm > 0:
                critic_grad_norms = nn.utils.clip_grad_norm_(self.critic.parameters(), max_norm=self.grad_norm, norm_type=2)
            self.critic_optimizer.step()

            """ Policy Training """
            bc_loss = self.actor.loss(action, state)
            new_action = self.actor(state)

            q1_new_action, q2_new_action = self.critic(state, new_action)
            if np.random.uniform() > 0.5:
                q_loss = - q1_new_action.mean() / q2_new_action.abs().mean().detach()
            else:
                q_loss = - q2_new_action.mean() / q1_new_action.abs().mean().detach()
            actor_loss = bc_loss + self.eta * q_loss

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            if self.grad_norm > 0: 
                actor_grad_norms = nn.utils.clip_grad_norm_(self.actor.parameters(), max_norm=self.grad_norm, norm_type=2)
            self.actor_optimizer.step()


            """ Step Target network """
            if self.step % self.update_ema_every == 0:
                self.step_ema()

            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

            self.step += 1

            """ Log """
            if log_writer is not None:
                if self.grad_norm > 0:
                    log_writer.add_scalar('Actor Grad Norm', actor_grad_norms.max().item(), self.step)
                    log_writer.add_scalar('Critic Grad Norm', critic_grad_norms.max().item(), self.step)
                log_writer.add_scalar('BC Loss', bc_loss.item(), self.step)
                log_writer.add_scalar('QL Loss', q_loss.item(), self.step)
                log_writer.add_scalar('Critic Loss', critic_loss.item(), self.step)
                log_writer.add_scalar('Target_Q Mean', target_q.mean().item(), self.step)

            metric['actor_loss'].append(actor_loss.item())
            metric['bc_loss'].append(bc_loss.item())
            metric['ql_loss'].append(q_loss.item())
            metric['critic_loss'].append(critic_loss.item())

        if self.lr_decay: 
            self.actor_lr_scheduler.step()
            self.critic_lr_scheduler.step()

        return metric

    def sample_action(self, state):
        state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
        state_rpt = torch.repeat_interleave(state, repeats=50, dim=0)
        with torch.no_grad():
            action = self.actor.sample(state_rpt)
            q_value = self.critic_target.q_min(state_rpt, action).flatten()
            idx = torch.multinomial(F.softmax(q_value), 1)
        return action[idx].cpu().data.numpy().flatten()

    def save_model(self, dir, id=None):
        if id is not None:
            torch.save(self.actor.state_dict(), f'{dir}/actor_{id}.pth')
            torch.save(self.critic.state_dict(), f'{dir}/critic_{id}.pth')
        else:
            torch.save(self.actor.state_dict(), f'{dir}/actor.pth')
            torch.save(self.critic.state_dict(), f'{dir}/critic.pth')

    def load_model(self, dir, id=None):
        if id is not None:
            self.actor.load_state_dict(torch.load(f'{dir}/actor_{id}.pth'))
            self.critic.load_state_dict(torch.load(f'{dir}/critic_{id}.pth'))
        else:
            self.actor.load_state_dict(torch.load(f'{dir}/actor.pth'))
            self.critic.load_state_dict(torch.load(f'{dir}/critic.pth'))



In [6]:
import numpy as np
from utils import utils

def eval_policy(policy, env_name, eval_episodes=10):
    eval_env = gym.make(env_name)

    scores = []
    for _ in range(eval_episodes):
        traj_return = 0.0
        state, done = eval_env.reset(), False
        while not done:
            action = policy.sample_action(np.array(state))
            state, reward, done, _ = eval_env.step(action)
            traj_return += reward
        scores.append(traj_return)

    avg_reward = np.mean(scores)
    std_reward = np.std(scores)

    normalized_scores = [eval_env.get_normalized_score(s) for s in scores]
    avg_norm_score = eval_env.get_normalized_score(avg_reward)
    std_norm_score = np.std(normalized_scores)

    utils.print_banner(
        f"Evaluation over {eval_episodes} episodes: {avg_reward:.2f} {avg_norm_score:.2f}"
    )
    return avg_reward, std_reward, avg_norm_score, std_norm_score

### main.py 

In [7]:
import argparse
import gym
import numpy as np 
import os
import torch 
import json 

from utils import utils 
from utils.logger import logger, setup_logger 

device = "cuda:0" if torch.cuda.is_available() else "cpu"

dataset = expert 
data_sampler = dataset.sample(10)

state_dim = 4
action_dim = 1

states = [sample[0] for sample in data_sampler]
# states = torch.tensor([sample[0] for sample in data_sample], dtype=torch.float32, device=device)
actions = torch.tensor([sample[1] for sample in data_sampler], dtype=torch.float32, device=device)
next_states = [sample[2] for sample in data_sampler]
rewards = [sample[3] for sample in data_sampler]

for transition in data_sampler:
    state = transition.state
    action = transition.action 
    next_state = transition.next_state
    reward = transition.reward 

    print(f"State: {state}, Action: {action}, Next State: {next_state}, Reward: {reward}") 

agent = Diffusion_QL(state_dim=state_dim,
                      action_dim=action_dim,
                      device=device,
                      discount=0.99,
                      tau=0.005,
                      max_q_backup=False,
                      beta_schedule="vp",
                      n_timesteps=100,
                      eta=1.0,
                      lr=3e-4,
                      lr_decay=False,
                      lr_maxt=2000,
                      grad_norm=4.0)

early_stop = False
stop_check = utils.EarlyStopping(tolerance=1, min_delta=0.)
writer = None 

batch_size = 256 
evaluations = []
num_epochs = 5
num_steps_per_epoch = 2
max_timesteps = num_epochs * num_steps_per_epoch 
training_iters = 0
metric = 100
eval_freq = 50

utils.print_banner(f"Training Start", separator="*", num_star=90)


# while (training_iters < max_timesteps) and (not early_stop):
#     iterations = int(eval_freq * num_steps_per_epoch)
#     loss_metric = agent.train(dataset, iterations, batch_size, writer)
#     training_iters += iterations
#     curr_epoch = int(training_iters // int(num_steps_per_epoch))

#     utils.print_banner(f"Epoch {epoch} End", separator="*", num_star=90)
#     print("Iteration: ", epoch*num_steps_per_epoch)
#     print(f"BC Loss: ", np.mean(loss_metric["bc_loss"]))
#     print(f"QL Loss: ", np.mean(loss_metric["ql_loss"]))
#     print(f"Actor Loss: ", np.mean(loss_metric["actor_loss"]))
#     print(f"Critic Loss: ", np.mean(loss_metric["critic_loss"]))



for epoch in range(num_epochs):
    for step in range(num_steps_per_epoch):
        ############################### Training ######################################
        loss_metric = agent.train(dataset, eval_freq*num_steps_per_epoch, batch_size, writer)

        ############################### Logging #######################################
        utils.print_banner(f"Epoch {epoch} End", separator="*", num_star=90)
        print("Iteration: ", epoch*num_steps_per_epoch)
        print(f"BC Loss: ", np.mean(loss_metric["bc_loss"]))
        print(f"QL Loss: ", np.mean(loss_metric["ql_loss"]))
        print(f"Actor Loss: ", np.mean(loss_metric["actor_loss"]))
        print(f"Critic Loss: ", np.mean(loss_metric["critic_loss"]))


# eval_res, eval_res_std, eval_norm_res, eval_norm_res_std = eval_policy(agent, args.env_name, args.seed,
#                                                                                eval_episodes=args.eval_episodes)



State: tensor([[-0.9180, -0.8974, -0.1123, -0.4871]], device='cuda:0'), Action: tensor([[1]], device='cuda:0'), Next State: tensor([[-0.9359, -0.7009, -0.1221, -0.8130]], device='cuda:0'), Reward: tensor([1.], device='cuda:0')
State: tensor([[ 0.0029, -0.1866,  0.0079,  0.1625]], device='cuda:0'), Action: tensor([[0]], device='cuda:0'), Next State: tensor([[-0.0008, -0.3819,  0.0111,  0.4576]], device='cuda:0'), Reward: tensor([1.], device='cuda:0')
State: tensor([[ 0.7881, -0.2419, -0.0166,  0.9280]], device='cuda:0'), Action: tensor([[1]], device='cuda:0'), Next State: tensor([[ 0.7833, -0.0465,  0.0019,  0.6301]], device='cuda:0'), Reward: tensor([1.], device='cuda:0')
State: tensor([[ 1.9581,  0.9179,  0.0723, -0.0332]], device='cuda:0'), Action: tensor([[1]], device='cuda:0'), Next State: tensor([[ 1.9764,  1.1119,  0.0716, -0.3022]], device='cuda:0'), Reward: tensor([1.], device='cuda:0')
State: tensor([[ 0.0030, -0.4209,  0.0808,  0.4851]], device='cuda:0'), Action: tensor([[1]]



******************************************************************************************
Training Start
******************************************************************************************


  critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)


******************************************************************************************
Epoch 0 End
******************************************************************************************
Iteration:  0
BC Loss:  0.3605037303268909
QL Loss:  -1.0522946539521216
Actor Loss:  -0.6917909184098243
Critic Loss:  0.14799598822544793
******************************************************************************************
Epoch 0 End
******************************************************************************************
Iteration:  0
BC Loss:  0.23865554884076118
QL Loss:  -0.9995956754684449
Actor Loss:  -0.7609401267766952
Critic Loss:  0.0006400656956247985
******************************************************************************************
Epoch 1 End
******************************************************************************************
Iteration:  2
BC Loss:  0.23117459818720817
QL Loss:  -1.000018515586853
Actor Loss:  -0.7688439166545868
Critic Loss:  0.0004963229832

1. modify DQN Replaymemory code(push code)
2. modify diffusion.py ---------> 

In [8]:
import pygame
import gymnasium as gym

env = gym.make("CartPole-v1", render_mode="human", max_episode_steps=300)

observation, _ = env.reset()
terminated = False
while not terminated:
    action = agent.sample_action(observation)
    action = round(action.item() > 0.5)      # int(value > 0.5)
    next_state, reward, terminated, truncated, _ = env.step(action)
    observation = next_state

env.close()

  idx = torch.multinomial(F.softmax(q_value), 1)
