In [2]:
import time
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

import gymnasium as gym
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
import wandb
import import_ipynb
from datetime import datetime
from shutil import copyfile

from c_qnet import QNet, ReplayBuffer, Transition, MODEL_DIR

importing Jupyter notebook from c_qnet.ipynb
TORCH VERSION: 2.0.1


In [3]:
class DQN:
    def __init__(self, env, test_env, config, use_wandb):
        self.env = env
        self.test_env = test_env
        self.use_wandb = use_wandb

        self.env_name = config["env_name"]

        self.current_time = datetime.now().astimezone().strftime('%Y-%m-%d_%H-%M-%S')

        if self.use_wandb:
            self.wandb = wandb.init(
                project="DQN_{0}".format(self.env_name),
                name=self.current_time,
                config=config
            )

        self.max_num_episodes = config["max_num_episodes"]
        self.batch_size = config["batch_size"]
        self.learning_rate = config["learning_rate"]
        self.gamma = config["gamma"]
        self.steps_between_train = config["steps_between_train"]
        self.target_sync_step_interval = config["target_sync_step_interval"]
        self.replay_buffer_size = config["replay_buffer_size"]
        self.epsilon_start = config["epsilon_start"]
        self.epsilon_end = config["epsilon_end"]
        self.epsilon_final_scheduled_percent = config["epsilon_final_scheduled_percent"]
        self.print_episode_interval = config["print_episode_interval"]
        self.train_num_episodes_before_next_test = config["train_num_episodes_before_next_test"]
        self.validation_num_episodes = config["validation_num_episodes"]
        self.episode_reward_avg_solved = config["episode_reward_avg_solved"]

        self.epsilon_scheduled_last_episode = self.max_num_episodes * self.epsilon_final_scheduled_percent

        # network
        self.q = QNet(n_features=4, n_actions=2)
        self.target_q = QNet(n_features=4, n_actions=2)
        self.target_q.load_state_dict(self.q.state_dict())

        self.optimizer = optim.Adam(self.q.parameters(), lr=self.learning_rate)

        # agent
        self.replay_buffer = ReplayBuffer(self.replay_buffer_size)

        self.time_steps = 0
        self.total_time_steps = 0
        self.training_time_steps = 0

    def epsilon_scheduled(self, current_episode):
        fraction = min(current_episode / self.epsilon_scheduled_last_episode, 1.0)

        epsilon = min(
            self.epsilon_start + fraction * (self.epsilon_end - self.epsilon_start),
            self.epsilon_start
        )
        return epsilon

    def train_loop(self):
        loss = 0.0

        total_train_start_time = time.time()

        validation_episode_reward_avg = 0.0

        is_terminated = False

        for n_episode in range(1, self.max_num_episodes + 1):
            epsilon = self.epsilon_scheduled(n_episode)

            episode_reward = 0

            observation, _ = self.env.reset()

            done = False

            while not done:
                self.time_steps += 1
                self.total_time_steps += 1

                action = self.q.get_action(observation, epsilon)

                next_observation, reward, terminated, truncated, _ = self.env.step(action)

                transition = Transition(observation, action, next_observation, reward, terminated)

                self.replay_buffer.append(transition)

                episode_reward += reward
                observation = next_observation
                done = terminated or truncated

                if self.total_time_steps % self.steps_between_train == 0 and self.time_steps > self.batch_size:
                    loss = self.train()

            total_training_time = time.time() - total_train_start_time
            total_training_time = time.strftime('%H:%M:%S', time.gmtime(total_training_time))

            if n_episode % self.print_episode_interval == 0:
                print(
                    "[Episode {:3,}, Time Steps {:6,}]".format(n_episode, self.time_steps),
                    "Episode Reward: {:>5},".format(episode_reward),
                    "Replay buffer: {:>6,},".format(self.replay_buffer.size()),
                    "Loss: {:6.3f},".format(loss),
                    "Epsilon: {:4.2f},".format(epsilon),
                    "Training Steps: {:5,},".format(self.training_time_steps),
                    "Elapsed Time: {}".format(total_training_time)
                )

            if n_episode % self.train_num_episodes_before_next_test == 0:
                validation_episode_reward_lst, validation_episode_reward_avg = self.validate()

                print("[Validation Episode Reward: {0}] Average: {1:.3f}".format(
                    validation_episode_reward_lst, validation_episode_reward_avg
                ))

                if validation_episode_reward_avg > self.episode_reward_avg_solved:
                    print("Solved in {0:,} steps ({1:,} training steps)!".format(
                        self.time_steps, self.training_time_steps
                    ))
                    self.model_save(validation_episode_reward_avg)
                    is_terminated = True

            if self.use_wandb:
                self.wandb.log({
                    "[VALIDATION] Mean Episode Reward ({0} Episodes)".format(self.validation_num_episodes): validation_episode_reward_avg,
                    "[TRAIN] Episode Reward": episode_reward,
                    "[TRAIN] Loss": loss if loss != 0.0 else 0.0,
                    "[TRAIN] Epsilon": epsilon,
                    "[TRAIN] Replay buffer": self.replay_buffer.size(),
                    "Training Episode": n_episode,
                    "Training Steps": self.training_time_steps
                })

            if is_terminated:
                break

        total_training_time = time.time() - total_train_start_time
        total_training_time = time.strftime('%H:%M:%S', time.gmtime(total_training_time))
        print("Total Training End : {}".format(total_training_time))
        self.wandb.finish()

    def train(self):
        self.training_time_steps += 1

        batch = self.replay_buffer.sample(self.batch_size)

        # observations.shape: torch.Size([32, 4]),
        # actions.shape: torch.Size([32, 1]),
        # next_observations.shape: torch.Size([32, 4]),
        # rewards.shape: torch.Size([32, 1]),
        # dones.shape: torch.Size([32])
        observations, actions, next_observations, rewards, dones = batch

        # state_action_values.shape: torch.Size([32, 1])
        q_out = self.q(observations)
        q_values = q_out.gather(dim=-1, index=actions)

        with torch.no_grad():
            q_prime_out = self.target_q(next_observations)
            # next_state_values.shape: torch.Size([32, 1])
            max_q_prime = q_prime_out.max(dim=1, keepdim=True).values
            max_q_prime[dones] = 0.0

            # target_state_action_values.shape: torch.Size([32, 1])
            targets = rewards + self.gamma * max_q_prime

        # loss is just scalar torch value
        loss = F.mse_loss(targets.detach(), q_values)

        # print("observations.shape: {0}, actions.shape: {1}, "
        #       "next_observations.shape: {2}, rewards.shape: {3}, dones.shape: {4}".format(
        #     observations.shape, actions.shape,
        #     next_observations.shape, rewards.shape, dones.shape
        # ))
        # print("state_action_values.shape: {0}".format(state_action_values.shape))
        # print("next_state_values.shape: {0}".format(next_state_values.shape))
        # print("target_state_action_values.shape: {0}".format(
        #     target_state_action_values.shape
        # ))
        # print("loss.shape: {0}".format(loss.shape))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # sync
        if self.time_steps % self.target_sync_step_interval == 0:
            self.target_q.load_state_dict(self.q.state_dict())

        return loss.item()

    def model_save(self, validation_episode_reward_avg):
        filename = "dqn_{0}_{1:4.1f}_{2}.pth".format(
            self.env_name, validation_episode_reward_avg, self.current_time
        )
        torch.save(self.q.state_dict(), os.path.join(MODEL_DIR, filename))

        copyfile(
            src=os.path.join(MODEL_DIR, filename),
            dst=os.path.join(MODEL_DIR, "dqn_{0}_latest.pth".format(self.env_name))
        )

    def validate(self):
        episode_reward_lst = np.zeros(shape=(self.validation_num_episodes,), dtype=float)

        for i in range(self.validation_num_episodes):
            episode_reward = 0

            observation, _ = self.test_env.reset()

            done = False

            while not done:
                action = self.q.get_action(observation, epsilon=0.0)

                next_observation, reward, terminated, truncated, _ = self.test_env.step(action)

                episode_reward += reward
                observation = next_observation
                done = terminated or truncated

            episode_reward_lst[i] = episode_reward

        return episode_reward_lst, np.average(episode_reward_lst)

## DQN

### __init__function
This function is initialization DQN class

```python
def __init__(self, env, test_env, config, use_wandb):
    self.env = env
    self.test_env = test_env
    self.use_wandb = use_wandb

    self.env_name = config["env_name"]

    self.current_time = datetime.now().astimezone().strftime('%Y-%m-%d_%H-%M-%S')

    if self.use_wandb:
        self.wandb = wandb.init(
            project="DQN_{0}".format(self.env_name),
            name=self.current_time,
            config=config
        )

    self.max_num_episodes = config["max_num_episodes"]
    self.batch_size = config["batch_size"]
    self.learning_rate = config["learning_rate"]
    self.gamma = config["gamma"]
    self.steps_between_train = config["steps_between_train"]
    self.target_sync_step_interval = config["target_sync_step_interval"]
    self.replay_buffer_size = config["replay_buffer_size"]
    self.epsilon_start = config["epsilon_start"]
    self.epsilon_end = config["epsilon_end"]
    self.epsilon_final_scheduled_percent = config["epsilon_final_scheduled_percent"]
    self.print_episode_interval = config["print_episode_interval"]
    self.train_num_episodes_before_next_test = config["train_num_episodes_before_next_test"]
    self.validation_num_episodes = config["validation_num_episodes"]
    self.episode_reward_avg_solved = config["episode_reward_avg_solved"]

    self.epsilon_scheduled_last_episode = self.max_num_episodes * self.epsilon_final_scheduled_percent

    # network
    self.q = QNet(n_features=4, n_actions=2)
    self.target_q = QNet(n_features=4, n_actions=2)
    self.target_q.load_state_dict(self.q.state_dict())

    self.optimizer = optim.Adam(self.q.parameters(), lr=self.learning_rate)

    # agent
    self.replay_buffer = ReplayBuffer(self.replay_buffer_size)

    self.time_steps = 0
    self.total_time_steps = 0
    self.training_time_steps = 0
```

### environment initialization

- env : this variable means training environment
- test_env : this variable mean validation environment
- use_wandb : this variable has two value true or false
    - if value is true, using wandb
    - else doesn't use wandb
- env_name : in this case using CartPole-v1
- current_time : this variable means current time / when using this value, can see episode info

### hyper parameter initialization

- max_num_episodes : this variable means how many do episode
- batch_size : this variable means how many using step info when do gradient decent
- learning_rate : this variable means how much new information will you accept
- gamma : this variable means To what extent will future values be reflected
- steps_between_train : this variable means how many steps will you train during the episode
- epsilon_start : this variable means how much do exploration when start
- epsilon_end : this variable means how much do exploration when end
- epsilon_final_scheduled_percent : percentage of last episode scheduled as epsilon final value
- print_episode_interval : episode interval for output statistics
- train_num_episodes_before_next_test : interval between each training episode between verifications
- validation_num_episodes : number of episodes performed for validation
- episode_reward_avg_solved : average of verification episode rewards for completing training

### QNet initialization

- self.q = QNet(n_features=4, n_actions=2) : current q-net
    - observation space is 4 ans action space is 2
- self.target_q = QNet(n_features=4, n_actions=2) : target q-net
- self.target_q.load_state_dict(self.q.state_dict())
    - set the initial weights of the target Q-net to be the same as the current Q-net
- self.optimizer = optim.Adam(self.q.parameters(), lr=self.learning_rate) : optimizer is adam optimizer

### agent initialization

- self.replay_buffer = ReplayBuffer(self.replay_buffer_size) : set replay_buffer

## epsilon_scheduled function
This function selecte action by decaying_epsilon_greedy method

```python
def epsilon_scheduled(self, current_episode):
    fraction = min(current_episode / self.epsilon_scheduled_last_episode, 1.0)

    epsilon = min(
        self.epsilon_start + fraction * (self.epsilon_end - self.epsilon_start),
        self.epsilon_start
    )
    return epsilon
```

## train_loop function

``` python
def train_loop(self):
    loss = 0.0
    validation_episode_reward_avg = 0.0
    is_terminated = False

    for n_episode in range(1, self.max_num_episodes + 1):
        epsilon = self.epsilon_scheduled(n_episode)
        episode_reward = 0
        observation, _ = self.env.reset()
        done = False

        while not done:
            self.time_steps += 1
            self.total_time_steps += 1

            action = self.q.get_action(observation, epsilon)
            next_observation, reward, terminated, truncated, _ = self.env.step(action)
            transition = Transition(observation, action, next_observation, reward, terminated)
            self.replay_buffer.append(transition)

            episode_reward += reward
            observation = next_observation
            done = terminated or truncated

            if self.total_time_steps % self.steps_between_train == 0 and self.time_steps > self.batch_size:
                loss = self.train()

        if n_episode % self.train_num_episodes_before_next_test == 0:
            validation_episode_reward_lst, validation_episode_reward_avg = self.validate()

            if validation_episode_reward_avg > self.episode_reward_avg_solved:
                self.model_save(validation_episode_reward_avg)
                is_terminated = True

        if is_terminated:
            break
```

### initialization train loop

- loss = 0.0
- validation_episode_reward_avg = 0.0
    - if validation_episode_reward_avg is same episode_reward_avg_solved, train loop is done
- is_terminated = False
    - if this value is true, train loop is done

### doing episode
#### initialization episode
- epsilon = self.epsilon_scheduled(n_episode) : this episode epsilon
- episode_reward = 0 : this episode reward
- observation, _ = self.env.reset() : start observation
- done = False : done is false
#### doing step
- self.time_steps += 1 : counting step
- action = self.q.get_action(observation, epsilon) : get action
- next_observation, reward, terminated, truncated, _ = self.env.step(action) : get next_observation
- transition = Transition(observation, action, next_observation, reward, terminated) : make transition
- self.replay_buffer.append(transition) : insert transition in replay_buffer
- episode_reward += reward : cumulative sum reward in this episode
- observation = next_observation : set current observation to next observation
- done = terminated or truncated : done is (terminated | truncated)
- if self.total_time_steps % self.steps_between_train == 0 and self.time_steps > self.batch_size -> loss = self.train()
    - Test is performed every steps_between_train but buffer size is smaller then batch_size doesn't do this
#### doing validation 
- validation_episode_reward_lst, validation_episode_reward_avg = self.validate()
    - validate is performed every train_num_episodes_before_next_test
    - if get validation_episode_reward_avg value is more than episode_reward_avg_solved
        - save model and terminated

## train function
This function update policy

``` python
def train(self):
    self.training_time_steps += 1

    batch = self.replay_buffer.sample(self.batch_size)
    observations, actions, next_observations, rewards, dones = batch

    q_out = self.q(observations)
    q_values = q_out.gather(dim=-1, index=actions)

    with torch.no_grad():
        q_prime_out = self.target_q(next_observations)
        max_q_prime = q_prime_out.max(dim=1, keepdim=True).values
        max_q_prime[dones] = 0.0

        targets = rewards + self.gamma * max_q_prime

    loss = F.mse_loss(targets.detach(), q_values)

    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()

    if self.time_steps % self.target_sync_step_interval == 0:
        self.target_q.load_state_dict(self.q.state_dict())

    return loss.item()
```

### get current policy batch
- self.training_time_steps += 1 : counting train step
- batch = self.replay_buffer.sample(self.batch_size) : get batch at replay_buffer
- observations, actions, next_observations, rewards, dones = batch : divide each into batch sizes
- q_out = self.q(observations) : get q_value each batch sizes observations
- q_values = q_out.gather(dim=-1, index=actions) : get action index value
### get target policy batch
- q_prime_out = self.target_q(next_observations) : get q_value each batch sizes next observations
- max_q_prime = q_prime_out.max(dim=1, keepdim=True).values : get best action index value
- targets = rewards + self.gamma * max_q_prime : update target value
### update current policy
- oss = F.mse_loss(targets.detach(), q_values) : get loss
- self.optimizer.zero_grad() : initialization qnet
- loss.backward() : gradient decent
- self.optimizer.step() : update qnet
### sync
- if self.time_steps % self.target_sync_step_interval == 0 -> self.target_q.load_state_dict(self.q.state_dict())
    - target_q syncs for each target_sync_step_interval 

## model_save function
This function save model

``` python
def model_save(self, validation_episode_reward_avg):
    filename = "dqn_{0}_{1:4.1f}_{2}.pth".format(
        self.env_name, validation_episode_reward_avg, self.current_time
    )
    torch.save(self.q.state_dict(), os.path.join(MODEL_DIR, filename))

    copyfile(
        src=os.path.join(MODEL_DIR, filename),
        dst=os.path.join(MODEL_DIR, "dqn_{0}_latest.pth".format(self.env_name))
    )
```

## validate function
This function validate model

```python
def validate(self):
    episode_reward_lst = np.zeros(shape=(self.validation_num_episodes,), dtype=float)

    for i in range(self.validation_num_episodes):
        episode_reward = 0

        observation, _ = self.test_env.reset()

        done = False

        while not done:
            action = self.q.get_action(observation, epsilon=0.0)

            next_observation, reward, terminated, truncated, _ = self.test_env.step(action)

            episode_reward += reward
            observation = next_observation
            done = terminated or truncated

        episode_reward_lst[i] = episode_reward

    return episode_reward_lst, np.average(episode_reward_lst)
```

In [4]:
def main():
    ENV_NAME = "CartPole-v1"

    env = gym.make(ENV_NAME)
    test_env = gym.make(ENV_NAME)

    config = {
        "env_name": ENV_NAME,                       # 환경의 이름
        "max_num_episodes": 1_500,                  # 훈련을 위한 최대 에피소드 횟수
        "batch_size": 32,                           # 훈련시 배치에서 한번에 가져오는 랜덤 배치 사이즈
        "learning_rate": 0.0001,                    # 학습율
        "gamma": 0.99,                              # 감가율
        "steps_between_train": 1,                   # 훈련 사이의 환경 스텝 수
        "target_sync_step_interval": 500,           # 기존 Q 모델을 타깃 Q 모델로 동기화시키는 step 간격
        "replay_buffer_size": 30_000,               # 리플레이 버퍼 사이즈
        "epsilon_start": 0.95,                      # Epsilon 초기 값
        "epsilon_end": 0.01,                        # Epsilon 최종 값
        "epsilon_final_scheduled_percent": 0.75,    # Epsilon 최종 값으로 스케줄되는 마지막 에피소드 비율
        "print_episode_interval": 10,               # Episode 통계 출력에 관한 에피소드 간격
        "train_num_episodes_before_next_test": 50,  # 검증 사이 마다 각 훈련 episode 간격
        "validation_num_episodes": 3,               # 검증에 수행하는 에피소드 횟수
        "episode_reward_avg_solved": 490,           # 훈련 종료를 위한 검증 에피소드 리워드의 Average
    }

    use_wandb = True
    dqn = DQN(
        env=env, test_env=test_env, config=config, use_wandb=use_wandb
    )
    dqn.train_loop()

In [5]:
if __name__ == '__main__':
    main()

[34m[1mwandb[0m: Currently logged in as: [33mgihwan319[0m ([33mgihwanjang[0m). Use [1m`wandb login --relogin`[0m to force relogin


[Episode  10, Time Steps    156] Episode Reward:  13.0, Replay buffer:    156, Loss:  0.158, Epsilon: 0.94, Training Steps:   124, Elapsed Time: 00:00:00
[Episode  20, Time Steps    380] Episode Reward:  32.0, Replay buffer:    380, Loss:  0.011, Epsilon: 0.93, Training Steps:   348, Elapsed Time: 00:00:00
[Episode  30, Time Steps    554] Episode Reward:   9.0, Replay buffer:    554, Loss:  0.016, Epsilon: 0.92, Training Steps:   522, Elapsed Time: 00:00:00
[Episode  40, Time Steps    754] Episode Reward:  11.0, Replay buffer:    754, Loss:  0.008, Epsilon: 0.92, Training Steps:   722, Elapsed Time: 00:00:00
[Episode  50, Time Steps    971] Episode Reward:  22.0, Replay buffer:    971, Loss:  0.011, Epsilon: 0.91, Training Steps:   939, Elapsed Time: 00:00:00
[Validation Episode Reward: [11. 10. 11.]] Average: 10.667
[Episode  60, Time Steps  1,158] Episode Reward:  16.0, Replay buffer:  1,158, Loss:  0.329, Epsilon: 0.90, Training Steps: 1,126, Elapsed Time: 00:00:00
[Episode  70, Tim

0,1
Training Episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
Training Steps,▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▆▆▇▇▇██
[TRAIN] Episode Reward,▁▁▁▁▁▁▂▁▂▂▂▁▁▁▁▂▁▃▃▂▄▁▁▃▆▂▅██▆▁▅▇▅▆▆▆▆▇▆
[TRAIN] Epsilon,███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
[TRAIN] Loss,▁▁▁▁▁▁▁▂▁▁▂▁▂▁▁▁▂▁██▂▃▃▂▂▂▁▁▁▃▂▁▁▁▁▁▁▁▁▁
[TRAIN] Replay buffer,▁▁▁▁▂▂▂▂▂▃▃▃▄▄▅▅▆▆▇█████████████████████
[VALIDATION] Mean Episode Reward (3 Episodes),▁▁▁▁▃▃▂██▆▆▆▆▄▄▄▄▆▆▅▅▅▆▆▇▇▆▆▆▅▅▅▅▅▅▅▅▅▅▅

0,1
Training Episode,1100.0
Training Steps,170870.0
[TRAIN] Episode Reward,500.0
[TRAIN] Epsilon,0.03089
[TRAIN] Loss,0.04112
[TRAIN] Replay buffer,30000.0
[VALIDATION] Mean Episode Reward (3 Episodes),500.0
