In [None]:
import sys
import torch
import gymnasium as gym
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from pathlib import Path
from torch.distributions import Categorical
from collections import namedtuple, deque
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from itertools import count
import pandas as pd
import math
import random
import cv2

In [None]:
env = gym.make("CartPole-v1", render_mode = 'rgb_array')

In [None]:
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

In [None]:
class ReplayMemory(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = deque([], maxlen = capacity)
    
    def push(self, *args):
        self.memory.append(Transition(*args))
    
    def sample(self, batch_size):
        batch = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = [], [], [], [], []
        for i in batch:
            states.append(i.state)
            actions.append(i.action)
            rewards.append(i.reward)
            next_states.append(i.next_state)
            dones.append(i.done)
        
        # print(type(states), type(actions), type(rewards), type(next_states), type(dones))
        # print(type(states[0]), type(actions[0]), type(rewards[0]), type(next_states[0]), type(dones[0]))

        return (
            torch.vstack(states).to(device),
            torch.vstack(actions).to(device),
            torch.tensor(np.array(rewards), dtype = torch.float).to(device).unsqueeze(1),
            torch.tensor(np.array(next_states), dtype = torch.float).to(device),
            torch.tensor(np.array(dones), dtype = torch.float).to(device).unsqueeze(1)
        )
    def __len__(self):
        return len(self.memory)

In [None]:
class DDQN(nn.Module):
    def __init__(self, n_observations, n_actions):
        super(DDQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128,128)
        self.layer3 = nn.Linear(128, n_actions)
    
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

In [None]:
class Agent:
    def __init__(self, env, BATCH_SIZE = 512,
                GAMMA = 0.99,
                EPSILON = 1,
                MIN_EPSILON = 0.05,
                EPSILON_DECAY = 10000,
                TAU = 0.005,
                LR = 1e-4,
                update_every = 1):
        self.BATCH_SIZE = BATCH_SIZE 
        self.GAMMA = GAMMA 
        self.EPSILON = EPSILON 
        self.MIN_EPSILON = MIN_EPSILON 
        self.EPSILON_DECAY = EPSILON_DECAY 
        self.TAU = TAU 
        self.LR = LR 
        self.update_every = update_every

        self.Transition = namedtuple('Transition',
                                     ('state', 'action', 'reward', 'next_state', 'done'))
        self.memory = ReplayMemory(10000)
        self.n_actions = env.action_space.n
        self.n_observations = env.observation_space.shape[0]
        
        self.input_shape = self.n_observations
        self.q_eval = DDQN(self.input_shape, self.n_actions).to(device)
        self.q_target = DDQN(self.input_shape, self.n_actions).to(device)

        self.optimizer = optim.AdamW(self.q_eval.parameters(),
                                     lr=self.LR, amsgrad=True)

        self.steps_done = 0
        self.num_train_steps = 0
        self.episode_durations = []
        self.rewards = []
        self.mean_rewards = []

    def get_exploration_rate(self, update = True):
        if update:
            self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * math.exp(-1.0 * self.steps_done / self.EPSILON_DECAY)
        return self.EPSILON
    
    def step(self, state, action, reward, next_state, done):
        self.memory.push(state, action, reward, next_state, done)
        
        self.steps_done += 1
        if self.steps_done % self.update_every == 0:
            if len(self.memory) > self.BATCH_SIZE:
                experiences = self.memory.sample(self.BATCH_SIZE)
                self.learn(experiences)
    
    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences
        # print(states.shape, actions.shape, rewards.shape, next_states.shape, dones.shape)
        # print(type(states), type(actions), type(rewards), type(next_states), type(dones))
        # print(type(states[0]), type(actions[0]), type(rewards[0]), type(next_states[0]), type(dones[0]))
        # print(states[0].shape, actions[0].shape, rewards[0].shape, next_states[0].shape, dones[0].shape)
        q_target_next = self.q_target(next_states).detach().max(1)[0].unsqueeze(1)
        q_targets = rewards + (self.GAMMA * q_target_next * (1 - dones))

        q_expected = self.q_eval(states).gather(1, actions.view(-1,1))

        loss = F.mse_loss(q_expected, q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        for target_param, eval_param in zip(self.q_target.parameters(), self.q_eval.parameters()):
            target_param.data.copy_(self.TAU * eval_param.data + (1.0 - self.TAU) * target_param.data)


    def select_action(self, state, explore=True):
        sample = random.random()
        eps_thresh = self.get_exploration_rate(update = True)
        self.steps_done += 1
        if sample > eps_thresh or not explore:
            with torch.no_grad():
                return self.q_eval(state).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)

    def plot_durations(self):
        plt.figure(1)
        durations_t = torch.tensor(self.episode_durations, dtype=torch.float)
        plt.title('Episode_Lengths')
        plt.xlabel('Episode')
        plt.ylabel('Duration')
        plt.plot(durations_t.numpy())

        if len(durations_t) >= 100:
            means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
            means = torch.cat((torch.zeros(99), means))
            plt.plot(means.numpy())

        plt.show()

        plt.figure(2)
        rewards_t = torch.tensor(self.rewards, dtype=torch.float)
        mean_rewards_t = torch.tensor(self.mean_rewards, dtype=torch.float)
        plt.title('Rewards')
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.plot(rewards_t.numpy())
        plt.plot(mean_rewards_t.numpy())
        plt.show()


In [None]:
def show_game(agent, num_games = 3):
    for i in range(num_games):
        total_reward = 0
        state = env.reset()
        img = plt.imshow(env.render())
        for j in range(500):
            state = torch.tensor(state, dtype = torch.float32, device = device).unsqueeze(0)
            action = agent.select_action(state)
            img.set_data(env.render())
            plt.axis('off')
            display.display(plt.gcf())
            display.clear_output(wait = True)
            state, reward, terminated, truncated, _ = env.step(action.item())
            done = terminated or truncated
            total_reward += reward
            if done:
                print(f"Game: {i} | Total_reward: {total_reward} | Game_Lenght: {j}\n")
                break


In [None]:
agent = Agent(env, 
              BATCH_SIZE = 128,
              GAMMA = 0.99, 
              EPSILON = 1, 
              MIN_EPSILON = 0.05, 
              EPSILON_DECAY = 1000, 
              TAU = 0.005, 
              LR = 1e-4, 
              update_every = 1)

In [None]:
if torch.cuda.is_available():
    num_episodes = 600
else:
    num_episodes = 50

for i_episode in range(num_episodes):
    print(f"Episode: {i_episode}")
    state, info = env.reset()
    score = 0
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

    for t in count():
        action = agent.select_action(state)
        observation, reward, terminated, truncated, _ = env.step(action.item())
        agent.step(state, action, reward, observation, terminated or truncated)
        score += reward
        done = terminated or truncated

        if terminated:
            next_state = None
        else:
            next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

        state = next_state

        if done:
            agent.episode_durations.append(t + 1)
            agent.rewards.append(score)
            agent.mean_rewards.append(np.mean(agent.rewards[-100:]))
            print(f"\tScore: {score} | Mean: {np.mean(agent.rewards[-100:])} | Epsilon: {agent.get_exploration_rate(update=False)} | Duration: {t+1}")
            if i_episode % 100 == 0:
                try:
                    agent.plot_durations()
                    show_game(num_games=3)
                except:
                    print('Game_Show_Error!!!...')
                    pass
                print("Continuing")
            break

print('Complete')
agent.plot_durations()
plt.ioff()
plt.show()


In [None]:
agent.plot_durations()