<a href="https://colab.research.google.com/github/diegomrodrigues/deep_rl/blob/main/DQN_CartPole_Agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
!pip install --upgrade tensorboardX

Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [59]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [63]:
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gymnasium as gym
from gymnasium.vector import SyncVectorEnv
from dataclasses import dataclass
from datetime import datetime
from huggingface_hub import HFSummaryWriter

NUM_FEATURES = 4
NUM_ENVS = 100

@dataclass
class TrainConfig:
    epochs: int
    gamma: float
    learning_rate: float

@dataclass
class Experiences:
    states: list
    actions: list
    rewards: list
    next_states: list
    dones: list
    total_reward: float

def create_env():
    def _make_env():
        env = gym.make('CartPole-v1')
        return env

    return _make_env

def create_envs(num_envs):
    envs = [create_env() for _ in range(num_envs)]
    return SyncVectorEnv(envs)

def create_model(num_features, num_actions):
    model = keras.Sequential([
        layers.Input(shape=(num_features,)),
        layers.Dense(512, activation='relu'),
        layers.Dense(num_actions, activation='linear'),
    ])
    return model

def agent(model, state):
    q_values = model(state)
    return tf.argmax(q_values, axis=1)

def collect_experiences(model, env, num_envs):
    states, actions, rewards, next_states, dones = [], [], [], [], []
    total_reward = 0

    observations, _ = env.reset()
    done = np.array([False] * num_envs)

    while not all(done):
        state = tf.convert_to_tensor(observations, dtype=tf.float32)
        states.append(state)

        action = agent(model, state)
        observations, reward, termination, truncation, info = env.step(action.numpy())

        actions.append(action)
        rewards.append(tf.cast(reward, tf.float32))
        next_states.append(tf.convert_to_tensor(observations, dtype=tf.float32))
        dones.append(termination | truncation)

        total_reward += np.sum(reward)
        done = done | termination | truncation

    return Experiences(states, actions, rewards, next_states, dones, total_reward)

def prepare_experiences(experiences):
    states = tf.concat(experiences.states, axis=0)
    actions = tf.concat(experiences.actions, axis=0)
    rewards = tf.concat(experiences.rewards, axis=0)
    next_states = tf.concat(experiences.next_states, axis=0)
    dones = tf.concat(experiences.dones, axis=0)

    dones = tf.cast(dones, dtype=tf.float32)

    return states, actions, rewards, next_states, dones

def compute_targets(model, next_states, rewards, dones, gamma):
    next_q_values = model(next_states)
    next_q_max = tf.reduce_max(next_q_values, axis=1)
    target_q_values = rewards + gamma * (1 - dones) * next_q_max
    return target_q_values

def mean_squared_error_loss(q_action, target_q_values):
    return tf.reduce_mean(tf.square(target_q_values - q_action))

def compute_loss(model, states, actions, target_q_values, num_actions):
    q_values = model(states)
    action_mask = tf.one_hot(actions, depth=num_actions)
    q_action = tf.reduce_sum(q_values * action_mask, axis=-1)
    loss = mean_squared_error_loss(q_action, target_q_values)
    return loss

def train_step(model, optimizer, experiences, gamma, num_actions, writer, step):
    states, actions, rewards, next_states, dones = prepare_experiences(experiences)
    target_q_values = compute_targets(model, next_states, rewards, dones, gamma)
    with tf.GradientTape() as tape:
        loss = compute_loss(model, states, actions, target_q_values, num_actions)

    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # Log metrics to TensorBoard
    writer.add_scalar('loss', loss.numpy(), step)
    writer.add_scalar('total_reward', experiences.total_reward, step)

    return loss

def train(model, env, config):
    num_actions = env.single_action_space.n
    optimizer = keras.optimizers.Adam(learning_rate=config.learning_rate)

    repo_id = "diegomrodrigues/CartPole-DQN"
    writer = HFSummaryWriter(repo_id=repo_id, commit_every=1)

    for epoch in range(config.epochs):
        experiences = collect_experiences(model, env, NUM_ENVS)
        loss = train_step(model, optimizer, experiences, config.gamma, num_actions, writer, epoch)
        print(f"Epoch: {epoch+1}, Loss: {loss.numpy()}, Total Reward: {experiences.total_reward}")

    # Close the writer
    writer.close()

# main code

env = create_envs(NUM_ENVS)
model = create_model(NUM_FEATURES, env.single_action_space.n)

train_config = TrainConfig(
    epochs=1000,
    gamma=0.99,
    learning_rate=0.001
)

train(model, env, train_config)



Epoch: 1, Loss: 1.0010871887207031, Total Reward: 1900.0
Epoch: 2, Loss: 1.0016552209854126, Total Reward: 1000.0
Epoch: 3, Loss: 1.0060248374938965, Total Reward: 1000.0
Epoch: 4, Loss: 1.020082950592041, Total Reward: 1100.0
Epoch: 5, Loss: 1.0325711965560913, Total Reward: 1100.0
Epoch: 6, Loss: 1.0374184846878052, Total Reward: 1000.0
Epoch: 7, Loss: 1.0629668235778809, Total Reward: 1100.0
Epoch: 8, Loss: 1.0822651386260986, Total Reward: 1100.0
Epoch: 9, Loss: 1.0930697917938232, Total Reward: 1000.0
Epoch: 10, Loss: 1.115274429321289, Total Reward: 1000.0
Epoch: 11, Loss: 1.1528236865997314, Total Reward: 1100.0
Epoch: 12, Loss: 1.1797800064086914, Total Reward: 1100.0
Epoch: 13, Loss: 1.2096812725067139, Total Reward: 1100.0
Epoch: 14, Loss: 1.2425553798675537, Total Reward: 1100.0
Epoch: 15, Loss: 1.276976466178894, Total Reward: 1100.0
Epoch: 16, Loss: 1.3135631084442139, Total Reward: 1100.0
Epoch: 17, Loss: 1.3542265892028809, Total Reward: 1100.0
Epoch: 18, Loss: 1.3950917