# üåà Rainbow DQN ‚Äî Rezumat

**Rainbow DQN** combinƒÉ cele mai eficiente 6 √ÆmbunƒÉtƒÉ»õiri ale lui DQN √Æntr-un singur algoritm stabil »ôi performant.

---

## üß© Componentele Rainbow

### **1. Double DQN**
Reduce supraestimarea Q-values separ√¢nd selec»õia ac»õiunii de evaluare:

$$
y = r + \gamma \, Q_{\text{target}}\big(s', \arg\max_a Q_{\text{online}}(s',a)\big)
$$

---

### **2. Dueling Network Architecture**
Descompune estimarea Q √Æn:

$$
Q(s,a) = V(s) + A(s,a) - \frac{1}{|\mathcal{A}|}\sum_{a'} A(s,a')
$$

---

### **3. Prioritized Experience Replay (PER)**
SelecteazƒÉ mai des tranzi»õiile cu TD-error mare:

$$
P(i) = \frac{p_i^\alpha}{\sum_j p_j^\alpha}
$$

---

### **4. Multi-Step Returns**

$$
R^{(n)} = 
\sum_{k=0}^{n-1} \gamma^k r_k
\;+\;
\gamma^n V(s_n)
$$

---

### **5. Noisy Networks**
Explorare √ÆnvƒÉ»õabilƒÉ prin zgomot parametric:

$$
y = (\mu_W + \sigma_W \odot \epsilon_W)\, x
$$

---

### **6. C51 Distributional RL**
ModeleazƒÉ distribu»õia √ÆntreagƒÉ a valorii viitoare:

- 51 atomi \((z_i)\)
- Softmax ‚Üí probabilitƒÉ»õi
- Proiec»õie distribuitƒÉ pe suport

---

## ‚≠ê Pe scurt

**Rainbow = Double DQN + Dueling + PER + Multi-Step + Noisy Nets + C51**

‚Üí una dintre cele mai robuste »ôi avansate metode value-based.


In [None]:
import gymnasium as gym
import tensorflow as tf
import numpy as np
import random
import collections
import matplotlib.pyplot as plt
from IPython.display import clear_output
from tensorflow.keras import Model, layers, optimizers

# ============================================================
# Prioritized Multi-Step Replay Buffer (PER + N-step)
# ============================================================

class PrioritizedNStepBuffer:
    def __init__(self, capacity, state_dim, n_step=3, gamma=0.99, alpha=0.6, beta=0.4):
        self.capacity = capacity
        self.ptr = 0
        self.full = False

        self.states = np.zeros((capacity, state_dim), dtype=np.float32)
        self.next_states = np.zeros((capacity, state_dim), dtype=np.float32)
        self.actions = np.zeros(capacity, dtype=np.int32)
        self.rewards = np.zeros(capacity, dtype=np.float32)
        self.dones = np.zeros(capacity, dtype=np.float32)
        self.priorities = np.ones(capacity, dtype=np.float32)

        self.n_step = n_step
        self.gamma = gamma
        self.buffer = collections.deque(maxlen=n_step)

        self.alpha = alpha
        self.beta = beta

    def store(self, s, s2, a, r, done):
        self.buffer.append((s, s2, a, r, done))
        if len(self.buffer) < self.n_step:
            return

        R = 0
        discount = 1
        for (_, _, _, r_i, d_i) in self.buffer:
            R += r_i * discount
            discount *= self.gamma
            if d_i:
                break

        s0, _, a0, _, _ = self.buffer[0]
        _, s_n, _, _, d_n = self.buffer[-1]

        idx = self.ptr
        self.states[idx] = s0
        self.next_states[idx] = s_n
        self.actions[idx] = a0
        self.rewards[idx] = R
        self.dones[idx] = float(d_n)
        self.priorities[idx] = self.priorities.max()

        self.ptr = (self.ptr + 1) % self.capacity
        if self.ptr == 0: self.full = True

    def sample(self, batch_size):
        max_mem = self.capacity if self.full else self.ptr
        probs = self.priorities[:max_mem] ** self.alpha
        probs /= probs.sum()

        idx = np.random.choice(max_mem, batch_size, p=probs)
        weights = (max_mem * probs[idx]) ** (-self.beta)
        weights /= weights.max()

        return (
            self.states[idx], self.next_states[idx], self.actions[idx],
            self.rewards[idx], self.dones[idx], idx, weights.astype(np.float32)
        )

    def update_priorities(self, indexes, p):
        self.priorities[indexes] = p


# ============================================================
# Noisy Dense Layer (Factorized Noisy Nets)
# ============================================================

class NoisyDense(layers.Layer):
    def __init__(self, units):
        super().__init__()
        self.units = units

    def build(self, input_shape):
        in_dim = int(input_shape[-1])

        self.mu_w = self.add_weight(shape=(in_dim, self.units), initializer="glorot_uniform")
        self.sigma_w = self.add_weight(shape=(in_dim, self.units), initializer=tf.constant_initializer(0.017))

        self.mu_b = self.add_weight(shape=(self.units,), initializer="zeros")
        self.sigma_b = self.add_weight(shape=(self.units,), initializer=tf.constant_initializer(0.017))

    def call(self, x):
        eps_in = tf.random.normal((x.shape[-1],))
        eps_out = tf.random.normal((self.units,))

        f_in = tf.sign(eps_in) * tf.sqrt(tf.abs(eps_in))
        f_out = tf.sign(eps_out) * tf.sqrt(tf.abs(eps_out))

        w_noise = tf.expand_dims(f_in, -1) * tf.expand_dims(f_out, 0)
        b_noise = f_out

        w = self.mu_w + self.sigma_w * w_noise
        b = self.mu_b + self.sigma_b * b_noise

        return tf.matmul(x, w) + b


# ============================================================
# Rainbow Network (Noisy + Dueling + C51)
# ============================================================

class RainbowNetwork(Model):
    def __init__(self, n_actions, n_atoms=51, v_min=-10, v_max=10):
        super().__init__()
        self.n_actions = n_actions
        self.n_atoms = n_atoms
        self.v_min = v_min
        self.v_max = v_max

        self.support = tf.cast(tf.linspace(v_min, v_max, n_atoms), tf.float32)

        self.fc1 = NoisyDense(128)
        self.fc2 = NoisyDense(128)

        self.val1 = NoisyDense(128)
        self.val2 = NoisyDense(n_atoms)

        self.adv1 = NoisyDense(128)
        self.adv2 = NoisyDense(n_actions * n_atoms)

    def call(self, x):
        x = tf.convert_to_tensor(x, dtype=tf.float32)

        x = tf.nn.relu(self.fc1(x))
        x = tf.nn.relu(self.fc2(x))

        V = tf.nn.relu(self.val1(x))
        V = tf.reshape(self.val2(V), (-1, 1, self.n_atoms))

        A = tf.nn.relu(self.adv1(x))
        A = tf.reshape(self.adv2(A), (-1, self.n_actions, self.n_atoms))

        A_mean = tf.reduce_mean(A, axis=1, keepdims=True)
        logits = V + (A - A_mean)

        dist = tf.nn.softmax(logits, axis=2)
        return dist

    def act(self, state):
        dist = self(state[np.newaxis, :])
        q = tf.reduce_sum(dist * self.support, axis=2)
        return int(tf.argmax(q[0]).numpy())


# ============================================================
# C51 Projection
# ============================================================

def c51_projection(next_dist, rewards, dones, gamma, support, v_min, v_max):
    batch = rewards.shape[0]
    n_atoms = support.shape[0]
    delta = (v_max - v_min) / (n_atoms - 1)

    proj = np.zeros((batch, n_atoms), dtype=np.float32)

    tz = rewards[:, None] + gamma * (1 - dones[:, None]) * support[None, :]
    tz = np.clip(tz, v_min, v_max)

    b = (tz - v_min) / delta
    l = np.floor(b).astype(np.int32)
    u = np.ceil(b).astype(np.int32)

    next_dist = next_dist.numpy()

    for i in range(batch):
        for j in range(n_atoms):
            p = next_dist[i, j]
            lj, uj = l[i, j], u[i, j]
            if lj == uj:
                proj[i, lj] += p
            else:
                proj[i, lj] += p * (uj - b[i, j])
                proj[i, uj] += p * (b[i, j] - lj)

    return proj


# ============================================================
# Hyperparameters
# ============================================================

GAMMA = 0.99
LR = 1e-3
BATCH = 32
MEMORY = 50000
N_STEP = 3
NUM_EPISODES = 600

N_ATOMS = 51
V_MIN, V_MAX = -10, 10
TAU = 0.005


# ============================================================
# Soft Update
# ============================================================

def soft_update(target, online, tau=TAU):
    tw = target.get_weights()
    ow = online.get_weights()
    target.set_weights([t * (1 - tau) + o * tau for t, o in zip(tw, ow)])


# ============================================================
# Training Loop ‚Äî FULL RAINBOW
# ============================================================

env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
n_actions = env.action_space.n

buffer = PrioritizedNStepBuffer(MEMORY, state_dim, n_step=N_STEP, gamma=GAMMA)
online = RainbowNetwork(n_actions)
target = RainbowNetwork(n_actions)
target.set_weights(online.get_weights())

optimizer = optimizers.Adam(LR)
reward_history = []

for episode in range(NUM_EPISODES):
    state, _ = env.reset()
    ep_reward = 0

    for step in range(200):
        action = online.act(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        ep_reward += reward

        buffer.store(state, next_state, action, reward, float(done))
        state = next_state

        # TRAIN
        if buffer.ptr > 1000 or buffer.full:
            s, s2, a, r, d, idx, weights = buffer.sample(BATCH)

            next_dist = target(s2)
            next_q = tf.reduce_sum(next_dist * online.support, axis=2)
            next_actions = tf.argmax(next_q, axis=1, output_type=tf.int32)

            next_sel = tf.gather_nd(next_dist,
                                    tf.stack([tf.range(BATCH), next_actions], axis=1))

            proj = c51_projection(next_sel, r, d,
                                  GAMMA, online.support.numpy(), V_MIN, V_MAX)

            with tf.GradientTape() as tape:
                dist = online(s)
                chosen = tf.gather_nd(dist,
                                      tf.stack([tf.range(BATCH), a], axis=1))

                loss = tf.nn.softmax_cross_entropy_with_logits(
                    labels=proj,
                    logits=tf.math.log(chosen + 1e-8)
                )
                loss = tf.reduce_mean(weights * loss)

            grads = tape.gradient(loss, online.trainable_variables)
            grads = [tf.clip_by_norm(g, 5.0) for g in grads]
            optimizer.apply_gradients(zip(grads, online.trainable_variables))

            buffer.update_priorities(idx, loss.numpy() + 1e-6)
            soft_update(target, online)

        if done:
            break

    reward_history.append(ep_reward)

    # ----------------------------------------------------
    # LIVE PLOT
    # ----------------------------------------------------
    if episode % 5 == 0:
        clear_output(wait=True)
        plt.figure(figsize=(10,4))

        # raw rewards
        plt.plot(
            reward_history,
            label="Reward",
            color="blue",
            alpha=0.3,
            linewidth=1
        )

        # moving average
        if len(reward_history) > 20:
            ma = np.convolve(reward_history, np.ones(20)/20, mode='valid')
            plt.plot(
                range(19, len(reward_history)),
                ma,
                label="Moving Avg (20 eps)",
                color="orange",
                linewidth=2.5
            )

        plt.title("Rainbow DQN ‚Äî Training Progress")
        plt.xlabel("Episode")
        plt.ylabel("Reward")
        plt.grid(True, alpha=0.3)
        plt.legend()
        plt.show()
