# RNN

The aim of this notebook is getting the concepts and key feature of the Recurencet Neural Net

In [2]:
"""
==============================================================================
VANILLA RNN — From Scratch in NumPy
==============================================================================

Architecture:
                    y_t
                     ↑
                  [W_hy]
                     ↑
   h_{t-1} --→ [  h_t  ] --→ h_t  (passed to next step)
                  ↑    ↑
              [W_hh] [W_xh]
                ↑      ↑
           h_{t-1}    x_t

Core equations (forward pass):
    h_t = activation( W_xh · x_t  +  W_hh · h_{t-1}  +  b_h )
    y_t = softmax(    W_hy · h_t  +  b_y )

Core equations (backward pass — BPTT):
    dL/dW = sum over time of gradients flowing back through the unrolled graph
"""

import numpy as np
import matplotlib.pyplot as plt
np.random.seed(8)


ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# =============================================================================
# 1. ACTIVATION FUNCTIONS
# =============================================================================

def tanh(x):
    return np.tanh(x)


def tanh_derivative(x):
    """d/dx tanh(x) = 1 - tanh²(x)"""
    return 1.0 - np.tanh(x) ** 2


def softmax(x):
    """
    Converts raw scores (logits) into probabilities that sum to 1.
    Subtract max for numerical stability (avoids overflow in exp).
    """
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [None]:
x = np.linspace(-3,3,100)
plt.plot(x, tanh(x), label="tanh")
plt.plot(x, tanh_derivative(x), label="tanh derivative")
plt.legend()
plt.title("Tanh Activation and Its Derivative")

In [None]:
class VanillaRNN:
    """
    A character-level RNN that learns to predict the next character.

    Parameters
    ----------
    input_size  : int — number of unique input tokens (vocabulary size)
    hidden_size : int — number of hidden units (the "memory" capacity)
    output_size : int — number of unique output tokens (often = input_size)
    lr          : float — learning rate

    Weight matrices
    ---------------
    W_xh : (hidden_size, input_size)  — input  → hidden
    W_hh : (hidden_size, hidden_size) — hidden → hidden (recurrence!)
    W_hy : (output_size, hidden_size) — hidden → output
    b_h  : (hidden_size, 1)           — hidden bias
    b_y  : (output_size, 1)           — output bias
    """

    def __init__(self, input_size, hidden_size, output_size, lr=0.01):
        self.hidden_size = hidden_size
        self.lr = lr

        # --- Weight initialization (Xavier / sqrt scale) ---
        scale = lambda fan_in: 1.0 / np.sqrt(fan_in)

        self.W_xh = np.random.randn(hidden_size, input_size)  * scale(input_size)
        self.W_hh = np.random.randn(hidden_size, hidden_size) * scale(hidden_size)
        self.W_hy = np.random.randn(output_size, hidden_size) * scale(hidden_size)
        self.b_h  = np.zeros((hidden_size, 1))
        self.b_y  = np.zeros((output_size, 1))

        # --- Adagrad memory (for adaptive learning rate) ---
        self.mW_xh = np.zeros_like(self.W_xh)
        self.mW_hh = np.zeros_like(self.W_hh)
        self.mW_hy = np.zeros_like(self.W_hy)
        self.mb_h  = np.zeros_like(self.b_h)
        self.mb_y  = np.zeros_like(self.b_y)

    # -----------------------------------------------------------------
    # 2a. FORWARD PASS  (one full sequence)
    # -----------------------------------------------------------------
    def forward(self, inputs, targets, h_prev):
        """
        Run the RNN forward through the sequence and compute loss.

        Parameters
        ----------
        inputs  : list of int — input token indices  [x_0, x_1, ..., x_{T-1}]
        targets : list of int — target token indices  [x_1, x_2, ..., x_T]
        h_prev  : (hidden_size, 1) — initial hidden state

        Returns
        -------
        loss       : scalar — cross-entropy loss over the sequence
        cache      : dict   — everything needed for backward pass

        Step-by-step for each time step t:
        ┌─────────────────────────────────────────────────┐
        │  1. One-hot encode x_t                          │
        │  2. Compute raw hidden: z_t = W_xh·x_t          │
        │                              + W_hh·h_{t-1}     │
        │                              + b_h               │
        │  3. Apply activation:  h_t = tanh(z_t)          │
        │  4. Compute output:    o_t = W_hy·h_t + b_y     │
        │  5. Compute probs:     p_t = softmax(o_t)       │
        │  6. Accumulate loss:   L -= log(p_t[target_t])   │
        └─────────────────────────────────────────────────┘
        """
        xs, hs, zs, os, ps = {}, {}, {}, {}, {}
        hs[-1] = np.copy(h_prev)  # h at t=-1 is the initial state
        loss = 0.0

        for t in range(len(inputs)):
            # --- One-hot encode input ---
            xs[t] = np.zeros((self.W_xh.shape[1], 1))
            xs[t][inputs[t]] = 1.0

            # --- Raw hidden state (pre-activation) ---
            #       ┌── from input ──┐  ┌── recurrence ──┐  ┌ bias ┐
            zs[t] = self.W_xh @ xs[t] + self.W_hh @ hs[t-1] + self.b_h

            # --- Activated hidden state ---
            hs[t] = tanh(zs[t])

            # --- Output logits ---
            os[t] = self.W_hy @ hs[t] + self.b_y

            # --- Probabilities ---
            ps[t] = softmax(os[t])

            # --- Cross-entropy loss ---
            loss -= np.log(ps[t][targets[t], 0] + 1e-12)

        cache = {'xs': xs, 'hs': hs, 'zs': zs, 'ps': ps}
        return loss, cache

    # -----------------------------------------------------------------
    # 2b. BACKWARD PASS  (Backpropagation Through Time — BPTT)
    # -----------------------------------------------------------------
    def backward(self, inputs, targets, cache):
        """
        Compute gradients via BPTT.

        The key insight: since h_t depends on h_{t-1}, which depends on
        h_{t-2}, etc., gradients flow backward through ALL previous time
        steps. This is what makes RNNs powerful (and prone to vanishing
        gradients).

        Gradient flow diagram (backward):

        dL/dy_t → dL/do_t → dL/dh_t ─→ dL/dW_hy
                                │
                    ┌───────────┴───────────┐
                    ↓                       ↓
               dL/dz_t                 dL/dh_{t-1}  ← RECURRENT GRADIENT!
                 │  │                       │
                 ↓  ↓                       ↓
          dL/dW_xh  dL/dW_hh          (flows to t-1)
        """
        xs, hs, zs, ps = cache['xs'], cache['hs'], cache['zs'], cache['ps']

        # Initialize gradients
        dW_xh = np.zeros_like(self.W_xh)
        dW_hh = np.zeros_like(self.W_hh)
        dW_hy = np.zeros_like(self.W_hy)
        db_h  = np.zeros_like(self.b_h)
        db_y  = np.zeros_like(self.b_y)

        dh_next = np.zeros_like(hs[0])  # gradient flowing from future

        # Walk BACKWARD through time
        for t in reversed(range(len(inputs))):
            # --- Output gradient ---
            # For cross-entropy + softmax: dL/do_t = p_t - one_hot(target)
            do = np.copy(ps[t])
            do[targets[t]] = do[targets[t]] - 1.0  # This elegant formula is softmax + CE combined
            # ∂L/∂o = p - one_hot(c)

            # --- Gradients for W_hy, b_y ---
            dW_hy += do @ hs[t].T    # (output_size, hidden_size)
            db_y  += do

            # --- Hidden state gradient ---
            # dL/dh_t comes from TWO sources:
            #   1. The output at time t:     W_hy^T · do
            #   2. The hidden state at t+1:  dh_next (from the FUTURE!)
            dh = self.W_hy.T @ do + dh_next

            # --- Through tanh activation ---
            # dL/dz_t = dL/dh_t * tanh'(z_t)
            dz = dh * tanh_derivative(zs[t])

            # --- Gradients for W_xh, W_hh, b_h ---
            dW_xh += dz @ xs[t].T     # input contribution
            dW_hh += dz @ hs[t-1].T   # recurrent contribution
            db_h  += dz

            # --- Pass gradient to previous time step ---
            # This is the RECURRENT gradient: dL/dh_{t-1} = W_hh^T · dz
            # ⚠️ If W_hh has small eigenvalues, this shrinks → VANISHING GRADIENT
            # ⚠️ If W_hh has large eigenvalues, this grows  → EXPLODING GRADIENT
            dh_next = self.W_hh.T @ dz

        # --- Gradient clipping (prevent exploding gradients) ---
        for grad in [dW_xh, dW_hh, dW_hy, db_h, db_y]:
            np.clip(grad, -5, 5, out=grad)

        return dW_xh, dW_hh, dW_hy, db_h, db_y

    # -----------------------------------------------------------------
    # 2c. PARAMETER UPDATE (Adagrad)
    # -----------------------------------------------------------------
    def update(self, grads):
        """
        Adagrad update: lr_effective = lr / sqrt(sum_of_squared_grads)

        This adaptively reduces the learning rate for frequently updated
        parameters, which is helpful for RNNs where some characters
        appear much more often than others.
        """
        params = [self.W_xh,  self.W_hh,  self.W_hy,  self.b_h,  self.b_y]
        mems   = [self.mW_xh, self.mW_hh, self.mW_hy, self.mb_h, self.mb_y]

        for param, grad, mem in zip(params, grads, mems):
            mem += grad * grad                              # accumulate squared gradient
            param -= self.lr * grad / (np.sqrt(mem) + 1e-8) # adaptive update

    # -----------------------------------------------------------------
    # 2d. SAMPLING (generate text from the model)
    # -----------------------------------------------------------------
    def sample(self, seed_idx, h, length):
        """
        Generate a sequence of `length` tokens.

        Uses the RNN autoregressively: feed the output back as the next input.

        seed_idx : int — starting token index
        h        : (hidden_size, 1) — starting hidden state
        length   : int — number of tokens to generate
        """
        x = np.zeros((self.W_xh.shape[1], 1))
        x[seed_idx] = 1.0
        indices = []

        for _ in range(length):
            # Forward one step
            h = tanh(self.W_xh @ x + self.W_hh @ h + self.b_h)
            o = self.W_hy @ h + self.b_y
            p = softmax(o)

            # Sample from probability distribution
            idx = np.random.choice(range(p.shape[0]), p=p.ravel())
            indices.append(idx)

            # Prepare next input
            x = np.zeros_like(x)
            x[idx] = 1.0

        return indices



In [None]:
# =============================================================================
# 3. TRAINING LOOP — Character-level language model
# =============================================================================

def train_char_rnn():
    """
    Train the RNN on a small text to learn character-level patterns.

    Data flow overview:
    ┌────────────────────────────────────────────────────────┐
    │  "hello" → ['h','e','l','l','o']                       │
    │                                                        │
    │  Input:   h → e → l → l       (characters 0 to T-1)   │
    │  Target:  e → l → l → o       (characters 1 to T)     │
    │                                                        │
    │  The RNN learns: given this character, predict the next │
    └────────────────────────────────────────────────────────┘
    """

    # --- Dataset ---
    text = "hello world. hello neural network. hello recurrent neural network. "
    chars = sorted(list(set(text)))
    vocab_size = len(chars)

    # Character ↔ Index mappings
    char_to_idx = {ch: i for i, ch in enumerate(chars)}
    idx_to_char = {i: ch for i, ch in enumerate(chars)}

    print(f"Vocabulary ({vocab_size} chars): {chars}")
    print(f"Text length: {len(text)} characters\n")

    # --- Model ---
    hidden_size = 64*2
    seq_length = 25  # number of steps to unroll for BPTT
    rnn = VanillaRNN(vocab_size, hidden_size, vocab_size, lr=0.01)

    # --- Training ---
    num_iterations = 10001
    pointer = 0          # position in the text
    h_prev = np.zeros((hidden_size, 1))
    smooth_loss = -np.log(1.0 / vocab_size) * seq_length  # initial loss estimate

    print("=" * 60)
    print("TRAINING")
    print("=" * 60)

    for iteration in range(num_iterations):

        # --- Reset if we've reached end of text ---
        if pointer + seq_length + 1 >= len(text):
            pointer = 0
            h_prev = np.zeros((hidden_size, 1))

        # --- Prepare input/target sequences ---
        inputs  = [char_to_idx[ch] for ch in text[pointer:pointer + seq_length]]
        targets = [char_to_idx[ch] for ch in text[pointer + 1:pointer + seq_length + 1]]

        # --- Forward ---
        loss, cache = rnn.forward(inputs, targets, h_prev)

        # --- Backward ---
        grads = rnn.backward(inputs, targets, cache)

        # --- Update ---
        rnn.update(grads)

        # --- Carry hidden state forward (truncated BPTT) ---
        # We keep h from the end of this chunk as the start of the next.
        # This gives the RNN some "memory" across chunks, but we don't
        # backpropagate across chunk boundaries (that's the "truncated" part).
        h_prev = cache['hs'][len(inputs) - 1]

        # --- Logging ---
        smooth_loss = 0.999 * smooth_loss + 0.001 * loss
        if iteration % 500 == 0:
            print(f"\n--- Iteration {iteration}, Loss: {smooth_loss:.4f} ---")

            # Sample from the model
            sample_indices = rnn.sample(inputs[0], h_prev, 60)
            sample_text = ''.join([idx_to_char[i] for i in sample_indices])
            print(f"Sample: {sample_text}")

        pointer += seq_length

    # --- Final visualization ---
    print("\n" + "=" * 60)
    print("WHAT THE WEIGHTS LEARNED")
    print("=" * 60)
    print(f"\nW_xh shape: {rnn.W_xh.shape}  — Maps {vocab_size} input chars → {hidden_size} hidden units")
    print(f"W_hh shape: {rnn.W_hh.shape}  — Recurrence: {hidden_size} hidden → {hidden_size} hidden")
    print(f"W_hy shape: {rnn.W_hy.shape}  — Maps {hidden_size} hidden units → {vocab_size} output chars")
    print(f"\nTotal parameters: {sum(p.size for p in [rnn.W_xh, rnn.W_hh, rnn.W_hy, rnn.b_h, rnn.b_y])}")

In [None]:
train_char_rnn()

# Predicting Time Series

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from IPython.display import display, clear_output
import time

np.random.seed(42)

In [None]:
# ===== AIRLINE PASSENGERS (1949-1960) =====
airline_data = {
    'name': 'Airline Passengers (monthly, 1949-1960)',
    'values': np.array([
        112,118,132,129,121,135,148,148,136,119,104,118,
        115,126,141,135,125,149,170,170,158,133,114,140,
        145,150,178,163,172,178,199,199,184,162,146,166,
        171,180,193,181,183,218,230,242,209,191,172,194,
        196,196,236,235,229,243,264,272,237,211,180,201,
        204,188,235,227,234,264,302,293,259,229,203,229,
        242,233,267,269,270,315,364,347,312,274,237,278,
        284,277,317,313,318,374,413,405,355,306,271,306,
        315,301,356,348,355,422,465,467,404,347,305,336,
        340,318,362,348,363,435,491,505,404,359,310,337,
        360,342,406,396,420,472,548,559,463,407,362,405,
        417,391,419,461,472,535,622,606,508,461,390,432
    ], dtype=float),
    'xlabel': 'Month',
    'ylabel': 'Passengers (thousands)'
}

# ===== GLOBAL TEMPERATURE ANOMALY (1880-2023) =====
temp_data = {
    'name': 'Global Temperature Anomaly (annual, 1880-2023)',
    'values': np.array([
        -0.16,-0.08,-0.10,-0.17,-0.28,-0.32,-0.31,-0.36,-0.17,-0.10,
        -0.35,-0.25,-0.30,-0.32,-0.32,-0.23,-0.11,-0.11,-0.27,-0.17,
        -0.09,-0.16,-0.28,-0.37,-0.47,-0.29,-0.22,-0.39,-0.43,-0.48,
        -0.43,-0.44,-0.36,-0.34,-0.15,-0.14,-0.36,-0.46,-0.29,-0.47,
        -0.43,-0.44,-0.23,-0.23,-0.22,-0.28,-0.13,-0.21,-0.36,-0.23,
        -0.27,-0.32,-0.29,-0.31,-0.12,-0.15,-0.16,-0.33,-0.43,-0.37,
        -0.35,-0.24,-0.28,-0.27,-0.15,-0.09,-0.18,-0.20,-0.22,-0.25,
        -0.11,-0.06,-0.09,-0.14,-0.12,-0.07,-0.01,-0.04,0.05,-0.07,
        -0.01,0.08,0.04,0.07,0.12,-0.02,0.06,-0.02,0.06,0.03,
        -0.03,-0.07,-0.02,0.01,0.13,-0.14,-0.07,-0.01,-0.05,0.05,
        0.03,0.06,0.02,-0.03,-0.20,0.04,0.06,-0.07,0.03,0.02,
        -0.08,0.05,0.03,0.23,0.08,0.10,0.15,0.12,0.17,0.32,
        0.14,0.17,0.25,0.23,0.15,0.21,0.38,0.32,0.36,0.24,
        0.28,0.32,0.41,0.32,0.46,0.42,0.55,0.62,0.63,0.54,
        0.62,0.64,0.56,0.65
    ], dtype=float),
    'xlabel': 'Year (1880-2023)',
    'ylabel': 'Temp Anomaly (°C)'
}

In [None]:
# All datasets
ALL_DATASETS = {
    'airline': airline_data,
    'temperature': temp_data,
}

# Quick preview
fig, axes = plt.subplots(1, 2, figsize=(18, 5.5))
for ax, (key, ds) in zip(axes, ALL_DATASETS.items()):
    ax.plot(ds['values'], color='#38bdf8', linewidth=1.2)
    ax.set_title(ds['name'], fontsize=8, pad=6)
    ax.set_xlabel(ds['xlabel'], fontsize=7)
fig.suptitle('Available Datasets', fontsize=11, y=1.05)
plt.tight_layout()
plt.show()

In [None]:
class VanillaRNN:
    """
    Vanilla RNN for time series prediction.
    Input:  scalar x_t (one value per time step)
    Output: scalar ŷ_t (predicted next value)
    """
    
    def __init__(self, hidden_size, lr=0.005):
        self.hs = hidden_size
        self.lr = lr
        
        # Xavier initialization
        s = lambda n: 1.0 / np.sqrt(n)
        self.W_xh = np.random.randn(hidden_size, 1) * s(1)
        self.W_hh = np.random.randn(hidden_size, hidden_size) * s(hidden_size)
        self.W_hy = np.random.randn(1, hidden_size) * s(hidden_size)
        self.b_h  = np.zeros((hidden_size, 1))
        self.b_y  = np.zeros((1, 1))
        
        # Adagrad memory
        self.mW_xh = np.zeros_like(self.W_xh)
        self.mW_hh = np.zeros_like(self.W_hh)
        self.mW_hy = np.zeros_like(self.W_hy)
        self.mb_h  = np.zeros_like(self.b_h)
        self.mb_y  = np.zeros_like(self.b_y)
    
    def forward(self, inputs, targets, h_prev):
        """Forward pass through the sequence."""
        T = len(inputs)
        xs, hs, zs, ys = {}, {}, {}, {}
        hs[-1] = h_prev.copy()
        loss = 0.0
        
        for t in range(T):
            xs[t] = np.array([[inputs[t]]])
            zs[t] = self.W_xh @ xs[t] + self.W_hh @ hs[t-1] + self.b_h
            hs[t] = np.tanh(zs[t])
            ys[t] = self.W_hy @ hs[t] + self.b_y
            err = ys[t][0,0] - targets[t]
            loss += 0.5 * err * err
        
        return loss, {'xs': xs, 'hs': hs, 'zs': zs, 'ys': ys}
    
    def backward(self, inputs, targets, cache):
        """BPTT — Backpropagation Through Time."""
        xs, hs, zs, ys = cache['xs'], cache['hs'], cache['zs'], cache['ys']
        T = len(inputs)
        
        dW_xh = np.zeros_like(self.W_xh)
        dW_hh = np.zeros_like(self.W_hh)
        dW_hy = np.zeros_like(self.W_hy)
        db_h  = np.zeros_like(self.b_h)
        db_y  = np.zeros_like(self.b_y)
        dh_next = np.zeros_like(hs[0])
        
        for t in reversed(range(T)):
            # MSE gradient: dL/dy = y - target
            dy = np.array([[ys[t][0,0] - targets[t]]])
            
            dW_hy += dy @ hs[t].T
            db_y  += dy
            
            dh = self.W_hy.T @ dy + dh_next
            dz = dh * (1 - np.tanh(zs[t])**2)
            
            dW_xh += dz @ xs[t].T
            dW_hh += dz @ hs[t-1].T
            db_h  += dz
            
            dh_next = self.W_hh.T @ dz
        
        # Clip gradients
        for g in [dW_xh, dW_hh, dW_hy, db_h, db_y]:
            np.clip(g, -5, 5, out=g)
        
        return dW_xh, dW_hh, dW_hy, db_h, db_y
    
    def update(self, grads):
        """Adagrad update."""
        params = [self.W_xh, self.W_hh, self.W_hy, self.b_h, self.b_y]
        mems   = [self.mW_xh, self.mW_hh, self.mW_hy, self.mb_h, self.mb_y]
        for p, g, m in zip(params, grads, mems):
            m += g * g
            p -= self.lr * g / (np.sqrt(m) + 1e-8)
    
    def predict(self, inputs, h_prev):
        """Run forward pass and return predictions + hidden states."""
        h = h_prev.copy()
        predictions = []
        hidden_states = []
        
        for x in inputs:
            z = self.W_xh @ np.array([[x]]) + self.W_hh @ h + self.b_h
            h = np.tanh(z)
            y = self.W_hy @ h + self.b_y
            predictions.append(y[0,0])
            hidden_states.append(h.flatten().copy())
        
        return np.array(predictions), np.array(hidden_states), h


In [None]:
DATASET     = 'airline'    # 'airline', 'sunspots', 'temperature', 'co2', 'ecg'
HIDDEN_SIZE = 32           # 8, 16, 32, 64 — more = more capacity
SEQ_LENGTH  = 40           # 10-60 — how many steps to unroll BPTT
LR          = 0.05        # 0.001-0.02
EPOCHS      = 500          # total training epochs
TRAIN_SPLIT = 0.8          # 80% train, 20% test
PLOT_EVERY  = 10           # update the live plot every N epochs

# ============================================

# Load and normalize
ds = ALL_DATASETS[DATASET]
raw = ds['values']
v_min, v_max = raw.min(), raw.max()
data = (raw - v_min) / (v_max - v_min)  # normalize to [0, 1]

split = int(len(data) * TRAIN_SPLIT)
train_data = data[:split]
test_data  = data[split:]

def denorm(v):
    return v * (v_max - v_min) + v_min

print(f'Dataset:     {ds["name"]}')
print(f'Data points: {len(data)} (train: {len(train_data)}, test: {len(test_data)})')
print(f'Hidden size: {HIDDEN_SIZE}')
print(f'Seq length:  {SEQ_LENGTH}')
print(f'LR:          {LR}')

In [None]:
# Initialize
rnn = VanillaRNN(HIDDEN_SIZE, lr=LR)
loss_history = []
test_loss_history = []

print(f'Training on: {ds["name"]}')
print(f'Parameters: {HIDDEN_SIZE + HIDDEN_SIZE**2 + HIDDEN_SIZE + HIDDEN_SIZE + 1:,}')
print(f'Training for {EPOCHS} epochs, plotting every {PLOT_EVERY}...\n')

fig = plt.figure(figsize=(14, 10))
gs = GridSpec(3, 2, figure=fig, hspace=0.35, wspace=0.25)

for epoch in range(1, EPOCHS + 1):
    
    # === TRAIN ONE EPOCH ===
    h = np.zeros((HIDDEN_SIZE, 1))
    epoch_loss = 0
    steps = 0
    
    for p in range(0, len(train_data) - SEQ_LENGTH - 1, SEQ_LENGTH):
        inputs  = train_data[p : p + SEQ_LENGTH]
        targets = train_data[p+1 : p + SEQ_LENGTH + 1]
        
        loss, cache = rnn.forward(inputs, targets, h)
        grads = rnn.backward(inputs, targets, cache)
        rnn.update(grads)
        
        h = cache['hs'][SEQ_LENGTH - 1]
        epoch_loss += loss
        steps += 1
    
    avg_loss = epoch_loss / (steps * SEQ_LENGTH) if steps > 0 else 0
    loss_history.append(avg_loss)
    
    # Test loss
    if len(test_data) > SEQ_LENGTH + 1:
        h_test = np.zeros((HIDDEN_SIZE, 1))
        # Feed train data to build hidden state
        _, _, h_test = rnn.predict(train_data, h_test)
        t_loss = 0
        t_steps = 0
        for p in range(0, len(test_data) - SEQ_LENGTH - 1, SEQ_LENGTH):
            inp = test_data[p : p + SEQ_LENGTH]
            tgt = test_data[p+1 : p + SEQ_LENGTH + 1]
            l, _ = rnn.forward(inp, tgt, h_test)
            t_loss += l
            t_steps += 1
        test_loss_history.append(t_loss / (t_steps * SEQ_LENGTH) if t_steps > 0 else 0)
    
    # === LIVE PLOT ===
    if epoch % PLOT_EVERY == 0 or epoch == 1 or epoch == EPOCHS:
        clear_output(wait=True)
        fig.clear()
        
        # Get full predictions
        h0 = np.zeros((HIDDEN_SIZE, 1))
        all_preds, all_hidden, _ = rnn.predict(data[:-1], h0)
        true_vals = data[1:]
        
        # --- Plot 1: Prediction vs Truth (FULL WIDTH) ---
        ax1 = fig.add_subplot(gs[0, :])
        x_axis = np.arange(len(true_vals))
        
        # Train/test background
        ax1.axvspan(0, split-1, alpha=0.05, color='#38bdf8')
        ax1.axvspan(split-1, len(true_vals), alpha=0.05, color='#f97316')
        ax1.axvline(x=split-1, color='#ffffff', alpha=0.15, linestyle='--', linewidth=1)
        
        # Fill error area
        preds_clipped = np.clip(all_preds, 0, 1)
        ax1.fill_between(x_axis, denorm(true_vals), denorm(preds_clipped),
                         alpha=0.1, color='#ef4444')
        
        ax1.plot(x_axis, denorm(true_vals), color='#38bdf8', linewidth=1.5,
                 label='Actual', alpha=0.9)
        ax1.plot(x_axis, denorm(preds_clipped), color='#f97316', linewidth=2,
                 label='RNN Prediction', alpha=0.9)
        
        # Labels
        ax1.text(split*0.4, ax1.get_ylim()[1]*0.95, 'TRAIN',
                 color='#38bdf8', alpha=0.4, fontsize=10, ha='center')
        ax1.text(split + (len(true_vals)-split)*0.5, ax1.get_ylim()[1]*0.95, 'TEST',
                 color='#f97316', alpha=0.4, fontsize=10, ha='center')
        
        ax1.set_title(f'{ds["name"]}  —  Epoch {epoch}/{EPOCHS}  |  Loss: {avg_loss:.6f}',
                      fontsize=11, pad=8)
        ax1.set_ylabel(ds['ylabel'], fontsize=8)
        ax1.legend(loc='upper left', fontsize=8)
        ax1.grid(True, alpha=0.3)
        
        # --- Plot 2: Loss Curve ---
        ax2 = fig.add_subplot(gs[1, 0])
        ax2.plot(loss_history, color='#f97316', linewidth=1.5, label='Train Loss')
        if test_loss_history:
            ax2.plot(test_loss_history, color='#38bdf8', linewidth=1.5,
                     label='Test Loss', alpha=0.7)
        ax2.fill_between(range(len(loss_history)), loss_history,
                         alpha=0.08, color='#f97316')
        ax2.set_title('Loss Curve', fontsize=10, pad=6)
        ax2.set_xlabel('Epoch', fontsize=8)
        ax2.set_ylabel('MSE Loss', fontsize=8)
        ax2.legend(fontsize=7)
        ax2.grid(True, alpha=0.3)
        
        # --- Plot 3: Hidden State Heatmap ---
        ax3 = fig.add_subplot(gs[1, 1])
        # Show last 100 time steps
        n_show = min(100, all_hidden.shape[0])
        heatmap_data = all_hidden[-n_show:].T  # (hidden_size, time)
        im = ax3.imshow(heatmap_data, aspect='auto', cmap='RdBu_r',
                        vmin=-1, vmax=1, interpolation='nearest')
        ax3.set_title('Hidden State Activations (last 100 steps)', fontsize=10, pad=6)
        ax3.set_xlabel('Time Step', fontsize=8)
        ax3.set_ylabel('Hidden Unit', fontsize=8)
        plt.colorbar(im, ax=ax3, fraction=0.046, pad=0.04)
        
        # --- Plot 4: Weight Histograms ---
        ax4 = fig.add_subplot(gs[2, 0])
        ax4.hist(rnn.W_xh.flatten(), bins=20, alpha=0.6, color='#f97316', label='W_xh')
        ax4.hist(rnn.W_hh.flatten(), bins=20, alpha=0.6, color='#38bdf8', label='W_hh')
        ax4.hist(rnn.W_hy.flatten(), bins=20, alpha=0.6, color='#22c55e', label='W_hy')
        ax4.set_title('Weight Distributions', fontsize=10, pad=6)
        ax4.legend(fontsize=7)
        ax4.grid(True, alpha=0.3)
        
        # --- Plot 5: Per-step Error ---
        ax5 = fig.add_subplot(gs[2, 1])
        errors = np.abs(denorm(true_vals) - denorm(preds_clipped))
        ax5.bar(x_axis[:split-1], errors[:split-1], color='#38bdf8', alpha=0.5, width=1.0, label='Train')
        ax5.bar(x_axis[split-1:], errors[split-1:], color='#f97316', alpha=0.5, width=1.0, label='Test')
        ax5.set_title('Absolute Error Per Step', fontsize=10, pad=6)
        ax5.set_ylabel('|Error|', fontsize=8)
        ax5.legend(fontsize=7)
        ax5.grid(True, alpha=0.3)
        
        plt.tight_layout()
        display(fig)
        
print(f'\n✅ Done! Final train loss: {loss_history[-1]:.6f}')
if test_loss_history:
    print(f'   Final test loss:  {test_loss_history[-1]:.6f}')

In [3]:
DATASET     = 'temperature'    # 'airline', 'sunspots', 'temperature', 'co2', 'ecg'
HIDDEN_SIZE = 32*2           # 8, 16, 32, 64 — more = more capacity
SEQ_LENGTH  = 40           # 10-60 — how many steps to unroll BPTT
LR          = 0.001        # 0.001-0.02
EPOCHS      = 10          # total training epochs
TRAIN_SPLIT = 0.8          # 80% train, 20% test
PLOT_EVERY  = 10           # update the live plot every N epochs

# ============================================

# Load and normalize
ds = ALL_DATASETS[DATASET]
raw = ds['values']
v_min, v_max = raw.min(), raw.max()
data = (raw - v_min) / (v_max - v_min)  # normalize to [0, 1]

split = int(len(data) * TRAIN_SPLIT)
train_data = data[:split]
test_data  = data[split:]

def denorm(v):
    return v * (v_max - v_min) + v_min

print(f'Dataset:     {ds["name"]}')
print(f'Data points: {len(data)} (train: {len(train_data)}, test: {len(test_data)})')
print(f'Hidden size: {HIDDEN_SIZE}')
print(f'Seq length:  {SEQ_LENGTH}')
print(f'LR:          {LR}')

NameError: name 'ALL_DATASETS' is not defined

In [None]:
# Initialize
rnn = VanillaRNN(HIDDEN_SIZE, lr=LR)
loss_history = []
test_loss_history = []

print(f'Training on: {ds["name"]}')
print(f'Parameters: {HIDDEN_SIZE + HIDDEN_SIZE**2 + HIDDEN_SIZE + HIDDEN_SIZE + 1:,}')
print(f'Training for {EPOCHS} epochs, plotting every {PLOT_EVERY}...\n')

fig = plt.figure(figsize=(14, 10))
gs = GridSpec(3, 2, figure=fig, hspace=0.35, wspace=0.25)

for epoch in range(1, EPOCHS + 1):
    
    # === TRAIN ONE EPOCH ===
    h = np.zeros((HIDDEN_SIZE, 1))
    epoch_loss = 0
    steps = 0
    
    for p in range(0, len(train_data) - SEQ_LENGTH - 1, SEQ_LENGTH):
        inputs  = train_data[p : p + SEQ_LENGTH]
        targets = train_data[p+1 : p + SEQ_LENGTH + 1]
        
        loss, cache = rnn.forward(inputs, targets, h)
        grads = rnn.backward(inputs, targets, cache)
        rnn.update(grads)
        
        h = cache['hs'][SEQ_LENGTH - 1]
        epoch_loss += loss
        steps += 1
    
    avg_loss = epoch_loss / (steps * SEQ_LENGTH) if steps > 0 else 0
    loss_history.append(avg_loss)
    
    # Test loss
    if len(test_data) > SEQ_LENGTH + 1:
        h_test = np.zeros((HIDDEN_SIZE, 1))
        # Feed train data to build hidden state
        _, _, h_test = rnn.predict(train_data, h_test)
        t_loss = 0
        t_steps = 0
        for p in range(0, len(test_data) - SEQ_LENGTH - 1, SEQ_LENGTH):
            inp = test_data[p : p + SEQ_LENGTH]
            tgt = test_data[p+1 : p + SEQ_LENGTH + 1]
            l, _ = rnn.forward(inp, tgt, h_test)
            t_loss += l
            t_steps += 1
        test_loss_history.append(t_loss / (t_steps * SEQ_LENGTH) if t_steps > 0 else 0)
    
    # === LIVE PLOT ===
    if epoch % PLOT_EVERY == 0 or epoch == 1 or epoch == EPOCHS:
        clear_output(wait=True)
        fig.clear()
        
        # Get full predictions
        h0 = np.zeros((HIDDEN_SIZE, 1))
        all_preds, all_hidden, _ = rnn.predict(data[:-1], h0)
        true_vals = data[1:]
        
        # --- Plot 1: Prediction vs Truth (FULL WIDTH) ---
        ax1 = fig.add_subplot(gs[0, :])
        x_axis = np.arange(len(true_vals))
        
        # Train/test background
        ax1.axvspan(0, split-1, alpha=0.05, color='#38bdf8')
        ax1.axvspan(split-1, len(true_vals), alpha=0.05, color='#f97316')
        ax1.axvline(x=split-1, color='#ffffff', alpha=0.15, linestyle='--', linewidth=1)
        
        # Fill error area
        preds_clipped = np.clip(all_preds, 0, 1)
        ax1.fill_between(x_axis, denorm(true_vals), denorm(preds_clipped),
                         alpha=0.1, color='#ef4444')
        
        ax1.plot(x_axis, denorm(true_vals), color='#38bdf8', linewidth=1.5,
                 label='Actual', alpha=0.9)
        ax1.plot(x_axis, denorm(preds_clipped), color='#f97316', linewidth=2,
                 label='RNN Prediction', alpha=0.9)
        
        # Labels
        ax1.text(split*0.4, ax1.get_ylim()[1]*0.95, 'TRAIN',
                 color='#38bdf8', alpha=0.4, fontsize=10, ha='center')
        ax1.text(split + (len(true_vals)-split)*0.5, ax1.get_ylim()[1]*0.95, 'TEST',
                 color='#f97316', alpha=0.4, fontsize=10, ha='center')
        
        ax1.set_title(f'{ds["name"]}  —  Epoch {epoch}/{EPOCHS}  |  Loss: {avg_loss:.6f}',
                      fontsize=11, pad=8)
        ax1.set_ylabel(ds['ylabel'], fontsize=8)
        ax1.legend(loc='upper left', fontsize=8)
        ax1.grid(True, alpha=0.3)
        
        # --- Plot 2: Loss Curve ---
        ax2 = fig.add_subplot(gs[1, 0])
        ax2.plot(loss_history, color='#f97316', linewidth=1.5, label='Train Loss')
        if test_loss_history:
            ax2.plot(test_loss_history, color='#38bdf8', linewidth=1.5,
                     label='Test Loss', alpha=0.7)
        ax2.fill_between(range(len(loss_history)), loss_history,
                         alpha=0.08, color='#f97316')
        ax2.set_title('Loss Curve', fontsize=10, pad=6)
        ax2.set_xlabel('Epoch', fontsize=8)
        ax2.set_ylabel('MSE Loss', fontsize=8)
        ax2.legend(fontsize=7)
        ax2.grid(True, alpha=0.3)
        
        # --- Plot 3: Hidden State Heatmap ---
        ax3 = fig.add_subplot(gs[1, 1])
        # Show last 100 time steps
        n_show = min(100, all_hidden.shape[0])
        heatmap_data = all_hidden[-n_show:].T  # (hidden_size, time)
        im = ax3.imshow(heatmap_data, aspect='auto', cmap='RdBu_r',
                        vmin=-1, vmax=1, interpolation='nearest')
        ax3.set_title('Hidden State Activations (last 100 steps)', fontsize=10, pad=6)
        ax3.set_xlabel('Time Step', fontsize=8)
        ax3.set_ylabel('Hidden Unit', fontsize=8)
        plt.colorbar(im, ax=ax3, fraction=0.046, pad=0.04)
        
        # --- Plot 4: Weight Histograms ---
        ax4 = fig.add_subplot(gs[2, 0])
        ax4.hist(rnn.W_xh.flatten(), bins=20, alpha=0.6, color='#f97316', label='W_xh')
        ax4.hist(rnn.W_hh.flatten(), bins=20, alpha=0.6, color='#38bdf8', label='W_hh')
        ax4.hist(rnn.W_hy.flatten(), bins=20, alpha=0.6, color='#22c55e', label='W_hy')
        ax4.set_title('Weight Distributions', fontsize=10, pad=6)
        ax4.legend(fontsize=7)
        ax4.grid(True, alpha=0.3)
        
        # --- Plot 5: Per-step Error ---
        ax5 = fig.add_subplot(gs[2, 1])
        errors = np.abs(denorm(true_vals) - denorm(preds_clipped))
        ax5.bar(x_axis[:split-1], errors[:split-1], color='#38bdf8', alpha=0.5, width=1.0, label='Train')
        ax5.bar(x_axis[split-1:], errors[split-1:], color='#f97316', alpha=0.5, width=1.0, label='Test')
        ax5.set_title('Absolute Error Per Step', fontsize=10, pad=6)
        ax5.set_ylabel('|Error|', fontsize=8)
        ax5.legend(fontsize=7)
        ax5.grid(True, alpha=0.3)
        
        plt.tight_layout()
        display(fig)
        
print(f'\n✅ Done! Final train loss: {loss_history[-1]:.6f}')
if test_loss_history:
    print(f'   Final test loss:  {test_loss_history[-1]:.6f}')

# IMDB Sentiment Dataset

In [1]:
import torch.nn as nn
import torch
from datasets import load_dataset
from torch.utils.tensorboard import SummaryWriter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Database
ds = load_dataset("imdb")
print(ds["train"][0]["text"][:200], ds["train"][0]["label"])  # label: 0/1


I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ev 0


In [3]:
example = ds["train"][0]
example

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [4]:
# Tokenization
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



In [5]:
text = "Hello, how are you?"
encoded_input = tokenizer(text, return_tensors='pt')
print(encoded_input)

print(encoded_input["input_ids"])

print(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0]))

{'input_ids': tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 1029,  102]])
['[CLS]', 'hello', ',', 'how', 'are', 'you', '?', '[SEP]']


In [6]:
seq_length = 128 
tokens = tokenizer(example["text"], truncation=True, padding='max_length', max_length=seq_length, return_tensors='pt')
tokens.keys()

KeysView({'input_ids': tensor([[  101,  1045, 12524,  1045,  2572,  8025,  1011,  3756,  2013,  2026,
          2678,  3573,  2138,  1997,  2035,  1996,  6704,  2008,  5129,  2009,
          2043,  2009,  2001,  2034,  2207,  1999,  3476,  1012,  1045,  2036,
          2657,  2008,  2012,  2034,  2009,  2001,  8243,  2011,  1057,  1012,
          1055,  1012,  8205,  2065,  2009,  2412,  2699,  2000,  4607,  2023,
          2406,  1010,  3568,  2108,  1037,  5470,  1997,  3152,  2641,  1000,
          6801,  1000,  1045,  2428,  2018,  2000,  2156,  2023,  2005,  2870,
          1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,  1996,
          5436,  2003,  8857,  2105,  1037,  2402,  4467,  3689,  3076,  2315,
         14229,  2040,  4122,  2000,  4553,  2673,  2016,  2064,  2055,  2166,
          1012,  1999,  3327,  2016,  4122,  2000,  3579,  2014,  3086,  2015,
          2000,  2437,  2070,  4066,  1997,  4516,  2006,  2054,  1996,  2779,
         25430, 14728,  2245,

In [7]:
# Embeddings
from transformers import AutoModel
model = AutoModel.from_pretrained("bert-base-uncased")

Loading weights: 100%|███████████████████████████████████████████████████████████████| 199/199 [00:00<00:00, 361.10it/s, Materializing param=pooler.dense.weight]
BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [8]:
# Pass inputs through the model
with torch.no_grad():
    outputs = model(**tokens)

In [9]:
# Get last hidden states
embeddings = outputs.last_hidden_state
print(embeddings.shape)  # (1, seq_length, hidden_size)


torch.Size([1, 128, 768])


In [10]:
# RRN Layers
import torch
import torch.nn as nn

In [11]:
class TrainableRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.1, num_layers=1, bidirectional=False):
        super(TrainableRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, 
                        num_layers=num_layers, 
                        batch_first=True,
                        nonlinearity='tanh',
                        dropout=dropout if num_layers > 1 else 0,
                        bidirectional=bidirectional
                        )
        direction_factor = 2 if bidirectional else 1
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size * direction_factor, output_size)
    
    def forward(self, x):
        # x.shape: (batch_size, seq_length, input_size)
        rnn_out, _ = self.rnn(x)
        rnn_out = self.dropout(rnn_out)
        # Take last time step
        out = self.fc(rnn_out[:, -1, :])
        return out



In [12]:
input_size = embeddings.shape[-1]  # hidden_size from BERT
hidden_size = 128
output_size = 2 # Binary classification

model = TrainableRNN(input_size, hidden_size, output_size)

In [13]:
logits = model(embeddings)
prob = torch.sigmoid(logits)

print("Logit:", logits)
print("Probability:", prob)


Logit: tensor([[-0.4815,  0.5061]], grad_fn=<AddmmBackward0>)
Probability: tensor([[0.3819, 0.6239]], grad_fn=<SigmoidBackward0>)


In [None]:
%load_ext tensorboard
%tensorboard --logdir runs

import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.tensorboard import SummaryWriter
from transformers import AutoTokenizer, AutoModel


# =============================================================================
# 1. PRECOMPUTE BERT EMBEDDINGS (runs once, saves to disk)
# =============================================================================

seq_length = 128
batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if os.path.exists('data/imdb_embeddings.pt'):
    print("Loading precomputed embeddings...")
    all_embeddings = torch.load('data/imdb_embeddings.pt')
    all_labels = torch.load('data/imdb_labels.pt')
else:
    print("Precomputing BERT embeddings (one time only)...")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    bert_model = AutoModel.from_pretrained("bert-base-uncased").to(device)
    bert_model.eval()

    all_embeddings = []
    with torch.no_grad():
        for i, text in enumerate(ds["train"]["text"]):
            tokens = tokenizer(text, truncation=True, padding='max_length',
                             max_length=seq_length, return_tensors='pt').to(device)
            emb = bert_model(**tokens).last_hidden_state.squeeze(0).cpu()
            all_embeddings.append(emb)
            if i % 500 == 0:
                print(f"  {i}/{len(ds['train']['text'])}")

    all_embeddings = torch.stack(all_embeddings)
    all_labels = torch.tensor(ds["train"]["label"], dtype=torch.float)

    os.makedirs('data', exist_ok=True)
    torch.save(all_embeddings, 'data/imdb_embeddings.pt')
    torch.save(all_labels, 'data/imdb_labels.pt')
    print("Saved to data/")

    del bert_model, tokenizer  # free GPU memory

print(f"Embeddings: {all_embeddings.shape}")  # (25000, 128, 768)


# =============================================================================
# 2. RNN CLASSIFIER
# =============================================================================

class SentimentRNN(nn.Module):
    def __init__(self, input_size=768, hidden_size=128):
        super().__init__()
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size,
                         batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        h_seq, h_last = self.rnn(x)
        out = h_last.squeeze(0)
        return self.fc(out)


# =============================================================================
# 3. SETUP
# =============================================================================

train_dataset = TensorDataset(all_embeddings, all_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

rnn_model = SentimentRNN(input_size=768, hidden_size=128).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(rnn_model.parameters(), lr=0.001)


# =============================================================================
# 4. TRAINING LOOP
# =============================================================================

num_epochs = 10
writer = SummaryWriter('runs/rnn_imdb')

for epoch in range(num_epochs):
    rnn_model.train()
    total_loss = 0

    for batch_idx, (embeddings, labels) in enumerate(train_loader):
        embeddings = embeddings.to(device)
        labels = labels.to(device)

        logits = rnn_model(embeddings)
        loss = criterion(logits.squeeze(), labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        global_step = epoch * len(train_loader) + batch_idx
        writer.add_scalar('Loss/batch', loss.item(), global_step)

    avg_loss = total_loss / len(train_loader)
    writer.add_scalar('Loss/epoch', avg_loss, epoch)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

writer.close()

# Save model
os.makedirs('checkpoints', exist_ok=True)
torch.save(rnn_model.state_dict(), 'checkpoints/rnn_imdb.pt')
print("Model saved!")


Precomputing BERT embeddings (one time only)...


Loading weights: 100%|███████████████████████████████████████████████████████████████| 199/199 [00:00<00:00, 466.39it/s, Materializing param=pooler.dense.weight]
BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


  0/25000
  500/25000
  1000/25000
  1500/25000
  2000/25000
  2500/25000
  3000/25000
  3500/25000
  4000/25000
