# Processing Sequential Data Using GRUs Networks

In [1]:
# ========================================================================
#         Deep Learning For Sequential Data and Computer Vision
# ========================================================================
#    Module: Advanced Time Series Models
#    Topic: Processing Sequential Data Using GRUs Networks
#    
#    Description:
#    -----------
#    This notebook demonstrates how the Gated Recurrent Unit (GRU) processes
#    sequential data by tracking internal states at each time step. We create
#    a synthetic time series dataset to visualize how inputs flow through 
#    a GRU network, focusing on the mechanics of update gates, reset gates, 
#    and candidate hidden states.
#    
#    Contents:
#    1. Creating a synthetic time series dataset (10 samples, 3 features)
#    2. Implementing GRU components step-by-step
#    3. Visualizing hidden states and activation values at each time step
#    4. Analyzing how different gates affect information flow
#    5. Building intuition for GRU memory mechanisms
#    
#    Objective:
#    - Create a small, manageable time series dataset for our GRU example
#    - Have 10 samples with 3 features each to visualize how inputs flow through a GRU
#    - Track hidden states and activation values at each time step
#    - Build intuition about how GRU components work with real data
#    
#    Author: Dr. Saad Laouadi
#    Version: 1.0
#    
# ========================================================================
#  ®Copyright Dr. Saad Laouadi, 2025. All rights reserved.
# ========================================================================

In [2]:
# ============================================= #
#           Setting Up Our Environment
# ============================================= #

import numpy as np
import pandas as pd
from tabulate import tabulate

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

from banner import banner   # this is a user defined module not a package

%reload_ext watermark

print("*"*52)
%watermark -a "Dr. Saad Laouadi"
%watermark -ud

print("-"*52)
print("The loaded packages".center(52))
print("-"*52)

%watermark -iv
%reload_ext autoreload
%autoreload
print("*"*52)

****************************************************
Author: Dr. Saad Laouadi

Last updated: 2025-03-06

----------------------------------------------------
                The loaded packages                 
----------------------------------------------------
numpy     : 1.26.4
matplotlib: 3.10.0
sklearn   : 1.5.2
seaborn   : 0.13.2
pandas    : 1.5.3
tabulate  : 0.8.10

****************************************************


In [3]:
# Generate 3 input features:
# - Temperature: values between 15-25°C
# - Humidity: values between 40-80%
# - Wind speed: values between 0-15 km/h

# Generate time series data: 10 time steps, 3 features
n_samples = 10
n_features = 3

temperature = np.array([18.2, 19.5, 20.1, 22.4, 23.8, 25.0, 23.2, 21.5, 19.8, 17.5])
humidity = np.array([65.2, 62.8, 58.5, 55.0, 45.2, 42.1, 48.5, 52.3, 60.5, 67.8])
wind_speed = np.array([5.2, 6.8, 8.5, 10.2, 12.5, 14.8, 13.2, 11.5, 9.2, 6.5])

# Create input features array
X = np.column_stack((temperature, humidity, wind_speed))

# Generate target variable: power consumption in kWh
# This is a function of the features (with some noise)
# Higher temperatures, lower humidity, and higher wind speeds lead to higher power consumption
y = 2.5 * temperature - 0.5 * humidity + 1.2 * wind_speed + np.random.normal(0, 5, n_samples)

In [4]:
banner(52, title = "Data Shape")
print("Input data (X) shape:", X.shape)
print("Target data (y) shape:", y.shape)

banner(52, title = "Peek at our Data")

print("\nFirst 3 samples of input data:")
print(X[:3])
print("\nFirst 3 samples of target data:")
print(y[:3])

banner(52)

                     Data Shape                     
Input data (X) shape: (10, 3)
Target data (y) shape: (10,)
                  Peek at our Data                  

First 3 samples of input data:
[[18.2 65.2  5.2]
 [19.5 62.8  6.8]
 [20.1 58.5  8.5]]

First 3 samples of target data:
[13.71184698 30.49672723 32.61489249]


In [5]:
banner(72, title = "Complete dataset")

for i in range(n_samples):
    # For wind speed
    wind_value = X[i,2]
    wind_formatted = f"0{wind_value:.2f}" if 0 <= wind_value < 10 else f"{wind_value:.2f}"
    # For power
    power_value = y[i]
    power_formatted = f"0{power_value:.2f}" if 0 <= power_value < 10 else f"{power_value:.2f}"
    
    print(f"Day {'0'+str(i+1) if i+1<10 else i+1}: Temp={X[i,0]:.1f}°C, Humidity={X[i,1]:.1f}%,"
          f" Wind={wind_formatted}km/h → Power={power_formatted}kWh")

banner(72)

                            Complete dataset                            
Day 01: Temp=18.2°C, Humidity=65.2%, Wind=05.20km/h → Power=13.71kWh
Day 02: Temp=19.5°C, Humidity=62.8%, Wind=06.80km/h → Power=30.50kWh
Day 03: Temp=20.1°C, Humidity=58.5%, Wind=08.50km/h → Power=32.61kWh
Day 04: Temp=22.4°C, Humidity=55.0%, Wind=10.20km/h → Power=33.21kWh
Day 05: Temp=23.8°C, Humidity=45.2%, Wind=12.50km/h → Power=49.01kWh
Day 06: Temp=25.0°C, Humidity=42.1%, Wind=14.80km/h → Power=67.47kWh
Day 07: Temp=23.2°C, Humidity=48.5%, Wind=13.20km/h → Power=37.46kWh
Day 08: Temp=21.5°C, Humidity=52.3%, Wind=11.50km/h → Power=39.26kWh
Day 09: Temp=19.8°C, Humidity=60.5%, Wind=09.20km/h → Power=36.62kWh
Day 10: Temp=17.5°C, Humidity=67.8%, Wind=06.50km/h → Power=13.32kWh


In [6]:
banner(52, title = "Dataset values")
print(X)
banner(52)

                   Dataset values                   
[[18.2 65.2  5.2]
 [19.5 62.8  6.8]
 [20.1 58.5  8.5]
 [22.4 55.  10.2]
 [23.8 45.2 12.5]
 [25.  42.1 14.8]
 [23.2 48.5 13.2]
 [21.5 52.3 11.5]
 [19.8 60.5  9.2]
 [17.5 67.8  6.5]]


In [7]:
data = pd.DataFrame(np.concatenate([X, y.reshape(-1, 1)], axis = 1),
             columns = ['Temp', "Humidity", "WindSpeed", "PowerConsumption"])

data

Unnamed: 0,Temp,Humidity,WindSpeed,PowerConsumption
0,18.2,65.2,5.2,13.711847
1,19.5,62.8,6.8,30.496727
2,20.1,58.5,8.5,32.614892
3,22.4,55.0,10.2,33.208526
4,23.8,45.2,12.5,49.006999
5,25.0,42.1,14.8,67.467183
6,23.2,48.5,13.2,37.456604
7,21.5,52.3,11.5,39.255437
8,19.8,60.5,9.2,36.619681
9,17.5,67.8,6.5,13.316298


## The First Step in GRU

The first step of setting up a GRU involves defining its architecture and initializing its states. In this section we will walk  through this process step by step:

### Determine the GRU architecture dimensions:
1. **Input dimension (n_features)**: The number of features in our input data (3 in our example - temperature, humidity, wind speed)
2. **Hidden state dimension (n_hidden)**: The number of GRU units/neurons in our layer
3. **Output dimension:** Depends on our prediction task (1 for our power consumption example)

### Initialize the weight matrices and biases:
- Weight matrices for update gate $\left(z_{t}\right)$
- Weight matrices for reset gate $\left(r_{t}\right)$
- Weight matrices for candidate hidden state $\left(\tilde h_{t}\right)$
- Bias vectors for each component

### Initialize the initial state:
- **Hidden state $\left(h_{0}\right)$**: Usually initialized as zeros, with shape `[batch_size, n_hidden]`
 - Unlike LSTM, GRU only has one state vector (no separate cell state)


We will implement these steps using our example data manually to show how this would look in code:

In [8]:
# Set random seed for reproducibility
np.random.seed(123)

# Define GRU dimensions
n_features = 3            # Temperature, humidity, wind speed
n_hidden = 4              # Number of GRU units
n_output = 1              # Power consumption


# Initialize weight matrices and biases
#     For each gate, weights have shape [n_hidden, n_features + n_hidden]
#     Biases have shape [n_hidden]

# Update gate weights and bias
W_z = np.random.randn(n_hidden, n_features + n_hidden) * 0.01
b_z = np.zeros(n_hidden)

# Reset gate weights and bias
W_r = np.random.randn(n_hidden, n_features + n_hidden) * 0.01
b_r = np.zeros(n_hidden)

# Candidate hidden state weights and bias
W_h = np.random.randn(n_hidden, n_features + n_hidden) * 0.01
b_h = np.zeros(n_hidden)

# Output layer weights and bias
W_y = np.random.randn(n_output, n_hidden) * 0.01
b_y = np.zeros(n_output)

# Initialize initial state (GRU only has hidden state, no cell state)
h_0 = np.zeros(n_hidden)            # Initial hidden state

banner(52, title = "GRU Architecture Configuration:")
print(f"{'Input features:':<30} {n_features}")
print(f"{'Hidden units:':<30} {n_hidden}")
print(f"{'Output dimension:':<30} {n_output}")

print("\nWeight matrix shapes:")
print()

print(f"{'Update gate (W_z):':<30} {W_z.shape}")
print(f"{'Reset gate (W_r):':<30} {W_r.shape}")
print(f"{'Candidate hidden state (W_h):':<30} {W_h.shape}")
print(f"{'Output layer (W_y):':<30} {W_y.shape}")

print("\nInitial state:")
print(f"{'Hidden state (h_0):':<30} {h_0}")
banner(52)

          GRU Architecture Configuration:           
Input features:                3
Hidden units:                  4
Output dimension:              1

Weight matrix shapes:

Update gate (W_z):             (4, 7)
Reset gate (W_r):              (4, 7)
Candidate hidden state (W_h):  (4, 7)
Output layer (W_y):            (1, 4)

Initial state:
Hidden state (h_0):            [0. 0. 0. 0.]


## Implementing GRU with a Lookback Window of 2

When implementing a GRU model with a lookback window of 2, each prediction incorporates information from both the current and previous time step. This temporal context enhances the model's ability to capture short-term dependencies in sequential data.

### Sequence Preparation for Lookback Implementation

To properly structure input data for a lookback window of 2:

1. **Data Reshaping**: The input tensor must be restructured into a three-dimensional array with dimensions:
   - Batch size
   - Sequence length (2 for our lookback window)
   - Feature count (3 for our weather variables)

2. **Temporal Organization**: Each sequence should contain consecutive time steps, maintaining the chronological order of observations.

3. **Target Alignment**: Ensure target values align properly with their corresponding input sequences, typically positioned at the end of each sequence.

This reformatting transforms our standard time series into overlapping segments, where each segment provides the GRU with sufficient historical context to generate accurate predictions while maintaining the temporal dynamics of the data.

In [9]:
# Create sequences with lookback of 2
def create_sequences(X, y, lookback=2):
    """
    Creates sequences of data with specified lookback period
    
    Parameters:
        X: Input features array of shape (n_samples, n_features)
        y: Target values array of shape (n_samples,)
        lookback: Number of time steps to look back
        
    Returns:
        X_seq: Sequence input of shape (n_sequences, lookback, n_features)
        y_seq: Target values of shape (n_sequences,)
    """
    n_samples = len(X)
    n_sequences = n_samples - lookback
    n_features = X.shape[1]
    
    # Initialize arrays
    X_seq = np.zeros((n_sequences, lookback, n_features))
    y_seq = np.zeros(n_sequences)
    
    # Fill the arrays
    for i in range(n_sequences):
        X_seq[i] = X[i:i+lookback]
        y_seq[i] = y[i+lookback]  # Target is the NEXT value after the sequence
    
    return X_seq, y_seq

In [10]:
# Normalize the data for better GRU performance
# =============================================

# Create scalers for input and output data
X_scaler = MinMaxScaler(feature_range=(0, 1))
y_scaler = MinMaxScaler(feature_range=(0, 1))

# Fit and transform the data
X_normalized = X_scaler.fit_transform(X)
y_normalized = y_scaler.fit_transform(y.reshape(-1, 1)).flatten()

banner(52, title="Normalized input data (first 3 samples)")
print(X_normalized[:3])
banner(52, )

      Normalized input data (first 3 samples)       
[[0.09333333 0.89883268 0.        ]
 [0.26666667 0.80544747 0.16666667]
 [0.34666667 0.6381323  0.34375   ]]


In [11]:
# Create sequences
X_seq, y_seq = create_sequences(X_normalized, y_normalized, lookback=2)

banner(52, title = "Sequence data shapes")
print(f"X sequences shape: {X_seq.shape}")  # (8, 2, 3) - 8 sequences, lookback of 2, 3 features
print(f"y sequences shape: {y_seq.shape}")  # (8,) - 8 target values

# Display example sequence
print("\nExample sequence (first sequence):")
print(f"Input sequence:\n{X_seq[0]}")
print(f"Target value: {y_seq[0]} (This is the value at t=2, predicted using t=0 and t=1)")
banner(52)

                Sequence data shapes                
X sequences shape: (8, 2, 3)
y sequences shape: (8,)

Example sequence (first sequence):
Input sequence:
[[0.09333333 0.89883268 0.        ]
 [0.26666667 0.80544747 0.16666667]]
Target value: 0.35638558094918593 (This is the value at t=2, predicted using t=0 and t=1)


In [12]:
# display the created sequences
if True:
    print(X_seq)

[[[0.09333333 0.89883268 0.        ]
  [0.26666667 0.80544747 0.16666667]]

 [[0.26666667 0.80544747 0.16666667]
  [0.34666667 0.6381323  0.34375   ]]

 [[0.34666667 0.6381323  0.34375   ]
  [0.65333333 0.50194553 0.52083333]]

 [[0.65333333 0.50194553 0.52083333]
  [0.84       0.12062257 0.76041667]]

 [[0.84       0.12062257 0.76041667]
  [1.         0.         1.        ]]

 [[1.         0.         1.        ]
  [0.76       0.24902724 0.83333333]]

 [[0.76       0.24902724 0.83333333]
  [0.53333333 0.39688716 0.65625   ]]

 [[0.53333333 0.39688716 0.65625   ]
  [0.30666667 0.71595331 0.41666667]]]


In [13]:
# Display the target variable
print(y_seq)

[0.35638558 0.36734817 0.65909728 1.         0.44579707 0.47901598
 0.43034169 0.        ]


## GRU Forward Pass Computation

The Gated Recurrent Unit (GRU) processes sequential data through a series of carefully designed operations. Each time step involves the following calculations:

### 1. Update Gate Computation

The update gate ($z_t$) determines how much of the previous hidden state should be retained:

$$z_t = \sigma(W_z \cdot [h_{t-1}, x_t] + b_z)$$

Where:
- $\sigma$ is the sigmoid activation function (output range [0,1])
- $W_z$ is the update gate weight matrix
- $h_{t-1}$ is the previous hidden state
- $x_t$ is the current input
- $b_z$ is the update gate bias vector

### 2. Reset Gate Computation

The reset gate ($r_t$) controls how much of the previous hidden state to consider when computing the candidate hidden state:

$$r_t = \sigma(W_r \cdot [h_{t-1}, x_t] + b_r)$$

Where:
- $W_r$ is the reset gate weight matrix
- $b_r$ is the reset gate bias vector

### 3. Candidate Hidden State Calculation
The candidate hidden state ($\tilde{h}_t$) proposes new content to potentially incorporate:

$$\tilde{h}_t = \tanh(W_h \cdot [r_t \odot h_{t-1}, x_t] + b_h)$$

Where:
- $\tanh$ is the hyperbolic tangent activation function (output range [-1,1])
- $\odot$ represents element-wise multiplication
- $W_h$ is the candidate hidden state weight matrix
- $b_h$ is the candidate hidden state bias vector

### 4. Hidden State Update
The final hidden state ($h_t$) is computed as a weighted combination of the previous hidden state and the candidate hidden state:

$$h_t = (1 - z_t) \odot h_{t-1} + z_t \odot \tilde{h}_t$$

This intuitive interpolation mechanism allows the GRU to adaptively:
- Retain important long-term dependencies when $z_t$ approaches 0
- Update with relevant new information when $z_t$ approaches 1
- Selectively utilize specific components from the previous state through the reset gate

The GRU's elegant design with just two gates achieves comparable performance to the LSTM while requiring fewer parameters, resulting in computational efficiency without sacrificing modeling capacity.

### Forward Computation for One Lookback 

In [14]:
# Define activation functions
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

# Let's take the first sequence as our example
first_sequence = X_seq[0]   # Weather data from days 1 and 2
target = y_seq[0]           # Power consumption on day 3

print("Processing sequence:")
print(first_sequence)
print("Target:", target)

# Initialize states (this is at t=0, before we process any inputs)
h_t = np.zeros(n_hidden)  # Initial hidden state

# GRU states and gates
z_t = np.zeros(n_hidden)  # Update gate
r_t = np.zeros(n_hidden)  # Reset gate
h_candidate = np.zeros(n_hidden)  # Candidate hidden state

# Note: GRU does not have a cell state like LSTM

Processing sequence:
[[0.09333333 0.89883268 0.        ]
 [0.26666667 0.80544747 0.16666667]]
Target: 0.35638558094918593


In [15]:
print("Initial state:")
print("h_0:", h_t)

Initial state:
h_0: [0. 0. 0. 0.]


In [16]:
# Process each time step in the sequence
for t in range(len(first_sequence)):
    x_t = first_sequence[t]  # Current input at time step t
    
    # Step 1: Concatenate input with previous hidden state
    combined = np.concatenate([x_t, h_t])
    
    # Step 2: Calculate update gate
    z_t = sigmoid(np.dot(W_z, combined) + b_z)
    
    # Step 3: Calculate reset gate
    r_t = sigmoid(np.dot(W_r, combined) + b_r)
    
    # Step 4: Create reset hidden state
    reset_h_t = r_t * h_t
    combined_reset = np.concatenate([x_t, reset_h_t])
    
    # Step 5: Calculate candidate hidden state
    h_tilde = tanh(np.dot(W_h, combined_reset) + b_h)
    
    # Step 6: Update hidden state
    h_t_prev = h_t.copy()  # Save previous hidden state for visualization
    h_t = (1 - z_t) * h_t + z_t * h_tilde
    
    banner(title = f"--- Time step {t+1} (Day {t+1}) ---")
    
    print(f"{'Input x_{t+1}:':<40} {x_t}")
    print(f"{'Update gate z_{t+1}:':<40} {z_t}")
    print(f"{'Reset gate r_{t+1}:':<40} {r_t}")
    print(f"{'Candidate hidden state h_tilde_{t+1}:':<40} {h_tilde}")
    print(f"{'Hidden state h_{t+1}:':<40} {h_t}")
    print(f"{'Change in hidden state:':<40} {h_t - h_t_prev}")

# Make prediction using the final hidden state
y_pred = np.dot(W_y, h_t) + b_y
banner(title = "--- Final Prediction ---")
print(f"{'Predicted power consumption:':<40} {y_pred[0]}")
print(f"{'Actual power consumption:':<40} {target}")
print(f"{'Prediction error:':<40} {y_pred[0] - target}")
banner()

                          --- Time step 1 (Day 1) ---                           
Input x_{t+1}:                           [0.09333333 0.89883268 0.        ]
Update gate z_{t+1}:                     [0.50198779 0.50274456 0.49892038 0.49824495]
Reset gate r_{t+1}:                      [0.49803089 0.49996588 0.50119818 0.49761434]
Candidate hidden state h_tilde_{t+1}:    [ 0.01660329  0.01150079 -0.00703337 -0.00764535]
Hidden state h_{t+1}:                    [ 0.00833465  0.00578196 -0.00350909 -0.00380926]
Change in hidden state:                  [ 0.00833465  0.00578196 -0.00350909 -0.00380926]
                          --- Time step 2 (Day 2) ---                           
Input x_{t+1}:                           [0.26666667 0.80544747 0.16666667]
Update gate z_{t+1}:                     [0.5013713  0.50187951 0.4997982  0.49956964]
Reset gate r_{t+1}:                      [0.49797827 0.50018602 0.5010573  0.49846783]
Candidate hidden state h_tilde_{t+1}:    [ 0.01900245  0.00923645

## GRU State Extraction and Analysis

After implementing the GRU architecture and defining its parameters, we'll now examine how the hidden state evolves during the forward pass of sequential data. This analysis provides critical insights into how GRU cells maintain and update temporal information.

Let's extract the final hidden state after processing our first sequence, which becomes the initial state for the second sequence—demonstrating GRU's ability to carry relevant information forward in time:

In [17]:
# Initialize state once at the beginning (GRU only has hidden state)
h_t = np.zeros(n_hidden)                # Shape: [4]
print("Initial state before processing first sequence:")
print("h_0:", h_t)

# Process the first sequence
first_sequence = X_seq[0]           # Weather data from days 1 and 2
for t in range(len(first_sequence)):
    x_t = first_sequence[t]
    
    # Concatenate input with previous hidden state
    combined = np.concatenate([x_t, h_t])
    
    # Calculate update gate
    z_t = sigmoid(np.dot(W_z, combined) + b_z)
    
    # Calculate reset gate
    r_t = sigmoid(np.dot(W_r, combined) + b_r)
    
    # Apply reset gate to previous hidden state
    reset_h = r_t * h_t
    
    # Concatenate input with reset hidden state
    combined_reset = np.concatenate([x_t, reset_h])
    
    # Calculate candidate hidden state
    h_tilde = tanh(np.dot(W_h, combined_reset) + b_h)
    
    # Update hidden state
    h_t = (1 - z_t) * h_t + z_t * h_tilde
    
    banner(title = f"After processing Day {t+1}")
    print(f"{'Update gate z_{t+1}:':<40} {z_t}")
    print(f"{'Reset gate r_{t+1}:':<40} {r_t}")
    print(f"{'Candidate hidden state h_tilde_{'+str(t+1)+'}:':<40} {h_tilde}")
    print(f"{'Hidden state h_{'+str(t+1)+'}:':<40} {h_t}")

banner()
print("\nFinal state after processing first sequence (Days 1-2):")
print("Final hidden state (h_2):", h_t)
print("\nThis value will be used as initial state for the second sequence (Days 2-3).")
banner()
# This final value would be used as initial state for the second sequence
# second_sequence = X_sequences[1]  # Days 2-3
# The initial state for this sequence would be the final h_t from above

Initial state before processing first sequence:
h_0: [0. 0. 0. 0.]
                             After processing Day 1                             
Update gate z_{t+1}:                     [0.50198779 0.50274456 0.49892038 0.49824495]
Reset gate r_{t+1}:                      [0.49803089 0.49996588 0.50119818 0.49761434]
Candidate hidden state h_tilde_{1}:      [ 0.01660329  0.01150079 -0.00703337 -0.00764535]
Hidden state h_{1}:                      [ 0.00833465  0.00578196 -0.00350909 -0.00380926]
                             After processing Day 2                             
Update gate z_{t+1}:                     [0.5013713  0.50187951 0.4997982  0.49956964]
Reset gate r_{t+1}:                      [0.49797827 0.50018602 0.5010573  0.49846783]
Candidate hidden state h_tilde_{2}:      [ 0.01900245  0.00923645 -0.00362589 -0.00977049]
Hidden state h_{2}:                      [ 0.01368318  0.0075157  -0.00356746 -0.00678731]

Final state after processing first sequence (Days 1-2):
Fi

### Full Code to Process the Entire Data 

---

In [18]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate our sample data
temperature = np.array([18.2, 19.5, 20.1, 22.4, 23.8, 25.0, 23.2, 21.5, 19.8, 17.5])
humidity = np.array([65.2, 62.8, 58.5, 55.0, 45.2, 42.1, 48.5, 52.3, 60.5, 67.8])
wind_speed = np.array([5.2, 6.8, 8.5, 10.2, 12.5, 14.8, 13.2, 11.5, 9.2, 6.5])
X = np.column_stack((temperature, humidity, wind_speed))

# Target: power consumption (kWh)
y = 2.5 * temperature - 0.5 * humidity + 1.2 * wind_speed + np.random.normal(0, 5, 10)

# Create sequences with lookback of 2 (predict the step after the sequence)
def create_sequences(X, y, lookback=2):
    X_seq, y_seq = [], []
    for i in range(len(X) - lookback):
        X_seq.append(X[i:i+lookback])
        y_seq.append(y[i+lookback])
    return np.array(X_seq), np.array(y_seq)

X_sequences, y_sequences = create_sequences(X, y, lookback=2)

# Define GRU dimensions
n_features = 3  # Temperature, humidity, wind speed
n_hidden = 4    # Number of GRU units
n_output = 1    # Power consumption prediction

# Initialize weight matrices and biases for GRU
# Update gate weights and bias
W_z = np.random.randn(n_hidden, n_features + n_hidden) * 0.01
b_z = np.zeros(n_hidden)

# Reset gate weights and bias
W_r = np.random.randn(n_hidden, n_features + n_hidden) * 0.01
b_r = np.zeros(n_hidden)

# Candidate hidden state weights and bias
W_h = np.random.randn(n_hidden, n_features + n_hidden) * 0.01
b_h = np.zeros(n_hidden)

# Output layer weights and bias
W_y = np.random.randn(n_output, n_hidden) * 0.01
b_y = np.zeros(n_output)

# Define activation functions
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

# Process all sequences with state persistence
# Initialize states once at the beginning (GRU only has hidden state)
h_t = np.zeros(n_hidden)

print("=" * 80)
print("GRU FORWARD PASS WITH STATE PERSISTENCE ACROSS SEQUENCES")
print("=" * 80)
print(f"Processing {len(X_sequences)} sequences with lookback=2")
print(f"Initial state: h_0={h_t}")
print("=" * 80)

predictions = []
all_states = []

for seq_idx, sequence in enumerate(X_sequences):
    print(f"\n{'=' * 30} SEQUENCE {seq_idx+1} {'=' * 30}")
    print(f"Days {seq_idx+1}-{seq_idx+2} → Predicting Day {seq_idx+3}")
    
    # Store states for this sequence
    sequence_states = []
    
    # Process each time step in the sequence
    for t in range(len(sequence)):
        x_t = sequence[t]
        day_num = seq_idx + t + 1
        
        # Concatenate input with previous hidden state
        combined = np.concatenate([x_t, h_t])
        
        # Update gate
        z_t = sigmoid(np.dot(W_z, combined) + b_z)
        
        # Reset gate
        r_t = sigmoid(np.dot(W_r, combined) + b_r)
        
        # Reset hidden state
        reset_h = r_t * h_t
        
        # Candidate hidden state calculation
        # Use reset gate to control information from previous hidden state
        combined_reset = np.concatenate([x_t, reset_h])
        h_tilde = tanh(np.dot(W_h, combined_reset) + b_h)
        
        # Update hidden state
        h_t_prev = h_t.copy()
        h_t = (1 - z_t) * h_t + z_t * h_tilde
        
        # Store states
        sequence_states.append({
            'day': day_num,
            'input': x_t,
            'update_gate': z_t,
            'reset_gate': r_t,
            'candidate_hidden': h_tilde,
            'hidden_state': h_t.copy()
        })
        
        # Print gate values and states for this time step
        print(f"\n--- Time step {t+1} (Day {day_num}) ---")
        print("Input Features:")
        input_table = [
            ["Temperature", f"{x_t[0]:.2f}°C"],
            ["Humidity", f"{x_t[1]:.2f}%"],
            ["Wind Speed", f"{x_t[2]:.2f} km/h"]
        ]
        print(tabulate(input_table, headers=["Feature", "Value"], tablefmt="grid"))
        
        print("\nGate Values:")
        gates_table = [
            ["Update Gate (z_t)", f"{z_t}"],
            ["Reset Gate (r_t)", f"{r_t}"],
            ["Candidate Hidden State (h_tilde)", f"{h_tilde}"]
        ]
        print(tabulate(gates_table, headers=["Gate", "Values"], tablefmt="grid"))
        
        print("\nState Updates:")
        state_table = [
            ["Previous Hidden State", f"{h_t_prev}"],
            ["New Hidden State", f"{h_t}"],
            ["Change in Hidden State", f"{h_t - h_t_prev}"]
        ]
        print(tabulate(state_table, headers=["State", "Values"], tablefmt="grid"))
    
    # Make prediction for this sequence
    y_pred = np.dot(W_y, h_t) + b_y
    predictions.append(y_pred[0])
    
    # Print prediction vs actual
    print(f"\n--- Prediction for Day {seq_idx+3} ---")
    pred_table = [
        ["Predicted Power Consumption", f"{y_pred[0]:.2f} kWh"],
        ["Actual Power Consumption", f"{y_sequences[seq_idx]:.2f} kWh"],
        ["Prediction Error", f"{y_pred[0] - y_sequences[seq_idx]:.2f} kWh"]
    ]
    print(tabulate(pred_table, headers=["Metric", "Value"], tablefmt="grid"))
    
    # Save states for this sequence
    all_states.append(sequence_states)
    
    print(f"\nFinal state after sequence {seq_idx+1}:")
    final_state_table = [
        ["Hidden State (h_t)", f"{h_t}"]
    ]
    print(tabulate(final_state_table, headers=["State", "Values"], tablefmt="grid"))
    
    if seq_idx < len(X_sequences) - 1:
        print("\n→ This state will be used as initial state for the next sequence")

# Print overall prediction performance
print("\n" + "=" * 80)
print("OVERALL PREDICTION PERFORMANCE")
print("=" * 80)
mse = np.mean((np.array(predictions) - y_sequences) ** 2)
mae = np.mean(np.abs(np.array(predictions) - y_sequences))

performance_table = [
    ["Mean Squared Error", f"{mse:.4f}"],
    ["Mean Absolute Error", f"{mae:.4f}"]
]
print(tabulate(performance_table, headers=["Metric", "Value"], tablefmt="grid"))

# Print a summary of how states evolved across all sequences
print("\n" + "=" * 80)
print("SUMMARY OF STATE EVOLUTION ACROSS ALL SEQUENCES")
print("=" * 80)

# Extract hidden state norms for each time step
days = []
hidden_state_norms = []

for seq_idx, sequence_states in enumerate(all_states):
    for state in sequence_states:
        days.append(state['day'])
        hidden_state_norms.append(np.linalg.norm(state['hidden_state']))

state_evolution = []
for i in range(len(days)):
    state_evolution.append([
        days[i],
        f"{hidden_state_norms[i]:.4f}"
    ])

print(tabulate(state_evolution, 
               headers=["Day", "Hidden State Norm"], 
               tablefmt="grid"))

print("\nNote: The norm values show how the 'magnitude' of the hidden state changes over time")
print("Higher values indicate more information is being stored in the state")

GRU FORWARD PASS WITH STATE PERSISTENCE ACROSS SEQUENCES
Processing 8 sequences with lookback=2
Initial state: h_0=[0. 0. 0. 0.]

Days 1-2 → Predicting Day 3

--- Time step 1 (Day 1) ---
Input Features:
+-------------+-----------+
| Feature     | Value     |
| Temperature | 18.20°C   |
+-------------+-----------+
| Humidity    | 65.20%    |
+-------------+-----------+
| Wind Speed  | 5.20 km/h |
+-------------+-----------+

Gate Values:
+----------------------------------+---------------------------------------------------+
| Gate                             | Values                                            |
| Update Gate (z_t)                | [0.40723303 0.352452   0.47836158 0.56790618]     |
+----------------------------------+---------------------------------------------------+
| Reset Gate (r_t)                 | [0.48126476 0.40700083 0.58157082 0.4639311 ]     |
+----------------------------------+---------------------------------------------------+
| Candidate Hidden State 