## Understanding Lookback in RNN By Example

In [1]:
# =========================== # 
#     Environment Setup 
# =========================== # 
import numpy as np
import pandas as pd
import tensorflow as tf

---

In [2]:
# ==================================================#
#  Example: Simple temperature data (10 days)
# ==================================================#

temperatures = np.array([20, 22, 25, 23, 24, 26, 25, 27, 28, 29])
print(f"{'The temperature data:':<35} {temperatures}")
print(f"{'The temperature data shape:':<35} {temperatures.shape}")

The temperature data:               [20 22 25 23 24 26 25 27 28 29]
The temperature data shape:         (10,)


In [8]:
# Let's set lookback = 3 (using 3 days to predict the next day)
lookback = 3

# Here's how sequences are created:
# Input sequence (X)         | Target (y)
# [20, 22, 25]               | 23    (day 4)
# [22, 25, 23]               | 24    (day 5)
# [25, 23, 24]               | 26    (day 6)
# [23, 24, 26]               | 25    (day 7)
# [24, 26, 25]               | 27    (day 8)
# [26, 25, 27]               | 28    (day 9)
# [25, 27, 28]               | 29    (day 10)

[20 22 25 23 24 26 25 27 28 29]
(10,)


In [3]:
def create_sequences(data, lookback):
    X, y = [], []
    for i in range(len(data) - lookback):
        print(f"{i} {i+lookback}")
        X.append(data[i:i+lookback])
        print(X)
        y.append(data[i+lookback])
        print(y)
    return np.array(X), np.array(y)

In [4]:
# Create sequences
X, y = create_sequences(temperatures, lookback=3)

0 3
[array([20, 22, 25])]
[23]
1 4
[array([20, 22, 25]), array([22, 25, 23])]
[23, 24]
2 5
[array([20, 22, 25]), array([22, 25, 23]), array([25, 23, 24])]
[23, 24, 26]
3 6
[array([20, 22, 25]), array([22, 25, 23]), array([25, 23, 24]), array([23, 24, 26])]
[23, 24, 26, 25]
4 7
[array([20, 22, 25]), array([22, 25, 23]), array([25, 23, 24]), array([23, 24, 26]), array([24, 26, 25])]
[23, 24, 26, 25, 27]
5 8
[array([20, 22, 25]), array([22, 25, 23]), array([25, 23, 24]), array([23, 24, 26]), array([24, 26, 25]), array([26, 25, 27])]
[23, 24, 26, 25, 27, 28]
6 9
[array([20, 22, 25]), array([22, 25, 23]), array([25, 23, 24]), array([23, 24, 26]), array([24, 26, 25]), array([26, 25, 27]), array([25, 27, 28])]
[23, 24, 26, 25, 27, 28, 29]


In [9]:
# Print each sequence and its target
for i in range(len(X)):
    print(f"Sequence {i+1}:")
    print(f"Input: {X[i]} -> Target: {y[i]}")

Sequence 1:
Input: [20 22 25] -> Target: 23
Sequence 2:
Input: [22 25 23] -> Target: 24
Sequence 3:
Input: [25 23 24] -> Target: 26
Sequence 4:
Input: [23 24 26] -> Target: 25
Sequence 5:
Input: [24 26 25] -> Target: 27
Sequence 6:
Input: [26 25 27] -> Target: 28
Sequence 7:
Input: [25 27 28] -> Target: 29


In [5]:
# Print the input sequences
print(X)

[[20 22 25]
 [22 25 23]
 [25 23 24]
 [23 24 26]
 [24 26 25]
 [26 25 27]
 [25 27 28]]


In [6]:
# Print the input sequences shape
print(X.shape)

(7, 3)


In [7]:
# Print the target values and its shape
print(y)
print(y.shape)

[23 24 26 25 27 28 29]
(7,)


In [8]:
# ======================================================== #
# Simulating a real dataset to enhance our understanding
# ======================================================== #

# We will create a sample data

dates = pd.date_range(start='2024-01-01', end='2024-01-10', freq='h')
data = pd.DataFrame({
    'temperature': [25, 24, 23, 26, 27, 28, 25, 24, 23, 22],
    'humidity': [60, 65, 70, 63, 58, 55, 57, 62, 68, 71],
    'target_temp': [24, 23, 26, 27, 28, 25, 24, 23, 22, 21]  # next hour's temperature
})

# Display information
print("Original Data:")
print(data.head(7))
print(data.shape)

Original Data:
   temperature  humidity  target_temp
0           25        60           24
1           24        65           23
2           23        70           26
3           26        63           27
4           27        58           28
5           28        55           25
6           25        57           24
(10, 3)


In [9]:
# Create sequences with lookback=3
# ---------------------------------

def create_sequences(data, lookback):
    X, y = [], []
    for i in range(len(data) - lookback):
        # Get features for lookback period
        features_sequence = data.iloc[i:(i + lookback)][['temperature', 'humidity']].values
        # Get target (next temperature)
        target = data.iloc[i + lookback]['target_temp']
        X.append(features_sequence)
        y.append(target)
    return np.array(X), np.array(y)

In [10]:
# Set the lookback to 3 and create the sequences
lookback = 3
X, y = create_sequences(data, lookback)

# print the Input Sequences

print(X)

[[[25 60]
  [24 65]
  [23 70]]

 [[24 65]
  [23 70]
  [26 63]]

 [[23 70]
  [26 63]
  [27 58]]

 [[26 63]
  [27 58]
  [28 55]]

 [[27 58]
  [28 55]
  [25 57]]

 [[28 55]
  [25 57]
  [24 62]]

 [[25 57]
  [24 62]
  [23 68]]]


In [11]:
# Print the input sequences shape 
X.shape

(7, 3, 2)

In [13]:
# print the target values
print(y)
print(y.shape)

[27 28 25 24 23 22 21]
(7,)


In [11]:
print("\nFirst sequence (X[0]):")
print("Input features:")
print(X[0])
print("\nTarget (y[0]):", y[0])


First sequence (X[0]):
Input features:
[[25 60]
 [24 65]
 [23 70]]

Target (y[0]): 27


In [5]:
print("\nSequence shapes:")
print(f"X shape: {X.shape}")       # (samples, lookback, features)
print(f"y shape: {y.shape}")        # (samples,)

# Let's look at what each sequence represents
print("\nDetailed look at first few sequences:")
for i in range(3):
    print(f"\nSequence {i+1}:")
    print("Input:")
    print("Time    Temperature  Humidity")
    for j in range(lookback):
        print(f"t-{lookback-j}    {X[i][j][0]:.1f}°C        {X[i][j][1]}%")
    print(f"Target: {y[i]}°C")


Sequence shapes:
X shape: (7, 3, 2)
y shape: (7,)

Detailed look at first few sequences:

Sequence 1:
Input:
Time    Temperature  Humidity
t-3    25.0°C        60%
t-2    24.0°C        65%
t-1    23.0°C        70%
Target: 27°C

Sequence 2:
Input:
Time    Temperature  Humidity
t-3    24.0°C        65%
t-2    23.0°C        70%
t-1    26.0°C        63%
Target: 28°C

Sequence 3:
Input:
Time    Temperature  Humidity
t-3    23.0°C        70%
t-2    26.0°C        63%
t-1    27.0°C        58%
Target: 25°C


## How Things are Done

Looking at our example, with `lookback=3`, the first three target values (24, 23, 26) won't be used as targets because they correspond to time points where we don't have enough history to make a prediction.

Let me show this visually with the data:

```python
data = pd.DataFrame({
    'temperature': [25, 24, 23, 26, 27, 28, 25, 24, 23, 22],
    'humidity':    [60, 65, 70, 63, 58, 55, 57, 62, 68, 71],
    'target_temp': [24, 23, 26, 27, 28, 25, 24, 23, 22, 21]
})

# With lookback=3:

# First possible prediction:
# Input: uses rows 0,1,2 (three timesteps)
X[0] = [
    [25, 60],  # t-3
    [24, 65],  # t-2
    [23, 70]   # t-1
]
y[0] = 27     # row 3's target_temp

# Second possible prediction:
X[1] = [
    [24, 65],  # t-3
    [23, 70],  # t-2
    [26, 63]   # t-1
]
y[1] = 28     # row 4's target_temp

```
So we can see:

  - target_temp values [24, 23, 26] aren't used as targets
  - First actual target used is 27 (row 3's target_temp)
  - This ensures we always have 3 complete timesteps of history before making a prediction

This is why our effective training data size will be smaller than our original dataset by the lookback period.

## Understand Size

let's understand the train/test split sizing with lookback:

In [14]:
# Sample data
data = pd.DataFrame({
    'temperature': [25, 24, 23, 26, 27, 28, 25, 24, 23, 22],
    'humidity': [60, 65, 70, 63, 58, 55, 57, 62, 68, 71],
    'target_temp': [24, 23, 26, 27, 28, 25, 24, 23, 22, 21]
})

# Original data length = 10
print(f"Original data length: {len(data)}")

lookback = 3
# After creating sequences, length will be (10 - 3) = 7
sequence_length = len(data) - lookback
print(f"After sequencing, length: {sequence_length}")

# If we do 80-20 split on sequences
train_size = int(sequence_length * 0.8)  # 5 sequences for training
test_size = sequence_length - train_size  # 2 sequences for testing


Original data length: 10
After sequencing, length: 7


### Why X_train is 3D:

  1. **First dimension (samples)**: Number of sequences
  2. **Second dimension (timesteps)**: Lookback period
  3. **Third dimension (features)**: Number of features

```python
# Shape explanation
X_train shape: (5, 3, 2)
  # 5 = number of training sequences
  # 3 = lookback period (timesteps)
  # 2 = number of features (temperature, humidity)
```