[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/danpele/Time-Series-Analysis/blob/main/chapter3_lecture_notebook.ipynb)

---

# Chapter 3: ARIMA Models for Non-Stationary Data

**Course:** Time Series Analysis and Forecasting  
**Program:** Bachelor program, Faculty of Cybernetics, Statistics and Economic Informatics, Bucharest University of Economic Studies, Romania  
**Academic Year:** 2025-2026

---

## Learning Objectives

By the end of this notebook, you will be able to:
1. Understand non-stationarity and its implications
2. Distinguish between deterministic and stochastic trends
3. Apply differencing to achieve stationarity
4. Perform unit root tests (ADF, KPSS)
5. Fit and interpret ARIMA(p,d,q) models
6. Generate forecasts with ARIMA models

## Setup and Imports

In [None]:
# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Time series specific
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller, kpss, acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from scipy import stats

# Plotting style - clean, professional
plt.rcParams['figure.figsize'] = (12, 5)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.facecolor'] = 'none'
plt.rcParams['figure.facecolor'] = 'none'
plt.rcParams['savefig.facecolor'] = 'none'
plt.rcParams['savefig.transparent'] = True
plt.rcParams['legend.frameon'] = False
plt.rcParams['axes.grid'] = False
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.right'] = False

# Colors (IDA color scheme)
COLORS = {
    'blue': '#1A3A6E',
    'red': '#DC3545',
    'green': '#2E7D32',
    'orange': '#E67E22',
    'gray': '#666666'
}

print("All libraries loaded successfully!")

## 1. Non-Stationarity in Time Series

Many economic and financial time series are **non-stationary**:
- GDP, stock prices, exchange rates
- They exhibit trends, changing means, or growing variance

### Why Does It Matter?
- Standard ARMA models assume stationarity
- Regression with non-stationary data leads to **spurious results**
- Statistical inference becomes invalid

In [None]:
# Example: Simulating different types of non-stationarity
np.random.seed(42)
n = 200
t = np.arange(n)

# 1. Stationary process (AR(1))
stationary = np.zeros(n)
for i in range(1, n):
    stationary[i] = 0.7 * stationary[i-1] + np.random.randn()

# 2. Deterministic trend
det_trend = 0.5 + 0.1 * t + np.random.randn(n)

# 3. Random walk (stochastic trend)
random_walk = np.cumsum(np.random.randn(n))

# 4. Random walk with drift
rw_drift = np.cumsum(0.2 + np.random.randn(n))

# Plot all four
fig, axes = plt.subplots(2, 2, figsize=(14, 8))

processes = [
    (stationary, 'Stationary AR(1)', COLORS['blue']),
    (det_trend, 'Deterministic Trend', COLORS['green']),
    (random_walk, 'Random Walk (Stochastic Trend)', COLORS['red']),
    (rw_drift, 'Random Walk with Drift', COLORS['orange'])
]

for ax, (data, title, color) in zip(axes.flatten(), processes):
    ax.plot(data, color=color, linewidth=1, label=title)
    ax.set_title(title, fontweight='bold')
    ax.set_xlabel('Time')

# Collect all handles and labels for combined legend
handles, labels = [], []
for ax in axes.flatten():
    h, l = ax.get_legend_handles_labels()
    handles.extend(h)
    labels.extend(l)

plt.subplots_adjust(bottom=0.15)
fig.legend(handles, labels, loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.02))
plt.tight_layout(rect=[0, 0.05, 1, 1])
plt.show()

In [None]:
print("Key observation: Non-stationary series have time-varying properties")

In [None]:
# Demonstrate growing variance of random walk
np.random.seed(123)
n_sims = 100
n_periods = 200

# Simulate many random walks
random_walks = np.zeros((n_sims, n_periods))
for i in range(n_sims):
    random_walks[i] = np.cumsum(np.random.randn(n_periods))

# Calculate variance at each time point
empirical_var = np.var(random_walks, axis=0)
theoretical_var = np.arange(1, n_periods + 1)  # t * sigma^2 with sigma^2 = 1

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot sample paths
for i in range(20):
    axes[0].plot(random_walks[i], alpha=0.3, linewidth=0.8)
axes[0].set_title('Random Walk Sample Paths', fontweight='bold')
axes[0].set_xlabel('Time')
axes[0].set_ylabel('Value')

# Plot variance growth
axes[1].plot(empirical_var, color=COLORS['blue'], linewidth=2, label='Empirical Variance')
axes[1].plot(theoretical_var, color=COLORS['red'], linestyle='--', linewidth=2, label='Theoretical: t·σ²')
axes[1].set_title('Variance Grows Linearly with Time', fontweight='bold')
axes[1].set_xlabel('Time')
axes[1].set_ylabel('Variance')

# Collect handles and labels for fig.legend
handles, labels = axes[1].get_legend_handles_labels()

plt.subplots_adjust(bottom=0.15)
fig.legend(handles, labels, loc='lower center', ncol=2, bbox_to_anchor=(0.5, -0.02))
plt.tight_layout(rect=[0, 0.05, 1, 1])
plt.show()

In [None]:
print(f"Variance at t=50: Empirical={empirical_var[49]:.2f}, Theoretical=50.00")
print(f"Variance at t=200: Empirical={empirical_var[199]:.2f}, Theoretical=200.00")

In [None]:
# Show how differencing makes random walk stationary
np.random.seed(42)
n = 300

# Generate random walk (I(1) process)
eps = np.random.randn(n)
random_walk = np.cumsum(eps)

# First difference
diff1 = np.diff(random_walk)

fig, axes = plt.subplots(2, 2, figsize=(14, 8))

# Original series
axes[0, 0].plot(random_walk, color=COLORS['blue'], linewidth=1, label='Random Walk')
axes[0, 0].set_title('Original: Random Walk Y_t (Non-stationary)', fontweight='bold')
axes[0, 0].set_xlabel('Time')

# ACF of original
plot_acf(random_walk, ax=axes[0, 1], lags=30, color=COLORS['blue'])
axes[0, 1].set_title('ACF of Y_t (Slow decay = Non-stationary)', fontweight='bold')

# Differenced series
axes[1, 0].plot(diff1, color=COLORS['green'], linewidth=1, label='ΔY_t')
axes[1, 0].axhline(y=0, color='red', linestyle='--', alpha=0.5)
axes[1, 0].set_title('First Difference: ΔY_t = ε_t (Stationary!)', fontweight='bold')
axes[1, 0].set_xlabel('Time')

# ACF of differenced
plot_acf(diff1, ax=axes[1, 1], lags=30, color=COLORS['green'])
axes[1, 1].set_title('ACF of ΔY_t (White noise = Stationary)', fontweight='bold')

# Collect handles and labels from subplots that have legends
handles, labels = [], []
for ax in [axes[0, 0], axes[1, 0]]:
    h, l = ax.get_legend_handles_labels()
    handles.extend(h)
    labels.extend(l)

plt.subplots_adjust(bottom=0.12)
fig.legend(handles, labels, loc='lower center', ncol=2, bbox_to_anchor=(0.5, -0.02))
plt.tight_layout(rect=[0, 0.05, 1, 1])
plt.show()

In [None]:
print("\nKey insight: One difference transforms I(1) to I(0)")
print(f"ΔY_t = Y_t - Y_{'{t-1}'} = ε_t (white noise)")

## 3. Integrated Processes

A time series $Y_t$ is **integrated of order d**, written $Y_t \sim I(d)$, if:
- $Y_t$ is non-stationary
- $\Delta^d Y_t = (1-L)^d Y_t$ is stationary

### Common Cases
- **I(0)**: Stationary (ARMA)
- **I(1)**: First difference is stationary (most common for economic data)
- **I(2)**: Second difference is stationary (rare)

In [None]:
# Show how differencing makes random walk stationary
np.random.seed(42)
n = 300

# Generate random walk (I(1) process)
eps = np.random.randn(n)
random_walk = np.cumsum(eps)

# First difference
diff1 = np.diff(random_walk)

fig, axes = plt.subplots(2, 2, figsize=(14, 8))

# Original series
axes[0, 0].plot(random_walk, color=COLORS['blue'], linewidth=1, label='Random Walk')
axes[0, 0].set_title('Original: Random Walk Y_t (Non-stationary)', fontweight='bold')
axes[0, 0].set_xlabel('Time')
axes[0, 0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), frameon=False)

# ACF of original
plot_acf(random_walk, ax=axes[0, 1], lags=30, color=COLORS['blue'])
axes[0, 1].set_title('ACF of Y_t (Slow decay = Non-stationary)', fontweight='bold')

# Differenced series
axes[1, 0].plot(diff1, color=COLORS['green'], linewidth=1, label='ΔY_t')
axes[1, 0].axhline(y=0, color='red', linestyle='--', alpha=0.5)
axes[1, 0].set_title('First Difference: ΔY_t = ε_t (Stationary!)', fontweight='bold')
axes[1, 0].set_xlabel('Time')
axes[1, 0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), frameon=False)

# ACF of differenced
plot_acf(diff1, ax=axes[1, 1], lags=30, color=COLORS['green'])
axes[1, 1].set_title('ACF of ΔY_t (White noise = Stationary)', fontweight='bold')

plt.tight_layout()
plt.show()

print("\nKey insight: One difference transforms I(1) to I(0)")
print(f"ΔY_t = Y_t - Y_{'{t-1}'} = ε_t (white noise)")

## 4. The Difference Operator

### First Difference
$$\Delta Y_t = Y_t - Y_{t-1} = (1-L)Y_t$$

### Second Difference
$$\Delta^2 Y_t = \Delta(\Delta Y_t) = (1-L)^2 Y_t = Y_t - 2Y_{t-1} + Y_{t-2}$$

In [None]:
# Demonstrate difference operators
Y = pd.Series([100, 102, 105, 103, 108, 112, 110, 115])

df = pd.DataFrame({
    'Y_t': Y,
    'ΔY_t = Y_t - Y_{t-1}': Y.diff(1),
    'Δ²Y_t': Y.diff(1).diff(1)
})

print("Differencing Examples:")
print("=" * 60)
print(df.to_string())
print("\nNote: Each difference loses one observation")

## 5. Unit Root Tests

### Augmented Dickey-Fuller (ADF) Test
- $H_0$: Unit root exists (non-stationary)
- $H_1$: No unit root (stationary)
- Reject $H_0$ if test statistic < critical value (more negative)

### KPSS Test
- $H_0$: Series is stationary
- $H_1$: Series has unit root
- Opposite null hypothesis to ADF!

In [None]:
def run_unit_root_tests(series, name):
    """Run ADF and KPSS tests and print results."""
    print(f"\n{'='*60}")
    print(f"Unit Root Tests for: {name}")
    print('='*60)
    
    # ADF Test
    adf_result = adfuller(series, autolag='AIC')
    print(f"\nADF Test (H0: Unit root exists)")
    print(f"  Test Statistic: {adf_result[0]:.4f}")
    print(f"  p-value: {adf_result[1]:.6f}")
    print(f"  Critical Values:")
    for key, value in adf_result[4].items():
        print(f"    {key}: {value:.4f}")
    adf_conclusion = "STATIONARY" if adf_result[1] < 0.05 else "NON-STATIONARY"
    print(f"  Conclusion: {adf_conclusion}")
    
    # KPSS Test
    kpss_result = kpss(series, regression='c', nlags='auto')
    print(f"\nKPSS Test (H0: Series is stationary)")
    print(f"  Test Statistic: {kpss_result[0]:.4f}")
    print(f"  p-value: {kpss_result[1]:.4f}")
    print(f"  Critical Values:")
    for key, value in kpss_result[3].items():
        print(f"    {key}: {value:.4f}")
    kpss_conclusion = "STATIONARY" if kpss_result[1] > 0.05 else "NON-STATIONARY"
    print(f"  Conclusion: {kpss_conclusion}")
    
    return adf_conclusion, kpss_conclusion

In [None]:
# Simulate ARIMA(1,1,1) process
np.random.seed(42)
n = 300
phi, theta = 0.6, 0.4

# Generate ARIMA(1,1,1)
# First generate ARMA(1,1) for the differences
eps = np.random.randn(n)
diff_y = np.zeros(n)
for t in range(1, n):
    diff_y[t] = phi * diff_y[t-1] + eps[t] + theta * eps[t-1]

# Cumulative sum to get I(1) series
y_arima = np.cumsum(diff_y)

fig, axes = plt.subplots(2, 2, figsize=(14, 8))

# Original series
axes[0, 0].plot(y_arima, color=COLORS['blue'], linewidth=1, label='ARIMA(1,1,1)')
axes[0, 0].set_title('ARIMA(1,1,1) Process (Non-stationary)', fontweight='bold')
axes[0, 0].set_xlabel('Time')

# ACF of original
plot_acf(y_arima, ax=axes[0, 1], lags=30, color=COLORS['blue'])
axes[0, 1].set_title('ACF of Y_t (Slow decay)', fontweight='bold')

# Differenced series
axes[1, 0].plot(diff_y, color=COLORS['green'], linewidth=1, label='ΔY_t')
axes[1, 0].axhline(y=0, color='red', linestyle='--', alpha=0.5)
axes[1, 0].set_title('First Difference ΔY_t ~ ARMA(1,1)', fontweight='bold')
axes[1, 0].set_xlabel('Time')

# ACF of differenced
plot_acf(diff_y, ax=axes[1, 1], lags=30, color=COLORS['green'])
axes[1, 1].set_title('ACF of ΔY_t (Stationary pattern)', fontweight='bold')

# Collect handles and labels
handles, labels = [], []
for ax in [axes[0, 0], axes[1, 0]]:
    h, l = ax.get_legend_handles_labels()
    handles.extend(h)
    labels.extend(l)

plt.subplots_adjust(bottom=0.12)
fig.legend(handles, labels, loc='lower center', ncol=2, bbox_to_anchor=(0.5, -0.02))
plt.tight_layout(rect=[0, 0.05, 1, 1])
plt.show()

In [None]:
print(f"True parameters: φ = {phi}, θ = {theta}, d = 1")

In [None]:
# Generate random walk for unit root testing
np.random.seed(42)
rw = np.cumsum(np.random.randn(200))

# Test the random walk
run_unit_root_tests(rw, "Random Walk (Y_t)")

# Test the differenced random walk
rw_diff = np.diff(rw)
run_unit_root_tests(rw_diff, "Differenced Random Walk")

print()
print("="*60)
print("Summary: Random walk is I(1) - one difference makes it stationary")
print("="*60)

## 6. ARIMA(p,d,q) Models

An **ARIMA(p,d,q)** model combines:
- **AR(p)**: Autoregressive component
- **I(d)**: Integration (differencing)
- **MA(q)**: Moving average component

$$\phi(L)(1-L)^d Y_t = c + \theta(L)\varepsilon_t$$

### Special Cases
- ARIMA(p,0,q) = ARMA(p,q)
- ARIMA(0,1,0) = Random walk
- ARIMA(0,1,1) = Simple exponential smoothing

In [None]:
# Simulate ARIMA(1,1,1) process
np.random.seed(42)
n = 300
phi, theta = 0.6, 0.4

# Generate ARIMA(1,1,1)
# First generate ARMA(1,1) for the differences
eps = np.random.randn(n)
diff_y = np.zeros(n)
for t in range(1, n):
    diff_y[t] = phi * diff_y[t-1] + eps[t] + theta * eps[t-1]

# Cumulative sum to get I(1) series
y_arima = np.cumsum(diff_y)

fig, axes = plt.subplots(2, 2, figsize=(14, 8))

# Original series
axes[0, 0].plot(y_arima, color=COLORS['blue'], linewidth=1, label='ARIMA(1,1,1)')
axes[0, 0].set_title('ARIMA(1,1,1) Process (Non-stationary)', fontweight='bold')
axes[0, 0].set_xlabel('Time')
axes[0, 0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), frameon=False)

# ACF of original
plot_acf(y_arima, ax=axes[0, 1], lags=30, color=COLORS['blue'])
axes[0, 1].set_title('ACF of Y_t (Slow decay)', fontweight='bold')

# Differenced series
axes[1, 0].plot(diff_y, color=COLORS['green'], linewidth=1, label='ΔY_t')
axes[1, 0].axhline(y=0, color='red', linestyle='--', alpha=0.5)
axes[1, 0].set_title('First Difference ΔY_t ~ ARMA(1,1)', fontweight='bold')
axes[1, 0].set_xlabel('Time')
axes[1, 0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), frameon=False)

# ACF of differenced
plot_acf(diff_y, ax=axes[1, 1], lags=30, color=COLORS['green'])
axes[1, 1].set_title('ACF of ΔY_t (Stationary pattern)', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"True parameters: φ = {phi}, θ = {theta}, d = 1")

## 7. ARIMA Model Estimation

In [None]:
# Fit ARIMA(1,1,1) to our simulated data
model = ARIMA(y_arima, order=(1, 1, 1))
results = model.fit()

print("ARIMA(1,1,1) Estimation Results")
print("=" * 60)
print(results.summary())

In [None]:
# Compare estimated vs true parameters
print("\nParameter Comparison:")
print("=" * 40)
print(f"{'Parameter':<15} {'True':>10} {'Estimated':>12}")
print("-" * 40)
print(f"{'AR(1) φ':<15} {phi:>10.4f} {results.arparams[0]:>12.4f}")
print(f"{'MA(1) θ':<15} {theta:>10.4f} {results.maparams[0]:>12.4f}")
print("-" * 40)

## 8. Model Selection: Determining p, d, q

### Step 1: Determine d
- Use unit root tests (ADF, KPSS)
- Difference until stationary
- Typically d = 1 for economic data

### Step 2: Determine p and q
- Examine ACF/PACF of differenced series
- Use information criteria (AIC, BIC)

In [None]:
# Model selection using information criteria
print("Model Comparison (comparing different ARIMA specifications):")
print("=" * 60)
print(f"{'Model':<20} {'AIC':>12} {'BIC':>12}")
print("-" * 60)

orders = [
    (0, 1, 0),  # Random walk
    (1, 1, 0),  # ARI(1,1)
    (0, 1, 1),  # IMA(1,1)
    (1, 1, 1),  # ARIMA(1,1,1)
    (2, 1, 0),  # ARI(2,1)
    (0, 1, 2),  # IMA(2,1)
    (2, 1, 1),  # ARIMA(2,1,1)
]

best_aic = float('inf')
best_bic = float('inf')
best_model_aic = None
best_model_bic = None

for order in orders:
    try:
        model = ARIMA(y_arima, order=order)
        res = model.fit()
        model_name = f"ARIMA{order}"
        print(f"{model_name:<20} {res.aic:>12.2f} {res.bic:>12.2f}")
        
        if res.aic < best_aic:
            best_aic = res.aic
            best_model_aic = model_name
        if res.bic < best_bic:
            best_bic = res.bic
            best_model_bic = model_name
    except:
        pass

print("-" * 60)
print(f"Best by AIC: {best_model_aic}")
print(f"Best by BIC: {best_model_bic}")

In [None]:
# Get residuals from our fitted model
residuals = results.resid

fig, axes = plt.subplots(2, 2, figsize=(14, 8))

# Residuals over time
axes[0, 0].plot(residuals, color=COLORS['blue'], linewidth=0.5, label='Residuals')
axes[0, 0].axhline(y=0, color='red', linestyle='--')
axes[0, 0].set_title('Residuals Over Time', fontweight='bold')
axes[0, 0].set_xlabel('Time')

# Histogram
axes[0, 1].hist(residuals, bins=30, color=COLORS['blue'], edgecolor='black', 
                alpha=0.7, density=True, label='Residuals')
x = np.linspace(residuals.min(), residuals.max(), 100)
axes[0, 1].plot(x, stats.norm.pdf(x, residuals.mean(), residuals.std()), 
                color=COLORS['red'], linewidth=2, label='Normal')
axes[0, 1].set_title('Residual Distribution', fontweight='bold')

# ACF of residuals
plot_acf(residuals, ax=axes[1, 0], lags=20, color=COLORS['blue'])
axes[1, 0].set_title('ACF of Residuals', fontweight='bold')

# Q-Q plot
(osm, osr), (slope, intercept, r) = stats.probplot(residuals, dist="norm")
axes[1, 1].scatter(osm, osr, color=COLORS['blue'], s=20, alpha=0.5, label='Sample')
axes[1, 1].plot(osm, slope*osm + intercept, color=COLORS['red'], linewidth=2, label='Theoretical')
axes[1, 1].set_title('Q-Q Plot', fontweight='bold')
axes[1, 1].set_xlabel('Theoretical Quantiles')
axes[1, 1].set_ylabel('Sample Quantiles')

# Collect handles and labels from relevant subplots
handles, labels = [], []
for ax in [axes[0, 0], axes[0, 1], axes[1, 1]]:
    h, l = ax.get_legend_handles_labels()
    handles.extend(h)
    labels.extend(l)

plt.subplots_adjust(bottom=0.12)
fig.legend(handles, labels, loc='lower center', ncol=5, bbox_to_anchor=(0.5, -0.02))
plt.tight_layout(rect=[0, 0.05, 1, 1])
plt.show()

In [None]:
# Install pmdarima if not available
try:
    import pmdarima as pm
    print("pmdarima is available")
except ImportError:
    print("Installing pmdarima...")
    !pip install pmdarima -q
    import pmdarima as pm
    print("pmdarima installed successfully")

In [None]:
# Use auto_arima to find best model
import pmdarima as pm

auto_model = pm.auto_arima(
    y_arima,
    start_p=0, start_q=0,
    max_p=3, max_q=3,
    d=None,  # Let auto_arima determine d
    seasonal=False,
    stepwise=True,
    suppress_warnings=True,
    trace=True
)

print("\n" + "="*60)
print("Auto-ARIMA Selected Model:")
print("="*60)
print(auto_model.summary())

In [None]:
# Generate forecasts
forecast_steps = 50
forecast = results.get_forecast(steps=forecast_steps)
forecast_mean = forecast.predicted_mean
forecast_ci = forecast.conf_int()

# Handle both DataFrame and numpy array formats
if hasattr(forecast_ci, 'iloc'):
    ci_lower = forecast_ci.iloc[:, 0]
    ci_upper = forecast_ci.iloc[:, 1]
else:
    ci_lower = forecast_ci[:, 0]
    ci_upper = forecast_ci[:, 1]

# Plot
fig, ax = plt.subplots(figsize=(14, 6))

# Historical data (last 100 points)
ax.plot(range(200, 300), y_arima[200:], color=COLORS['blue'], linewidth=1, label='Historical')

# Forecasts
forecast_index = range(300, 300 + forecast_steps)
ax.plot(forecast_index, forecast_mean, color=COLORS['red'], linewidth=2, label='Forecast')

# Confidence interval
ax.fill_between(forecast_index, ci_lower, ci_upper,
                color=COLORS['red'], alpha=0.2, label='95% CI')

ax.axvline(x=300, color='black', linestyle='-', alpha=0.3)
ax.set_xlabel('Time')
ax.set_ylabel('Value')
ax.set_title('ARIMA(1,1,1) Forecasts with 95% Confidence Interval', fontweight='bold')

# Get handles and labels for fig.legend
handles, labels = ax.get_legend_handles_labels()

plt.subplots_adjust(bottom=0.15)
fig.legend(handles, labels, loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.02))
plt.tight_layout(rect=[0, 0.08, 1, 1])
plt.show()

In [None]:
# CI width analysis
if hasattr(ci_upper, 'iloc'):
    ci_width = ci_upper.iloc[-1] - ci_lower.iloc[-1]
    ci_width_1 = ci_upper.iloc[0] - ci_lower.iloc[0]
else:
    ci_width = ci_upper[-1] - ci_lower[-1]
    ci_width_1 = ci_upper[0] - ci_lower[0]

print(f"\nForecast Properties:")
print(f"- 95% CI width at h=1: {ci_width_1:.4f}")
print(f"- 95% CI width at h={forecast_steps}: {ci_width:.4f}")
print(f"- CI grows because I(1) processes have unbounded forecast variance")

In [None]:
# Get residuals from our fitted model
residuals = results.resid

fig, axes = plt.subplots(2, 2, figsize=(14, 8))

# Residuals over time
axes[0, 0].plot(residuals, color=COLORS['blue'], linewidth=0.5, label='Residuals')
axes[0, 0].axhline(y=0, color='red', linestyle='--')
axes[0, 0].set_title('Residuals Over Time', fontweight='bold')
axes[0, 0].set_xlabel('Time')
axes[0, 0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), frameon=False)

# Histogram
axes[0, 1].hist(residuals, bins=30, color=COLORS['blue'], edgecolor='black', 
                alpha=0.7, density=True, label='Residuals')
x = np.linspace(residuals.min(), residuals.max(), 100)
axes[0, 1].plot(x, stats.norm.pdf(x, residuals.mean(), residuals.std()), 
                color=COLORS['red'], linewidth=2, label='Normal')
axes[0, 1].set_title('Residual Distribution', fontweight='bold')
axes[0, 1].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=2, frameon=False)

# ACF of residuals
plot_acf(residuals, ax=axes[1, 0], lags=20, color=COLORS['blue'])
axes[1, 0].set_title('ACF of Residuals', fontweight='bold')

# Q-Q plot
(osm, osr), (slope, intercept, r) = stats.probplot(residuals, dist="norm")
axes[1, 1].scatter(osm, osr, color=COLORS['blue'], s=20, alpha=0.5, label='Sample')
axes[1, 1].plot(osm, slope*osm + intercept, color=COLORS['red'], linewidth=2, label='Theoretical')
axes[1, 1].set_title('Q-Q Plot', fontweight='bold')
axes[1, 1].set_xlabel('Theoretical Quantiles')
axes[1, 1].set_ylabel('Sample Quantiles')
axes[1, 1].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=2, frameon=False)

plt.tight_layout()
plt.show()

In [None]:
# Fetch US GDP data
try:
    import pandas_datareader.data as web
    gdp = web.DataReader("GDP", "fred", "2000-01-01", "2024-12-31")
    gdp = gdp["GDP"]
    print(f"GDP data loaded: {len(gdp)} observations")
except Exception as e:
    print(f"Could not fetch GDP data: {e}")
    # Create simulated GDP data as fallback
    import pandas as pd
    np.random.seed(42)
    n = 100
    trend = np.exp(0.005 * np.arange(n) + 9.5)
    noise = np.random.randn(n) * 0.02 * trend
    gdp_values = trend + noise
    dates = pd.date_range("2000-01-01", periods=n, freq="QE")
    gdp = pd.Series(gdp_values, index=dates)
    print(f"Using simulated GDP data: {len(gdp)} observations")

In [None]:
# Plot GDP and log GDP
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# GDP level
gdp_values = gdp.values.flatten() if hasattr(gdp, 'values') else gdp
axes[0].plot(gdp.index, gdp_values, color=COLORS['blue'], linewidth=1, label='GDP')
axes[0].set_title('US GDP (Billions of $)', fontweight='bold')
axes[0].set_xlabel('Date')

# Log GDP
log_gdp = np.log(gdp_values)
axes[1].plot(gdp.index, log_gdp, color=COLORS['green'], linewidth=1, label='Log GDP')
axes[1].set_title('Log GDP (for modeling)', fontweight='bold')
axes[1].set_xlabel('Date')

# Collect handles and labels
handles, labels = [], []
for ax in axes:
    h, l = ax.get_legend_handles_labels()
    handles.extend(h)
    labels.extend(l)

plt.subplots_adjust(bottom=0.15)
fig.legend(handles, labels, loc='lower center', ncol=2, bbox_to_anchor=(0.5, -0.02))
plt.tight_layout(rect=[0, 0.08, 1, 1])
plt.show()

## 11. Forecasting with ARIMA

Key property for I(1) series: **Forecast intervals grow with horizon**

In [None]:
# Generate forecasts
forecast_steps = 50
forecast = results.get_forecast(steps=forecast_steps)
forecast_mean = forecast.predicted_mean
forecast_ci = forecast.conf_int()

# Handle both DataFrame and numpy array formats
if hasattr(forecast_ci, 'iloc'):
    ci_lower = forecast_ci.iloc[:, 0]
    ci_upper = forecast_ci.iloc[:, 1]
else:
    ci_lower = forecast_ci[:, 0]
    ci_upper = forecast_ci[:, 1]

# Plot
fig, ax = plt.subplots(figsize=(14, 6))

# Historical data (last 100 points)
ax.plot(range(200, 300), y_arima[200:], color=COLORS['blue'], linewidth=1, label='Historical')

# Forecasts
forecast_index = range(300, 300 + forecast_steps)
ax.plot(forecast_index, forecast_mean, color=COLORS['red'], linewidth=2, label='Forecast')

# Confidence interval
ax.fill_between(forecast_index, ci_lower, ci_upper,
                color=COLORS['red'], alpha=0.2, label='95% CI')

ax.axvline(x=300, color='black', linestyle='-', alpha=0.3)
ax.set_xlabel('Time')
ax.set_ylabel('Value')
ax.set_title('ARIMA(1,1,1) Forecasts with 95% Confidence Interval', fontweight='bold')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=4, frameon=False)
plt.tight_layout()
plt.show()

# CI width analysis
if hasattr(ci_upper, 'iloc'):
    ci_width = ci_upper.iloc[-1] - ci_lower.iloc[-1]
    ci_width_1 = ci_upper.iloc[0] - ci_lower.iloc[0]
else:
    ci_width = ci_upper[-1] - ci_lower[-1]
    ci_width_1 = ci_upper[0] - ci_lower[0]

print(f"\nForecast Properties:")
print(f"- 95% CI width at h=1: {ci_width_1:.4f}")
print(f"- 95% CI width at h={forecast_steps}: {ci_width:.4f}")
print(f"- CI grows because I(1) processes have unbounded forecast variance")

## 12. Real Data Example: GDP

In [None]:
# Forecast GDP
forecast_periods = 12  # 3 years quarterly
fc, conf_int = auto_gdp.predict(n_periods=forecast_periods, return_conf_int=True)

# Convert back from log
gdp_forecast = np.exp(fc)
gdp_ci_lower = np.exp(conf_int[:, 0])
gdp_ci_upper = np.exp(conf_int[:, 1])

# Plot
fig, ax = plt.subplots(figsize=(14, 6))

# Historical
ax.plot(gdp.index[-40:], gdp_values[-40:], color=COLORS['blue'], linewidth=1.5, label='Historical GDP')

# Forecast dates
last_date = gdp.index[-1]
forecast_dates = pd.date_range(start=last_date, periods=forecast_periods+1, freq='QE')[1:]

ax.plot(forecast_dates, gdp_forecast, color=COLORS['red'], linewidth=2, label='Forecast')
ax.fill_between(forecast_dates, gdp_ci_lower, gdp_ci_upper,
                color=COLORS['red'], alpha=0.2, label='95% CI')

ax.axvline(x=last_date, color='black', linestyle='-', alpha=0.3)
ax.set_xlabel('Date')
ax.set_ylabel('GDP (Billions $)')
ax.set_title('US GDP Forecast with ARIMA', fontweight='bold')

# Get handles and labels for fig.legend
handles, labels = ax.get_legend_handles_labels()

plt.subplots_adjust(bottom=0.15)
fig.legend(handles, labels, loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.02))
plt.tight_layout(rect=[0, 0.08, 1, 1])
plt.show()

In [None]:
# Plot GDP and log GDP
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# GDP level
gdp_values = gdp.values.flatten() if hasattr(gdp, 'values') else gdp
axes[0].plot(gdp.index, gdp_values, color=COLORS['blue'], linewidth=1, label='GDP')
axes[0].set_title('US GDP (Billions of $)', fontweight='bold')
axes[0].set_xlabel('Date')
axes[0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), frameon=False)

# Log GDP
log_gdp = np.log(gdp_values)
axes[1].plot(gdp.index, log_gdp, color=COLORS['green'], linewidth=1, label='Log GDP')
axes[1].set_title('Log GDP (for modeling)', fontweight='bold')
axes[1].set_xlabel('Date')
axes[1].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), frameon=False)

plt.tight_layout()
plt.show()

In [None]:
# Unit root tests on log GDP
log_gdp_clean = log_gdp[~np.isnan(log_gdp)]
run_unit_root_tests(log_gdp_clean, "Log GDP")

In [None]:
# Test on GDP growth (first difference of log GDP)
gdp_growth = np.diff(log_gdp_clean)
run_unit_root_tests(gdp_growth, "GDP Growth (Δ log GDP)")

In [None]:
# Fit ARIMA model to log GDP
import pmdarima as pm

auto_gdp = pm.auto_arima(
    log_gdp_clean,
    start_p=0, start_q=0,
    max_p=3, max_q=3,
    d=None,
    seasonal=False,
    stepwise=True,
    suppress_warnings=True,
    trace=True
)

print("\nBest ARIMA model for log GDP:")
print(auto_gdp.summary())

In [None]:
# Forecast GDP
forecast_periods = 12  # 3 years quarterly
fc, conf_int = auto_gdp.predict(n_periods=forecast_periods, return_conf_int=True)

# Convert back from log
gdp_forecast = np.exp(fc)
gdp_ci_lower = np.exp(conf_int[:, 0])
gdp_ci_upper = np.exp(conf_int[:, 1])

# Plot
fig, ax = plt.subplots(figsize=(14, 6))

# Historical
ax.plot(gdp.index[-40:], gdp_values[-40:], color=COLORS['blue'], linewidth=1.5, label='Historical GDP')

# Forecast dates
last_date = gdp.index[-1]
forecast_dates = pd.date_range(start=last_date, periods=forecast_periods+1, freq='QE')[1:]

ax.plot(forecast_dates, gdp_forecast, color=COLORS['red'], linewidth=2, label='Forecast')
ax.fill_between(forecast_dates, gdp_ci_lower, gdp_ci_upper,
                color=COLORS['red'], alpha=0.2, label='95% CI')

ax.axvline(x=last_date, color='black', linestyle='-', alpha=0.3)
ax.set_xlabel('Date')
ax.set_ylabel('GDP (Billions $)')
ax.set_title('US GDP Forecast with ARIMA', fontweight='bold')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3, frameon=False)
plt.tight_layout()
plt.show()

## Summary

### Key Takeaways

1. **Non-stationarity** is common in economic data
   - Random walk: $Y_t = Y_{t-1} + \varepsilon_t$
   - Variance grows with time

2. **Differencing** transforms I(d) to I(0)
   - $\Delta Y_t = Y_t - Y_{t-1}$
   - Usually d=1 for economic data

3. **Unit root tests** determine d
   - ADF: H₀ = unit root
   - KPSS: H₀ = stationary

4. **ARIMA(p,d,q)** combines differencing with ARMA
   - Use AIC/BIC for model selection
   - Auto-ARIMA automates the process

5. **Forecasts** for I(1) processes have growing uncertainty

### Next Chapter
- Seasonal ARIMA (SARIMA) models
- Seasonal differencing
- Forecasting with seasonality