[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/danpele/Time-Series-Analysis/blob/main/EN/Course_Notebooks/chapter8_lecture_notebook.ipynb)

---

# Chapter 8: Modern Extensions - ARFIMA, Random Forest, LSTM

**Course:** Time Series Analysis and Forecasting  
**Program:** Bachelor program, Faculty of Cybernetics, Statistics and Economic Informatics, Bucharest University of Economic Studies, Romania  
**Academic Year:** 2025-2026

---

## Learning Objectives

By the end of this notebook, you will be able to:
1. Understand the concept of long memory in time series
2. Estimate and interpret ARFIMA models
3. Apply Random Forest for time series forecasting
4. Build LSTM networks for sequential data
5. Compare performance across different methods
6. Choose the appropriate method based on data characteristics

## Setup and Imports

In [None]:
# Install required packages (uncomment if needed in Colab)
# !pip install arch yfinance tensorflow scikit-learn statsmodels -q

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Statistical models
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Machine Learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

# For fetching real data
try:
    import yfinance as yf
    HAS_YF = True
except ImportError:
    HAS_YF = False
    print("yfinance not installed. Install with: pip install yfinance")

# Deep Learning (optional)
try:
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, Dense, Dropout
    HAS_TF = True
except ImportError:
    HAS_TF = False
    print("TensorFlow not installed. Install with: pip install tensorflow")

# Plotting style
plt.rcParams['figure.figsize'] = (12, 5)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.facecolor'] = 'none'
plt.rcParams['figure.facecolor'] = 'none'
plt.rcParams['axes.grid'] = False
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['legend.frameon'] = False

COLORS = {'blue': '#1A3A6E', 'red': '#DC3545', 'green': '#2E7D32', 'orange': '#E67E22', 'gray': '#666666'}

print("Setup complete!")
print(f"TensorFlow available: {HAS_TF}")
print(f"yfinance available: {HAS_YF}")

## 1. Long Memory and ARFIMA

### Short Memory vs Long Memory

- **Short Memory (ARMA)**: Autocorrelations decay exponentially
- **Long Memory (ARFIMA)**: Autocorrelations decay hyperbolically (slowly)

In [None]:
# Demonstrate short vs long memory
np.random.seed(42)
n = 1000

# Short memory: White noise (H ≈ 0.5)
white_noise = np.random.randn(n)

# Short memory: AR(1) with moderate phi (for comparison)
phi = 0.3
ar1_process = np.zeros(n)
for t in range(1, n):
    ar1_process[t] = phi * ar1_process[t-1] + np.random.randn()

# Long memory simulation (fractional Gaussian noise approximation)
d = 0.4  # Long memory parameter (0 < d < 0.5)

def generate_long_memory(n, d):
    """Generate long memory series using Hosking's method"""
    # Generate fractional Gaussian noise
    gamma = np.zeros(n)
    gamma[0] = 1.0
    for k in range(1, n):
        gamma[k] = gamma[k-1] * (k - 1 + d) / (k - d)
    
    # Build covariance matrix and generate
    # Use Cholesky decomposition for proper correlation structure
    cov = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            k = abs(i - j)
            if k < n:
                cov[i, j] = gamma[k]
    
    # Add small regularization for numerical stability
    cov += np.eye(n) * 1e-10
    
    try:
        L = np.linalg.cholesky(cov)
        z = np.random.randn(n)
        long_memory = L @ z
    except:
        # Fallback: use spectral method
        freqs = np.fft.fftfreq(n)
        freqs[0] = 1e-10
        spectrum = np.abs(freqs) ** (-2 * d)
        spectrum[0] = 0
        phases = np.random.uniform(0, 2*np.pi, n)
        fft_vals = np.sqrt(spectrum) * np.exp(1j * phases)
        long_memory = np.real(np.fft.ifft(fft_vals))
    
    return long_memory

long_memory = generate_long_memory(n, d)

# Plot comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 8))

# Time series
axes[0, 0].plot(white_noise, color=COLORS['blue'], linewidth=0.8)
axes[0, 0].set_title('Short Memory: White Noise', fontweight='bold')
axes[0, 0].set_xlabel('Time')

axes[0, 1].plot(long_memory, color=COLORS['orange'], linewidth=0.8)
axes[0, 1].set_title(f'Long Memory: Fractional Noise (d={d})', fontweight='bold')
axes[0, 1].set_xlabel('Time')

# ACF comparison
acf_short = acf(white_noise, nlags=50)
acf_long = acf(long_memory, nlags=50)

axes[1, 0].bar(range(51), acf_short, color=COLORS['blue'], alpha=0.7, label='ACF values')
axes[1, 0].axhline(y=0, color='black', linestyle='-')
axes[1, 0].axhline(y=1.96/np.sqrt(n), color='red', linestyle='--', alpha=0.5, label='95% confidence')
axes[1, 0].axhline(y=-1.96/np.sqrt(n), color='red', linestyle='--', alpha=0.5)
axes[1, 0].set_title('ACF: Short Memory (fast decay)', fontweight='bold')
axes[1, 0].set_xlabel('Lag')
axes[1, 0].set_ylim(-0.2, 1.0)
axes[1, 0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2, frameon=False)

axes[1, 1].bar(range(51), acf_long, color=COLORS['orange'], alpha=0.7, label='ACF values')
axes[1, 1].axhline(y=0, color='black', linestyle='-')
axes[1, 1].axhline(y=1.96/np.sqrt(n), color='red', linestyle='--', alpha=0.5, label='95% confidence')
axes[1, 1].axhline(y=-1.96/np.sqrt(n), color='red', linestyle='--', alpha=0.5)
axes[1, 1].set_title('ACF: Long Memory (slow/hyperbolic decay)', fontweight='bold')
axes[1, 1].set_xlabel('Lag')
axes[1, 1].set_ylim(-0.2, 1.0)
axes[1, 1].legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2, frameon=False)

plt.tight_layout()
plt.subplots_adjust(bottom=0.12)
plt.show()

print("Key difference:")
print("- Short memory (white noise): ACF drops to zero immediately")
print("- Long memory: ACF remains significant at high lags (slow decay)")

### Estimating the Long Memory Parameter d

The **Hurst exponent** H and fractional differencing parameter d are related:
$$d = H - 0.5$$

- H = 0.5: Random walk (no memory)
- H > 0.5: Persistent (trending)
- H < 0.5: Anti-persistent (mean-reverting)

In [None]:
# R/S Analysis to estimate Hurst exponent
def hurst_rs(ts, min_window=10):
    """
    Estimate Hurst exponent using Rescaled Range (R/S) analysis.
    
    The idea: For a series with Hurst exponent H:
        R/S ~ n^H
    
    So in log-log space: log(R/S) = H * log(n) + c
    The slope H tells us about memory.
    """
    n = len(ts)
    max_k = int(np.log2(n))
    
    rs_list = []
    n_list = []
    
    for k in range(4, max_k):
        size = 2**k
        if size > n:
            break
        
        rs_values = []
        for start in range(0, n - size + 1, size):
            segment = ts[start:start + size]
            mean_seg = np.mean(segment)
            # Cumulative deviation from mean
            cumdev = np.cumsum(segment - mean_seg)
            # Range
            R = np.max(cumdev) - np.min(cumdev)
            # Standard deviation
            S = np.std(segment, ddof=1)
            if S > 0:
                rs_values.append(R / S)
        
        if rs_values:
            rs_list.append(np.mean(rs_values))
            n_list.append(size)
    
    # Linear regression in log-log space gives H as slope
    log_n = np.log(n_list)
    log_rs = np.log(rs_list)
    H, c = np.polyfit(log_n, log_rs, 1)
    
    return H, n_list, rs_list

# Estimate Hurst for both series
H_short, n_short, rs_short = hurst_rs(white_noise)
H_long, n_long, rs_long = hurst_rs(long_memory)

print("Hurst Exponent Estimation (R/S Analysis)")
print("="*50)
print(f"White Noise (short memory): H = {H_short:.3f}")
print(f"Fractional Noise (long memory): H = {H_long:.3f}")
print(f"\nInterpretation:")
print(f"  H < 0.5: Anti-persistent (mean-reverting)")
print(f"  H = 0.5: Random walk (no memory)")
print(f"  H > 0.5: Persistent (trending/long memory)")

# Plot R/S analysis - clearer visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: Show the R/S values and fitted lines
ax = axes[0]
ax.scatter(n_short, rs_short, color=COLORS['blue'], s=80, zorder=3, label='White Noise data points')
ax.scatter(n_long, rs_long, color=COLORS['orange'], s=80, zorder=3, label='Long Memory data points')

# Fitted lines
n_fit = np.array([min(n_short), max(n_short)])
ax.plot(n_fit, np.exp(np.log(n_fit) * H_short) * 0.8, '--', color=COLORS['blue'], 
        linewidth=2, label=f'White Noise fit (H={H_short:.2f})')
ax.plot(n_fit, np.exp(np.log(n_fit) * H_long) * 0.5, '--', color=COLORS['orange'], 
        linewidth=2, label=f'Long Memory fit (H={H_long:.2f})')

ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Window Size (n)', fontsize=11)
ax.set_ylabel('R/S Statistic', fontsize=11)
ax.set_title('R/S Analysis: Slope = Hurst Exponent', fontweight='bold')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2, frameon=False)

# Right: Show interpretation
ax2 = axes[1]
h_values = [0.3, 0.5, 0.7, 0.9]
colors_h = [COLORS['green'], COLORS['gray'], COLORS['orange'], COLORS['red']]
labels_h = ['H=0.3 (mean-reverting)', 'H=0.5 (random walk)', 'H=0.7 (persistent)', 'H=0.9 (strong persistence)']

n_demo = np.linspace(16, 512, 100)
for h, c, lab in zip(h_values, colors_h, labels_h):
    ax2.plot(n_demo, n_demo**h, color=c, linewidth=2, label=lab)

# Mark our estimated values
ax2.axhline(y=100, color='black', linestyle=':', alpha=0.3)
ax2.text(20, 120, f'Our estimates:\nWhite Noise H={H_short:.2f}\nLong Memory H={H_long:.2f}', 
         fontsize=10, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.set_xlabel('Window Size (n)', fontsize=11)
ax2.set_ylabel('R/S Statistic', fontsize=11)
ax2.set_title('Interpretation: Steeper Slope = More Persistence', fontweight='bold')
ax2.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2, frameon=False)

plt.tight_layout()
plt.subplots_adjust(bottom=0.25)
plt.show()

print("\nKey insight: The SLOPE in log-log space equals H.")
print("Steeper slope = higher H = more persistent/trending behavior.")

## 2. Real Data: Bitcoin Volatility (Long Memory)

Financial volatility is known to exhibit long memory properties.

In [None]:
# Fetch Bitcoin data
if HAS_YF:
    btc = yf.download('BTC-USD', start='2020-01-01', end='2024-12-31', progress=False)
    # Handle MultiIndex columns from newer yfinance versions
    if isinstance(btc.columns, pd.MultiIndex):
        btc.columns = btc.columns.droplevel(1)
    # Ensure we get a Series (not DataFrame) and it has proper index
    close_prices = btc['Close'].squeeze()
    btc_returns = close_prices.pct_change().dropna() * 100
    btc_volatility = btc_returns.abs()  # Absolute returns as volatility proxy
    DATA_SOURCE = "Yahoo Finance"
else:
    # Simulated data
    np.random.seed(123)
    n = 1000
    btc_returns = pd.Series(np.random.randn(n) * 3, 
                           index=pd.date_range('2020-01-01', periods=n, freq='D'))
    btc_volatility = btc_returns.abs()
    DATA_SOURCE = "Simulated"

# Verify btc_returns is a Series with index
print(f"btc_returns type: {type(btc_returns)}")
print(f"btc_returns has index: {hasattr(btc_returns, 'index') and len(btc_returns.index) > 0}")

print(f"\nBitcoin Data ({DATA_SOURCE})")
print("="*50)
print(f"Period: {btc_returns.index[0].date()} to {btc_returns.index[-1].date()}")
print(f"Observations: {len(btc_returns)}")
print(f"\nReturns statistics:")
print(btc_returns.describe().round(2))

In [None]:
# Test for long memory in Bitcoin volatility
H_btc, n_btc, rs_btc = hurst_rs(btc_volatility.values)

print("Long Memory Test: Bitcoin Absolute Returns")
print("="*50)
print(f"Hurst Exponent: H = {H_btc:.3f}")
print(f"Implied d: d = {H_btc - 0.5:.3f}")
print(f"\nInterpretation:")
if H_btc > 0.5:
    print(f"  H = {H_btc:.2f} > 0.5 → Long memory/Persistence")
    print("  Volatility clusters tend to persist over time")
else:
    print(f"  H = {H_btc:.2f} ≈ 0.5 → No significant long memory")

# Plot ACF of absolute returns
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(btc_volatility.index, btc_volatility, color=COLORS['blue'], linewidth=0.5, alpha=0.7, label='|Returns|')
axes[0].set_title(f'Bitcoin Absolute Returns ({DATA_SOURCE})', fontweight='bold')
axes[0].set_xlabel('Date')
axes[0].set_ylabel('|Return| (%)')
axes[0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=1, frameon=False)

# ACF
acf_vol = acf(btc_volatility, nlags=100)
axes[1].bar(range(101), acf_vol, color=COLORS['orange'], alpha=0.7, label='ACF of |Returns|')
axes[1].axhline(y=0, color='black', linestyle='-')
axes[1].axhline(y=1.96/np.sqrt(len(btc_volatility)), color='red', linestyle='--', alpha=0.5, label='95% confidence')
axes[1].axhline(y=-1.96/np.sqrt(len(btc_volatility)), color='red', linestyle='--', alpha=0.5)
axes[1].set_title('ACF of Absolute Returns (slow decay = long memory)', fontweight='bold')
axes[1].set_xlabel('Lag')
axes[1].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=2, frameon=False)

plt.tight_layout()
plt.subplots_adjust(bottom=0.18)
plt.show()

## 3. Random Forest for Time Series

### Feature Engineering

Key to using ML for time series: create meaningful features from lagged values.

In [None]:
def create_features(df, target_col, lags=5, rolling_windows=[5, 10, 20]):
    """
    Create features for time series forecasting.
    
    Parameters:
    -----------
    df : DataFrame with datetime index
    target_col : name of target column
    lags : number of lag features
    rolling_windows : list of windows for rolling statistics
    
    Returns:
    --------
    DataFrame with features
    """
    data = df[[target_col]].copy()
    
    # Lag features
    for i in range(1, lags + 1):
        data[f'lag_{i}'] = data[target_col].shift(i)
    
    # Rolling statistics (calculated on past data only!)
    for w in rolling_windows:
        data[f'rolling_mean_{w}'] = data[target_col].shift(1).rolling(window=w).mean()
        data[f'rolling_std_{w}'] = data[target_col].shift(1).rolling(window=w).std()
    
    return data.dropna()

# Prepare Bitcoin PRICE data for Random Forest
btc_price_df = pd.DataFrame({'price': btc['Close'].squeeze()})
btc_features = create_features(btc_price_df, 'price', lags=10, rolling_windows=[5, 10, 20])

print("Feature Engineering for Random Forest")
print("="*50)
print(f"Target: Bitcoin Price (USD)")
print(f"\nFeatures created: {[c for c in btc_features.columns if c != 'price']}")
print(f"\nDataset shape: {btc_features.shape}")
print(f"\nFirst few rows:")
print(btc_features.head().round(2))

In [None]:
# Train/Test split (TEMPORAL - not random!)
train_size = int(len(btc_features) * 0.8)
train_data = btc_features.iloc[:train_size]
test_data = btc_features.iloc[train_size:]

# Separate features and target
feature_cols = [c for c in btc_features.columns if c != 'price']
X_train = train_data[feature_cols]
y_train = train_data['price']
X_test = test_data[feature_cols]
y_test = test_data['price']

print(f"Train set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"\nTrain period: {train_data.index[0].date()} to {train_data.index[-1].date()}")
print(f"Test period: {test_data.index[0].date()} to {test_data.index[-1].date()}")

In [None]:
# Train Random Forest
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)

# Metrics
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf = mean_absolute_error(y_test, y_pred_rf)

# Direction accuracy (predicting if price goes UP or DOWN)
actual_direction = np.diff(y_test.values) > 0
pred_direction = np.diff(y_pred_rf) > 0
direction_accuracy = np.mean(actual_direction == pred_direction) * 100

print("Random Forest Results (Price Prediction)")
print("="*50)
print(f"RMSE: ${rmse_rf:.2f}")
print(f"MAE:  ${mae_rf:.2f}")
print(f"Direction Accuracy: {direction_accuracy:.1f}%")

# Naive benchmark (predict previous price)
y_pred_naive = X_test['lag_1'].values
rmse_naive = np.sqrt(mean_squared_error(y_test, y_pred_naive))

print(f"\nNaive Benchmark (yesterday's price):")
print(f"RMSE: ${rmse_naive:.2f}")

print(f"\nRF improvement over naive:")
print(f"RMSE: {(1 - rmse_rf/rmse_naive)*100:.1f}%")

In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

# Convert to numpy arrays
y_test_np = np.array(y_test)
y_pred_np = np.array(y_pred_rf)

# Calculate correlation
corr = np.corrcoef(y_test_np, y_pred_np)[0, 1]

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Feature importance
top_features = feature_importance.head(10)
axes[0].barh(top_features['feature'], top_features['importance'], color=COLORS['blue'], label='Feature importance')
axes[0].set_xlabel('Importance')
axes[0].set_title('Top 10 Feature Importances', fontweight='bold')
axes[0].invert_yaxis()
axes[0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=1, frameon=False)

# Actual vs Predicted Price
axes[1].scatter(y_test_np, y_pred_np, alpha=0.5, s=15, color=COLORS['blue'], label='Predictions')

# Diagonal line (perfect prediction)
min_val = min(y_test_np.min(), y_pred_np.min())
max_val = max(y_test_np.max(), y_pred_np.max())
axes[1].plot([min_val, max_val], [min_val, max_val], 'r-', linewidth=2, label='Perfect prediction')

# Add info box
axes[1].text(0.05, 0.95, f'Correlation: {corr:.3f}\nDirection Acc: {direction_accuracy:.1f}%', 
             transform=axes[1].transAxes, fontsize=10, verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

axes[1].set_xlabel('Actual Price ($)')
axes[1].set_ylabel('Predicted Price ($)')
axes[1].set_title('Random Forest: Actual vs Predicted Price', fontweight='bold')
axes[1].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=2, frameon=False)

plt.tight_layout()
plt.subplots_adjust(bottom=0.18)
plt.show()

print("\nTop 5 Most Important Features:")
print(feature_importance.head().to_string(index=False))
print(f"\nCorrelation: {corr:.3f}")
print(f"Direction Accuracy: {direction_accuracy:.1f}%")
print("\nNote: High correlation but ~50% direction accuracy shows")
print("      model tracks price level but can't predict direction")

## 4. LSTM for Time Series

### Data Preparation for LSTM

LSTM requires:
1. **Scaled data** (usually MinMax or Standard scaling)
2. **3D input shape**: (samples, timesteps, features)

In [None]:
if HAS_TF:
    def create_sequences(data, n_steps):
        """
        Create sequences for LSTM.
        
        Parameters:
        -----------
        data : 1D array of values
        n_steps : number of time steps in each sequence
        
        Returns:
        --------
        X : sequences of shape (samples, n_steps, 1)
        y : target values
        """
        X, y = [], []
        for i in range(len(data) - n_steps):
            X.append(data[i:(i + n_steps)])
            y.append(data[i + n_steps])
        return np.array(X), np.array(y)
    
    # Prepare PRICE data
    price_values = btc['Close'].squeeze().values.reshape(-1, 1)
    
    # Scale data
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(price_values)
    
    # Create sequences
    n_steps = 20  # Use 20 days to predict next day
    X_lstm, y_lstm = create_sequences(scaled_data.flatten(), n_steps)
    
    # Reshape for LSTM: (samples, timesteps, features)
    X_lstm = X_lstm.reshape((X_lstm.shape[0], X_lstm.shape[1], 1))
    
    # Train/Test split (temporal)
    train_size = int(len(X_lstm) * 0.8)
    X_train_lstm = X_lstm[:train_size]
    y_train_lstm = y_lstm[:train_size]
    X_test_lstm = X_lstm[train_size:]
    y_test_lstm = y_lstm[train_size:]
    
    print("LSTM Data Preparation (Price Prediction)")
    print("="*50)
    print(f"Sequence length: {n_steps} days")
    print(f"X shape: {X_lstm.shape} (samples, timesteps, features)")
    print(f"y shape: {y_lstm.shape}")
    print(f"\nTrain samples: {len(X_train_lstm)}")
    print(f"Test samples: {len(X_test_lstm)}")
else:
    print("TensorFlow not available. Skipping LSTM section.")

In [None]:
if HAS_TF:
    # Build LSTM model
    model = Sequential([
        LSTM(50, activation='relu', return_sequences=True, 
             input_shape=(n_steps, 1)),
        Dropout(0.2),
        LSTM(50, activation='relu'),
        Dropout(0.2),
        Dense(1)
    ])
    
    model.compile(optimizer='adam', loss='mse')
    
    print("LSTM Model Architecture")
    print("="*50)
    model.summary()
    
    # Train model
    print("\nTraining LSTM (this may take a while)...")
    history = model.fit(
        X_train_lstm, y_train_lstm,
        epochs=50,
        batch_size=32,
        validation_split=0.1,
        verbose=0
    )
    print("Training complete!")

In [None]:
if HAS_TF:
    # Make predictions
    y_pred_lstm_scaled = model.predict(X_test_lstm, verbose=0)
    
    # Inverse transform to get actual prices
    y_pred_lstm = scaler.inverse_transform(y_pred_lstm_scaled.reshape(-1, 1)).flatten()
    y_test_lstm_orig = scaler.inverse_transform(y_test_lstm.reshape(-1, 1)).flatten()
    
    # Metrics
    rmse_lstm = np.sqrt(mean_squared_error(y_test_lstm_orig, y_pred_lstm))
    mae_lstm = mean_absolute_error(y_test_lstm_orig, y_pred_lstm)
    
    # Direction accuracy
    actual_dir = np.diff(y_test_lstm_orig) > 0
    pred_dir = np.diff(y_pred_lstm) > 0
    dir_acc_lstm = np.mean(actual_dir == pred_dir) * 100
    
    print("LSTM Results (Price Prediction)")
    print("="*50)
    print(f"RMSE: ${rmse_lstm:.2f}")
    print(f"MAE:  ${mae_lstm:.2f}")
    print(f"Direction Accuracy: {dir_acc_lstm:.1f}%")
    
    # Compare with Random Forest
    print(f"\nComparison:")
    print(f"Random Forest RMSE: ${rmse_rf:.2f}")
    print(f"LSTM RMSE:          ${rmse_lstm:.2f}")
    print(f"Naive RMSE:         ${rmse_naive:.2f}")

In [None]:
if HAS_TF:
    # Plot results
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Training history
    axes[0, 0].plot(history.history['loss'], label='Train Loss', color=COLORS['blue'])
    axes[0, 0].plot(history.history['val_loss'], label='Validation Loss', color=COLORS['orange'])
    axes[0, 0].set_title('LSTM Training History', fontweight='bold')
    axes[0, 0].set_xlabel('Epoch')
    axes[0, 0].set_ylabel('MSE Loss')
    axes[0, 0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=2, frameon=False)
    
    # Actual vs Predicted (last 100 points)
    n_plot = 100
    axes[0, 1].plot(range(n_plot), y_test_lstm_orig[-n_plot:], 
                    color=COLORS['blue'], label='Actual', linewidth=1)
    axes[0, 1].plot(range(n_plot), y_pred_lstm[-n_plot:], 
                    color=COLORS['red'], label='LSTM Prediction', linewidth=1, alpha=0.7)
    axes[0, 1].set_title('LSTM: Actual vs Predicted (last 100 days)', fontweight='bold')
    axes[0, 1].set_xlabel('Day')
    axes[0, 1].set_ylabel('Price ($)')
    axes[0, 1].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=2, frameon=False)
    
    # Scatter plot
    corr_lstm = np.corrcoef(y_test_lstm_orig, y_pred_lstm)[0, 1]
    axes[1, 0].scatter(y_test_lstm_orig, y_pred_lstm, alpha=0.5, s=10, color=COLORS['green'], label='Predictions')
    min_p = min(y_test_lstm_orig.min(), y_pred_lstm.min())
    max_p = max(y_test_lstm_orig.max(), y_pred_lstm.max())
    axes[1, 0].plot([min_p, max_p], [min_p, max_p], 'r--', linewidth=2, label='Perfect prediction')
    axes[1, 0].text(0.05, 0.95, f'Correlation: {corr_lstm:.3f}\nDir. Acc: {dir_acc_lstm:.1f}%', 
                    transform=axes[1, 0].transAxes, fontsize=10, verticalalignment='top',
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    axes[1, 0].set_xlabel('Actual Price ($)')
    axes[1, 0].set_ylabel('Predicted Price ($)')
    axes[1, 0].set_title('LSTM: Actual vs Predicted', fontweight='bold')
    axes[1, 0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=2, frameon=False)
    
    # Model comparison
    models = ['Naive', 'Random Forest', 'LSTM']
    rmses = [rmse_naive, rmse_rf, rmse_lstm]
    colors = [COLORS['gray'], COLORS['blue'], COLORS['green']]
    bars = axes[1, 1].bar(models, rmses, color=colors)
    axes[1, 1].set_ylabel('RMSE ($)')
    axes[1, 1].set_title('Model Comparison: RMSE (lower is better)', fontweight='bold')
    for i, v in enumerate(rmses):
        axes[1, 1].text(i, v + 50, f'${v:.0f}', ha='center')
    
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.08, hspace=0.35)
    plt.show()

## 5. Time Series Cross-Validation

**Important**: Never use standard k-fold CV for time series!

Use **walk-forward validation** instead.

In [None]:
# Time Series Cross-Validation for Random Forest
tscv = TimeSeriesSplit(n_splits=5)

cv_scores = []
fold_results = []

for fold, (train_idx, test_idx) in enumerate(tscv.split(btc_features), 1):
    X_train_cv = btc_features.iloc[train_idx][feature_cols]
    y_train_cv = btc_features.iloc[train_idx]['price']
    X_test_cv = btc_features.iloc[test_idx][feature_cols]
    y_test_cv = btc_features.iloc[test_idx]['price']
    
    rf_cv = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
    rf_cv.fit(X_train_cv, y_train_cv)
    y_pred_cv = rf_cv.predict(X_test_cv)
    
    rmse_cv = np.sqrt(mean_squared_error(y_test_cv, y_pred_cv))
    
    # Direction accuracy
    actual_dir = np.diff(y_test_cv.values) > 0
    pred_dir = np.diff(y_pred_cv) > 0
    dir_acc = np.mean(actual_dir == pred_dir) * 100
    
    cv_scores.append(rmse_cv)
    fold_results.append({
        'fold': fold,
        'train_size': len(train_idx),
        'test_size': len(test_idx),
        'rmse': rmse_cv,
        'dir_acc': dir_acc
    })

print("Time Series Cross-Validation Results (Price Prediction)")
print("="*60)
for r in fold_results:
    print(f"Fold {r['fold']}: Train={r['train_size']:4d}, Test={r['test_size']:4d}, RMSE=${r['rmse']:,.0f}, Dir.Acc={r['dir_acc']:.1f}%")

print(f"\nMean RMSE: ${np.mean(cv_scores):,.0f} (+/- ${np.std(cv_scores):,.0f})")
print(f"Mean Direction Accuracy: {np.mean([r['dir_acc'] for r in fold_results]):.1f}%")

## Summary

### Key Takeaways

1. **ARFIMA** extends ARIMA for long memory (fractional d)
   - Useful when ACF decays slowly (hyperbolic)
   - Common in financial volatility

2. **Random Forest** for time series:
   - Feature engineering is crucial (lags, rolling stats)
   - Handles non-linear relationships
   - Provides feature importance (interpretability)

3. **LSTM** for sequential data:
   - Captures complex patterns and long dependencies
   - Requires lots of data and careful tuning
   - "Black box" - limited interpretability

4. **Always use temporal train/test split**
   - Never shuffle time series data
   - Use TimeSeriesSplit for cross-validation

### When to Use What?

| Situation | Recommendation |
|-----------|----------------|
| Small data, linear | ARIMA/ARFIMA |
| Non-linear, interpretability needed | Random Forest |
| Large data, complex patterns | LSTM |
| Long memory in volatility | ARFIMA or GARCH |
| Multiple seasonalities | Prophet or TBATS |