In [83]:
import numpy as np
import pandas as pd
import keras
from statsmodels.tsa.arima.model import ARIMA
from arch import arch_model
from keras import models
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import statsmodels.tsa.seasonal as smt
from tqdm import tqdm


In [84]:
class DataPreprocessing:
    def __init__(self, data, decomposition_model="additive", rolling_window=12):
        self.data = data
        self.decomposition_model = decomposition_model
        self.rolling_window = rolling_window

    def decompose_data(self):
        decomposition = smt.seasonal_decompose(self.data, model=self.decomposition_model, period=self.rolling_window)
        trend = decomposition.trend.dropna()
        seasonal = decomposition.seasonal.dropna()
        residual = decomposition.resid.dropna()
        return trend, seasonal, residual

    def deseasonalize(self):
        trend, seasonal, residual = self.decompose_data()
        if self.decomposition_model == "additive":
            deseasonalized_data = self.data - seasonal
        elif self.decomposition_model == "multiplicative":
            deseasonalized_data = self.data / seasonal
        return deseasonalized_data.dropna()


In [96]:
def arima_forecast(train_data, test_data, order):
    model = ARIMA(train_data, order=order)
    model_fit = model.fit()
    forecast = model_fit.forecast(steps=len(test_data))
    return forecast

def garch_forecast(residuals, test_length, p, q):
    model = arch_model(residuals, vol='GARCH', p=p, q=q)
    model_fit = model.fit(disp='off')
    forecast = model_fit.forecast(horizon=test_length)
    garch_forecast = np.sqrt(forecast.variance.values[-1, :])
    return pd.Series(garch_forecast)

def lstm_forecast(train_data, test_data, hidden_units, epochs):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaled_train = scaler.fit_transform(train_data.values.reshape(-1, 1))

    X_train, y_train = [], []
    for i in range(5, len(scaled_train)):
        X_train.append(scaled_train[i-5:i, 0])
        y_train.append(scaled_train[i, 0])

    X_train, y_train = np.array(X_train), np.array(y_train)
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)

    model = Sequential()
    model.add(LSTM(hidden_units, input_shape=(X_train.shape[1], 1)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(X_train, y_train, epochs=epochs, batch_size=1, verbose=0)

    inputs = train_data.values[-5:].reshape(-1, 1)
    inputs = np.append(inputs, test_data.values.reshape(-1, 1), axis=0)
    inputs_scaled = scaler.transform(inputs)

    X_test = []
    for i in range(5, len(inputs_scaled)):
        X_test.append(inputs_scaled[i-5:i, 0])

    X_test = np.array(X_test)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
    lstm_pred = model.predict(X_test)
    lstm_pred = scaler.inverse_transform(lstm_pred)
    return pd.Series(lstm_pred.flatten(), index=test_data.index)

In [None]:
def forecast_models(data, model_choice, params):
    train_size = int(len(data) * 0.75)
    train_data, test_data = data[:train_size], data[train_size:]

    if model_choice == 'ARIMA':
        order = params.get('arima_order', (1, 1, 1))
        forecast = arima_forecast(train_data, test_data, order)
    elif model_choice == 'GARCH':
        # First, fit ARIMA to get residuals
        arima_order = params.get('arima_order', (1, 1, 1))
        arima_model = ARIMA(train_data, order=arima_order)
        arima_fit = arima_model.fit()
        residuals = arima_fit.resid
        p = params.get('garch_p', 1)
        q = params.get('garch_q', 1)
        forecast = garch_forecast(residuals, len(test_data), p, q)
    elif model_choice == 'LSTM':
        hidden_units = params.get('hidden_units', 50)
        epochs = params.get('epochs', 10)
        forecast = lstm_forecast(train_data, test_data, hidden_units, epochs)
    elif model_choice == 'Hybrid':
        # ARIMA Forecast
        order = params.get('arima_order', (1, 1, 1))
        arima_pred = arima_forecast(train_data, test_data, order)
        # GARCH Forecast on ARIMA Residuals
        residuals = ARIMA(train_data, order=order).fit().resid
        p = params.get('garch_p', 1)
        q = params.get('garch_q', 1)
        garch_pred = garch_forecast(residuals, len(test_data), p, q)
        # LSTM Forecast on Residuals
        residual_series = pd.Series(residuals[-len(train_data):], index=train_data.index)
        hidden_units = params.get('hidden_units', 50)
        epochs = params.get('epochs', 10)
        lstm_pred = lstm_forecast(residual_series, test_data, hidden_units, epochs)
        # Combine Forecasts
        forecast = arima_pred + garch_pred + lstm_pred
    else:
        raise ValueError("Invalid model choice.")

    return forecast, test_data

In [87]:
class SCAOptimizer:
    def __init__(self, pop_size, max_iters, dim, lb, ub):
        self.pop_size = pop_size
        self.max_iters = max_iters
        self.dim = dim
        self.lb = lb
        self.ub = ub
        self.population = np.random.uniform(lb, ub, (pop_size, dim))

    def optimize(self, fitness_func):
        best_pos = None
        best_fit = float('inf')

        for t in tqdm(range(self.max_iters), desc="SCA Optimizer Progress"):
            for i in range(self.pop_size):
                fitness = fitness_func(self.population[i])
                if fitness < best_fit:
                    best_fit = fitness
                    best_pos = self.population[i]

            # SCA algorithm to update positions
            a = 2 * (1 - t / self.max_iters)
            for i in range(self.pop_size):
                for j in range(self.dim):
                    r1 = a - t * (a / self.max_iters)
                    r2 = 2 * np.pi * np.random.rand()
                    r3 = np.random.rand()
                    r4 = np.random.rand()

                    if r4 < 0.5:
                        self.population[i, j] = self.population[i, j] + (r1 * np.sin(r2) * abs(r3 * best_pos[j] - self.population[i, j]))
                    else:
                        self.population[i, j] = self.population[i, j] + (r1 * np.cos(r2) * abs(r3 * best_pos[j] - self.population[i, j]))

            # Make sure values stay within bounds
            self.population = np.clip(self.population, self.lb, self.ub)

        return best_pos, best_fit

def calculate_fitness(predictions, actual):
    return np.sqrt(np.mean((predictions - actual) ** 2))


In [97]:
"""
def forecast_arima_garch_lstm(data, best_params):
    long_term, seasonal, residual = DataPreprocessing(data).decompose_data()

    # Split long-term and residual data for train/test
    train_len = int(0.75 * len(long_term))
    test_len = len(long_term) - train_len

    # ARIMA-GARCH Forecast
    arima_garch_predictions = arima_forecast(long_term[:train_len],
                                             long_term[train_len:],
                                             order=(int(best_params[0]), 1, int(best_params[1])))

    # LSTM Forecast
    lstm_predictions = lstm_forecast(residual[:train_len],
                                     residual[train_len:],
                                     hidden_units=int(best_params[2]),
                                     epochs=int(best_params[3]))

    # Combine forecasts (adjusting lengths)
    min_len = min(len(arima_garch_predictions), len(lstm_predictions))
    arima_garch_predictions = arima_garch_predictions[-min_len:]
    lstm_predictions = lstm_predictions[-min_len:]

    combined_forecast = arima_garch_predictions + lstm_predictions

    return combined_forecast
"""

In [85]:
"""# ARIMA-GARCH Forecasting
def arima_garch_forecast(train_data, test_data, p, q):
    
    Forecast using ARIMA-GARCH model.
    
    Parameters:
    - train_data (pd.Series): Training data for the ARIMA-GARCH model.
    - test_data (pd.Series): Testing data for forecasting.
    - p (int): ARIMA p parameter (autoregressive term).
    - q (int): GARCH q parameter (lagged residual error term).
    
    Returns:
    - forecast (pd.Series): Combined forecast of ARIMA and GARCH models.
    
    # Fit ARIMA model
    arima_model = ARIMA(train_data, order=(p, 1, q))
    arima_fitted = arima_model.fit()

    # Fit GARCH model on ARIMA residuals
    residuals = arima_fitted.resid
    garch_model = arch_model(residuals, vol="GARCH", p=p, q=q)
    garch_fitted = garch_model.fit(disp="off")

    # Forecast for ARIMA and GARCH
    arima_forecast = arima_fitted.forecast(steps=len(test_data))

    # Forecast GARCH and get the mean forecast for the future time steps
    garch_forecast_result = garch_fitted.forecast(horizon=len(test_data), start=len(residuals)-1)

    # Extract GARCH forecast and ensure it's for multiple steps
    garch_forecast = garch_forecast_result.variance[-len(test_data):].values.flatten()

    print("ARIMA forecast length:", len(arima_forecast))
    print("GARCH forecast length:", len(garch_forecast))

    # Combine the ARIMA and GARCH forecasts
    forecast = arima_forecast + garch_forecast

    return forecast
"""


In [86]:
"""
def lstm_forecast(train_data, test_data, hidden_units, max_epochs):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    train_scaled = scaler.fit_transform(train_data.values.reshape(-1, 1))

    X_train, y_train = [], []
    for i in range(5, len(train_scaled)):
        X_train.append(train_scaled[i-5:i, 0])
        y_train.append(train_scaled[i, 0])

    X_train, y_train = np.array(X_train), np.array(y_train)
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

    model = keras.models.Sequential()
    model.add(keras.layers.LSTM(units=hidden_units, input_shape=(X_train.shape[1], 1)))
    model.add(keras.layers.Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(X_train, y_train, epochs=max_epochs, batch_size=1, verbose=2)

    total_data = pd.concat((train_data, test_data), axis=0)
    inputs = total_data[len(total_data) - len(test_data) - 5:].values.reshape(-1, 1)
    inputs_scaled = scaler.transform(inputs)

    X_test = []
    for i in range(5, len(inputs_scaled)):
        X_test.append(inputs_scaled[i-5:i, 0])
    X_test = np.array(X_test)
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

    lstm_predictions = model.predict(X_test)
    lstm_predictions = scaler.inverse_transform(lstm_predictions)

    return lstm_predictions.flatten()
"""

In [89]:
def forecast_arima_garch_lstm(data, best_params):
    long_term, seasonal, residual = DataPreprocessing(data).decompose_data()

    # Split long-term and residual data for train/test
    train_len = int(0.75 * len(long_term))
    test_len = len(long_term) - train_len

    # ARIMA-GARCH Forecast
    arima_garch_predictions = arima_garch_forecast(long_term[:train_len],
                                                   long_term[train_len:],
                                                   p=int(best_params[0]), q=int(best_params[1]))

    # LSTM Forecast
    lstm_predictions = lstm_forecast(residual[:train_len],
                                     residual[train_len:],
                                     hidden_units=int(best_params[2]), max_epochs=int(best_params[3]))

    # Align ARIMA-GARCH and LSTM lengths if different
    min_len = min(len(arima_garch_predictions), len(lstm_predictions))
    arima_garch_predictions = arima_garch_predictions[-min_len:]
    lstm_predictions = lstm_predictions[-min_len:]

    # Combine predictions
    combined_forecast = arima_garch_predictions + lstm_predictions

    return combined_forecast

In [90]:
# Convert differenced values back to stock prices
def reverse_differencing(actual_data, predicted_diff):
    """
    Reverse the differencing to convert predicted differences back to stock prices.
    
    Parameters:
    - actual_data (pd.Series): The actual stock price series (used to get the last known price).
    - predicted_diff (pd.Series or np.array): The predicted differenced values.
    
    Returns:
    - predicted_stock_prices (pd.Series): The predicted stock prices.
    """
    last_actual_price = actual_data.iloc[-1]  # Get the last actual stock price
    predicted_stock_prices = np.r_[last_actual_price, predicted_diff].cumsum()  # Reverse the differencing
    return predicted_stock_prices[1:]  # Return all except the first element which is just the last actual price


In [91]:
# Update the save_results_to_csv method to include both differenced and actual prices
def save_results_to_csv(original_data, actual_data, forecast_diff, filename='../data/processed/^SPX_Predicted.csv'):
    """
    Save the actual, predicted differenced values, and predicted stock prices to a CSV file.
    
    Parameters:
    - actual_data (pd.Series): Actual stock prices.
    - forecast_diff (pd.Series): Forecasted differenced values.
    - filename (str): Output filename for the CSV.
    
    Returns:
    - None
    """
    predicted_stock_prices = reverse_differencing(actual_data, forecast_diff)

    results_df = original_data.copy()
    results_df['Predicted_Difference_Adj_Close'] = np.nan
    results_df['Predicted_Adj_Close'] = np.nan

    # Fill the predictions only for the forecasted period
    results_df.iloc[-len(predicted_stock_prices):, results_df.columns.get_loc('Predicted_Difference_Adj_Close')] = forecast_diff
    results_df.iloc[-len(predicted_stock_prices):, results_df.columns.get_loc('Predicted_Adj_Close')] = predicted_stock_prices

    results_df.to_csv(filename, index=False)
    print(f"Results saved to {filename}")

In [92]:
# Updating plot_results to show both predicted differenced values and stock prices
def plot_results(actual, forecast_diff):
    """
    Plot the actual and forecasted values (for differenced and stock prices).
    
    Parameters:
    - actual (pd.Series): Actual stock prices.
    - forecast_diff (pd.Series): Forecasted differenced values from the model.
    
    Returns:
    - None
    """
    predicted_stock_prices = reverse_differencing(actual, forecast_diff)

    plt.figure(figsize=(12, 6))
    plt.plot(actual.index, actual, label="Actual Data", color='steelblue', linewidth=2)
    plt.plot(actual.index[-len(predicted_stock_prices):], predicted_stock_prices, label="Predicted Stock Price", linestyle="--", color='orange', linewidth=2)
    plt.title("ARIMA-GARCH-LSTM Predictions (Stock Prices)")
    plt.xlabel("Date")
    plt.ylabel("Price")
    plt.legend()
    plt.show()

In [93]:
def run_full_workflow(data,original_data, best_params=None, best_fitness=None):
    """
    Run the full ARIMA-GARCH-LSTM workflow. Optionally, provide best parameters and fitness.
    
    Parameters:
    - data (pd.DataFrame): The input stock price data.
    - best_params (list): Predefined best parameters for the model (optional).
    - best_fitness (float): Predefined best fitness value (optional).
    
    Returns:
    - None
    """
    # Decompose the data
    preprocessing = DataPreprocessing(data)
    long_term, seasonal, residual = preprocessing.decompose_data()

    # Define bounds for SCA optimization
    lb = [1, 1, 10, 10]  # p, q, hidden_units, max_epochs
    ub = [5, 5, 100, 100]

    # Check if best_params and best_fitness are provided
    if best_params is None or best_fitness is None:
        # If not provided, run SCA optimizer
        print("No predefined parameters provided. Running SCA optimization...")
        sca_optimizer = SCAOptimizer(pop_size=1, max_iters=1, dim=4, lb=lb, ub=ub)

        # Run SCA optimization
        best_params, best_fitness = sca_optimizer.optimize(lambda params: calculate_fitness(forecast_arima_garch_lstm(data, params), data))

    print(f"Best Parameters: {best_params}, Best Fitness: {best_fitness}")

    forecast_diff = forecast_arima_garch_lstm(data, best_params)

    save_results_to_csv(original_data, data, forecast_diff)
    plot_results(data, forecast_diff)


