In [1]:
# Install necessary libraries
!pip install yfinance pandas numpy scikit-learn matplotlib lightgbm xgboost catboost statsmodels tensorflow

Collecting yfinance
  Obtaining dependency information for yfinance from https://files.pythonhosted.org/packages/c5/98/6fd94db428a9670d638d4469cedfea51671de0126b8f50de1b06e2245c97/yfinance-0.2.50-py2.py3-none-any.whl.metadata
  Using cached yfinance-0.2.50-py2.py3-none-any.whl.metadata (5.5 kB)
Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/aa/70/c853aec59839bceed032d52010ff5f1b8d87dc3114b762e4ba2727661a3b/pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl.metadata
  Using cached pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl.metadata (89 kB)
Collecting numpy
  Obtaining dependency information for numpy from https://files.pythonhosted.org/packages/d1/bb/75b945874f931494891eac6ca06a1764d0e8208791f3addadb2963b83527/numpy-2.1.3-cp310-cp310-macosx_14_0_x86_64.whl.metadata
  Downloading numpy-2.1.3-cp310-cp310-macosx_14_0_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [

In [2]:
# Import libraries
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import timedelta
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor
from statsmodels.tsa.arima.model import ARIMA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import tensorflow as tf
import warnings

In [None]:
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# 1. Data Acquisition
stock_symbol = 'SOXL'
start_date = '2010-03-11'
end_date = '2024-11-22'

# Download stock data
data = yf.download(stock_symbol, start=start_date, end=end_date)

# Check if data was downloaded successfully
if data.empty:
    raise ValueError(f"No data found for stock symbol {stock_symbol} between {start_date} and {end_date}.")

# 2. Data Preprocessing
# We'll use the 'Close' price for prediction
data = data[['Close']].copy()
data.rename(columns={'Close': 'y'}, inplace=True)

# Reset index to have 'Date' as a column
data.reset_index(inplace=True)

# Ensure data is sorted by date
data.sort_values('Date', inplace=True)

# Feature Engineering: Create additional time-based features
data['Date'] = pd.to_datetime(data['Date'])
data['DayOfWeek'] = data['Date'].dt.dayofweek
data['WeekOfYear'] = data['Date'].dt.isocalendar().week
data['Month'] = data['Date'].dt.month
data['Quarter'] = data['Date'].dt.quarter
data['Year'] = data['Date'].dt.year

# Lag features (previous day's closing price)
for lag in range(1, 8):  # Using 7 days lag
    data[f'Lag_{lag}'] = data['y'].shift(lag)

# Rolling window features
data['RollingMean_7'] = data['y'].rolling(window=7).mean()
data['RollingStd_7'] = data['y'].rolling(window=7).std()

data['RollingMean_14'] = data['y'].rolling(window=14).mean()
data['RollingStd_14'] = data['y'].rolling(window=14).std()

# Drop rows with NaN values resulting from lag and rolling calculations
data.dropna(inplace=True)

# 3. Train-Test Split
# We'll use the last 20% of the data as the test set
split_ratio = 0.8
split_index = int(len(data) * split_ratio)

train_data = data[:split_index]
test_data = data[split_index:]

print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")

# 4. Feature Scaling
# Features to scale
features_to_scale = ['y'] + [f'Lag_{lag}' for lag in range(1, 8)] + \
                    ['RollingMean_7', 'RollingStd_7', 'RollingMean_14', 'RollingStd_14']

scaler = StandardScaler()
train_data_scaled = train_data.copy()
test_data_scaled = test_data.copy()

train_data_scaled[features_to_scale] = scaler.fit_transform(train_data[features_to_scale])
test_data_scaled[features_to_scale] = scaler.transform(test_data[features_to_scale])

# 5. Prepare Training and Testing Data
X_train = train_data_scaled.drop(columns=['Date', 'y'])
y_train = train_data_scaled['y']

X_test = test_data_scaled.drop(columns=['Date', 'y'])
y_test = test_data_scaled['y']

# 6. Model Training and Evaluation

# Function to evaluate model performance
def evaluate_model(true, predicted, model_name):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mse)
    print(f"\n{model_name} Performance:")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    return rmse

# Function to plot predictions
def plot_predictions(true, predicted, model_name):
    plt.figure(figsize=(14, 7))
    plt.plot(true.reset_index(drop=True), label='Actual', color='blue')
    plt.plot(predicted, label='Predicted', color='red', alpha=0.7)
    plt.title(f'Actual vs Predicted Closing Prices ({model_name})')
    plt.xlabel('Sample Index')
    plt.ylabel('Closing Price (Scaled)')
    plt.legend()
    plt.show()

### **Model 1: LightGBM Regressor**

# Initialize the model
lgbm = LGBMRegressor(random_state=42)

# Train the model
lgbm.fit(X_train, y_train)

# Predict
y_pred_lgbm = lgbm.predict(X_test)

# Evaluate
rmse_lgbm = evaluate_model(y_test, y_pred_lgbm, 'LightGBM Regressor')

# Plot predictions
plot_predictions(y_test, y_pred_lgbm, 'LightGBM Regressor')

### **Model 2: ARIMA**

# For ARIMA, we'll use the original data without scaling
from statsmodels.tsa.stattools import adfuller

# Ensure stationarity
result = adfuller(train_data['y'])
print('ADF Statistic:', result[0])
print('p-value:', result[1])

if result[1] > 0.05:
    # Differencing to make the series stationary
    train_data['y_diff'] = train_data['y'].diff().dropna()
else:
    train_data['y_diff'] = train_data['y']

# Fit ARIMA model
arima_order = (5, 1, 0)  # This can be adjusted or determined using AIC/BIC criteria
arima_model = ARIMA(train_data['y'], order=arima_order)
arima_model_fit = arima_model.fit()

# Forecast
forecast_steps = len(test_data)
arima_forecast = arima_model_fit.forecast(steps=forecast_steps)

# Evaluate
rmse_arima = evaluate_model(test_data['y'], arima_forecast, 'ARIMA')

# Plot predictions
plt.figure(figsize=(14, 7))
plt.plot(test_data['y'].reset_index(drop=True), label='Actual', color='blue')
plt.plot(arima_forecast.values, label='Predicted', color='red', alpha=0.7)
plt.title('Actual vs Predicted Closing Prices (ARIMA)')
plt.xlabel('Sample Index')
plt.ylabel('Closing Price')
plt.legend()
plt.show()

### **Model 3: LSTM Neural Network**

# Prepare data for LSTM
def create_lstm_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X.iloc[i:(i + time_steps)].values)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

time_steps = 10

# Combine train and test data for LSTM scaling
combined_data = pd.concat([train_data_scaled, test_data_scaled], axis=0)
X_lstm = combined_data.drop(columns=['Date', 'y'])
y_lstm = combined_data['y']

# Create datasets
X_lstm, y_lstm = create_lstm_dataset(X_lstm, y_lstm, time_steps)

# Split back into train and test
X_train_lstm = X_lstm[:split_index - time_steps]
y_train_lstm = y_lstm[:split_index - time_steps]

X_test_lstm = X_lstm[split_index - time_steps:]
y_test_lstm = y_lstm[split_index - time_steps:]

# Build LSTM Model
lstm_model = Sequential()
lstm_model.add(LSTM(64, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), return_sequences=True))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(32, return_sequences=False))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(1))

lstm_model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = lstm_model.fit(X_train_lstm, y_train_lstm, epochs=20, batch_size=32, validation_split=0.1, verbose=1)

# Predict
y_pred_lstm = lstm_model.predict(X_test_lstm)

# Evaluate
rmse_lstm = evaluate_model(y_test_lstm, y_pred_lstm.flatten(), 'LSTM')

# Plot predictions
plot_predictions(y_test_lstm, y_pred_lstm.flatten(), 'LSTM')

# 7. Model Comparison

print("\nModel RMSE Comparison:")
print(f"LightGBM Regressor RMSE: {rmse_lgbm:.4f}")
print(f"ARIMA RMSE: {rmse_arima:.4f}")
print(f"LSTM RMSE: {rmse_lstm:.4f}")

# 8. Final Model Selection and Prediction

# Assuming LightGBM performed the best based on RMSE
# We can retrain LightGBM on the entire dataset and make future predictions if needed

# Retrain on entire dataset
full_data = data.copy()
full_data[features_to_scale] = scaler.fit_transform(full_data[features_to_scale])

X_full = full_data.drop(columns=['Date', 'y'])
y_full = full_data['y']

lgbm_final = LGBMRegressor(random_state=42)
lgbm_final.fit(X_full, y_full)

# Future Prediction (Next Day Closing Price)
# Prepare features for the next day
last_row = full_data.iloc[-1:].copy()

# Increment the date by one day
next_date = last_row['Date'] + timedelta(days=1)
last_row['Date'] = next_date

# Update time-based features
last_row['DayOfWeek'] = last_row['Date'].dt.dayofweek
last_row['WeekOfYear'] = last_row['Date'].dt.isocalendar().week
last_row['Month'] = last_row['Date'].dt.month
last_row['Quarter'] = last_row['Date'].dt.quarter
last_row['Year'] = last_row['Date'].dt.year

# Update lag features
for lag in range(1, 8):
    last_row[f'Lag_{lag}'] = full_data['y'].shift(lag).iloc[-1]

# Update rolling features
last_row['RollingMean_7'] = full_data['y'].rolling(window=7).mean().iloc[-1]
last_row['RollingStd_7'] = full_data['y'].rolling(window=7).std().iloc[-1]
last_row['RollingMean_14'] = full_data['y'].rolling(window=14).mean().iloc[-1]
last_row['RollingStd_14'] = full_data['y'].rolling(window=14).std().iloc[-1]

# Drop NaN values
last_row.dropna(inplace=True)

# Scale features
last_row[features_to_scale] = scaler.transform(last_row[features_to_scale])

# Prepare input features
X_next = last_row.drop(columns=['Date', 'y'])

# Predict the next day's closing price
next_day_prediction_scaled = lgbm_final.predict(X_next)

# Inverse transform the prediction
next_day_prediction = scaler.inverse_transform([[0]* (len(features_to_scale)-1) + [next_day_prediction_scaled[0]]])[-1][0]

print(f"\nPredicted Next Day Closing Price: {next_day_prediction:.2f}")

# 9. Plotting the Actual vs Predicted Prices for Test Data

# Inverse transform the scaled prices
y_test_actual = scaler.inverse_transform(np.concatenate((np.zeros((len(y_test), len(features_to_scale)-1)), y_test.values.reshape(-1, 1)), axis=1))[:, -1]
y_pred_actual = scaler.inverse_transform(np.concatenate((np.zeros((len(y_pred_lgbm), len(features_to_scale)-1)), y_pred_lgbm.reshape(-1, 1)), axis=1))[:, -1]

plt.figure(figsize=(14, 7))
plt.plot(test_data['Date'].reset_index(drop=True), y_test_actual, label='Actual', color='blue')
plt.plot(test_data['Date'].reset_index(drop=True), y_pred_actual, label='Predicted', color='red', alpha=0.7)
plt.title('Actual vs Predicted Closing Prices (LightGBM)')
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.legend()
plt.show()
