In [None]:
# Install necessary libraries
!pip install yfinance pandas numpy scikit-learn matplotlib lightgbm xgboost catboost statsmodels tensorflow

In [None]:
# Import libraries
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import timedelta
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor
from statsmodels.tsa.arima.model import ARIMA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import tensorflow as tf
import warnings

In [None]:
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# 1. Data Acquisition
stock_symbol = 'SOXL'
start_date = '2010-03-11'
end_date = '2024-11-22'

# Download stock data
data = yf.download(stock_symbol, start=start_date, end=end_date)

# Check if data was downloaded successfully
if data.empty:
    raise ValueError(f"No data found for stock symbol {stock_symbol} between {start_date} and {end_date}.")

# 2. Data Preprocessing
# We'll use the 'Close' price for prediction
data = data[['Close']].copy()
data.rename(columns={'Close': 'y'}, inplace=True)

# Reset index to have 'Date' as a column
data.reset_index(inplace=True)

# Ensure data is sorted by date
data.sort_values('Date', inplace=True)

# Feature Engineering: Create additional time-based features
data['Date'] = pd.to_datetime(data['Date'])
data['DayOfWeek'] = data['Date'].dt.dayofweek
data['WeekOfYear'] = data['Date'].dt.isocalendar().week
data['Month'] = data['Date'].dt.month
data['Quarter'] = data['Date'].dt.quarter
data['Year'] = data['Date'].dt.year

# Lag features (previous day's closing price)
for lag in range(1, 8):  # Using 7 days lag
    data[f'Lag_{lag}'] = data['y'].shift(lag)

# Rolling window features
data['RollingMean_7'] = data['y'].rolling(window=7).mean()
data['RollingStd_7'] = data['y'].rolling(window=7).std()

data['RollingMean_14'] = data['y'].rolling(window=14).mean()
data['RollingStd_14'] = data['y'].rolling(window=14).std()

# Drop rows with NaN values resulting from lag and rolling calculations
data.dropna(inplace=True)

# 3. Train-Test Split
# We'll use the last 20% of the data as the test set
split_ratio = 0.8
split_index = int(len(data) * split_ratio)

train_data = data[:split_index]
test_data = data[split_index:]

print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")

# 4. Feature Scaling
# Features to scale
features_to_scale = ['y'] + [f'Lag_{lag}' for lag in range(1, 8)] + \
                    ['RollingMean_7', 'RollingStd_7', 'RollingMean_14', 'RollingStd_14']

scaler = StandardScaler()
train_data_scaled = train_data.copy()
test_data_scaled = test_data.copy()

train_data_scaled[features_to_scale] = scaler.fit_transform(train_data[features_to_scale])
test_data_scaled[features_to_scale] = scaler.transform(test_data[features_to_scale])

# 5. Prepare Training and Testing Data
X_train = train_data_scaled.drop(columns=['Date', 'y'])
y_train = train_data_scaled['y']

X_test = test_data_scaled.drop(columns=['Date', 'y'])
y_test = test_data_scaled['y']

# 6. Model Training and Evaluation

# Function to evaluate model performance
def evaluate_model(true, predicted, model_name):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mse)
    print(f"\n{model_name} Performance:")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    return rmse

# Function to plot predictions
def plot_predictions(true, predicted, model_name):
    plt.figure(figsize=(14, 7))
    plt.plot(true.reset_index(drop=True), label='Actual', color='blue')
    plt.plot(predicted, label='Predicted', color='red', alpha=0.7)
    plt.title(f'Actual vs Predicted Closing Prices ({model_name})')
    plt.xlabel('Sample Index')
    plt.ylabel('Closing Price (Scaled)')
    plt.legend()
    plt.show()

### **Model 1: LightGBM Regressor**

# Initialize the model
lgbm = LGBMRegressor(random_state=42)

# Train the model
lgbm.fit(X_train, y_train)

# Predict
y_pred_lgbm = lgbm.predict(X_test)

# Evaluate
rmse_lgbm = evaluate_model(y_test, y_pred_lgbm, 'LightGBM Regressor')

# Plot predictions
plot_predictions(y_test, y_pred_lgbm, 'LightGBM Regressor')

### **Model 2: ARIMA**

# For ARIMA, we'll use the original data without scaling
from statsmodels.tsa.stattools import adfuller

# Ensure stationarity
result = adfuller(train_data['y'])
print('ADF Statistic:', result[0])
print('p-value:', result[1])

if result[1] > 0.05:
    # Differencing to make the series stationary
    train_data['y_diff'] = train_data['y'].diff().dropna()
else:
    train_data['y_diff'] = train_data['y']

# Fit ARIMA model
arima_order = (5, 1, 0)  # This can be adjusted or determined using AIC/BIC criteria
arima_model = ARIMA(train_data['y'], order=arima_order)
arima_model_fit = arima_model.fit()

# Forecast
forecast_steps = len(test_data)
arima_forecast = arima_model_fit.forecast(steps=forecast_steps)

# Evaluate
rmse_arima = evaluate_model(test_data['y'], arima_forecast, 'ARIMA')

# Plot predictions
plt.figure(figsize=(14, 7))
plt.plot(test_data['y'].reset_index(drop=True), label='Actual', color='blue')
plt.plot(arima_forecast.values, label='Predicted', color='red', alpha=0.7)
plt.title('Actual vs Predicted Closing Prices (ARIMA)')
plt.xlabel('Sample Index')
plt.ylabel('Closing Price')
plt.legend()
plt.show()

### **Model 3: LSTM Neural Network**

# Prepare data for LSTM
def create_lstm_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X.iloc[i:(i + time_steps)].values)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

time_steps = 10

# Combine train and test data for LSTM scaling
combined_data = pd.concat([train_data_scaled, test_data_scaled], axis=0)
X_lstm = combined_data.drop(columns=['Date', 'y'])
y_lstm = combined_data['y']

# Create datasets
X_lstm, y_lstm = create_lstm_dataset(X_lstm, y_lstm, time_steps)

# Split back into train and test
X_train_lstm = X_lstm[:split_index - time_steps]
y_train_lstm = y_lstm[:split_index - time_steps]

X_test_lstm = X_lstm[split_index - time_steps:]
y_test_lstm = y_lstm[split_index - time_steps:]

# Build LSTM Model
lstm_model = Sequential()
lstm_model.add(LSTM(64, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), return_sequences=True))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(32, return_sequences=False))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(1))

lstm_model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = lstm_model.fit(X_train_lstm, y_train_lstm, epochs=20, batch_size=32, validation_split=0.1, verbose=1)

# Predict
y_pred_lstm = lstm_model.predict(X_test_lstm)

# Evaluate
rmse_lstm = evaluate_model(y_test_lstm, y_pred_lstm.flatten(), 'LSTM')

# Plot predictions
plot_predictions(y_test_lstm, y_pred_lstm.flatten(), 'LSTM')

# 7. Model Comparison

print("\nModel RMSE Comparison:")
print(f"LightGBM Regressor RMSE: {rmse_lgbm:.4f}")
print(f"ARIMA RMSE: {rmse_arima:.4f}")
print(f"LSTM RMSE: {rmse_lstm:.4f}")

# 8. Final Model Selection and Prediction

# Assuming LightGBM performed the best based on RMSE
# We can retrain LightGBM on the entire dataset and make future predictions if needed

# Retrain on entire dataset
full_data = data.copy()
full_data[features_to_scale] = scaler.fit_transform(full_data[features_to_scale])

X_full = full_data.drop(columns=['Date', 'y'])
y_full = full_data['y']

lgbm_final = LGBMRegressor(random_state=42)
lgbm_final.fit(X_full, y_full)

# Future Prediction (Next Day Closing Price)
# Prepare features for the next day
last_row = full_data.iloc[-1:].copy()

# Increment the date by one day
next_date = last_row['Date'] + timedelta(days=1)
last_row['Date'] = next_date

# Update time-based features
last_row['DayOfWeek'] = last_row['Date'].dt.dayofweek
last_row['WeekOfYear'] = last_row['Date'].dt.isocalendar().week
last_row['Month'] = last_row['Date'].dt.month
last_row['Quarter'] = last_row['Date'].dt.quarter
last_row['Year'] = last_row['Date'].dt.year

# Update lag features
for lag in range(1, 8):
    last_row[f'Lag_{lag}'] = full_data['y'].shift(lag).iloc[-1]

# Update rolling features
last_row['RollingMean_7'] = full_data['y'].rolling(window=7).mean().iloc[-1]
last_row['RollingStd_7'] = full_data['y'].rolling(window=7).std().iloc[-1]
last_row['RollingMean_14'] = full_data['y'].rolling(window=14).mean().iloc[-1]
last_row['RollingStd_14'] = full_data['y'].rolling(window=14).std().iloc[-1]

# Drop NaN values
last_row.dropna(inplace=True)

# Scale features
last_row[features_to_scale] = scaler.transform(last_row[features_to_scale])

# Prepare input features
X_next = last_row.drop(columns=['Date', 'y'])

# Predict the next day's closing price
next_day_prediction_scaled = lgbm_final.predict(X_next)

# Inverse transform the prediction
next_day_prediction = scaler.inverse_transform([[0]* (len(features_to_scale)-1) + [next_day_prediction_scaled[0]]])[-1][0]

print(f"\nPredicted Next Day Closing Price: {next_day_prediction:.2f}")

# 9. Plotting the Actual vs Predicted Prices for Test Data

# Inverse transform the scaled prices
y_test_actual = scaler.inverse_transform(np.concatenate((np.zeros((len(y_test), len(features_to_scale)-1)), y_test.values.reshape(-1, 1)), axis=1))[:, -1]
y_pred_actual = scaler.inverse_transform(np.concatenate((np.zeros((len(y_pred_lgbm), len(features_to_scale)-1)), y_pred_lgbm.reshape(-1, 1)), axis=1))[:, -1]

plt.figure(figsize=(14, 7))
plt.plot(test_data['Date'].reset_index(drop=True), y_test_actual, label='Actual', color='blue')
plt.plot(test_data['Date'].reset_index(drop=True), y_pred_actual, label='Predicted', color='red', alpha=0.7)
plt.title('Actual vs Predicted Closing Prices (LightGBM)')
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.legend()
plt.show()


In [None]:
# Install necessary libraries
# Note: Uncomment the following line if running in a fresh environment
# !pip install yfinance pandas numpy scikit-learn matplotlib lightgbm xgboost catboost statsmodels tensorflow

# Import libraries
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import timedelta
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import tensorflow as tf
import warnings
import re

warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# 1. Data Acquisition
stock_symbol = 'SOXL'
start_date = '2010-03-11'
end_date = '2024-11-22'

# Download stock data
data = yf.download(stock_symbol, start=start_date, end=end_date)

# Check if data was downloaded successfully
if data.empty:
    raise ValueError(f"No data found for stock symbol {stock_symbol} between {start_date} and {end_date}.")

# 2. Data Preprocessing
# We'll use the 'Close' price for prediction
data = data[['Close']].copy()
data.rename(columns={'Close': 'y'}, inplace=True)

# Reset index to have 'Date' as a column
data.reset_index(inplace=True)

# Ensure data is sorted by date
data.sort_values('Date', inplace=True)

# Feature Engineering: Create additional time-based features
data['Date'] = pd.to_datetime(data['Date'])
data['DayOfWeek'] = data['Date'].dt.dayofweek
data['WeekOfYear'] = data['Date'].dt.isocalendar().week
data['Month'] = data['Date'].dt.month
data['Quarter'] = data['Date'].dt.quarter
data['Year'] = data['Date'].dt.year

# Lag features (previous day's closing price)
for lag in range(1, 8):  # Using 7 days lag
    data[f'Lag_{lag}'] = data['y'].shift(lag)

# Rolling window features
data['RollingMean_7'] = data['y'].rolling(window=7).mean()
data['RollingStd_7'] = data['y'].rolling(window=7).std()

data['RollingMean_14'] = data['y'].rolling(window=14).mean()
data['RollingStd_14'] = data['y'].rolling(window=14).std()

# Drop rows with NaN values resulting from lag and rolling calculations
data.dropna(inplace=True)

# 3. Train-Test Split
# We'll use the last 20% of the data as the test set
split_ratio = 0.8
split_index = int(len(data) * split_ratio)

train_data = data[:split_index]
test_data = data[split_index:]

print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")

# 4. Feature Scaling
# Features to scale
features_to_scale = ['y'] + [f'Lag_{lag}' for lag in range(1, 8)] + \
                    ['RollingMean_7', 'RollingStd_7', 'RollingMean_14', 'RollingStd_14']

scaler = StandardScaler()
train_data_scaled = train_data.copy()
test_data_scaled = test_data.copy()

train_data_scaled[features_to_scale] = scaler.fit_transform(train_data[features_to_scale])
test_data_scaled[features_to_scale] = scaler.transform(test_data[features_to_scale])

# 5. Prepare Training and Testing Data
X_train = train_data_scaled.drop(columns=['Date', 'y'])
y_train = train_data_scaled['y']

X_test = test_data_scaled.drop(columns=['Date', 'y'])
y_test = test_data_scaled['y']

# 6. Sanitize Feature Names to Remove Special Characters
def sanitize_column_names(columns):
    """
    Replace any special characters in column names with underscores.
    Ensures all column names are strings before applying regex.
    """
    return [re.sub(r'[^A-Za-z0-9_]', '_', str(col)) for col in columns]

# Apply sanitization
X_train.columns = sanitize_column_names(X_train.columns)
X_test.columns = sanitize_column_names(X_test.columns)

# 7. Model Training and Evaluation

# Function to evaluate model performance
def evaluate_model(true, predicted, model_name):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mse)
    print(f"\n{model_name} Performance:")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    return rmse

# Function to plot predictions
def plot_predictions(true, predicted, model_name):
    plt.figure(figsize=(14, 7))
    plt.plot(range(len(true)), true.values, label='Actual', color='blue')
    plt.plot(range(len(predicted)), predicted, label='Predicted', color='red', alpha=0.7)
    plt.title(f'Actual vs Predicted Closing Prices ({model_name})')
    plt.xlabel('Sample Index')
    plt.ylabel('Closing Price (Scaled)')
    plt.legend()
    plt.show()

### **Model 1: LightGBM Regressor**

# Initialize the model
lgbm = LGBMRegressor(random_state=42)

# Train the model
try:
    lgbm.fit(X_train, y_train)
    print("LightGBM training completed successfully.")
except Exception as e:
    print(f"Error during LightGBM training: {e}")

# Predict
try:
    y_pred_lgbm = lgbm.predict(X_test)
    print("LightGBM prediction completed successfully.")
except Exception as e:
    print(f"Error during LightGBM prediction: {e}")

# Evaluate
if 'y_pred_lgbm' in locals():
    rmse_lgbm = evaluate_model(y_test, y_pred_lgbm, 'LightGBM Regressor')

    # Plot predictions
    plot_predictions(y_test, y_pred_lgbm, 'LightGBM Regressor')
else:
    rmse_lgbm = None
    print("Skipping evaluation and plotting for LightGBM due to previous errors.")

### **Model 2: ARIMA**

# For ARIMA, we'll use the original data without scaling
# Ensure stationarity
result = adfuller(train_data['y'])
print('\nAugmented Dickey-Fuller Test:')
print('ADF Statistic:', result[0])
print('p-value:', result[1])

if result[1] > 0.05:
    # Differencing to make the series stationary
    train_data['y_diff'] = train_data['y'].diff().dropna()
    print("Applied differencing to make the series stationary.")
else:
    train_data['y_diff'] = train_data['y']
    print("Series is stationary. No differencing applied.")

# Fit ARIMA model
arima_order = (5, 1, 0)  # This can be adjusted or determined using AIC/BIC criteria
arima_model = ARIMA(train_data['y'], order=arima_order)
try:
    arima_model_fit = arima_model.fit()
    print("ARIMA model fitted successfully.")
except Exception as e:
    print(f"Error during ARIMA model fitting: {e}")

# Forecast
forecast_steps = len(test_data)
try:
    arima_forecast = arima_model_fit.forecast(steps=forecast_steps)
    print("ARIMA forecasting completed successfully.")
except Exception as e:
    print(f"Error during ARIMA forecasting: {e}")

# Evaluate
try:
    rmse_arima = evaluate_model(test_data['y'], arima_forecast, 'ARIMA')
except Exception as e:
    print(f"Error during ARIMA evaluation: {e}")
    rmse_arima = None

# Plot predictions
if rmse_arima is not None:
    plt.figure(figsize=(14, 7))
    plt.plot(test_data['Date'].reset_index(drop=True), test_data['y'].reset_index(drop=True), label='Actual', color='blue')
    plt.plot(test_data['Date'].reset_index(drop=True), arima_forecast.values, label='Predicted', color='red', alpha=0.7)
    plt.title('Actual vs Predicted Closing Prices (ARIMA)')
    plt.xlabel('Date')
    plt.ylabel('Closing Price')
    plt.legend()
    plt.show()
else:
    print("Skipping ARIMA plotting due to previous errors.")

### **Model 3: LSTM Neural Network**

# Prepare data for LSTM
def create_lstm_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X.iloc[i:(i + time_steps)].values)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

time_steps = 10

# Combine train and test data for LSTM scaling
combined_data = pd.concat([train_data_scaled, test_data_scaled], axis=0)
X_lstm = combined_data.drop(columns=['Date', 'y'])
y_lstm = combined_data['y']

# Create datasets
X_lstm, y_lstm = create_lstm_dataset(X_lstm, y_lstm, time_steps)

# Split back into train and test
X_train_lstm = X_lstm[:split_index - time_steps]
y_train_lstm = y_lstm[:split_index - time_steps]

X_test_lstm = X_lstm[split_index - time_steps:]
y_test_lstm = y_lstm[split_index - time_steps:]

# Build LSTM Model
lstm_model = Sequential()
lstm_model.add(LSTM(64, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), return_sequences=True))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(32, return_sequences=False))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(1))

lstm_model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = lstm_model.fit(X_train_lstm, y_train_lstm, epochs=20, batch_size=32, validation_split=0.1, verbose=1)

# Predict
y_pred_lstm = lstm_model.predict(X_test_lstm)

# Evaluate
rmse_lstm = evaluate_model(y_test_lstm, y_pred_lstm.flatten(), 'LSTM')

# Plot predictions
plot_predictions(y_test_lstm, y_pred_lstm.flatten(), 'LSTM')

# 7. Model Comparison

print("\nModel RMSE Comparison:")
if rmse_lgbm is not None:
    print(f"LightGBM Regressor RMSE: {rmse_lgbm:.4f}")
if rmse_arima is not None:
    print(f"ARIMA RMSE: {rmse_arima:.4f}")
if rmse_lstm is not None:
    print(f"LSTM RMSE: {rmse_lstm:.4f}")

# 8. Final Model Selection and Prediction

# Assuming LightGBM performed the best based on RMSE
# We can retrain LightGBM on the entire dataset and make future predictions if needed

if rmse_lgbm is not None and (rmse_arima is None or rmse_lgbm <= rmse_arima) and (rmse_lstm is None or rmse_lgbm <= rmse_lstm):
    print("\nSelecting LightGBM Regressor as the final model.")
    # Retrain on entire dataset
    full_data = data.copy()
    full_data[features_to_scale] = scaler.fit_transform(full_data[features_to_scale])
    
    X_full = full_data.drop(columns=['Date', 'y'])
    y_full = full_data['y']
    
    # Sanitize column names in the full dataset
    X_full.columns = sanitize_column_names(X_full.columns)
    
    lgbm_final = LGBMRegressor(random_state=42)
    try:
        lgbm_final.fit(X_full, y_full)
        print("Final LightGBM training completed successfully.")
    except Exception as e:
        print(f"Error during final LightGBM training: {e}")
    
    # Future Prediction (Next Day Closing Price)
    # Prepare features for the next day
    last_row = full_data.iloc[-1:].copy()
    
    # Increment the date by one day
    next_date = last_row['Date'] + timedelta(days=1)
    last_row['Date'] = next_date
    
    # Update time-based features
    last_row['DayOfWeek'] = last_row['Date'].dt.dayofweek
    last_row['WeekOfYear'] = last_row['Date'].dt.isocalendar().week
    last_row['Month'] = last_row['Date'].dt.month
    last_row['Quarter'] = last_row['Date'].dt.quarter
    last_row['Year'] = last_row['Date'].dt.year
    
    # Update lag features
    for lag in range(1, 8):
        # Since we're predicting the next day, shift the 'y' values accordingly
        if lag <= len(full_data):
            last_row[f'Lag_{lag}'] = full_data['y'].shift(lag).iloc[-1]
        else:
            last_row[f'Lag_{lag}'] = 0  # Assign 0 or some default value if lag exceeds data length
    
    # Update rolling features
    last_row['RollingMean_7'] = full_data['y'].rolling(window=7).mean().iloc[-1]
    last_row['RollingStd_7'] = full_data['y'].rolling(window=7).std().iloc[-1]
    last_row['RollingMean_14'] = full_data['y'].rolling(window=14).mean().iloc[-1]
    last_row['RollingStd_14'] = full_data['y'].rolling(window=14).std().iloc[-1]
    
    # Drop NaN values
    last_row.dropna(inplace=True)
    
    # Scale features
    last_row[features_to_scale] = scaler.transform(last_row[features_to_scale])
    
    # Sanitize column names
    last_row = last_row.rename(columns=lambda x: re.sub(r'[^A-Za-z0-9_]', '_', x))
    
    # Prepare input features
    X_next = last_row.drop(columns=['Date', 'y'])
    
    # Predict the next day's closing price
    try:
        next_day_prediction_scaled = lgbm_final.predict(X_next)
        print("Next day prediction completed successfully.")
    except Exception as e:
        print(f"Error during LightGBM prediction for next day: {e}")
    
    # Inverse transform the prediction
    # Since only 'y' was scaled, we need to inverse transform only that component
    # Create an array with zeros for other features
    inverse_transform_input = np.zeros((1, len(features_to_scale)))
    inverse_transform_input[0, -1] = next_day_prediction_scaled[0]
    next_day_prediction = scaler.inverse_transform(inverse_transform_input)[0, -1]
    
    print(f"\nPredicted Next Day Closing Price: {next_day_prediction:.2f}")
else:
    print("\nLightGBM Regressor did not perform the best. Skipping final model selection and prediction.")

# 9. Plotting the Actual vs Predicted Prices for Test Data

# Inverse transform the scaled prices
y_test_actual = scaler.inverse_transform(
    np.concatenate((np.zeros((len(y_test), len(features_to_scale)-1)), y_test.values.reshape(-1, 1)), axis=1)
)[:, -1]
y_pred_actual = scaler.inverse_transform(
    np.concatenate((np.zeros((len(y_pred_lgbm), len(features_to_scale)-1)), y_pred_lgbm.reshape(-1, 1)), axis=1)
)[:, -1]

plt.figure(figsize=(14, 7))
plt.plot(test_data['Date'].reset_index(drop=True), y_test_actual, label='Actual', color='blue')
plt.plot(test_data['Date'].reset_index(drop=True), y_pred_actual, label='Predicted', color='red', alpha=0.7)
plt.title('Actual vs Predicted Closing Prices (LightGBM)')
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.legend()
plt.show()

# 10. Feature Importance Visualization

if rmse_lgbm is not None:
    importances = lgbm.feature_importances_
    feature_names = X_train.columns

    # Create a DataFrame for visualization
    feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

    # Plot
    plt.figure(figsize=(12, 8))
    plt.barh(feature_importances['Feature'], feature_importances['Importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importances from LightGBM Regressor')
    plt.gca().invert_yaxis()
    plt.show()


In [None]:
# Install necessary libraries
# Note: Uncomment the following line if running in a fresh environment
# !pip install yfinance pandas numpy scikit-learn matplotlib lightgbm xgboost catboost statsmodels tensorflow

# Import libraries
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import timedelta
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import tensorflow as tf
import warnings
import re

warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# 1. Data Acquisition
stock_symbol = 'SOXL'
start_date = '2010-03-11'
end_date = '2024-11-22'

# Download stock data
data = yf.download(stock_symbol, start=start_date, end=end_date)

# Check if data was downloaded successfully
if data.empty:
    raise ValueError(f"No data found for stock symbol {stock_symbol} between {start_date} and {end_date}.")

# 2. Data Preprocessing
# We'll use the 'Close' price for prediction
data = data[['Close']].copy()
data.rename(columns={'Close': 'y'}, inplace=True)

# Reset index to have 'Date' as a column
data.reset_index(inplace=True)

# Ensure data is sorted by date
data.sort_values('Date', inplace=True)

# Feature Engineering: Create additional time-based features
data['Date'] = pd.to_datetime(data['Date'])
data['DayOfWeek'] = data['Date'].dt.dayofweek
data['WeekOfYear'] = data['Date'].dt.isocalendar().week
data['Month'] = data['Date'].dt.month
data['Quarter'] = data['Date'].dt.quarter
data['Year'] = data['Date'].dt.year

# Lag features (previous day's closing price)
for lag in range(1, 8):  # Using 7 days lag
    data[f'Lag_{lag}'] = data['y'].shift(lag)

# Rolling window features
data['RollingMean_7'] = data['y'].rolling(window=7).mean()
data['RollingStd_7'] = data['y'].rolling(window=7).std()

data['RollingMean_14'] = data['y'].rolling(window=14).mean()
data['RollingStd_14'] = data['y'].rolling(window=14).std()

# Drop rows with NaN values resulting from lag and rolling calculations
data.dropna(inplace=True)

# 3. Train-Test Split
# We'll use the last 20% of the data as the test set
split_ratio = 0.8
split_index = int(len(data) * split_ratio)

train_data = data[:split_index]
test_data = data[split_index:]

print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")

# 4. Feature Scaling
# Features to scale
features_to_scale = ['y'] + [f'Lag_{lag}' for lag in range(1, 8)] + \
                    ['RollingMean_7', 'RollingStd_7', 'RollingMean_14', 'RollingStd_14']

scaler = StandardScaler()
train_data_scaled = train_data.copy()
test_data_scaled = test_data.copy()

train_data_scaled[features_to_scale] = scaler.fit_transform(train_data[features_to_scale])
test_data_scaled[features_to_scale] = scaler.transform(test_data[features_to_scale])

# 5. Prepare Training and Testing Data
X_train = train_data_scaled.drop(columns=['Date', 'y'])
y_train = train_data_scaled['y']

X_test = test_data_scaled.drop(columns=['Date', 'y'])
y_test = test_data_scaled['y']

# 6. Sanitize Feature Names to Remove Special Characters
def sanitize_column_names(columns):
    """
    Replace any special characters in column names with underscores.
    Ensures all column names are strings before applying regex.
    """
    return [re.sub(r'[^A-Za-z0-9_]', '_', str(col)) for col in columns]

# Apply sanitization
X_train.columns = sanitize_column_names(X_train.columns)
X_test.columns = sanitize_column_names(X_test.columns)

# 7. Model Training and Evaluation

# Function to evaluate model performance
def evaluate_model(true, predicted, model_name):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mse)
    print(f"\n{model_name} Performance:")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    return rmse

# Function to plot predictions
def plot_predictions(true, predicted, model_name):
    plt.figure(figsize=(14, 7))
    plt.plot(range(len(true)), true.values, label='Actual', color='blue')
    plt.plot(range(len(predicted)), predicted, label='Predicted', color='red', alpha=0.7)
    plt.title(f'Actual vs Predicted Closing Prices ({model_name})')
    plt.xlabel('Sample Index')
    plt.ylabel('Closing Price (Scaled)')
    plt.legend()
    plt.show()

### **Model 1: LightGBM Regressor**

# Initialize the model
lgbm = LGBMRegressor(random_state=42)

# Train the model
try:
    lgbm.fit(X_train, y_train)
    print("LightGBM training completed successfully.")
except Exception as e:
    print(f"Error during LightGBM training: {e}")

# Predict
try:
    y_pred_lgbm = lgbm.predict(X_test)
    print("LightGBM prediction completed successfully.")
except Exception as e:
    print(f"Error during LightGBM prediction: {e}")

# Evaluate
if 'y_pred_lgbm' in locals():
    rmse_lgbm = evaluate_model(y_test, y_pred_lgbm, 'LightGBM Regressor')

    # Plot predictions
    plot_predictions(y_test, y_pred_lgbm, 'LightGBM Regressor')
else:
    rmse_lgbm = None
    print("Skipping evaluation and plotting for LightGBM due to previous errors.")

### **Model 2: ARIMA**

# For ARIMA, we'll use the original data without scaling
# Ensure stationarity
result = adfuller(train_data['y'])
print('\nAugmented Dickey-Fuller Test:')
print('ADF Statistic:', result[0])
print('p-value:', result[1])

if result[1] > 0.05:
    # Differencing to make the series stationary
    train_data['y_diff'] = train_data['y'].diff().dropna()
    print("Applied differencing to make the series stationary.")
else:
    train_data['y_diff'] = train_data['y']
    print("Series is stationary. No differencing applied.")

# Fit ARIMA model
arima_order = (5, 1, 0)  # This can be adjusted or determined using AIC/BIC criteria
arima_model = ARIMA(train_data['y'], order=arima_order)
try:
    arima_model_fit = arima_model.fit()
    print("ARIMA model fitted successfully.")
except Exception as e:
    print(f"Error during ARIMA model fitting: {e}")

# Forecast
forecast_steps = len(test_data)
try:
    arima_forecast = arima_model_fit.forecast(steps=forecast_steps)
    print("ARIMA forecasting completed successfully.")
except Exception as e:
    print(f"Error during ARIMA forecasting: {e}")

# Evaluate
try:
    rmse_arima = evaluate_model(test_data['y'], arima_forecast, 'ARIMA')
except Exception as e:
    print(f"Error during ARIMA evaluation: {e}")
    rmse_arima = None

# Plot predictions
if rmse_arima is not None:
    plt.figure(figsize=(14, 7))
    plt.plot(test_data['Date'].reset_index(drop=True), test_data['y'].reset_index(drop=True), label='Actual', color='blue')
    plt.plot(test_data['Date'].reset_index(drop=True), arima_forecast.values, label='Predicted', color='red', alpha=0.7)
    plt.title('Actual vs Predicted Closing Prices (ARIMA)')
    plt.xlabel('Date')
    plt.ylabel('Closing Price')
    plt.legend()
    plt.show()
else:
    print("Skipping ARIMA plotting due to previous errors.")

### **Model 3: LSTM Neural Network**

# Prepare data for LSTM
def create_lstm_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X.iloc[i:(i + time_steps)].values)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs, dtype=np.float32), np.array(ys, dtype=np.float32)

time_steps = 10

# Combine train and test data for LSTM scaling
combined_data = pd.concat([train_data_scaled, test_data_scaled], axis=0)
X_lstm = combined_data.drop(columns=['Date', 'y'])
y_lstm = combined_data['y']

# Create datasets
X_lstm, y_lstm = create_lstm_dataset(X_lstm, y_lstm, time_steps)

# Split back into train and test
X_train_lstm = X_lstm[:split_index - time_steps]
y_train_lstm = y_lstm[:split_index - time_steps]

X_test_lstm = X_lstm[split_index - time_steps:]
y_test_lstm = y_lstm[split_index - time_steps:]

# Build LSTM Model
lstm_model = Sequential()
lstm_model.add(LSTM(64, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), return_sequences=True))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(32, return_sequences=False))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(1))

lstm_model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
try:
    history = lstm_model.fit(X_train_lstm, y_train_lstm, epochs=20, batch_size=32, validation_split=0.1, verbose=1)
    print("LSTM training completed successfully.")
except Exception as e:
    print(f"Error during LSTM training: {e}")

# Predict
try:
    y_pred_lstm = lstm_model.predict(X_test_lstm)
    print("LSTM prediction completed successfully.")
except Exception as e:
    print(f"Error during LSTM prediction: {e}")

# Evaluate
if 'y_pred_lstm' in locals():
    rmse_lstm = evaluate_model(y_test_lstm, y_pred_lstm.flatten(), 'LSTM')

    # Plot predictions
    plot_predictions(y_test_lstm, y_pred_lstm.flatten(), 'LSTM')
else:
    rmse_lstm = None
    print("Skipping evaluation and plotting for LSTM due to previous errors.")

# 7. Model Comparison

print("\nModel RMSE Comparison:")
if rmse_lgbm is not None:
    print(f"LightGBM Regressor RMSE: {rmse_lgbm:.4f}")
if rmse_arima is not None:
    print(f"ARIMA RMSE: {rmse_arima:.4f}")
if rmse_lstm is not None:
    print(f"LSTM RMSE: {rmse_lstm:.4f}")

# 8. Final Model Selection and Prediction

# Assuming LightGBM performed the best based on RMSE
if rmse_lgbm is not None and (rmse_arima is None or rmse_lgbm <= rmse_arima) and (rmse_lstm is None or rmse_lgbm <= rmse_lstm):
    print("\nSelecting LightGBM Regressor as the final model.")
    # Retrain on entire dataset
    full_data = data.copy()
    full_data[features_to_scale] = scaler.fit_transform(full_data[features_to_scale])
    
    X_full = full_data.drop(columns=['Date', 'y'])
    y_full = full_data['y']
    
    # Sanitize column names in the full dataset
    X_full.columns = sanitize_column_names(X_full.columns)
    
    lgbm_final = LGBMRegressor(random_state=42)
    try:
        lgbm_final.fit(X_full, y_full)
        print("Final LightGBM training completed successfully.")
    except Exception as e:
        print(f"Error during final LightGBM training: {e}")
    
    # Future Prediction (Next Day Closing Price)
    # Prepare features for the next day
    last_row = full_data.iloc[-1:].copy()
    
    # Increment the date by one day
    next_date = last_row['Date'] + timedelta(days=1)
    last_row['Date'] = next_date
    
    # Update time-based features
    last_row['DayOfWeek'] = last_row['Date'].dt.dayofweek
    last_row['WeekOfYear'] = last_row['Date'].dt.isocalendar().week
    last_row['Month'] = last_row['Date'].dt.month
    last_row['Quarter'] = last_row['Date'].dt.quarter
    last_row['Year'] = last_row['Date'].dt.year
    
    # Update lag features
    for lag in range(1, 8):
        if lag <= len(full_data):
            last_row[f'Lag_{lag}'] = full_data['y'].shift(lag).iloc[-1]
        else:
            last_row[f'Lag_{lag}'] = 0  # Assign 0 or some default value if lag exceeds data length
    
    # Update rolling features
    last_row['RollingMean_7'] = full_data['y'].rolling(window=7).mean().iloc[-1]
    last_row['RollingStd_7'] = full_data['y'].rolling(window=7).std().iloc[-1]
    last_row['RollingMean_14'] = full_data['y'].rolling(window=14).mean().iloc[-1]
    last_row['RollingStd_14'] = full_data['y'].rolling(window=14).std().iloc[-1]
    
    # Drop NaN values
    last_row.dropna(inplace=True)
    
    # Scale features
    last_row[features_to_scale] = scaler.transform(last_row[features_to_scale])
    
    # Sanitize column names
    last_row = last_row.rename(columns=lambda x: re.sub(r'[^A-Za-z0-9_]', '_', x))
    
    # Prepare input features
    X_next = last_row.drop(columns=['Date', 'y'])
    
    # Predict the next day's closing price
    try:
        next_day_prediction_scaled = lgbm_final.predict(X_next)
        print("Next day prediction completed successfully.")
    except Exception as e:
        print(f"Error during LightGBM prediction for next day: {e}")
    
    # Inverse transform the prediction
    # Since only 'y' was scaled, we need to inverse transform only that component
    # Create an array with zeros for other features
    inverse_transform_input = np.zeros((1, len(features_to_scale)))
    inverse_transform_input[0, -1] = next_day_prediction_scaled[0]
    next_day_prediction = scaler.inverse_transform(inverse_transform_input)[0, -1]
    
    print(f"\nPredicted Next Day Closing Price: {next_day_prediction:.2f}")
else:
    print("\nLightGBM Regressor did not perform the best. Skipping final model selection and prediction.")

# 9. Plotting the Actual vs Predicted Prices for Test Data

# Inverse transform the scaled prices
y_test_actual = scaler.inverse_transform(
    np.concatenate((np.zeros((len(y_test), len(features_to_scale)-1)), y_test.values.reshape(-1, 1)), axis=1)
)[:, -1]
y_pred_actual = scaler.inverse_transform(
    np.concatenate((np.zeros((len(y_pred_lgbm), len(features_to_scale)-1)), y_pred_lgbm.reshape(-1, 1)), axis=1)
)[:, -1]

plt.figure(figsize=(14, 7))
plt.plot(test_data['Date'].reset_index(drop=True), y_test_actual, label='Actual', color='blue')
plt.plot(test_data['Date'].reset_index(drop=True), y_pred_actual, label='Predicted', color='red', alpha=0.7)
plt.title('Actual vs Predicted Closing Prices (LightGBM)')
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.legend()
plt.show()

# 10. Feature Importance Visualization

if rmse_lgbm is not None:
    importances = lgbm.feature_importances_
    feature_names = X_train.columns

    # Create a DataFrame for visualization
    feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

    # Plot
    plt.figure(figsize=(12, 8))
    plt.barh(feature_importances['Feature'], feature_importances['Importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importances from LightGBM Regressor')
    plt.gca().invert_yaxis()
    plt.show()


In [None]:
# Install necessary libraries
# Note: Uncomment the following line if running in a fresh environment
# !pip install yfinance pandas numpy scikit-learn matplotlib lightgbm xgboost catboost statsmodels tensorflow

# Import libraries
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import timedelta
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import tensorflow as tf
import warnings
import re

warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# 1. Data Acquisition
stock_symbol = 'SOXL'
start_date = '2010-03-11'
end_date = '2024-11-22'

# Download stock data
data = yf.download(stock_symbol, start=start_date, end=end_date)

# Check if data was downloaded successfully
if data.empty:
    raise ValueError(f"No data found for stock symbol {stock_symbol} between {start_date} and {end_date}.")

# 2. Data Preprocessing
# We'll use the 'Close' price for prediction
data = data[['Close']].copy()
data.rename(columns={'Close': 'y'}, inplace=True)

# Reset index to have 'Date' as a column
data.reset_index(inplace=True)

# Ensure data is sorted by date
data.sort_values('Date', inplace=True)

# Feature Engineering: Create additional time-based features
data['Date'] = pd.to_datetime(data['Date'])
data['DayOfWeek'] = data['Date'].dt.dayofweek
data['WeekOfYear'] = data['Date'].dt.isocalendar().week
data['Month'] = data['Date'].dt.month
data['Quarter'] = data['Date'].dt.quarter
data['Year'] = data['Date'].dt.year

# Lag features (previous day's closing price)
for lag in range(1, 8):  # Using 7 days lag
    data[f'Lag_{lag}'] = data['y'].shift(lag)

# Rolling window features
data['RollingMean_7'] = data['y'].rolling(window=7).mean()
data['RollingStd_7'] = data['y'].rolling(window=7).std()

data['RollingMean_14'] = data['y'].rolling(window=14).mean()
data['RollingStd_14'] = data['y'].rolling(window=14).std()

# Drop rows with NaN values resulting from lag and rolling calculations
data.dropna(inplace=True)

# 3. Train-Test Split
# We'll use the last 20% of the data as the test set
split_ratio = 0.8
split_index = int(len(data) * split_ratio)

train_data = data[:split_index]
test_data = data[split_index:]

print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")

# 4. Feature Scaling
# Features to scale
features_to_scale = ['y'] + [f'Lag_{lag}' for lag in range(1, 8)] + \
                    ['RollingMean_7', 'RollingStd_7', 'RollingMean_14', 'RollingStd_14']

scaler = StandardScaler()
train_data_scaled = train_data.copy()
test_data_scaled = test_data.copy()

train_data_scaled[features_to_scale] = scaler.fit_transform(train_data[features_to_scale])
test_data_scaled[features_to_scale] = scaler.transform(test_data[features_to_scale])

# 5. Prepare Training and Testing Data
X_train = train_data_scaled.drop(columns=['Date', 'y'])
y_train = train_data_scaled['y']

X_test = test_data_scaled.drop(columns=['Date', 'y'])
y_test = test_data_scaled['y']

# 6. Sanitize Feature Names to Remove Special Characters
def sanitize_column_names(columns):
    """
    Replace any special characters in column names with underscores.
    Ensures all column names are strings before applying regex.
    """
    return [re.sub(r'[^A-Za-z0-9_]', '_', str(col)) for col in columns]

# Apply sanitization
X_train.columns = sanitize_column_names(X_train.columns)
X_test.columns = sanitize_column_names(X_test.columns)

# 7. Model Training and Evaluation

# Function to evaluate model performance
def evaluate_model(true, predicted, model_name):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mse)
    print(f"\n{model_name} Performance:")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    return rmse

# Function to plot predictions
def plot_predictions(true, predicted, model_name):
    plt.figure(figsize=(14, 7))
    # Check if 'true' is a pandas Series/DataFrame or a numpy array
    if isinstance(true, (pd.Series, pd.DataFrame)):
        true_values = true.values
    elif isinstance(true, np.ndarray):
        true_values = true
    else:
        true_values = np.array(true)
    
    if isinstance(predicted, (pd.Series, pd.DataFrame)):
        predicted_values = predicted.values
    elif isinstance(predicted, np.ndarray):
        predicted_values = predicted
    else:
        predicted_values = np.array(predicted)
    
    plt.plot(range(len(true_values)), true_values, label='Actual', color='blue')
    plt.plot(range(len(predicted_values)), predicted_values, label='Predicted', color='red', alpha=0.7)
    plt.title(f'Actual vs Predicted Closing Prices ({model_name})')
    plt.xlabel('Sample Index')
    plt.ylabel('Closing Price (Scaled)')
    plt.legend()
    plt.show()

### **Model 1: LightGBM Regressor**

# Initialize the model
lgbm = LGBMRegressor(random_state=42)

# Train the model
try:
    lgbm.fit(X_train, y_train)
    print("LightGBM training completed successfully.")
except Exception as e:
    print(f"Error during LightGBM training: {e}")

# Predict
try:
    y_pred_lgbm = lgbm.predict(X_test)
    print("LightGBM prediction completed successfully.")
except Exception as e:
    print(f"Error during LightGBM prediction: {e}")

# Evaluate
if 'y_pred_lgbm' in locals():
    rmse_lgbm = evaluate_model(y_test, y_pred_lgbm, 'LightGBM Regressor')

    # Plot predictions
    plot_predictions(y_test, y_pred_lgbm, 'LightGBM Regressor')
else:
    rmse_lgbm = None
    print("Skipping evaluation and plotting for LightGBM due to previous errors.")

### **Model 2: ARIMA**

# For ARIMA, we'll use the original data without scaling
# Ensure stationarity
result = adfuller(train_data['y'])
print('\nAugmented Dickey-Fuller Test:')
print('ADF Statistic:', result[0])
print('p-value:', result[1])

if result[1] > 0.05:
    # Differencing to make the series stationary
    train_data['y_diff'] = train_data['y'].diff().dropna()
    print("Applied differencing to make the series stationary.")
else:
    train_data['y_diff'] = train_data['y']
    print("Series is stationary. No differencing applied.")

# Fit ARIMA model
arima_order = (5, 1, 0)  # This can be adjusted or determined using AIC/BIC criteria
arima_model = ARIMA(train_data['y'], order=arima_order)
try:
    arima_model_fit = arima_model.fit()
    print("ARIMA model fitted successfully.")
except Exception as e:
    print(f"Error during ARIMA model fitting: {e}")

# Forecast
forecast_steps = len(test_data)
try:
    arima_forecast = arima_model_fit.forecast(steps=forecast_steps)
    print("ARIMA forecasting completed successfully.")
except Exception as e:
    print(f"Error during ARIMA forecasting: {e}")

# Evaluate
try:
    rmse_arima = evaluate_model(test_data['y'], arima_forecast, 'ARIMA')
except Exception as e:
    print(f"Error during ARIMA evaluation: {e}")
    rmse_arima = None

# Plot predictions
if rmse_arima is not None:
    plt.figure(figsize=(14, 7))
    plt.plot(test_data['Date'].reset_index(drop=True), test_data['y'].reset_index(drop=True), label='Actual', color='blue')
    plt.plot(test_data['Date'].reset_index(drop=True), arima_forecast.values, label='Predicted', color='red', alpha=0.7)
    plt.title('Actual vs Predicted Closing Prices (ARIMA)')
    plt.xlabel('Date')
    plt.ylabel('Closing Price')
    plt.legend()
    plt.show()
else:
    print("Skipping ARIMA plotting due to previous errors.")

### **Model 3: LSTM Neural Network**

# Prepare data for LSTM
def create_lstm_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X.iloc[i:(i + time_steps)].values)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs, dtype=np.float32), np.array(ys, dtype=np.float32)

time_steps = 10

# Combine train and test data for LSTM scaling
combined_data = pd.concat([train_data_scaled, test_data_scaled], axis=0)
X_lstm = combined_data.drop(columns=['Date', 'y'])
y_lstm = combined_data['y']

# Create datasets
X_lstm, y_lstm = create_lstm_dataset(X_lstm, y_lstm, time_steps)

# Split back into train and test
X_train_lstm = X_lstm[:split_index - time_steps]
y_train_lstm = y_lstm[:split_index - time_steps]

X_test_lstm = X_lstm[split_index - time_steps:]
y_test_lstm = y_lstm[split_index - time_steps:]

# Build LSTM Model
lstm_model = Sequential()
lstm_model.add(LSTM(64, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), return_sequences=True))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(32, return_sequences=False))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(1))

lstm_model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
try:
    history = lstm_model.fit(X_train_lstm, y_train_lstm, epochs=20, batch_size=32, validation_split=0.1, verbose=1)
    print("LSTM training completed successfully.")
except Exception as e:
    print(f"Error during LSTM training: {e}")

# Predict
try:
    y_pred_lstm = lstm_model.predict(X_test_lstm)
    print("LSTM prediction completed successfully.")
except Exception as e:
    print(f"Error during LSTM prediction: {e}")

# Evaluate
if 'y_pred_lstm' in locals():
    rmse_lstm = evaluate_model(y_test_lstm, y_pred_lstm.flatten(), 'LSTM')

    # Plot predictions
    plot_predictions(y_test_lstm, y_pred_lstm.flatten(), 'LSTM')
else:
    rmse_lstm = None
    print("Skipping evaluation and plotting for LSTM due to previous errors.")

# 8. Model Comparison

print("\nModel RMSE Comparison:")
if rmse_lgbm is not None:
    print(f"LightGBM Regressor RMSE: {rmse_lgbm:.4f}")
if rmse_arima is not None:
    print(f"ARIMA RMSE: {rmse_arima:.4f}")
if rmse_lstm is not None:
    print(f"LSTM RMSE: {rmse_lstm:.4f}")

# 9. Final Model Selection and Prediction

# Assuming LightGBM performed the best based on RMSE
if rmse_lgbm is not None and (rmse_arima is None or rmse_lgbm <= rmse_arima) and (rmse_lstm is None or rmse_lgbm <= rmse_lstm):
    print("\nSelecting LightGBM Regressor as the final model.")
    # Retrain on entire dataset
    full_data = data.copy()
    full_data[features_to_scale] = scaler.fit_transform(full_data[features_to_scale])
    
    X_full = full_data.drop(columns=['Date', 'y'])
    y_full = full_data['y']
    
    # Sanitize column names in the full dataset
    X_full.columns = sanitize_column_names(X_full.columns)
    
    lgbm_final = LGBMRegressor(random_state=42)
    try:
        lgbm_final.fit(X_full, y_full)
        print("Final LightGBM training completed successfully.")
    except Exception as e:
        print(f"Error during final LightGBM training: {e}")
    
    # Future Prediction (Next Day Closing Price)
    # Prepare features for the next day
    last_row = full_data.iloc[-1:].copy()
    
    # Increment the date by one day
    next_date = last_row['Date'] + timedelta(days=1)
    last_row['Date'] = next_date
    
    # Update time-based features
    last_row['DayOfWeek'] = last_row['Date'].dt.dayofweek
    last_row['WeekOfYear'] = last_row['Date'].dt.isocalendar().week
    last_row['Month'] = last_row['Date'].dt.month
    last_row['Quarter'] = last_row['Date'].dt.quarter
    last_row['Year'] = last_row['Date'].dt.year
    
    # Update lag features
    for lag in range(1, 8):
        if lag <= len(full_data):
            last_row[f'Lag_{lag}'] = full_data['y'].shift(lag).iloc[-1]
        else:
            last_row[f'Lag_{lag}'] = 0  # Assign 0 or some default value if lag exceeds data length
    
    # Update rolling features
    last_row['RollingMean_7'] = full_data['y'].rolling(window=7).mean().iloc[-1]
    last_row['RollingStd_7'] = full_data['y'].rolling(window=7).std().iloc[-1]
    last_row['RollingMean_14'] = full_data['y'].rolling(window=14).mean().iloc[-1]
    last_row['RollingStd_14'] = full_data['y'].rolling(window=14).std().iloc[-1]
    
    # Drop NaN values
    last_row.dropna(inplace=True)
    
    # Scale features
    last_row[features_to_scale] = scaler.transform(last_row[features_to_scale])
    
    # Sanitize column names
    last_row = last_row.rename(columns=lambda x: re.sub(r'[^A-Za-z0-9_]', '_', x))
    
    # Prepare input features
    X_next = last_row.drop(columns=['Date', 'y'])
    
    # Predict the next day's closing price
    try:
        next_day_prediction_scaled = lgbm_final.predict(X_next)
        print("Next day prediction completed successfully.")
    except Exception as e:
        print(f"Error during LightGBM prediction for next day: {e}")
    
    # Inverse transform the prediction
    # Since only 'y' was scaled, we need to inverse transform only that component
    # Create an array with zeros for other features
    inverse_transform_input = np.zeros((1, len(features_to_scale)))
    inverse_transform_input[0, -1] = next_day_prediction_scaled[0]
    next_day_prediction = scaler.inverse_transform(inverse_transform_input)[0, -1]
    
    print(f"\nPredicted Next Day Closing Price: {next_day_prediction:.2f}")
else:
    print("\nLightGBM Regressor did not perform the best. Skipping final model selection and prediction.")

# 10. Plotting the Actual vs Predicted Prices for Test Data

# Inverse transform the scaled prices
y_test_actual = scaler.inverse_transform(
    np.concatenate((np.zeros((len(y_test), len(features_to_scale)-1)), y_test.values.reshape(-1, 1)), axis=1)
)[:, -1]
y_pred_actual = scaler.inverse_transform(
    np.concatenate((np.zeros((len(y_pred_lgbm), len(features_to_scale)-1)), y_pred_lgbm.reshape(-1, 1)), axis=1)
)[:, -1]

plt.figure(figsize=(14, 7))
plt.plot(test_data['Date'].reset_index(drop=True), y_test_actual, label='Actual', color='blue')
plt.plot(test_data['Date'].reset_index(drop=True), y_pred_actual, label='Predicted', color='red', alpha=0.7)
plt.title('Actual vs Predicted Closing Prices (LightGBM)')
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.legend()
plt.show()

# 11. Feature Importance Visualization

if rmse_lgbm is not None:
    importances = lgbm.feature_importances_
    feature_names = X_train.columns

    # Create a DataFrame for visualization
    feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

    # Plot
    plt.figure(figsize=(12, 8))
    plt.barh(feature_importances['Feature'], feature_importances['Importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importances from LightGBM Regressor')
    plt.gca().invert_yaxis()
    plt.show()


In [None]:
# Install necessary libraries
# Note: Uncomment the following line if running in a fresh environment
# !pip install yfinance pandas numpy scikit-learn matplotlib lightgbm xgboost catboost statsmodels tensorflow

# Import libraries
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import timedelta
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import tensorflow as tf
import warnings
import re

warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# 1. Data Acquisition
stock_symbol = 'SOXL'
start_date = '2010-03-11'
end_date = '2024-11-22'

# Download stock data
data = yf.download(stock_symbol, start=start_date, end=end_date)

# Check if data was downloaded successfully
if data.empty:
    raise ValueError(f"No data found for stock symbol {stock_symbol} between {start_date} and {end_date}.")

# 2. Data Preprocessing
# We'll use the 'Close' price for prediction
data = data[['Close']].copy()
data.rename(columns={'Close': 'y'}, inplace=True)

# Reset index to have 'Date' as a column
data.reset_index(inplace=True)

# Ensure data is sorted by date
data.sort_values('Date', inplace=True)

# Feature Engineering: Create additional time-based features
data['Date'] = pd.to_datetime(data['Date'])
data['DayOfWeek'] = data['Date'].dt.dayofweek
data['WeekOfYear'] = data['Date'].dt.isocalendar().week
data['Month'] = data['Date'].dt.month
data['Quarter'] = data['Date'].dt.quarter
data['Year'] = data['Date'].dt.year

# Lag features (previous day's closing price)
for lag in range(1, 8):  # Using 7 days lag
    data[f'Lag_{lag}'] = data['y'].shift(lag)

# Rolling window features
data['RollingMean_7'] = data['y'].rolling(window=7).mean()
data['RollingStd_7'] = data['y'].rolling(window=7).std()

data['RollingMean_14'] = data['y'].rolling(window=14).mean()
data['RollingStd_14'] = data['y'].rolling(window=14).std()

# Drop rows with NaN values resulting from lag and rolling calculations
data.dropna(inplace=True)

# 3. Train-Test Split
# We'll use the last 20% of the data as the test set
split_ratio = 0.8
split_index = int(len(data) * split_ratio)

train_data = data[:split_index]
test_data = data[split_index:]

print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")

# 4. Feature Scaling
# Define two separate feature lists
features_to_scale_all = ['y'] + [f'Lag_{lag}' for lag in range(1, 8)] + \
                        ['RollingMean_7', 'RollingStd_7', 'RollingMean_14', 'RollingStd_14']

features_to_scale_input = [f for f in features_to_scale_all if f != 'y']

# Initialize scalers
scaler_all = StandardScaler()
scaler_input = StandardScaler()

# Scale all features including 'y' for training and testing datasets
train_data_scaled = train_data.copy()
test_data_scaled = test_data.copy()

train_data_scaled[features_to_scale_all] = scaler_all.fit_transform(train_data[features_to_scale_all])
test_data_scaled[features_to_scale_all] = scaler_all.transform(test_data[features_to_scale_all])

# Scale input features separately if needed (optional)
# train_data_scaled[features_to_scale_input] = scaler_input.fit_transform(train_data[features_to_scale_input])
# test_data_scaled[features_to_scale_input] = scaler_input.transform(test_data[features_to_scale_input])

# 5. Prepare Training and Testing Data
X_train = train_data_scaled.drop(columns=['Date', 'y'])
y_train = train_data_scaled['y']

X_test = test_data_scaled.drop(columns=['Date', 'y'])
y_test = test_data_scaled['y']

# 6. Sanitize Feature Names to Remove Special Characters
def sanitize_column_names(columns):
    """
    Replace any special characters in column names with underscores.
    Ensures all column names are strings before applying regex.
    """
    return [re.sub(r'[^A-Za-z0-9_]', '_', str(col)) for col in columns]

# Apply sanitization
X_train.columns = sanitize_column_names(X_train.columns)
X_test.columns = sanitize_column_names(X_test.columns)

# 7. Model Training and Evaluation

# Function to evaluate model performance
def evaluate_model(true, predicted, model_name):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mse)
    print(f"\n{model_name} Performance:")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    return rmse

# Function to plot predictions
def plot_predictions(true, predicted, model_name):
    plt.figure(figsize=(14, 7))
    # Check if 'true' is a pandas Series/DataFrame or a numpy array
    if isinstance(true, (pd.Series, pd.DataFrame)):
        true_values = true.values
    elif isinstance(true, np.ndarray):
        true_values = true
    else:
        true_values = np.array(true)
    
    if isinstance(predicted, (pd.Series, pd.DataFrame)):
        predicted_values = predicted.values
    elif isinstance(predicted, np.ndarray):
        predicted_values = predicted
    else:
        predicted_values = np.array(predicted)
    
    plt.plot(range(len(true_values)), true_values, label='Actual', color='blue')
    plt.plot(range(len(predicted_values)), predicted_values, label='Predicted', color='red', alpha=0.7)
    plt.title(f'Actual vs Predicted Closing Prices ({model_name})')
    plt.xlabel('Sample Index')
    plt.ylabel('Closing Price (Scaled)')
    plt.legend()
    plt.show()

### **Model 1: LightGBM Regressor**

# Initialize the model
lgbm = LGBMRegressor(random_state=42)

# Train the model
try:
    lgbm.fit(X_train, y_train)
    print("LightGBM training completed successfully.")
except Exception as e:
    print(f"Error during LightGBM training: {e}")

# Predict
try:
    y_pred_lgbm = lgbm.predict(X_test)
    print("LightGBM prediction completed successfully.")
except Exception as e:
    print(f"Error during LightGBM prediction: {e}")

# Evaluate
if 'y_pred_lgbm' in locals():
    rmse_lgbm = evaluate_model(y_test, y_pred_lgbm, 'LightGBM Regressor')

    # Plot predictions
    plot_predictions(y_test, y_pred_lgbm, 'LightGBM Regressor')
else:
    rmse_lgbm = None
    print("Skipping evaluation and plotting for LightGBM due to previous errors.")

### **Model 2: ARIMA**

# For ARIMA, we'll use the original data without scaling
# Ensure stationarity
result = adfuller(train_data['y'])
print('\nAugmented Dickey-Fuller Test:')
print('ADF Statistic:', result[0])
print('p-value:', result[1])

if result[1] > 0.05:
    # Differencing to make the series stationary
    train_data['y_diff'] = train_data['y'].diff().dropna()
    print("Applied differencing to make the series stationary.")
else:
    train_data['y_diff'] = train_data['y']
    print("Series is stationary. No differencing applied.")

# Fit ARIMA model
arima_order = (5, 1, 0)  # This can be adjusted or determined using AIC/BIC criteria
arima_model = ARIMA(train_data['y'], order=arima_order)
try:
    arima_model_fit = arima_model.fit()
    print("ARIMA model fitted successfully.")
except Exception as e:
    print(f"Error during ARIMA model fitting: {e}")

# Forecast
forecast_steps = len(test_data)
try:
    arima_forecast = arima_model_fit.forecast(steps=forecast_steps)
    print("ARIMA forecasting completed successfully.")
except Exception as e:
    print(f"Error during ARIMA forecasting: {e}")

# Evaluate
try:
    rmse_arima = evaluate_model(test_data['y'], arima_forecast, 'ARIMA')
except Exception as e:
    print(f"Error during ARIMA evaluation: {e}")
    rmse_arima = None

# Plot predictions
if rmse_arima is not None:
    plt.figure(figsize=(14, 7))
    plt.plot(test_data['Date'].reset_index(drop=True), test_data['y'].reset_index(drop=True), label='Actual', color='blue')
    plt.plot(test_data['Date'].reset_index(drop=True), arima_forecast.values, label='Predicted', color='red', alpha=0.7)
    plt.title('Actual vs Predicted Closing Prices (ARIMA)')
    plt.xlabel('Date')
    plt.ylabel('Closing Price')
    plt.legend()
    plt.show()
else:
    print("Skipping ARIMA plotting due to previous errors.")

### **Model 3: LSTM Neural Network**

# Prepare data for LSTM
def create_lstm_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X.iloc[i:(i + time_steps)].values)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs, dtype=np.float32), np.array(ys, dtype=np.float32)

time_steps = 10

# Combine train and test data for LSTM scaling
combined_data = pd.concat([train_data_scaled, test_data_scaled], axis=0)
X_lstm = combined_data.drop(columns=['Date', 'y'])
y_lstm = combined_data['y']

# Create datasets
X_lstm, y_lstm = create_lstm_dataset(X_lstm, y_lstm, time_steps)

# Split back into train and test
X_train_lstm = X_lstm[:split_index - time_steps]
y_train_lstm = y_lstm[:split_index - time_steps]

X_test_lstm = X_lstm[split_index - time_steps:]
y_test_lstm = y_lstm[split_index - time_steps:]

# Build LSTM Model
lstm_model = Sequential()
lstm_model.add(LSTM(64, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), return_sequences=True))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(32, return_sequences=False))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(1))

lstm_model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
try:
    history = lstm_model.fit(X_train_lstm, y_train_lstm, epochs=20, batch_size=32, validation_split=0.1, verbose=1)
    print("LSTM training completed successfully.")
except Exception as e:
    print(f"Error during LSTM training: {e}")

# Predict
try:
    y_pred_lstm = lstm_model.predict(X_test_lstm)
    print("LSTM prediction completed successfully.")
except Exception as e:
    print(f"Error during LSTM prediction: {e}")

# Evaluate
if 'y_pred_lstm' in locals():
    rmse_lstm = evaluate_model(y_test_lstm, y_pred_lstm.flatten(), 'LSTM')

    # Plot predictions
    plot_predictions(y_test_lstm, y_pred_lstm.flatten(), 'LSTM')
else:
    rmse_lstm = None
    print("Skipping evaluation and plotting for LSTM due to previous errors.")

# 8. Model Comparison

print("\nModel RMSE Comparison:")
if rmse_lgbm is not None:
    print(f"LightGBM Regressor RMSE: {rmse_lgbm:.4f}")
if rmse_arima is not None:
    print(f"ARIMA RMSE: {rmse_arima:.4f}")
if rmse_lstm is not None:
    print(f"LSTM RMSE: {rmse_lstm:.4f}")

# 9. Final Model Selection and Prediction

# Assuming LightGBM performed the best based on RMSE
if rmse_lgbm is not None and (rmse_arima is None or rmse_lgbm <= rmse_arima) and (rmse_lstm is None or rmse_lgbm <= rmse_lstm):
    print("\nSelecting LightGBM Regressor as the final model.")
    # Retrain on entire dataset
    full_data = data.copy()
    full_data[features_to_scale_all] = scaler_all.fit_transform(full_data[features_to_scale_all])
    
    X_full = full_data.drop(columns=['Date', 'y'])
    y_full = full_data['y']
    
    # Sanitize column names in the full dataset
    X_full.columns = sanitize_column_names(X_full.columns)
    
    lgbm_final = LGBMRegressor(random_state=42)
    try:
        lgbm_final.fit(X_full, y_full)
        print("Final LightGBM training completed successfully.")
    except Exception as e:
        print(f"Error during final LightGBM training: {e}")
    
    # Future Prediction (Next Day Closing Price)
    # Prepare features for the next day
    last_row = full_data.iloc[-1:].copy()
    
    # Increment the date by one day
    next_date = last_row['Date'] + timedelta(days=1)
    last_row['Date'] = next_date
    
    # Update time-based features
    last_row['DayOfWeek'] = last_row['Date'].dt.dayofweek
    last_row['WeekOfYear'] = last_row['Date'].dt.isocalendar().week
    last_row['Month'] = last_row['Date'].dt.month
    last_row['Quarter'] = last_row['Date'].dt.quarter
    last_row['Year'] = last_row['Date'].dt.year
    
    # Update lag features
    for lag in range(1, 8):
        if lag <= len(full_data):
            last_row[f'Lag_{lag}'] = full_data['y'].shift(lag).iloc[-1]
        else:
            last_row[f'Lag_{lag}'] = 0  # Assign 0 or some default value if lag exceeds data length
    
    # Update rolling features
    last_row['RollingMean_7'] = full_data['y'].rolling(window=7).mean().iloc[-1]
    last_row['RollingStd_7'] = full_data['y'].rolling(window=7).std().iloc[-1]
    last_row['RollingMean_14'] = full_data['y'].rolling(window=14).mean().iloc[-1]
    last_row['RollingStd_14'] = full_data['y'].rolling(window=14).std().iloc[-1]
    
    # Drop NaN values
    last_row.dropna(inplace=True)
    
    # Define input features excluding 'y'
    input_features = [f for f in features_to_scale_all if f != 'y']
    
    # Check if all input features are present
    missing_features = set(input_features) - set(last_row.columns)
    if missing_features:
        print(f"Missing features in last_row: {missing_features}")
        # Handle missing features if necessary
        # For now, we'll assign 0 to missing features
        for feature in missing_features:
            last_row[feature] = 0
    
    # Scale input features
    try:
        last_row[input_features] = scaler_all.transform(last_row[input_features])
        print("Scaling of input features for prediction completed successfully.")
    except Exception as e:
        print(f"Error during scaling of input features: {e}")
        # Exit prediction process if scaling fails
        next_day_prediction = None
    
    # Sanitize column names
    last_row = last_row.rename(columns=lambda x: re.sub(r'[^A-Za-z0-9_]', '_', x))
    
    # Prepare input features
    X_next = last_row.drop(columns=['Date', 'y'], errors='ignore')  # 'y' might not be present
    
    # Ensure that all input features are present
    for feature in X_full.columns:
        if feature not in X_next.columns:
            X_next[feature] = 0  # Assign default value
    
    # Reorder columns to match the training data
    X_next = X_next[X_full.columns]
    
    # Predict the next day's closing price
    try:
        next_day_prediction_scaled = lgbm_final.predict(X_next)
        print("Next day prediction completed successfully.")
    except Exception as e:
        print(f"Error during LightGBM prediction for next day: {e}")
        next_day_prediction_scaled = None
    
    if next_day_prediction_scaled is not None:
        # Inverse transform the prediction
        # Since 'y' is the last feature in scaler_all, we can reconstruct the array
        inverse_transform_input = np.zeros((1, len(features_to_scale_all)))
        inverse_transform_input[0, features_to_scale_all.index('y')] = next_day_prediction_scaled[0]
        next_day_prediction = scaler_all.inverse_transform(inverse_transform_input)[0, features_to_scale_all.index('y')]
        
        print(f"\nPredicted Next Day Closing Price: {next_day_prediction:.2f}")
    else:
        print("\nNext day prediction could not be made due to previous errors.")
else:
    print("\nLightGBM Regressor did not perform the best. Skipping final model selection and prediction.")

# 10. Plotting the Actual vs Predicted Prices for Test Data

# Inverse transform the scaled prices
y_test_actual = scaler_all.inverse_transform(
    np.concatenate((np.zeros((len(y_test), len(features_to_scale_all)-1)), y_test.values.reshape(-1, 1)), axis=1)
)[:, -1]
y_pred_actual = scaler_all.inverse_transform(
    np.concatenate((np.zeros((len(y_pred_lgbm), len(features_to_scale_all)-1)), y_pred_lgbm.reshape(-1, 1)), axis=1)
)[:, -1]

plt.figure(figsize=(14, 7))
plt.plot(test_data['Date'].reset_index(drop=True), y_test_actual, label='Actual', color='blue')
plt.plot(test_data['Date'].reset_index(drop=True), y_pred_actual, label='Predicted', color='red', alpha=0.7)
plt.title('Actual vs Predicted Closing Prices (LightGBM)')
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.legend()
plt.show()

# 11. Feature Importance Visualization

if rmse_lgbm is not None:
    importances = lgbm.feature_importances_
    feature_names = X_train.columns

    # Create a DataFrame for visualization
    feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

    # Plot
    plt.figure(figsize=(12, 8))
    plt.barh(feature_importances['Feature'], feature_importances['Importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importances from LightGBM Regressor')
    plt.gca().invert_yaxis()
    plt.show()


In [2]:
import pandas as pd
import yfinance as yf
from pycaret.time_series import TSForecastingExperiment

# Step 1: Fetch the data using yfinance
dat = yf.Ticker("SOXL")
df = dat.history(period='max')

# Step 2: Extract only the 'Open' column
open_data = df[['Open']]

# Step 3: Prepare the data for PyCaret
open_data.reset_index(inplace=True)
open_data.columns = ['Date', 'Open']

# Step 4: Initialize PyCaret Time Series Forecasting Experiment
exp = TSForecastingExperiment()
exp.setup(data=open_data['Open'], fh=7, fold=3, session_id=42)  # Using a forecast horizon (fh) of 7 days

# Step 5: Train and compare models
best_model = exp.compare_models()

# Step 6: Finalize the model and predict future values
final_model = exp.finalize_model(best_model)
forecast = exp.predict_model(final_model)

# Step 7: Plot the forecast
exp.plot_model(final_model, plot='forecast')

# Optional: Print forecasted values
print(forecast)


Unnamed: 0,Description,Value
0,session_id,42
1,Target,Open
2,Approach,Univariate
3,Exogenous Variables,Not Present
4,Original data shape,"(3703, 1)"
5,Transformed data shape,"(3703, 1)"
6,Transformed train set shape,"(3696, 1)"
7,Transformed test set shape,"(7, 1)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter


Unnamed: 0,Model,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2,TT (Sec)
naive,Naive Forecaster,1.5473,0.8392,3.18,3.5879,0.0926,0.0899,-9.5768,1.99
arima,ARIMA,1.5543,0.8735,3.1939,3.7348,0.0941,0.0886,-13.1524,1.0833
polytrend,Polynomial Trend Forecaster,1.5551,0.8317,3.1962,3.5559,0.0912,0.0967,-7.4531,0.6633
snaive,Seasonal Naive Forecaster,1.6714,0.9222,3.4362,3.9428,0.1037,0.0975,-5.9632,0.0433
grand_means,Grand Means Forecaster,10.9461,5.2788,22.5002,22.5685,0.6595,0.9847,-270.4225,0.6633


Processing:   0%|          | 0/121 [00:00<?, ?it/s]

KeyboardInterrupt: 