# Technical Analysis Indicator Price Prediction
The goal of this project is to analyze the predictive power of the top 10 most popular TA indicators and see how well they do to predict price over a 30 day period. I am going to find the value of the indicators on day 1 (30 trading days ago) and then find the daily closing price for 30 days later and measure how well the indicator predicted the price.  

first we'll find the top 500 stocks by market cap from nasdaq and pull them into a dataframe


In [None]:
# Importing pandas library for data manipulation and analysis
import pandas as pd

# Load the CSV file into a DataFrame
csv_file_path = '/Users/evancallaghan/Downloads/nasdaq_screener_1726538993372.csv' 
df = pd.read_csv(csv_file_path)

# Inspect the DataFrame to understand its structure
print(df.head())

# Filter DataFrame to only show the columns 'Symbol', 'Name', and 'Market Cap'
df = df[['Symbol', 'Name', 'Market Cap']]

# Convert 'Market Cap' to numeric if it's not already
# Remove commas, dollar signs, and replace these symbols with empty spaces
df['Market Cap'] = df['Market Cap'].replace({'\$': '', ',': ''}, regex=True).astype(float)

# Sort the DataFrame by Market Cap in descending order
df_sorted = df.sort_values(by='Market Cap', ascending=False).head(1000)                                                                        
df_sorted.head()


In [None]:
# Reset the index of the DataFrame and drop the old index
df_sorted.reset_index(drop=True, inplace=True)

# Update the index to start from 1 instead of 0
df_sorted.index = df_sorted.index + 1

# Display the first few rows of the updated DataFrame
df_sorted.head()

remove all stocks except common stocks

In [None]:
# Ensure there are no leading or trailing whitespaces in the 'Name' column
df_sorted['Name'] = df_sorted['Name'].str.strip()

# List of terms to filter out
terms_to_drop = ["Capital Stock", "Depository Shares", "Global Notes", "ADS", 
                 "Registry Shares", "Depositary Shares"
]

# Create a regex pattern to match any of the terms
# //b ensures that the match occues only at the start or end of a word
# pipe '|' ensures that if any of the terms in 'terms_to_drop' are seen, 
# there is a match
pattern = '|'.join([f"\\b{term}\\b" for term in terms_to_drop])

# Apply filtering based on the updated pattern
df_filtered = df_sorted[~df_sorted['Name'].str.contains(pattern, case=False, 
                                                        na=False)
]

# Display the filtered DataFrame
df_filtered.head()

In [None]:
# Reset the index of the DataFrame and drop the old index
df_filtered.reset_index(drop=True, inplace=True)

# Update the index to start from 1 instead of 0
df_filtered.index = df_filtered.index + 1

# Display the first few rows of the updated DataFrame
df_filtered.head()

In [None]:
df_filtered[595:600]

In [None]:
df_filtered.shape

below are the 10 technical indicators we are going to use for this project.
1. Relative Strength Index (RSI)
2. Moving Average Convergence Divergence (MACD)
3. Stochastic Oscillator
4. Simple Moving Average (SMA)
5. Exponential Moving Average (EMA)
6. Volume Weighted Average Price (VWAP)
7. Bollinger Bands
8. Average True Range (ATR)
9. Fibonacci Retracement 

In [None]:
# 10 year historical data top 600 stocks
# Pulls data from yahoo finance into CSV files
import yfinance as yf
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import time
import os

# Function to download stock data for a single stock
def download_stock_data(ticker, retries=3):
    for attempt in range(retries):
        try:
            # Add a delay to avoid rate limiting
            time.sleep(1)  # Increase delay to 2 seconds between requests
            
            print(f"Downloading data for {ticker}, attempt {attempt + 1}")
            data = yf.download(ticker, start="2015-02-10", end="2025-02-17", interval="1d")[['Close', 'High', 'Low', 'Volume']]
            
            if data.empty:
                print(f"Warning: No data found for {ticker}")
                return None  # Return None if data is empty

            # Explicitly add 'Date' as a column before resetting the index
            data['Date'] = data.index

            # Reset the index and make 'Date' a normal column
            data.reset_index(drop=True, inplace=True)
            
            data['Ticker'] = ticker
            print(f"Downloaded data for {ticker}:\n{data.head()}")
            return data
        except Exception as e:
            print(f"Error downloading data for {ticker}: {e}")
            time.sleep(1)  # Longer delay before retrying in case of failure
    return None  # Return None after retries if still fails

# List of tickers from your df_filtered dataframe
tickers = df_filtered['Symbol'].head(600).astype(str).tolist()  # Ensure tickers are strings

# Batch size for processing tickers in smaller chunks
batch_size = 100  # Reduce batch size to avoid rate limits

# Directory to save CSV files
output_dir = "/Users/evancallaghan/flatiron_ds/phase_5/capstone_project"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Create a function to download data for a batch of tickers in parallel
def download_batch(batch_tickers, batch_index):
    with ThreadPoolExecutor(max_workers=2) as executor:  # Use fewer threads to reduce load
        results = list(executor.map(lambda ticker: download_stock_data(ticker), batch_tickers))

    # Remove None values and ensure we have valid data
    valid_results = [(batch_tickers[i], results[i]) for i in range(len(batch_tickers)) if results[i] is not None]
    
    # Add ticker info to the valid results
    for ticker, df in valid_results:
        df['Ticker'] = ticker  # Explicitly add a column for ticker
    
    # Combine all the valid stock data into a single DataFrame
    if valid_results:
        df_batch = pd.concat([df for _, df in valid_results], ignore_index=False)  # Don't lose index info
        print(f"Saving batch {batch_index} data to CSV.")
        # Save the batch to a CSV file
        df_batch.to_csv(f"{output_dir}/top600_10yr_stock_price_data_{batch_index}.csv", index=False)
    else:
        print(f"No data downloaded for batch {batch_index}.")

# Split tickers into batches
for i in range(0, len(tickers), batch_size):
    batch_tickers = tickers[i:i + batch_size]
    batch_index = (i // batch_size) + 1  # Batch index starts from 1
    download_batch(batch_tickers, batch_index)

print("All batches processed and saved.")


In [None]:
# Not sure if we will use thiis

# 5 year historical data top 600 stocks
# Pulls data from yahoo finance into CSV files
import yfinance as yf
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import time
import os

# Function to download stock data for a single stock
def download_stock_data(ticker, retries=3):
    for attempt in range(retries):
        try:
            # Add a delay to avoid rate limiting
            time.sleep(1)  # Increase delay to 2 seconds between requests
            
            print(f"Downloading data for {ticker}, attempt {attempt + 1}")
            data = yf.download(ticker, start="2020-02-10", end="2025-02-17", interval="1d")[['Close', 'High', 'Low', 'Volume']]
            
            if data.empty:
                print(f"Warning: No data found for {ticker}")
                return None  # Return None if data is empty

            # Explicitly add 'Date' as a column before resetting the index
            data['Date'] = data.index

            # Reset the index and make 'Date' a normal column
            data.reset_index(drop=True, inplace=True)
            
            data['Ticker'] = ticker
            print(f"Downloaded data for {ticker}:\n{data.head()}")
            return data
        except Exception as e:
            print(f"Error downloading data for {ticker}: {e}")
            time.sleep(1)  # Longer delay before retrying in case of failure
    return None  # Return None after retries if still fails

# List of tickers from your df_filtered dataframe
tickers = df_filtered['Symbol'].head(600).astype(str).tolist()  # Ensure tickers are strings

# Batch size for processing tickers in smaller chunks
batch_size = 100  # Reduce batch size to avoid rate limits

# Directory to save CSV files
output_dir = "/Users/evancallaghan/flatiron_ds/phase_5/capstone_project"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Create a function to download data for a batch of tickers in parallel
def download_batch(batch_tickers, batch_index):
    with ThreadPoolExecutor(max_workers=2) as executor:  # Use fewer threads to reduce load
        results = list(executor.map(lambda ticker: download_stock_data(ticker), batch_tickers))

    # Remove None values and ensure we have valid data
    valid_results = [(batch_tickers[i], results[i]) for i in range(len(batch_tickers)) if results[i] is not None]
    
    # Add ticker info to the valid results
    for ticker, df in valid_results:
        df['Ticker'] = ticker  # Explicitly add a column for ticker
    
    # Combine all the valid stock data into a single DataFrame
    if valid_results:
        df_batch = pd.concat([df for _, df in valid_results], ignore_index=False)  # Don't lose index info
        print(f"Saving batch {batch_index} data to CSV.")
        # Save the batch to a CSV file
        df_batch.to_csv(f"{output_dir}/top600_5yr_stock_price_data_{batch_index}.csv", index=False)
    else:
        print(f"No data downloaded for batch {batch_index}.")

# Split tickers into batches
for i in range(0, len(tickers), batch_size):
    batch_tickers = tickers[i:i + batch_size]
    batch_index = (i // batch_size) + 1  # Batch index starts from 1
    download_batch(batch_tickers, batch_index)

print("All batches processed and saved.")


In [None]:
import pandas as pd

# Initialize an empty list to hold DataFrames
df_list = []

# List of specific file indices
file_indices = range(1, 7)

# Loop through the specific CSV file indices
for i in file_indices:
    # Construct the file path for each batch
    csv_file_path = f'/Users/evancallaghan/flatiron_ds/phase_5/capstone_project/top600_10yr_stock_price_data_{i}.csv'

    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)


    # Append the DataFrame to the list
    df_list.append(df)

# Concatenate all DataFrames in the list along the rows (axis=0)
df_all = pd.concat(df_list, ignore_index=True)

# Display the first few rows of the merged DataFrame
df_all.head()


In [None]:
df_all.shape

In [None]:
df_all = df_all[['Ticker', 'Date', 'Close', 'High', 'Low', 'Volume']]
df_all = df_all.rename(columns={'Ticker': 'Symbol',
                               'High': 'Daily_High',
                               'Low': 'Daily_Low'})
df_all.head()

In [None]:
print(df_all['Date'].dtype)

In [None]:
df_all['Date'] = pd.to_datetime(df_all['Date'], errors='coerce')
print(df_all['Date'].dtype)

In [None]:
df_all.shape

In [None]:
top_stocks = df_all['Symbol'].unique().tolist()
df_all = df_all[df_all['Symbol'].isin(top_stocks[:200])]
df_all.head()

In [None]:
len(df_all['Symbol'].unique())

In [None]:
# I have to change this for now, but I can change the dataframe name later
df_all_cleaned = df_all.copy()
df_all_cleaned.head()

In [None]:
df_all_cleaned.shape

In [None]:
# don't thinkn I need this, can delete

# Check for non-numeric values in the 'Close' column
# non_numeric_values = df_all_cleaned[~df_all_cleaned['Close'].apply(pd.to_numeric, errors='coerce').notna()]
# print(non_numeric_values[['Date', 'Symbol', 'Close']].head())

In [None]:
# don't thinkn I need this, can delete

# df_all_cleaned = df_all_cleaned[pd.to_numeric(df_all_cleaned['Close'], errors='coerce').notna()]


In [None]:
stock_data_1_week = df_all_cleaned.copy()
stock_data_1_month = df_all_cleaned.copy()
stock_data_3_month = df_all_cleaned.copy()

In [None]:
# Windows for my TA indicator calculations

one_week_window = [3, 5, 7]
one_month_window = [7, 10, 14, 20, 30]
three_month_window = [14, 20, 30, 50, 60, 90]

In [None]:
# Daily Volume MA

# 1 week
for window in one_week_window:
    stock_data_1_week[f'Volume_{window}day_avg'] = stock_data_1_week.groupby('Symbol')['Volume'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )


# 1 month
for window in one_month_window:
    stock_data_1_month[f'Volume_{window}day_avg'] = stock_data_1_month.groupby('Symbol')['Volume'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )


# 3 month
for window in three_month_window:
    stock_data_3_month[f'Volume_{window}day_avg'] = stock_data_3_month.groupby('Symbol')['Volume'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )


In [None]:
# Daily High Price MA

# 1 week
for window in one_week_window:
    stock_data_1_week[f'Daily_High_{window}day_avg'] = stock_data_1_week.groupby('Symbol')['Daily_High'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )

# 1 month
for window in one_month_window:
    stock_data_1_month[f'Daily_High_{window}day_avg'] = stock_data_1_month.groupby('Symbol')['Daily_High'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )

# 3 month
for window in three_month_window:
    stock_data_3_month[f'Daily_High_{window}day_avg'] = stock_data_3_month.groupby('Symbol')['Daily_High'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )

In [None]:
# Daily Low Price MA

# 1 week
for window in one_week_window:
    stock_data_1_week[f'Daily_Low_{window}day_avg'] = stock_data_1_week.groupby('Symbol')['Daily_Low'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )

# 1 month
for window in one_month_window:
    stock_data_1_month[f'Daily_Low_{window}day_avg'] = stock_data_1_month.groupby('Symbol')['Daily_Low'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )

# 3 month
for window in three_month_window:
    stock_data_3_month[f'Daily_Low_{window}day_avg'] = stock_data_3_month.groupby('Symbol')['Daily_Low'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )

In [None]:
# Simple Moving Average

# 1 week
for window in one_week_window:
    
    stock_data_1_week[f'SMA_{window}'] = stock_data_1_week.groupby('Symbol')['Close'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )

# 1 month
for window in one_month_window:
    
    stock_data_1_month[f'SMA_{window}'] = stock_data_1_month.groupby('Symbol')['Close'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )

# 3 month
for window in three_month_window:
    
    stock_data_3_month[f'SMA_{window}'] = stock_data_3_month.groupby('Symbol')['Close'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )


In [None]:
# Exponential Moving Average

# 1 week
for window in one_week_window:
    stock_data_1_week[f'EMA_{window}'] = stock_data_1_week.groupby('Symbol')['Close'].transform(
        lambda x: x.ewm(span=window, adjust=False).mean()
    )

# 1 month
for window in one_month_window:
    stock_data_1_month[f'EMA_{window}'] = stock_data_1_month.groupby('Symbol')['Close'].transform(
        lambda x: x.ewm(span=window, adjust=False).mean()
    )

# 3 month
for window in three_month_window:
    stock_data_3_month[f'EMA_{window}'] = stock_data_3_month.groupby('Symbol')['Close'].transform(
        lambda x: x.ewm(span=window, adjust=False).mean()
    )

In [None]:
# RSI

# Define a function to calculate RSI
def calculate_rsi(df, window=14):
    # Calculate price changes
    delta = df['Close'].diff()

    # Separate gains and losses
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)

    # Calculate the rolling average of gains and losses
    avg_gain = gain.rolling(window=window).mean()
    avg_loss = loss.rolling(window=window).mean()

    # Calculate Relative Strength (RS)
    rs = avg_gain / avg_loss

    # Calculate RSI
    rsi = 100 - (100 / (1 + rs))

    return rsi

# Apply the function to the dataframe to calculate RSI
for df in [stock_data_1_week, stock_data_1_month, stock_data_3_month]:
    df['RSI'] = calculate_rsi(df)

# 1 week
for window in one_week_window:

    stock_data_1_week[f'RSI_{window}'] = stock_data_1_week.groupby('Symbol', group_keys=False).apply(
        lambda x: calculate_rsi(x, window=window)
    )

# 1 month
for window in one_month_window:

    stock_data_1_month[f'RSI_{window}'] = stock_data_1_month.groupby('Symbol', group_keys=False).apply(
        lambda x: calculate_rsi(x, window=window)
    )

# 3 month
for window in three_month_window:

    stock_data_3_month[f'RSI_{window}'] = stock_data_3_month.groupby('Symbol', group_keys=False).apply(
        lambda x: calculate_rsi(x, window=window)
    )


In [None]:
# Window mapping for loops
window_mapping = {
    "stock_data_1_week": (stock_data_1_week, one_week_window),
    "stock_data_1_month": (stock_data_1_month, one_month_window),
    "stock_data_3_month": (stock_data_3_month, three_month_window)
}

# List of dataframes for loops
stock_dataframes = [stock_data_1_week, stock_data_1_month, stock_data_3_month]

In [None]:
# MACD

for df in [stock_data_1_week, stock_data_1_month, stock_data_3_month]:
    
    df['EMA_12_MACD'] = df.groupby('Symbol')['Close'].transform(
        lambda x: x.ewm(span=12, adjust=False).mean()
    )
    df['EMA_26_MACD'] = df.groupby('Symbol')['Close'].transform(
        lambda x: x.ewm(span=26, adjust=False).mean()
    )

    df['MACD'] = df['EMA_12_MACD'] - df['EMA_26_MACD']
    df['Signal_Line'] = df.groupby('Symbol')['MACD'].transform(
        lambda x: x.ewm(span=9, adjust=False).mean()
    )
    df['MACD_Histogram'] = df['MACD'] - df['Signal_Line']


for name, (df, windows) in window_mapping.items():
    for window in windows:
    # Apply rolling average to the MACD line 
        df[f'MACD_rolling_{window}'] = df.groupby('Symbol')['MACD'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
    
        # Apply rolling average to the Signal Line
        df[f'Signal_rolling_{window}'] = df.groupby('Symbol')['Signal_Line'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
    
        df[f'MACD_Histogram_rolling_{window}'] = df.groupby('Symbol')['MACD_Histogram'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )

    df.drop(columns=['EMA_12_MACD', 'EMA_26_MACD'], inplace=True)





In [None]:
pd.options.display.max_columns = None  # Show all columns

stock_data_3_month.head()

In [None]:
# stochastic oscillator

for df in stock_dataframes:
    df['Stoch_Lowest_Low_14'] = df.groupby('Symbol')['Daily_Low'].transform(
        lambda x: x.rolling(window=14, min_periods=1).min()
    )
    df['Stoch_Highest_High_14'] = df.groupby('Symbol')['Daily_High'].transform(
        lambda x: x.rolling(window=14, min_periods=1).max()
    )
    df['%K'] = ((df['Close'] - df['Stoch_Lowest_Low_14']) / (df['Stoch_Highest_High_14'] - df['Stoch_Lowest_Low_14'])) * 100
    

# Calculate the %D (3-day Simple Moving Average of %K)
# Regardless of window size, each DF gets the %D (3day SMA of %k)

for df in stock_dataframes:
    df['%D_3'] = df.groupby('Symbol')['%K'].transform(
        lambda x: x.rolling(window=3, min_periods=1).mean()
    )


for name, (df, windows) in window_mapping.items():
    for window in windows:

        df[f'%D_{window}'] = df.groupby('Symbol')['%K'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )  
    # Drop intermediate columns if you don't need them
    df.drop(columns=['Stoch_Lowest_Low_14', 'Stoch_Highest_High_14'], inplace=True)






In [None]:
# VWAP

# Calculate Volume Weighted Average Price (VWAP) per symbol
def calculate_vwap(df):
    # Ensure 'Close' and 'Volume' are numeric
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
    df['Volume'] = pd.to_numeric(df['Volume'], errors='coerce')

    # Calculate cumulative price-volume product for VWAP
    df['Cumulative_Price_Volume'] = df.groupby('Symbol')['Close'].transform(
    lambda x: (x * df.loc[x.index, 'Volume']).cumsum()
    )
    # Calculate cumulative volume for VWAP
    df['Cumulative_Volume'] = df.groupby('Symbol')['Volume'].transform(
    lambda x: x.cumsum()
    )
    # Calculate VWAP as the ratio of cumulative sums for each group (symbol)
    df['VWAP'] = df['Cumulative_Price_Volume'] / df['Cumulative_Volume']

    return df

# Add VWAP and VWAP window averages to dataframes
i = 0
while i < len(stock_dataframes):
    df = stock_dataframes[i]
    
    calculate_vwap(df)
    for name, (df_map, windows) in window_mapping.items():
        if df is df_map:
            for window in windows:
    
                df[f'VWAP_{window}'] = df.groupby('Symbol')['VWAP'].transform(
                    lambda x: x.rolling(window=window, min_periods=1).mean()
                )
    
    df.drop(columns=['Cumulative_Price_Volume', 'Cumulative_Volume'], inplace=True)
    i += 1
        

In [None]:
# for name, (df, windows) in window_mapping.items():
#     for window in windows:
#         # Drop the VWAP and VWAP_{window} columns after the rolling averages are calculated
#         df.drop(columns=['VWAP'], inplace=True, errors='ignore')  # Drop the VWAP column
#         df.drop(columns=[f'VWAP_{window}' for window in windows], inplace=True, errors='ignore')  # Drop VWAP_{window} columns

In [None]:
# Calculate Bollinger Bands per symbol

def calculate_bollinger_bands(df, windows):
    # Ensure 'Close' is numeric
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')

    for window in windows:
        df[f'bb_Middle_Band_{window}'] = df.groupby('Symbol')['Close'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )

        df[f'Std_Dev_{window}'] = df.groupby('Symbol')['Close'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std()
        )

        df[f'Upper_Band_{window}'] = df[f'bb_Middle_Band_{window}'] + (df[f'Std_Dev_{window}'] * 2)
        df[f'Lower_Band_{window}'] = df[f'bb_Middle_Band_{window}'] - (df[f'Std_Dev_{window}'] * 2)
   
    return df

# Add Bollinger Bands to all dataframes
i = 0
while i < len(stock_dataframes):
    
    for name, (df, windows) in window_mapping.items():
        calculate_bollinger_bands(df, windows)
            
    i += 1


In [None]:
# Average True Range (ATR)

# Function to calculate True Range (TR)
def calculate_true_range(df):
    # Convert relevant columns to numeric (if not already numeric)
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
    df['Daily_High'] = pd.to_numeric(df['Daily_High'], errors='coerce')
    df['Daily_Low'] = pd.to_numeric(df['Daily_Low'], errors='coerce')

    # Ensure previous close is calculated per stock symbol to prevent cross-stock contamination
    df['ATR_Prev_Close'] = df.groupby('Symbol')['Close'].shift(1)

    df['ATR_High_Low'] = df['Daily_High'] - df['Daily_Low']  # High - Low
    df['ATR_High_Close'] = (df['Daily_High'] - df['ATR_Prev_Close']).abs()  # High - Prev Close
    df['ATR_Low_Close'] = (df['Daily_Low'] - df['ATR_Prev_Close']).abs()  # Low - Prev Close

    # True Range is the max of the three
    df['ATR'] = df[['ATR_High_Low', 'ATR_High_Close', 'ATR_Low_Close']].max(axis=1)

    df.drop(columns=['ATR_Prev_Close', 'ATR_High_Low', 'ATR_High_Close', 'ATR_Low_Close'], inplace=True)
    
    return df

# Add ATR calculation to all dataframes and add rolling windows
for name, (df, windows) in window_mapping.items():
    df = calculate_true_range(df)
    for window in windows:

        df[f'ATR_{window}'] = df.groupby('Symbol')['ATR'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )

In [None]:
import numpy as np
import pandas as pd


def calculate_fibonacci_retracement(df, windows):
    # Convert relevant columns to numeric
    df['Daily_High'] = pd.to_numeric(df['Daily_High'], errors='coerce')
    df['Daily_Low'] = pd.to_numeric(df['Daily_Low'], errors='coerce')

    # Define Fibonacci levels
    fib_levels = [0.236, 0.382, 0.500, 0.618, 0.786, 1.000, 1.618, 2.618, 4.236]

    # Group by 'Symbol' and calculate Fibonacci levels for a given window
    def fib_retracement(stock_df, window):
        stock_df[f'Fib_{window}_High_Max'] = stock_df['Daily_High'].transform(
            lambda x: x.rolling(window=window, min_periods=1).max()
        )
        stock_df[f'Fib_{window}_Low_Min'] = stock_df['Daily_Low'].transform(
            lambda x: x.rolling(window=window, min_periods=1).min()
        )

        # Calculate Fibonacci retracement levels for each level
        for level in fib_levels:
            stock_df[f'{window}_day_Fib_{int(level*100)}'] = stock_df[f'Fib_{window}_High_Max'] - (
                level * (stock_df[f'Fib_{window}_High_Max'] - stock_df[f'Fib_{window}_Low_Min']))

        return stock_df

    # Apply the function to each stock symbol and window
    for window in windows:
        df = df.groupby('Symbol', group_keys=False).apply(fib_retracement, window)

    return df

fib_windows = [5, 14, 30]

# Apply Fibonacci Retracement calculation to the dataframe



In [None]:
df_all_cleaned.shape

In [None]:
import pandas as pd

pd.set_option('display.max_columns', None)
df_all_cleaned.head()

In [None]:
# OBV
# Calculate OBV for each stock symbol separately
df_all_cleaned['OBV'] = df_all_cleaned.groupby('Symbol')['Volume'].transform(
    lambda x: (np.sign(x.diff()) * x).cumsum()
)



for window in one_week_window:

    df_all_cleaned[f'OBV_{window}day_avg'] = df_all_cleaned.groupby('Symbol')['OBV'].transform(
        lambda x: x.rolling(window=window).mean()
    )

    
df_all_cleaned.head()

# Calculate 3-day rolling average of OBV for each stock symbol
# df_all_cleaned['OBV_3day_avg'] = df_all_cleaned.groupby('Symbol')['OBV'].rolling(window=3).mean()
# df_all_cleaned['OBV_5day_avg'] = df_all_cleaned.groupby('Symbol')['OBV'].rolling(window=5).mean()
# df_all_cleaned['OBV_7day_avg'] = df_all_cleaned.groupby('Symbol')['OBV'].rolling(window=7).mean()
# df_all_cleaned['OBV_10day_avg'] = df_all_cleaned.groupby('Symbol')['OBV'].rolling(window=10).mean()
# df_all_cleaned['OBV_12day_avg'] = df_all_cleaned.groupby('Symbol')['OBV'].rolling(window=12).mean()
# df_all_cleaned['OBV_14day_avg'] = df_all_cleaned.groupby('Symbol')['OBV'].rolling(window=14).mean()




In [None]:
df_all_cleaned.shape

In [None]:
#WMA

# Define the function for weighted moving average
def weighted_moving_average(series, weights):
    return np.dot(series, weights) / weights.sum()


# Apply the rolling WMA for each window
for window in one_week_window:
    df_all_cleaned[f'WMA_{window}'] = df_all_cleaned.groupby('Symbol')['Close'].rolling(
        window=window, min_periods=1
    ).apply(lambda x: weighted_moving_average(x, np.linspace(1, 0.1, len(x))), raw=False).droplevel(0)


In [None]:
# Momentum Features
for window in one_week_window:

    df_all_cleaned[f'Momentum_{window}'] = df_all_cleaned.groupby('Symbol')['Close'].transform(
        lambda x: x - x.shift(window)
    ).fillna(0)

# df_all_cleaned['Momentum_3'] = df_all_cleaned.groupby('Symbol')['Close'].transform(lambda x: x.diff(3))
# df_all_cleaned['Momentum_5'] = df_all_cleaned.groupby('Symbol')['Close'].transform(lambda x: x.diff(5))
# df_all_cleaned['Momentum_10'] = df_all_cleaned.groupby('Symbol')['Close'].transform(lambda x: x.diff(10))
# df_all_cleaned['Momentum_7'] = df_all_cleaned.groupby('Symbol')['Close'].transform(lambda x: x.diff(7))
# df_all_cleaned['Momentum_12'] = df_all_cleaned.groupby('Symbol')['Close'].transform(lambda x: x.diff(12))
# df_all_cleaned['Momentum_14'] = df_all_cleaned.groupby('Symbol')['Close'].transform(lambda x: x.diff(14))

# Verify the results
df_all_cleaned.head()

In [None]:
# Quantile-Based Features

for window in one_week_window:

    df_all_cleaned[f'Rolling_Median_{window}'] = df_all_cleaned.groupby('Symbol')['Close'].transform(
        lambda x: x.rolling(window=window).median()
    ).fillna(0)

    # Rolling 25th Quantile
    df_all_cleaned[f'Rolling_Quantile_25_{window}'] = df_all_cleaned.groupby('Symbol')['Close'].transform(
        lambda x: x.rolling(window=window).quantile(0.25)
    ).fillna(0)

    # Rolling 75th Quantile
    df_all_cleaned[f'Rolling_Quantile_75_{window}'] = df_all_cleaned.groupby('Symbol')['Close'].transform(
        lambda x: x.rolling(window=window).quantile(0.75)
    ).fillna(0)

# df_all_cleaned['Rolling_Median_3'] = df_all_cleaned.groupby('Symbol')['Close'].transform(lambda x: x.rolling(window=3).median())

# df_all_cleaned['Rolling_Quantile_25_3'] = df_all_cleaned.groupby('Symbol')['Close'].transform(lambda x: x.rolling(window=3).quantile(0.25))
# df_all_cleaned['Rolling_Quantile_75_3'] = df_all_cleaned.groupby('Symbol')['Close'].transform(lambda x: x.rolling(window=3).quantile(0.75))

# Verify the results
df_all_cleaned.head()

In [None]:
pd.set_option('display.max_columns', None)
df_all_cleaned.head()

In [None]:
# List of columns to create lags for (focusing on short-term indicators)
columns_to_lag = ['Close', 'SMA_3', 'SMA_5', 'SMA_7', 
                  'EMA_3', 'EMA_5', 'EMA_7', 'Volume',
                 'Daily_High', 'Daily_Low', 'RSI', 'RSI_3', 'RSI_5', 'RSI_7', 'Signal_Line', 'MACD', 
                  'VWAP', '%K', '%D_3','WMA_3', 'WMA_5', 'WMA_7', 'OBV', 'Momentum_3', 
                  'Momentum_5', 'Momentum_7','Std_Dev_3', 'Std_Dev_5', 'Std_Dev_7', 
                  'Rolling_Median_3', 'Rolling_Median_5', 'Rolling_Median_7', 
                  'Rolling_Quantile_25_3', 'Rolling_Quantile_25_5', 'Rolling_Quantile_25_7', 
                  'Rolling_Quantile_75_3', 'Rolling_Quantile_75_5', 'Rolling_Quantile_75_7', 
                 'Daily_High_3day_avg', 'Daily_High_5day_avg', 'Daily_High_7day_avg', 
                 'Daily_Low_3day_avg', 'Daily_Low_5day_avg', 'Daily_Low_7day_avg', 
                 'Volume_3day_avg', 'Volume_5day_avg', 'Volume_7day_avg']

# Creating lag features for each column
# [1, 3, 5, 7, 10, 14, 20, 30, 50, 60, 90, 100 180, 200] are the lags we will use
# but to save space, we will only use necessary lags per the timeline goal of the model
# this first model will be predicting price 1 week ahead (5 trading days)
lags = [1, 3, 5, 7]
for col in columns_to_lag:
    for lag in lags:
        df_all_cleaned[f'{col}_lag_{lag}'] = df_all_cleaned[col].shift(lag)

# Do not drop NaN values to maintain continuity (XGBoost can handle NaNs)
# You can handle missing values in your model later, if needed
df_all_cleaned.head()



In [None]:
df_all_cleaned.head(10)

RSI
MACD and MACD Signal Line
Stochastic Oscillator
VWAP
Bollinger Bands (Upper and Lower)
Price Range (High - Low)
Momentum and Standard Deviation
Fib Levels
OBV
Quantile Features (Median, Upper/Lower Quantile)
Weighted Moving Average (WMA)

In [None]:
# Check if we accidentally have the same values in any of these columns (rounded to 4 decimal places)
# Get the first 1000 rows of the dataframe
df_sample = df_all_cleaned.head(2500)

# Exclude 'Symbol' and 'Date' columns
columns = [col for col in df_sample.columns if col not in ['Symbol', 'Date']]

# Lists to store matching column pairs
col_i_list = []
col_j_list = []

# Loop through all pairs of remaining columns and compare their values rounded to 4 decimal places
for i in range(len(columns)):
    for j in range(i + 1, len(columns)):  # Avoid duplicate comparisons
        if (df_sample[columns[i]].round(4) == df_sample[columns[j]].round(4)).all():
            col_i_list.append(columns[i])
            col_j_list.append(columns[j])

# Print the two lists
print("Matching Column Pairs:")
print("Column i:", col_i_list)
print("Column j:", col_j_list)


In [None]:
df_all_cleaned.drop(columns=['Fib_3_Low_Min', 'Fib_5_Low_Min', 'Fib_7_Low_Min',
                             'bb_Middle_Band_3', 'bb_Middle_Band_5', 'bb_Middle_Band_7'
                             
                            ], inplace=True)


In [None]:
# Check if we accidentally have the same values in any of these columns (rounded to 4 decimal places)
# Get the first 1000 rows of the dataframe
df_sample = df_all_cleaned.head(2500)

# Exclude 'Symbol' and 'Date' columns
columns = [col for col in df_sample.columns if col not in ['Symbol', 'Date']]

# Lists to store matching column pairs
col_i_list = []
col_j_list = []

# Loop through all pairs of remaining columns and compare their values rounded to 4 decimal places
for i in range(len(columns)):
    for j in range(i + 1, len(columns)):  # Avoid duplicate comparisons
        if (df_sample[columns[i]].round(4) == df_sample[columns[j]].round(4)).all():
            col_i_list.append(columns[i])
            col_j_list.append(columns[j])

# Print the two lists
print("Matching Column Pairs:")
print("Column i:", col_i_list)
print("Column j:", col_j_list)


In [None]:
# Save DataFrame as CSV file for easy access
df_all_cleaned.to_csv('/Users/evancallaghan/flatiron_ds/phase_5/capstone_project/stock_10_yr_150_ta_data.csv', index=False)

In [None]:
# Specify the file path to your CSV in Google Drive
csv_file_path = '/Users/evancallaghan/flatiron_ds/phase_5/capstone_project/stock_ta_data.csv'

# Load the CSV file into a DataFrame
df_stocks_price_ta = pd.read_csv(csv_file_path)

# Inspect the DataFrame
df_stocks_price_ta.head()

In [None]:
df_stock_data_1_week = df_all_cleaned.copy()
df_stock_data_1_week.head()

In [None]:
df_stock_data_1_week = df_stocks_price_ta.copy()
df_stocks_price_ta.head()

In [None]:
df_stock_data_1_week.shape

In [None]:
# i think this one would actually be the baseline, as i can separate the dates and test only
# after feb 10 which is what i want to do
# it also contains scaled data, which was better
# baseline model

import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_1_week = df_stock_data_1_week.sort_values(by=['Symbol', 'Date'])

# Use data until the end of January 2022 for training (i.e., last 2-3 years for testing)
df_stock_data_train_1_week_baseline = df_stock_data_1_week[df_stock_data_1_week['Date'] <= '2023-01-24']

# Use data from February 1, 2022, onwards for testing
df_stock_data_test_1_week_baseline = df_stock_data_1_week[df_stock_data_1_week['Date'] > '2023-01-31']


# Check if the test set is empty
if df_stock_data_test_1_week_baseline.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 5 trading days ahead
df_stock_data_train_1_week_baseline['Close_Target'] = df_stock_data_train_1_week_baseline.groupby('Symbol')['Close'].shift(-5)
df_stock_data_test_1_week_baseline['Close_Target'] = df_stock_data_test_1_week_baseline.groupby('Symbol')['Close'].shift(-5)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_1_week_baseline = df_stock_data_train_1_week_baseline.dropna(subset=['Close_Target'])
df_stock_data_test_1_week_baseline = df_stock_data_test_1_week_baseline.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_1_week_baseline.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_1_week_baseline.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_1_week_baseline.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_1_week_baseline.select_dtypes(include=[np.number]).columns

df_stock_data_train_1_week_baseline[numeric_cols_train] = df_stock_data_train_1_week_baseline[numeric_cols_train].fillna(df_stock_data_train_1_week_baseline[numeric_cols_train].median())
df_stock_data_test_1_week_baseline[numeric_cols_test] = df_stock_data_test_1_week_baseline[numeric_cols_test].fillna(df_stock_data_test_1_week_baseline[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_1_week_baseline.shape}")
print(f"Testing data shape: {df_stock_data_test_1_week_baseline.shape}")

# Create X (features) and y (target) for training
X_train_1_week_baseline = df_stock_data_train_1_week_baseline.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_1_week_baseline = df_stock_data_train_1_week_baseline['Close_Target']

# Create X and y for testing
X_test_1_week_baseline = df_stock_data_test_1_week_baseline.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_1_week_baseline = df_stock_data_test_1_week_baseline['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_1_week_baseline shape: {X_train_1_week_baseline.shape}, y_train_1_week_baseline shape: {y_train_1_week_baseline.shape}")
print(f"X_test_1_week_baseline shape: {X_test_1_week_baseline.shape}, y_test_1_week_baseline shape: {y_test_1_week_baseline.shape}")

# Ensure there are samples in both training and testing sets
if X_train_1_week_baseline.shape[0] == 0 or X_test_1_week_baseline.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_baseline_1_week = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_baseline_1_week.fit(X_train_1_week_baseline, y_train_1_week_baseline)

# Make predictions on the unseen test data (February 17, 2024, onwards)
y_pred_1_week_baseline = model_baseline_1_week.predict(X_test_1_week_baseline)

# Calculate performance on the test data
mse_test_1_week_baseline = mean_squared_error(y_test_1_week_baseline, y_pred_1_week_baseline)
print(f'Mean Squared Error on unseen data (post-February 17, 2024): {mse_test_1_week_baseline}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np

# Assuming `y_pred_1_week_baseline` are your predictions for the test data and `y_test_1_week_baseline` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_1_week_baseline = mean_squared_error(y_test_1_week_baseline, y_pred_1_week_baseline)
mae_1_week_baseline = mean_absolute_error(y_test_1_week_baseline, y_pred_1_week_baseline)
rmse_1_week_baseline = np.sqrt(mse_1_week_baseline)  # Root Mean Squared Error
r2_1_week_baseline = r2_score(y_test_1_week_baseline, y_pred_1_week_baseline)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_1_week_baseline}')
print(f'Mean Absolute Error on unseen data: {mae_1_week_baseline}')
print(f'Root Mean Squared Error on unseen data: {rmse_1_week_baseline}')
print(f'R-squared on unseen data: {r2_1_week_baseline}')

# Additional metrics
medae_1_week_baseline = median_absolute_error(y_test_1_week_baseline, y_pred_1_week_baseline)
print(f'Median Absolute Error on unseen data: {medae_1_week_baseline}')

dw_stat_1_week_baseline = durbin_watson(y_test_1_week_baseline - y_pred_1_week_baseline)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_1_week_baseline}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_1_week_baseline = np.mean(np.abs((y_test_1_week_baseline - y_pred_1_week_baseline) / y_test_1_week_baseline)) * 100
print(f'MAPE on unseen data: {mape_1_week_baseline:.2f}%')
# Get feature importance as a dictionary
feature_importance_1_week_baseline = dict(zip(X_train_1_week_baseline.columns, model_baseline_1_week.feature_importances_))

# Sort features by importance in descending order
sorted_features_1_week_baseline = sorted(feature_importance_1_week_baseline.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_1_week_baseline:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:


# we're going to use the scaled data, so the model above will be
# our baseline
# next we're going to use the same model and use a new dataframe
# with features from the baseline model that contributed more than 1%
# first we need to get a list of important feautres from our baseline
# model and create a new dataframe containing only those features
# Get feature importance as a dictionary
# Get feature importance as a dictionary
feature_importance_1_week_baseline = dict(zip(X_train_1_week_baseline.columns, model_baseline_1_week.feature_importances_))

# Filter features with importance greater than 1%
important_features_1_week_baseline = {feature: importance for feature, importance in feature_importance_1_week_baseline.items() if importance > 0.01}

# Sort features by importance in descending order
sorted_important_features_1_week_baseline = sorted(important_features_1_week_baseline.items(), key=lambda x: x[1], reverse=True)

# Extract the features names (keys) into a list
important_feature_names_1_week_baseline = [feature for feature, importance in sorted_important_features_1_week_baseline]

# Print the sorted important features (optional)
print("Features with more than 1% contribution:")
for feature in sorted_important_features_1_week_baseline:
    print(f"{feature[0]}: {feature[1] * 100:.2f}%")

# The list of important features that you can use to create a new dataframe
print("List of important features:")
print(important_feature_names_1_week_baseline)



In [None]:
important_features = ['Symbol', 'Date', 'Close', 'SMA_5', '10_day_Fib_100', 'Fib_5_High_Max', 
                      'EMA_3', 'Low', 'EMA_26_MACD', '5_day-Fib_23', 'Fib_30_High_Max', 'EMA_100',
                      'High', '5_day-Fib_100', 'EMA_20', '30_day_Fib_100']
df_important_feat_1_week = df_stock_data_1_week[important_features]
df_important_feat_1_week.head()

In [None]:
# we're going to use the scaled data, so the model above will be
# our baseline
# next we're going to use the same model and use a new dataframe
# with features from the baseline model that contributed more than 1%

from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_important_feat_1_week = df_important_feat_1_week.sort_values(by=['Symbol', 'Date'])

# Use data until the end of January 2022 for training (i.e., last 2-3 years for testing)
df_stock_data_train_1_week_baseline = df_stock_data_1_week[df_stock_data_1_week['Date'] <= '2023-01-24']

# Use data from February 1, 2022, onwards for testing
df_stock_data_test_1_week_baseline = df_stock_data_1_week[df_stock_data_1_week['Date'] > '2023-01-31']

# Check if the test set is empty
if df_stock_data_test_1_week_if.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 5 trading days ahead
df_stock_data_train_1_week_if['Close_Target'] = df_stock_data_train_1_week_if.groupby('Symbol')['Close'].shift(-5)
df_stock_data_test_1_week_if['Close_Target'] = df_stock_data_test_1_week_if.groupby('Symbol')['Close'].shift(-5)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_1_week_if = df_stock_data_train_1_week_if.dropna(subset=['Close_Target'])
df_stock_data_test_1_week_if = df_stock_data_test_1_week_if.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_1_week_if.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_1_week_if.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_1_week_if.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_1_week_if.select_dtypes(include=[np.number]).columns

df_stock_data_train_1_week_if[numeric_cols_train] = df_stock_data_train_1_week_if[numeric_cols_train].fillna(df_stock_data_train_1_week_if[numeric_cols_train].median())
df_stock_data_test_1_week_if[numeric_cols_test] = df_stock_data_test_1_week_if[numeric_cols_test].fillna(df_stock_data_test_1_week_if[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_1_week_if.shape}")
print(f"Testing data shape: {df_stock_data_test_1_week_if.shape}")

# Create X (features) and y (target) for training
X_train_1_week_if = df_stock_data_train_1_week_if.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_1_week_if = df_stock_data_train_1_week_if['Close_Target']

# Create X and y for testing
X_test_1_week_if = df_stock_data_test_1_week_if.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_1_week_if = df_stock_data_test_1_week_if['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_1_week_if shape: {X_train_1_week_if.shape}, y_train_1_week_if shape: {y_train_1_week_if.shape}")
print(f"X_test_1_week_if shape: {X_test_1_week_if.shape}, y_test_1_week_if shape: {y_test_1_week_if.shape}")

# Ensure there are samples in both training and testing sets
if X_train_1_week_if.shape[0] == 0 or X_test_1_week_if.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_update1_1_week = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_update1_1_week.fit(X_train_1_week_if, y_train_1_week_if)

# Make predictions on the unseen test data (February 17, 2024, onwards)
y_pred_1_week_if = model_update1_1_week.predict(X_test_1_week_if)

# Calculate performance on the test data
mse_test_1_week_if = mean_squared_error(y_test_1_week_if, y_pred_1_week_if)
print(f'Mean Squared Error on unseen data (post-February 17, 2024): {mse_test_1_week_if}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np

# Assuming `y_pred_1_week_if` are your predictions for the test data and `y_test_1_week_if` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_1_week_if = mean_squared_error(y_test_1_week_if, y_pred_1_week_if)
mae_1_week_if = mean_absolute_error(y_test_1_week_if, y_pred_1_week_if)
rmse_1_week_if = np.sqrt(mse_1_week_if)  # Root Mean Squared Error
r2_1_week_if = r2_score(y_test_1_week_if, y_pred_1_week_if)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_1_week_if}')
print(f'Mean Absolute Error on unseen data: {mae_1_week_if}')
print(f'Root Mean Squared Error on unseen data: {rmse_1_week_if}')
print(f'R-squared on unseen data: {r2_1_week_if}')

# Additional metrics
medae_1_week_if = median_absolute_error(y_test_1_week_if, y_pred_1_week_if)
print(f'Median Absolute Error on unseen data: {medae_1_week_if}')

dw_stat_1_week_if = durbin_watson(y_test_1_week_if - y_pred_1_week_if)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_1_week_if}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_1_week_if = np.mean(np.abs((y_test_1_week_if - y_pred_1_week_if) / y_test_1_week_if)) * 100
print(f'MAPE on unseen data: {mape_1_week_if:.2f}%')
# Get feature importance as a dictionary
feature_importance_1_week_if = dict(zip(X_train_1_week_if.columns, model_update1_1_week.feature_importances_))

# Sort features by importance in descending order
sorted_features_1_week_if = sorted(feature_importance_1_week_if.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_1_week_if:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# removing important features led to the degredation of all metrics
# we're going to use all metrics again and try adjusting a few of the hyper parameters
#
# i think this one would actually be the baseline, as i can separate the dates and test only
# after feb 10 which is what i want to do
# it also contains scaled data, which was better
# learning_rate = 0.01

from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_1_week = df_stock_data_1_week.sort_values(by=['Symbol', 'Date'])

# Use data until the end of January 2022 for training (i.e., last 2-3 years for testing)
df_stock_data_train_1_week_baseline = df_stock_data_1_week[df_stock_data_1_week['Date'] <= '2023-01-24']

# Use data from February 1, 2022, onwards for testing
df_stock_data_test_1_week_baseline = df_stock_data_1_week[df_stock_data_1_week['Date'] > '2023-01-31']

# Check if the test set is empty
if df_stock_data_test_1_week_lr_01.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 5 trading days ahead
df_stock_data_train_1_week_lr_01['Close_Target'] = df_stock_data_train_1_week_lr_01.groupby('Symbol')['Close'].shift(-5)
df_stock_data_test_1_week_lr_01['Close_Target'] = df_stock_data_test_1_week_lr_01.groupby('Symbol')['Close'].shift(-5)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_1_week_lr_01 = df_stock_data_train_1_week_lr_01.dropna(subset=['Close_Target'])
df_stock_data_test_1_week_lr_01 = df_stock_data_test_1_week_lr_01.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_1_week_lr_01.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_1_week_lr_01.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_1_week_lr_01.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_1_week_lr_01.select_dtypes(include=[np.number]).columns

df_stock_data_train_1_week_lr_01[numeric_cols_train] = df_stock_data_train_1_week_lr_01[numeric_cols_train].fillna(df_stock_data_train_1_week_lr_01[numeric_cols_train].median())
df_stock_data_test_1_week_lr_01[numeric_cols_test] = df_stock_data_test_1_week_lr_01[numeric_cols_test].fillna(df_stock_data_test_1_week_lr_01[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_1_week_lr_01.shape}")
print(f"Testing data shape: {df_stock_data_test_1_week_lr_01.shape}")

# Create X (features) and y (target) for training
X_train_1_week_lr_01 = df_stock_data_train_1_week_lr_01.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_1_week_lr_01 = df_stock_data_train_1_week_lr_01['Close_Target']

# Create X and y for testing
X_test_1_week_lr_01 = df_stock_data_test_1_week_lr_01.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_1_week_lr_01 = df_stock_data_test_1_week_lr_01['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_1_week_lr_01 shape: {X_train_1_week_lr_01.shape}, y_train_1_week_lr_01 shape: {y_train_1_week_lr_01.shape}")
print(f"X_test_1_week_lr_01 shape: {X_test_1_week_lr_01.shape}, y_test_1_week_lr_01 shape: {y_test_1_week_lr_01.shape}")

# Ensure there are samples in both training and testing sets
if X_train_1_week_lr_01.shape[0] == 0 or X_test_1_week_lr_01.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_baseline_1_week_LR_01 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=5,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_baseline_1_week_LR_01.fit(X_train_1_week_lr_01, y_train_1_week_lr_01)

# Make predictions on the unseen test data (February 17, 2024, onwards)
y_pred_1_week_lr_01 = model_baseline_1_week_LR_01.predict(X_test_1_week_lr_01)

# Calculate performance on the test data
mse_test_1_week_lr_01 = mean_squared_error(y_test_1_week_lr_01, y_pred_1_week_lr_01)
print(f'Mean Squared Error on unseen data (post-February 17, 2024): {mse_test_1_week_lr_01}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np


# Assuming `y_pred_1_week_lr_01` are your predictions for the test data and `y_test_1_week_lr_01` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_1_week_lr_01 = mean_squared_error(y_test_1_week_lr_01, y_pred_1_week_lr_01)
mae_1_week_lr_01 = mean_absolute_error(y_test_1_week_lr_01, y_pred_1_week_lr_01)
rmse_1_week_lr_01 = np.sqrt(mse_1_week_lr_01)  # Root Mean Squared Error
r2_1_week_lr_01 = r2_score(y_test_1_week_lr_01, y_pred_1_week_lr_01)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_1_week_lr_01}')
print(f'Mean Absolute Error on unseen data: {mae_1_week_lr_01}')
print(f'Root Mean Squared Error on unseen data: {rmse_1_week_lr_01}')
print(f'R-squared on unseen data: {r2_1_week_lr_01}')

# Additional metrics
medae_week_lr_01 = median_absolute_error(y_test_1_week_lr_01, y_pred_1_week_lr_01)
print(f'Median Absolute Error on unseen data: {medae_week_lr_01}')

dw_stat_1_week_lr_01 = durbin_watson(y_test_1_week_lr_01 - y_pred_1_week_lr_01)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_1_week_lr_01}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_1_week_lr_01 = np.mean(np.abs((y_test_1_week_lr_01 - y_pred_1_week_lr_01) / y_test_1_week_lr_01)) * 100
print(f'MAPE on unseen data: {mape_1_week_lr_01:.2f}%')
# Get feature importance as a dictionary
feature_importance_1_week_lr_01 = dict(zip(X_train_1_week_lr_01.columns, model_baseline_1_week_LR_01.feature_importances_))

# Sort features by importance in descending order
sorted_features_1_week_lr_01 = sorted(feature_importance_1_week_lr_01.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_1_week_lr_01:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# removing important features led to the degredation of all metrics
# we're going to use all metrics again and try adjusting a few of the hyper parameters
#
# i think this one would actually be the baseline, as i can separate the dates and test only
# after feb 10 which is what i want to do
# it also contains scaled data, which was better
# learning_rate = 0.1

from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_1_week = df_stock_data_1_week.sort_values(by=['Symbol', 'Date'])

# Use data until the end of January 2022 for training (i.e., last 2-3 years for testing)
df_stock_data_train_1_week_lr_1 = df_stock_data_1_week[df_stock_data_1_week['Date'] <= '2023-01-24']

# Use data from February 1, 2022, onwards for testing
df_stock_data_test_1_week_lr_1 = df_stock_data_1_week[df_stock_data_1_week['Date'] > '2023-01-31']

# Check if the test set is empty
if df_stock_data_test_1_week_lr_1.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 5 trading days ahead
df_stock_data_train_1_week_lr_1['Close_Target'] = df_stock_data_train_1_week_lr_1.groupby('Symbol')['Close'].shift(-5)
df_stock_data_test_1_week_lr_1['Close_Target'] = df_stock_data_test_1_week_lr_1.groupby('Symbol')['Close'].shift(-5)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_1_week_lr_1 = df_stock_data_train_1_week_lr_1.dropna(subset=['Close_Target'])
df_stock_data_test_1_week_lr_1 = df_stock_data_test_1_week_lr_1.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_1_week_lr_1.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_1_week_lr_1.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_1_week_lr_1.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_1_week_lr_1.select_dtypes(include=[np.number]).columns

df_stock_data_train_1_week_lr_1[numeric_cols_train] = df_stock_data_train_1_week_lr_1[numeric_cols_train].fillna(df_stock_data_train_1_week_lr_1[numeric_cols_train].median())
df_stock_data_test_1_week_lr_1[numeric_cols_test] = df_stock_data_test_1_week_lr_1[numeric_cols_test].fillna(df_stock_data_test_1_week_lr_1[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_1_week_lr_1.shape}")
print(f"Testing data shape: {df_stock_data_test_1_week_lr_1.shape}")

# Create X (features) and y (target) for training
X_train_1_week_lr_1 = df_stock_data_train_1_week_lr_1.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_1_week_lr_1 = df_stock_data_train_1_week_lr_1['Close_Target']

# Create X and y for testing
X_test_1_week_lr_1 = df_stock_data_test_1_week_lr_1.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_1_week_lr_1 = df_stock_data_test_1_week_lr_1['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_1_week_lr_1 shape: {X_train_1_week_lr_1.shape}, y_train_1_week_lr_1 shape: {y_train_1_week_lr_1.shape}")
print(f"X_test_1_week_lr_1 shape: {X_test_1_week_lr_1.shape}, y_test_1_week_lr_1 shape: {y_test_1_week_lr_1.shape}")

# Ensure there are samples in both training and testing sets
if X_train_1_week_lr_1.shape[0] == 0 or X_test_1_week_lr_1.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_baseline_1_week_LR_1 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=5,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_baseline_1_week_LR_1.fit(X_train_1_week_lr_1, y_train_1_week_lr_1)

# Make predictions on the unseen test data (February 17, 2024, onwards)
y_pred_1_week_lr_1 = model_baseline_1_week_LR_1.predict(X_test_1_week_lr_1)

# Calculate performance on the test data
mse_test = mean_squared_error(y_test_1_week_lr_1, y_pred_1_week_lr_1)
print(f'Mean Squared Error on unseen data (post-February 17, 2024): {mse_test}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np

# Assuming `y_pred_1_week_lr_1` are your predictions for the test data and `y_test_1_week_lr_1` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_1_week_lr_1 = mean_squared_error(y_test_1_week_lr_1, y_pred_1_week_lr_1)
mae_1_week_lr_1 = mean_absolute_error(y_test_1_week_lr_1, y_pred_1_week_lr_1)
rmse_1_week_lr_1 = np.sqrt(mse_1_week_lr_1)  # Root Mean Squared Error
r2_1_week_lr_1 = r2_score(y_test_1_week_lr_1, y_pred_1_week_lr_1)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_1_week_lr_1}')
print(f'Mean Absolute Error on unseen data: {mae_1_week_lr_1}')
print(f'Root Mean Squared Error on unseen data: {rmse_1_week_lr_1}')
print(f'R-squared on unseen data: {r2_1_week_lr_1}')

# Additional metrics
medae_1_week_lr_1 = median_absolute_error(y_test_1_week_lr_1, y_pred_1_week_lr_1)
print(f'Median Absolute Error on unseen data: {medae_1_week_lr_1}')

dw_stat_1_week_lr_1 = durbin_watson(y_test_1_week_lr_1 - y_pred_1_week_lr_1)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_1_week_lr_1}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_1_week_lr_1 = np.mean(np.abs((y_test_1_week_lr_1 - y_pred_1_week_lr_1) / y_test_1_week_lr_1)) * 100
print(f'MAPE on unseen data: {mape_1_week_lr_1:.2f}%')
# Get feature importance as a dictionary
feature_importance_1_week_lr_1 = dict(zip(X_train_1_week_lr_1.columns, model_baseline_1_week_LR_1.feature_importances_))

# Sort features by importance in descending order
sorted_features_1_week_lr_1 = sorted(feature_importance_1_week_lr_1.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_1_week_lr_1:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# learning_rate outcome: learning_rate=0.01 showed the best improvement
# and had better metrics than the baseline, so we'll keep it and now tweak max_depth

# max_depth = 3

from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_1_week = df_stock_data_1_week.sort_values(by=['Symbol', 'Date'])

# Use data until the end of January 2022 for training (i.e., last 2-3 years for testing)
df_stock_data_train_1_week_md_3 = df_stock_data_1_week[df_stock_data_1_week['Date'] <= '2023-01-24']

# Use data from February 1, 2022, onwards for testing
df_stock_data_test_1_week_baseline = df_stock_data_1_week[df_stock_data_1_week['Date'] > '2023-01-31']

# Check if the test set is empty
if df_stock_data_test_1_week_md_3.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 5 trading days ahead
df_stock_data_train_1_week_md_3['Close_Target'] = df_stock_data_train_1_week_md_3.groupby('Symbol')['Close'].shift(-5)
df_stock_data_test_1_week_md_3['Close_Target'] = df_stock_data_test_1_week_md_3.groupby('Symbol')['Close'].shift(-5)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_1_week_md_3 = df_stock_data_train_1_week_md_3.dropna(subset=['Close_Target'])
df_stock_data_test_1_week_md_3 = df_stock_data_test_1_week_md_3.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_1_week_md_3.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_1_week_md_3.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_1_week_md_3.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_1_week_md_3.select_dtypes(include=[np.number]).columns

df_stock_data_train_1_week_md_3[numeric_cols_train] = df_stock_data_train_1_week_md_3[numeric_cols_train].fillna(df_stock_data_train_1_week_md_3[numeric_cols_train].median())
df_stock_data_test_1_week_md_3[numeric_cols_test] = df_stock_data_test_1_week_md_3[numeric_cols_test].fillna(df_stock_data_test_1_week_md_3[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_1_week_md_3.shape}")
print(f"Testing data shape: {df_stock_data_test_1_week_md_3.shape}")

# Create X (features) and y (target) for training
X_train_1_week_md_3 = df_stock_data_train_1_week_md_3.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_1_week_md_3 = df_stock_data_train_1_week_md_3['Close_Target']

# Create X and y for testing
X_test_1_week_md_3 = df_stock_data_test_1_week_md_3.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_1_week_md_3 = df_stock_data_test_1_week_md_3['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_1_week_md_3 shape: {X_train_1_week_md_3.shape}, y_train_1_week_md_3 shape: {y_train_1_week_md_3.shape}")
print(f"X_test_1_week_md_3 shape: {X_test_1_week_md_3.shape}, y_test_1_week_md_3 shape: {y_test_1_week_md_3.shape}")

# Ensure there are samples in both training and testing sets
if X_train_1_week_md_3.shape[0] == 0 or X_test_1_week_md_3.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_1_week_MD_3 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=3,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_1_week_MD_3.fit(X_train_1_week_md_3, y_train_1_week_md_3)

# Make predictions on the unseen test data (February 17, 2024, onwards)
y_pred_1_week_md_3 = model_1_week_MD_3.predict(X_test_1_week_md_3)

# Calculate performance on the test data
mse_test_1_week_md_3 = mean_squared_error(y_test_1_week_md_3, y_pred_1_week_md_3)
print(f'Mean Squared Error on unseen data (post-February 17, 2024): {mse_test_1_week_md_3}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np

# Assuming `y_pred_1_week_md_3` are your predictions for the test data and `y_test_1_week_md_3` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_1_week_md_3 = mean_squared_error(y_test_1_week_md_3, y_pred_1_week_md_3)
mae_1_week_md_3 = mean_absolute_error(y_test_1_week_md_3, y_pred_1_week_md_3)
rmse_1_week_md_3 = np.sqrt(mse_1_week_md_3)  # Root Mean Squared Error
r2_1_week_md_3 = r2_score(y_test_1_week_md_3, y_pred_1_week_md_3)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_1_week_md_3}')
print(f'Mean Absolute Error on unseen data: {mae_1_week_md_3}')
print(f'Root Mean Squared Error on unseen data: {rmse_1_week_md_3}')
print(f'R-squared on unseen data: {r2_1_week_md_3}')

# Additional metrics
medae_1_week_md_3 = median_absolute_error(y_test_1_week_md_3, y_pred_1_week_md_3)
print(f'Median Absolute Error on unseen data: {medae_1_week_md_3}')

dw_stat_1_week_md_3 = durbin_watson(y_test_1_week_md_3 - y_pred_1_week_md_3)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_1_week_md_3}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_1_week_md_3 = np.mean(np.abs((y_test_1_week_md_3 - y_pred_1_week_md_3) / y_test_1_week_md_3)) * 100
print(f'MAPE on unseen data: {mape_1_week_md_3:.2f}%')
# Get feature importance as a dictionary
feature_importance_1_week_md_3 = dict(zip(X_train_1_week_md_3.columns, model_1_week_MD_3.feature_importances_))

# Sort features by importance in descending order
sorted_features_1_week_md_3 = sorted(feature_importance_1_week_md_3.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_1_week_md_3:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# learning_rate outcome: learning_rate=0.01 showed the best improvement
# and had better metrics than the baseline, so we'll keep it and now tweak max_depth

# max_depth = 7

from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_1_week = df_stock_data_1_week.sort_values(by=['Symbol', 'Date'])

# Use data until the end of January 2022 for training (i.e., last 2-3 years for testing)
df_stock_data_train_1_week_md_7 = df_stock_data_1_week[df_stock_data_1_week['Date'] <= '2023-01-24']

# Use data from February 1, 2022, onwards for testing
df_stock_data_test_1_week_md_7 = df_stock_data_1_week[df_stock_data_1_week['Date'] > '2023-01-31']

# Check if the test set is empty
if df_stock_data_test_1_week_md_7.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 5 trading days ahead
df_stock_data_train_1_week_md_7['Close_Target'] = df_stock_data_train_1_week_md_7.groupby('Symbol')['Close'].shift(-5)
df_stock_data_test_1_week_md_7['Close_Target'] = df_stock_data_test_1_week_md_7.groupby('Symbol')['Close'].shift(-5)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_1_week_md_7 = df_stock_data_train_1_week_md_7.dropna(subset=['Close_Target'])
df_stock_data_test_1_week_md_7 = df_stock_data_test_1_week_md_7.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_1_week_md_7.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_1_week_md_7.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_1_week_md_7.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_1_week_md_7.select_dtypes(include=[np.number]).columns

df_stock_data_train_1_week_md_7[numeric_cols_train] = df_stock_data_train_1_week_md_7[numeric_cols_train].fillna(df_stock_data_train_1_week_md_7[numeric_cols_train].median())
df_stock_data_test_1_week_md_7[numeric_cols_test] = df_stock_data_test_1_week_md_7[numeric_cols_test].fillna(df_stock_data_test_1_week_md_7[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_1_week_md_7.shape}")
print(f"Testing data shape: {df_stock_data_test_1_week_md_7.shape}")

# Create X (features) and y (target) for training
X_train_1_week_md_7 = df_stock_data_train_1_week_md_7.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_1_week_md_7 = df_stock_data_train_1_week_md_7['Close_Target']

# Create X and y for testing
X_test_1_week_md_7 = df_stock_data_test_1_week_md_7.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_1_week_md_7 = df_stock_data_test_1_week_md_7['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_1_week_md_7 shape: {X_train_1_week_md_7.shape}, y_train_1_week_md_7 shape: {y_train_1_week_md_7.shape}")
print(f"X_test_1_week_md_7 shape: {X_test_1_week_md_7.shape}, y_test_1_week_md_7 shape: {y_test_1_week_md_7.shape}")

# Ensure there are samples in both training and testing sets
if X_train_1_week_md_7.shape[0] == 0 or X_test_1_week_md_7.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_1_week_MD_7 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=7,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_1_week_MD_7.fit(X_train_1_week_md_7, y_train_1_week_md_7)

# Make predictions on the unseen test data (February 17, 2024, onwards)
y_pred_1_week_md_7 = model_1_week_MD_7.predict(X_test_1_week_md_7)

# Calculate performance on the test data
mse_test_1_week_md_7 = mean_squared_error(y_test_1_week_md_7, y_pred_1_week_md_7)
print(f'Mean Squared Error on unseen data (post-February 17, 2024): {mse_test_1_week_md_7}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np

# Assuming `y_pred_1_week_md_7` are your predictions for the test data and `y_test_1_week_md_7` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_1_week_md_7 = mean_squared_error(y_test_1_week_md_7, y_pred_1_week_md_7)
mae_1_week_md_7 = mean_absolute_error(y_test_1_week_md_7, y_pred_1_week_md_7)
rmse_1_week_md_7 = np.sqrt(mse_1_week_md_7)  # Root Mean Squared Error
r2_1_week_md_7 = r2_score(y_test_1_week_md_7, y_pred_1_week_md_7)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_1_week_md_7}')
print(f'Mean Absolute Error on unseen data: {mae_1_week_md_7}')
print(f'Root Mean Squared Error on unseen data: {rmse_1_week_md_7}')
print(f'R-squared on unseen data: {r2_1_week_md_7}')

# Additional metrics
medae_1_week_md_7 = median_absolute_error(y_test_1_week_md_7, y_pred_1_week_md_7)
print(f'Median Absolute Error on unseen data: {medae_1_week_md_7}')

dw_stat_1_week_md_7 = durbin_watson(y_test_1_week_md_7 - y_pred_1_week_md_7)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_1_week_md_7}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_1_week_md_7 = np.mean(np.abs((y_test_1_week_md_7 - y_pred_1_week_md_7) / y_test_1_week_md_7)) * 100
print(f'MAPE on unseen data: {mape_1_week_md_7:.2f}%')
# Get feature importance as a dictionary
feature_importance_1_week_md_7 = dict(zip(X_train_1_week_md_7.columns, model_1_week_MD_7.feature_importances_))

# Sort features by importance in descending order
sorted_features_1_week_md_7 = sorted(feature_importance_1_week_md_7.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_1_week_md_7:
    print(f"{feature}: {importance * 100:.2f}%")



best model: learning_rate = 0.01 and max_depth = 7

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Convert to NumPy arrays (ensuring correct types)
features = np.array([feature for feature, importance in sorted_features_1_week_md_3[:5]])  # Extract feature names
importances = np.array([importance for feature, importance in sorted_features_1_week_md_3[:5]])  # Extract importances

# Create a bar plot
plt.figure(figsize=(13, 8))
ax = sns.barplot(x=importances * 100, y=features, palette="viridis")

# Add text labels to the bars (feature importance values)
for i, v in enumerate(importances * 100):
    ax.text(v + 0.01, i, f"{v:.2f}%", va="center", fontsize=16)  # Adjust position & format

# Format x-axis labels to include % sign
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:.0f}%"))

# Extend x-axis limits for more space
plt.xlim(0, max(importances * 100) + 6)  # Extend to provide more space on the right

# Add labels and title
plt.xlabel("Feature Importance (%)", fontsize=16, fontweight='bold')  # Bigger x-axis title
plt.ylabel("Important TA Indicators", fontsize=16, fontweight='bold')  # Bigger y-axis title
plt.title("Best 1 Week Prediction Model: Top 5 Most Important Features", fontsize=18, fontweight='bold')  # Bigger title

# Increase font size for y-axis and x-axis tick labels (feature names)
ax.set_yticklabels(features, fontsize=14)
plt.xticks(fontsize=14)  # Increase font size for x-axis labels
# Show the plot
plt.show()


In [None]:
# first we will modify our feature set to add bigger lagging indicators.
# Create a new dataframe called 'df_stock_data_1_month' as a copy of 'df_stocks_price_ta'
df_stock_data_1_month = df_stocks_price_ta.copy()


In [None]:
# List of columns to create lags for (focusing on short-term indicators)
columns_to_lag = ['Close', 'SMA_5', 'EMA_5', 'Volume', 'EMA_12_MACD', 'SMA_20', 'EMA_20']

# Creating lag features for each column
# [1, 3, 5, 7, 10, 12, 15, 20, 30, 60, 90, 180, 360] are the lags we will use
# but to save space, we will only use necessary lags per the timeline goal of the model
# this first model will be predicting price 1 week ahead (5 trading days)
lags = [1, 3, 5, 7, 10, 12, 15, 20]
for col in columns_to_lag:
    for lag in lags:
        df_stock_data_1_month[f'{col}_lag_{lag}'] = df_stock_data_1_month[col].shift(lag)

# Do not drop NaN values to maintain continuity (XGBoost can handle NaNs)
# You can handle missing values in your model later, if needed
df_stock_data_1_month.head()

In [None]:
# now we're going to move onto our next model: 1 month prediction
# we'll start at our baseline model and then do the same as we just did
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_1_month = df_stock_data_1_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to January 10, 2024 for training
df_stock_data_train_1_month_baseline = df_stock_data_1_month[df_stock_data_1_month['Date'] <= '2024-01-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_1_month_baseline = df_stock_data_1_month[df_stock_data_1_month['Date'] > '2024-02-10']

# Check if the test set is empty
if df_stock_data_test_1_month_baseline.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_1_month_baseline['Close_Target'] = df_stock_data_train_1_month_baseline.groupby('Symbol')['Close'].shift(-20)
df_stock_data_test_1_month_baseline['Close_Target'] = df_stock_data_test_1_month_baseline.groupby('Symbol')['Close'].shift(-20)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_1_month_baseline = df_stock_data_train_1_month_baseline.dropna(subset=['Close_Target'])
df_stock_data_test_1_month_baseline = df_stock_data_test_1_month_baseline.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_1_month_baseline.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_1_month_baseline.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_1_month_baseline.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_1_month_baseline.select_dtypes(include=[np.number]).columns

df_stock_data_train_1_month_baseline[numeric_cols_train] = df_stock_data_train_1_month_baseline[numeric_cols_train].fillna(df_stock_data_train_1_month_baseline[numeric_cols_train].median())
df_stock_data_test_1_month_baseline[numeric_cols_test] = df_stock_data_test_1_month_baseline[numeric_cols_test].fillna(df_stock_data_test_1_month_baseline[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_1_month_baseline.shape}")
print(f"Testing data shape: {df_stock_data_test_1_month_baseline.shape}")

# Create X (features) and y (target) for training
X_train_1_month_baseline = df_stock_data_train_1_month_baseline.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_1_month_baseline = df_stock_data_train_1_month_baseline['Close_Target']

# Create X and y for testing
X_test_1_month_baseline = df_stock_data_test_1_month_baseline.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_1_month_baseline = df_stock_data_test_1_month_baseline['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_1_month_baseline shape: {X_train_1_month_baseline.shape}, y_train_1_month_baseline shape: {y_train_1_month_baseline.shape}")
print(f"X_test_1_month_baseline shape: {X_test_1_month_baseline.shape}, y_test_1_month_baseline shape: {y_test_1_month_baseline.shape}")

# Ensure there are samples in both training and testing sets
if X_train_1_month_baseline.shape[0] == 0 or X_test_1_month_baseline.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_baseline_1_month = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_baseline_1_month.fit(X_train_1_month_baseline, y_train_1_month_baseline)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_1_month_baseline = model_baseline_1_month.predict(X_test_1_month_baseline)

# Calculate performance on the test data
mse_test_1_month_baseline = mean_squared_error(y_test_1_month_baseline, y_pred_1_month_baseline)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_1_month_baseline}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np


# Assuming `y_pred_1_month_baseline` are your predictions for the test data and `y_test_1_month_baseline` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_1_month_baseline = mean_squared_error(y_test_1_month_baseline, y_pred_1_month_baseline)
mae_1_month_baseline = mean_absolute_error(y_test_1_month_baseline, y_pred_1_month_baseline)
rmse_1_month_baseline = np.sqrt(mse_1_month_baseline)  # Root Mean Squared Error
r2_1_month_baseline = r2_score(y_test_1_month_baseline, y_pred_1_month_baseline)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_1_month_baseline}')
print(f'Mean Absolute Error on unseen data: {mae_1_month_baseline}')
print(f'Root Mean Squared Error on unseen data: {rmse_1_month_baseline}')
print(f'R-squared on unseen data: {r2_1_month_baseline}')

# Additional metrics
medae_1_month_baseline = median_absolute_error(y_test_1_month_baseline, y_pred_1_month_baseline)
print(f'Median Absolute Error on unseen data: {medae_1_month_baseline}')

dw_stat_1_month_baseline = durbin_watson(y_test_1_month_baseline - y_pred_1_month_baseline)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_1_month_baseline}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_1_month_baseline = np.mean(np.abs((y_test_1_month_baseline - y_pred_1_month_baseline) / y_test_1_month_baseline)) * 100
print(f'MAPE on unseen data: {mape_1_month_baseline:.2f}%')
# Get feature importance as a dictionary
feature_importance_1_month_baseline = dict(zip(X_train_1_month_baseline.columns, model_baseline_1_month.feature_importances_))

# Sort features by importance in descending order
sorted_features_1_month_baseline = sorted(feature_importance_1_month_baseline.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_1_month_baseline:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# Get feature importance from the baseline model (1-week prediction)
feature_importance = dict(zip(X_train.columns, model_baseline_1_month.feature_importances_))

# Filter features with importance greater than 1%
important_features = {feature: importance for feature, importance in feature_importance.items() if importance > 0.01}

# Sort features by importance in descending order
sorted_important_features = sorted(important_features.items(), key=lambda x: x[1], reverse=True)

# Extract the feature names (keys) into a list
important_feature_names = [feature for feature, importance in sorted_important_features]

# Print the sorted important features (optional)
print("Features with more than 1% contribution:")
for feature in sorted_important_features:
    print(f"{feature[0]}: {feature[1] * 100:.2f}%")

# The list of important features that you can use to create a new dataframe
print("List of important features:")
print(important_feature_names)




In [None]:
important_features = ['Symbol', 'Date', 'Close', 'Fib_30_High_Max', '30_day_Fib_23',
                      'High', 'Low', 'Fib_30_Low_Min', 'Volume', 'EMA_5', '30_day_Fib_50',
                      'Fib_5_Low_Min']
df_important_feat_1_month = df_stock_data_1_month[important_features]
df_important_feat_1_month.head()

In [None]:
# baseline 1 month prediction model with only features contributing over 1%
# not as good as baseline
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_important_feat_1_month = df_important_feat_1_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to January 10, 2024 for training
df_stock_data_train_1_month_if = df_important_feat_1_month[df_important_feat_1_month['Date'] <= '2024-01-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_1_month_if = df_important_feat_1_month[df_important_feat_1_month['Date'] > '2024-02-10']

# Check if the test set is empty
if df_stock_data_test_1_month_if.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_1_month_if['Close_Target'] = df_stock_data_train_1_month_if.groupby('Symbol')['Close'].shift(-20)
df_stock_data_test_1_month_if['Close_Target'] = df_stock_data_test_1_month_if.groupby('Symbol')['Close'].shift(-20)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_1_month_if = df_stock_data_train_1_month_if.dropna(subset=['Close_Target'])
df_stock_data_test_1_month_if = df_stock_data_test_1_month_if.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_1_month_if.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_1_month_if.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_1_month_if.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_1_month_if.select_dtypes(include=[np.number]).columns

df_stock_data_train_1_month_if[numeric_cols_train] = df_stock_data_train_1_month_if[numeric_cols_train].fillna(df_stock_data_train_1_month_if[numeric_cols_train].median())
df_stock_data_test_1_month_if[numeric_cols_test] = df_stock_data_test_1_month_if[numeric_cols_test].fillna(df_stock_data_test_1_month_if[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_1_month_if.shape}")
print(f"Testing data shape: {df_stock_data_test_1_month_if.shape}")

# Create X (features) and y (target) for training
X_train_1_month_if = df_stock_data_train_1_month_if.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_1_month_if = df_stock_data_train_1_month_if['Close_Target']

# Create X and y for testing
X_test_1_month_if = df_stock_data_test_1_month_if.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_1_month_if = df_stock_data_test_1_month_if['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_1_month_if shape: {X_train_1_month_if.shape}, y_train_1_month_if shape: {y_train_1_month_if.shape}")
print(f"X_test_1_month_if shape: {X_test_1_month_if.shape}, y_test_1_month_if shape: {y_test_1_month_if.shape}")

# Ensure there are samples in both training and testing sets
if X_train_1_month_if.shape[0] == 0 or X_test_1_month_if.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_baseline_if_1_month = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_baseline_if_1_month.fit(X_train_1_month_if, y_train_1_month_if)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_1_month_if = model_baseline_if_1_month.predict(X_test_1_month_if)

# Calculate performance on the test data
mse_test_1_month_if = mean_squared_error(y_test_1_month_if, y_pred_1_month_if)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_1_month_if}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np

# Assuming `y_pred_1_month_if` are your predictions for the test data and `y_test_1_month_if` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_1_month_if = mean_squared_error(y_test_1_month_if, y_pred_1_month_if)
mae_1_month_if = mean_absolute_error(y_test_1_month_if, y_pred_1_month_if)
rmse_1_month_if = np.sqrt(mse_1_month_if)  # Root Mean Squared Error
r2_1_month_if = r2_score(y_test_1_month_if, y_pred_1_month_if)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_1_month_if}')
print(f'Mean Absolute Error on unseen data: {mae_1_month_if}')
print(f'Root Mean Squared Error on unseen data: {rmse_1_month_if}')
print(f'R-squared on unseen data: {r2_1_month_if}')

# Additional metrics
medae_1_month_if = median_absolute_error(y_test_1_month_if, y_pred_1_month_if)
print(f'Median Absolute Error on unseen data: {medae_1_month_if}')

dw_stat_1_month_if = durbin_watson(y_test_1_month_if - y_pred_1_month_if)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_1_month_if}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_1_month_if = np.mean(np.abs((y_test_1_month_if - y_pred_1_month_if) / y_test_1_month_if)) * 100
print(f'MAPE on unseen data: {mape_1_month_if:.2f}%')
# Get feature importance as a dictionary
feature_importance_1_month_if = dict(zip(X_train_1_month_if.columns, model_baseline_if_1_month.feature_importances_))

# Sort features by importance in descending order
sorted_features_1_month_if = sorted(feature_importance_1_month_if.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_1_month_if:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# 1 month baseline model with learning_rate=0.1
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_1_month = df_stock_data_1_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to January 10, 2024 for training
df_stock_data_train_1_month_lr_1 = df_stock_data_1_month[df_stock_data_1_month['Date'] <= '2024-01-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_1_month_lr_1 = df_stock_data_1_month[df_stock_data_1_month['Date'] > '2024-02-10']

# Check if the test set is empty
if df_stock_data_test_1_month_lr_1.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_1_month_lr_1['Close_Target'] = df_stock_data_train_1_month_lr_1.groupby('Symbol')['Close'].shift(-20)
df_stock_data_test_1_month_lr_1['Close_Target'] = df_stock_data_test_1_month_lr_1.groupby('Symbol')['Close'].shift(-20)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_1_month_lr_1 = df_stock_data_train_1_month_lr_1.dropna(subset=['Close_Target'])
df_stock_data_test_1_month_lr_1 = df_stock_data_test_1_month_lr_1.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_1_month_lr_1.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_1_month_lr_1.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_1_month_lr_1.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_1_month_lr_1.select_dtypes(include=[np.number]).columns

df_stock_data_train_1_month_lr_1[numeric_cols_train] = df_stock_data_train_1_month_lr_1[numeric_cols_train].fillna(df_stock_data_train_1_month_lr_1[numeric_cols_train].median())
df_stock_data_test_1_month_lr_1[numeric_cols_test] = df_stock_data_test_1_month_lr_1[numeric_cols_test].fillna(df_stock_data_test_1_month_lr_1[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_1_month_lr_1.shape}")
print(f"Testing data shape: {df_stock_data_test_1_month_lr_1.shape}")

# Create X (features) and y (target) for training
X_train_1_month_lr_1 = df_stock_data_train_1_month_lr_1.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_1_month_lr_1 = df_stock_data_train_1_month_lr_1['Close_Target']

# Create X and y for testing
X_test_1_month_lr_1 = df_stock_data_test_1_month_lr_1.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_1_month_lr_1 = df_stock_data_test_1_month_lr_1['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_1_month_lr_1 shape: {X_train_1_month_lr_1.shape}, y_train_1_month_lr_1 shape: {y_train_1_month_lr_1.shape}")
print(f"X_test_1_month_lr_1 shape: {X_test_1_month_lr_1.shape}, y_test_1_month_lr_1 shape: {y_test_1_month_lr_1.shape}")

# Ensure there are samples in both training and testing sets
if X_train_1_month_lr_1.shape[0] == 0 or X_test_1_month_lr_1.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_1_month_tr_01 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=5,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_1_month_tr_01.fit(X_train_1_month_lr_1, y_train_1_month_lr_1)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_1_month_lr_1 = model_1_month_tr_01.predict(X_test_1_month_lr_1)

# Calculate performance on the test data
mse_test_1_month_lr_1 = mean_squared_error(y_test_1_month_lr_1, y_pred_1_month_lr_1)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_1_month_lr_1}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np

# Assuming `y_pred_1_month_lr_1` are your predictions for the test data and `y_test_1_month_lr_1` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_1_month_lr_1 = mean_squared_error(y_test_1_month_lr_1, y_pred_1_month_lr_1)
mae_1_month_lr_1 = mean_absolute_error(y_test_1_month_lr_1, y_pred_1_month_lr_1)
rmse_1_month_lr_1 = np.sqrt(mse_1_month_lr_1)  # Root Mean Squared Error
r2_1_month_lr_1 = r2_score(y_test_1_month_lr_1, y_pred_1_month_lr_1)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_1_month_lr_1}')
print(f'Mean Absolute Error on unseen data: {mae_1_month_lr_1}')
print(f'Root Mean Squared Error on unseen data: {rmse_1_month_lr_1}')
print(f'R-squared on unseen data: {r2_1_month_lr_1}')

# Additional metrics
medae_1_month_lr_1 = median_absolute_error(y_test_1_month_lr_1, y_pred_1_month_lr_1)
print(f'Median Absolute Error on unseen data: {medae_1_month_lr_1}')

dw_stat_1_month_lr_1 = durbin_watson(y_test_1_month_lr_1 - y_pred_1_month_lr_1)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_1_month_lr_1}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_1_month_lr_1 = np.mean(np.abs((y_test_1_month_lr_1 - y_pred_1_month_lr_1) / y_test_1_month_lr_1)) * 100
print(f'MAPE on unseen data: {mape_1_month_lr_1:.2f}%')
# Get feature importance as a dictionary
feature_importance_1_month_lr_1 = dict(zip(X_train_1_month_lr_1.columns, model_1_month_tr_01.feature_importances_))

# Sort features by importance in descending order
sorted_features_1_month_lr_1 = sorted(feature_importance_1_month_lr_1.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_1_month_lr_1:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# 1 month baseline model with learning_rate=0.01
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_1_month = df_stock_data_1_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to January 10, 2024 for training
df_stock_data_train_1_month_lr_01 = df_stock_data_1_month[df_stock_data_1_month['Date'] <= '2024-01-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_1_month_lr_01 = df_stock_data_1_month[df_stock_data_1_month['Date'] > '2024-02-10']

# Check if the test set is empty
if df_stock_data_test_1_month_lr_01.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_1_month_lr_01['Close_Target'] = df_stock_data_train_1_month_lr_01.groupby('Symbol')['Close'].shift(-20)
df_stock_data_test_1_month_lr_01['Close_Target'] = df_stock_data_test_1_month_lr_01.groupby('Symbol')['Close'].shift(-20)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_1_month_lr_01 = df_stock_data_train_1_month_lr_01.dropna(subset=['Close_Target'])
df_stock_data_test_1_month_lr_01 = df_stock_data_test_1_month_lr_01.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_1_month_lr_01.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_1_month_lr_01.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_1_month_lr_01.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_1_month_lr_01.select_dtypes(include=[np.number]).columns

df_stock_data_train_1_month_lr_01[numeric_cols_train] = df_stock_data_train_1_month_lr_01[numeric_cols_train].fillna(df_stock_data_train_1_month_lr_01[numeric_cols_train].median())
df_stock_data_test_1_month_lr_01[numeric_cols_test] = df_stock_data_test_1_month_lr_01[numeric_cols_test].fillna(df_stock_data_test_1_month_lr_01[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_1_month_lr_01.shape}")
print(f"Testing data shape: {df_stock_data_test_1_month_lr_01.shape}")

# Create X (features) and y (target) for training
X_train_1_month_lr_01 = df_stock_data_train_1_month_lr_01.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_1_month_lr_01 = df_stock_data_train_1_month_lr_01['Close_Target']

# Create X and y for testing
X_test_1_month_lr_01 = df_stock_data_test_1_month_lr_01.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_1_month_lr_01 = df_stock_data_test_1_month_lr_01['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_1_month_lr_01 shape: {X_train_1_month_lr_01.shape}, y_train_1_month_lr_01 shape: {y_train_1_month_lr_01.shape}")
print(f"X_test_1_month_lr_01 shape: {X_test_1_month_lr_01.shape}, y_test_1_month_lr_01 shape: {y_test_1_month_lr_01.shape}")

# Ensure there are samples in both training and testing sets
if X_train_1_month_lr_01.shape[0] == 0 or X_test_1_month_lr_01.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_1_month_tr_1 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=5,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_1_month_tr_1.fit(X_train_1_month_lr_01, y_train_1_month_lr_01)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_1_month_lr_01 = model_1_month_tr_1.predict(X_test_1_month_lr_01)

# Calculate performance on the test data
mse_test_1_month_lr_01 = mean_squared_error(y_test_1_month_lr_01, y_pred_1_month_lr_01)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_1_month_lr_01}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np


# Assuming `y_pred_1_month_lr_01` are your predictions for the test data and `y_test_1_month_lr_01` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_month_lr_01 = mean_squared_error(y_test_1_month_lr_01, y_pred_1_month_lr_01)
mae_1_month_lr_01 = mean_absolute_error(y_test_1_month_lr_01, y_pred_1_month_lr_01)
rmse_month_lr_01 = np.sqrt(mse_month_lr_01)  # Root Mean Squared Error
r2_month_lr_01 = r2_score(y_test_1_month_lr_01, y_pred_1_month_lr_01)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_month_lr_01}')
print(f'Mean Absolute Error on unseen data: {mae_1_month_lr_01}')
print(f'Root Mean Squared Error on unseen data: {rmse_month_lr_01}')
print(f'R-squared on unseen data: {r2_month_lr_01}')

# Additional metrics
medae_1_month_lr_01 = median_absolute_error(y_test_1_month_lr_01, y_pred_1_month_lr_01)
print(f'Median Absolute Error on unseen data: {medae_1_month_lr_01}')

dw_stat_1_month_lr_01 = durbin_watson(y_test_1_month_lr_01 - y_pred_1_month_lr_01)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_1_month_lr_01}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_1_month_lr_01 = np.mean(np.abs((y_test_1_month_lr_01 - y_pred_1_month_lr_01) / y_test_1_month_lr_01)) * 100
print(f'MAPE on unseen data: {mape_1_month_lr_01:.2f}%')
# Get feature importance as a dictionary
feature_importance_month_lr_01 = dict(zip(X_train_1_month_lr_01.columns, model_1_month_tr_01.feature_importances_))

# Sort features by importance in descending order
sorted_features_1_month_lr_01 = sorted(feature_importance_month_lr_01.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_1_month_lr_01:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# model with learning_rate = 0.01 is best again, so we keep that parameter
# now we'll do max depth
# max depth = 3
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_1_month = df_stock_data_1_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to January 10, 2024 for training
df_stock_data_train_1_month_md_3 = df_stock_data_1_month[df_stock_data_1_month['Date'] <= '2024-01-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_1_month_md_3 = df_stock_data_1_month[df_stock_data_1_month['Date'] > '2024-02-10']

# Check if the test set is empty
if df_stock_data_test_1_month_md_3.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_1_month_md_3['Close_Target'] = df_stock_data_train_1_month_md_3.groupby('Symbol')['Close'].shift(-20)
df_stock_data_test_1_month_md_3['Close_Target'] = df_stock_data_test_1_month_md_3.groupby('Symbol')['Close'].shift(-20)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_1_month_md_3 = df_stock_data_train_1_month_md_3.dropna(subset=['Close_Target'])
df_stock_data_test_1_month_md_3 = df_stock_data_test_1_month_md_3.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_1_month_md_3.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_1_month_md_3.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_1_month_md_3.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_1_month_md_3.select_dtypes(include=[np.number]).columns

df_stock_data_train_1_month_md_3[numeric_cols_train] = df_stock_data_train_1_month_md_3[numeric_cols_train].fillna(df_stock_data_train_1_month_md_3[numeric_cols_train].median())
df_stock_data_test_1_month_md_3[numeric_cols_test] = df_stock_data_test_1_month_md_3[numeric_cols_test].fillna(df_stock_data_test_1_month_md_3[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_1_month_md_3.shape}")
print(f"Testing data shape: {df_stock_data_test_1_month_md_3.shape}")

# Create X (features) and y (target) for training
X_train_1_month_md_3 = df_stock_data_train_1_month_md_3.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_1_month_md_3 = df_stock_data_train_1_month_md_3['Close_Target']

# Create X and y for testing
X_test_1_month_md_3 = df_stock_data_test_1_month_md_3.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_1_month_md_3 = df_stock_data_test_1_month_md_3['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_1_month_md_3 shape: {X_train_1_month_md_3.shape}, y_train_1_month_md_3 shape: {y_train_1_month_md_3.shape}")
print(f"X_test_1_month_md_3 shape: {X_test_1_month_md_3.shape}, y_test_1_month_md_3 shape: {y_test_1_month_md_3.shape}")

# Ensure there are samples in both training and testing sets
if X_train_1_month_md_3.shape[0] == 0 or X_test_1_month_md_3.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_1_month_md_3 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=3,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_1_month_md_3.fit(X_train_1_month_md_3, y_train_1_month_md_3)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_1_month_md_3 = model_1_month_md_3.predict(X_test_1_month_md_3)

# Calculate performance on the test data
mse_test_1_month_md_3 = mean_squared_error(y_test_1_month_md_3, y_pred_1_month_md_3)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_1_month_md_3}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np


# Assuming `y_pred_1_month_md_3` are your predictions for the test data and `y_test_1_month_md_3` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_1_month_md_3 = mean_squared_error(y_test_1_month_md_3, y_pred_1_month_md_3)
mae_1_month_md_3 = mean_absolute_error(y_test_1_month_md_3, y_pred_1_month_md_3)
rmse_1_month_md_3 = np.sqrt(mse_1_month_md_3)  # Root Mean Squared Error
r2_1_month_md_3 = r2_score(y_test_1_month_md_3, y_pred_1_month_md_3)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_1_month_md_3}')
print(f'Mean Absolute Error on unseen data: {mae_1_month_md_3}')
print(f'Root Mean Squared Error on unseen data: {rmse_1_month_md_3}')
print(f'R-squared on unseen data: {r2_1_month_md_3}')

# Additional metrics
medae_1_month_md_3 = median_absolute_error(y_test_1_month_md_3, y_pred_1_month_md_3)
print(f'Median Absolute Error on unseen data: {medae_1_month_md_3}')

dw_stat_1_month_md_3 = durbin_watson(y_test_1_month_md_3 - y_pred_1_month_md_3)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_1_month_md_3}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_1_month_md_3 = np.mean(np.abs((y_test_1_month_md_3 - y_pred_1_month_md_3) / y_test_1_month_md_3)) * 100
print(f'MAPE on unseen data: {mape_1_month_md_3:.2f}%')
# Get feature importance as a dictionary
feature_importance_1_month_md_3 = dict(zip(X_train_1_month_md_3.columns, model_1_month_md_3.feature_importances_))

# Sort features by importance in descending order
sorted_features_1_month_md_3 = sorted(feature_importance_1_month_md_3.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_1_month_md_3:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# model with learning_rate = 0.01 is best again, so we keep that parameter
# now we'll do max depth
# max depth = 7
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_1_month = df_stock_data_1_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to January 10, 2024 for training
df_stock_data_train_1_month_md_7 = df_stock_data_1_month[df_stock_data_1_month['Date'] <= '2024-01-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_1_month_md_7 = df_stock_data_1_month[df_stock_data_1_month['Date'] > '2024-02-10']

# Check if the test set is empty
if df_stock_data_test_1_month_md_7.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_1_month_md_7['Close_Target'] = df_stock_data_train_1_month_md_7.groupby('Symbol')['Close'].shift(-20)
df_stock_data_test_1_month_md_7['Close_Target'] = df_stock_data_test_1_month_md_7.groupby('Symbol')['Close'].shift(-20)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_1_month_md_7 = df_stock_data_train_1_month_md_7.dropna(subset=['Close_Target'])
df_stock_data_test_1_month_md_7 = df_stock_data_test_1_month_md_7.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_1_month_md_7.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_1_month_md_7.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_1_month_md_7.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_1_month_md_7.select_dtypes(include=[np.number]).columns

df_stock_data_train_1_month_md_7[numeric_cols_train] = df_stock_data_train_1_month_md_7[numeric_cols_train].fillna(df_stock_data_train_1_month_md_7[numeric_cols_train].median())
df_stock_data_test_1_month_md_7[numeric_cols_test] = df_stock_data_test_1_month_md_7[numeric_cols_test].fillna(df_stock_data_test_1_month_md_7[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_1_month_md_7.shape}")
print(f"Testing data shape: {df_stock_data_test_1_month_md_7.shape}")

# Create X (features) and y (target) for training
X_train_1_month_md_7 = df_stock_data_train_1_month_md_7.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_1_month_md_7 = df_stock_data_train_1_month_md_7['Close_Target']

# Create X and y for testing
X_test_1_month_md_7 = df_stock_data_test_1_month_md_7.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_1_month_md_7 = df_stock_data_test_1_month_md_7['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_1_month_md_7 shape: {X_train_1_month_md_7.shape}, y_train_1_month_md_7 shape: {y_train_1_month_md_7.shape}")
print(f"X_test_1_month_md_7 shape: {X_test_1_month_md_7.shape}, y_test_1_month_md_7 shape: {y_test_1_month_md_7.shape}")

# Ensure there are samples in both training and testing sets
if X_train_1_month_md_7.shape[0] == 0 or X_test_1_month_md_7.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_1_month_md_7 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=7,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_1_month_md_7.fit(X_train_1_month_md_7, y_train_1_month_md_7)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_1_month_md_7 = model_1_month_md_7.predict(X_test_1_month_md_7)

# Calculate performance on the test data
mse_test_1_month_md_7 = mean_squared_error(y_test_1_month_md_7, y_pred_1_month_md_7)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_1_month_md_7}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np

# Assuming `y_pred_1_month_md_7` are your predictions for the test data and `y_test_1_month_md_7` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_1_month_md_7 = mean_squared_error(y_test_1_month_md_7, y_pred_1_month_md_7)
mae_1_month_md_7 = mean_absolute_error(y_test_1_month_md_7, y_pred_1_month_md_7)
rmse_1_month_md_7 = np.sqrt(mse_1_month_md_7)  # Root Mean Squared Error
r2_1_month_md_7 = r2_score(y_test_1_month_md_7, y_pred_1_month_md_7)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_1_month_md_7}')
print(f'Mean Absolute Error on unseen data: {mae_1_month_md_7}')
print(f'Root Mean Squared Error on unseen data: {rmse_1_month_md_7}')
print(f'R-squared on unseen data: {r2_1_month_md_7}')

# Additional metrics
medae_1_month_md_7 = median_absolute_error(y_test_1_month_md_7, y_pred_1_month_md_7)
print(f'Median Absolute Error on unseen data: {medae_1_month_md_7}')

dw_stat_1_month_md_7 = durbin_watson(y_test_1_month_md_7 - y_pred_1_month_md_7)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_1_month_md_7}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_1_month_md_7 = np.mean(np.abs((y_test_1_month_md_7 - y_pred_1_month_md_7) / y_test_1_month_md_7)) * 100
print(f'MAPE on unseen data: {mape_1_month_md_7:.2f}%')
# Get feature importance as a dictionary
feature_importance_1_month_md_7 = dict(zip(X_train_1_month_md_7.columns, model_1_month_md_7.feature_importances_))

# Sort features by importance in descending order
sorted_features_1_month_md_7 = sorted(feature_importance_1_month_md_7.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_1_month_md_7:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Convert to NumPy arrays (ensuring correct types)
features = np.array([feature for feature, importance in sorted_features_1_month_md_7[:5]])  # Extract feature names
importances = np.array([importance for feature, importance in sorted_features_1_month_md_7[:5]])  # Extract importances

# Create a bar plot
plt.figure(figsize=(13, 8))
ax = sns.barplot(x=importances * 100, y=features, palette="viridis")

# Add text labels to the bars (feature importance values)
for i, v in enumerate(importances * 100):
    ax.text(v + 0.01, i, f"{v:.2f}%", va="center", fontsize=16)  # Adjust position & format

# Format x-axis labels to include % sign
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:.0f}%"))

# Extend x-axis limits for more space
plt.xlim(0, max(importances * 100) + 6)  # Extend to provide more space on the right

# Add labels and title
plt.xlabel("Feature Importance (%)", fontsize=16, fontweight='bold')  # Bigger x-axis title
plt.ylabel("Important TA Indicators", fontsize=16, fontweight='bold')  # Bigger y-axis title
plt.title("Best 1 Month Prediction Model: Top 5 Most Important Features", fontsize=18, fontweight='bold')  # Bigger title

# Increase font size for y-axis and x-axis tick labels (feature names)
ax.set_yticklabels(features, fontsize=14)
plt.xticks(fontsize=14)  # Increase font size for x-axis labels
# Show the plot
plt.show()


In [None]:
# first we will modify our feature set to add bigger lagging indicators.
# Create a new dataframe called 'df_stock_data_3_month' as a copy of 'df_stocks_price_ta'
df_stock_data_3_month = df_stocks_price_ta.copy()


In [None]:
# List of columns to create lags for (focusing on mid-term indicators)
columns_to_lag = ['Close', 'SMA_5', 'EMA_5', 'Volume', 'SMA_20',
       'SMA_50', 'EMA_5', 'EMA_20', 'EMA_50',  'EMA_12_MACD',
       'EMA_26_MACD']

# Creating lag features for each column
# [1, 3, 5, 7, 10, 12, 15, 20, 30, 60, 90, 180, 360] are the lags we will use
# but to save space, we will only use necessary lags per the timeline goal of the model
# this first model will be predicting price 1 week ahead (5 trading days)
lags = [1, 3, 5, 7, 10, 15, 20, 25, 30, 40, 50, 60, 75, 90]
for col in columns_to_lag:
    for lag in lags:
        df_stock_data_3_month[f'{col}_lag_{lag}'] = df_stock_data_3_month[col].shift(lag)

# Do not drop NaN values to maintain continuity (XGBoost can handle NaNs)
# You can handle missing values in your model later, if needed
df_stock_data_3_month.head()



In [None]:
# now we're going to move onto our next model: 3 month prediction
# we'll start at our baseline model and then do the same as we just did
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_3_month = df_stock_data_3_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to November 10, 2023 for training
df_stock_data_train_3_month_baseline = df_stock_data_3_month[df_stock_data_3_month['Date'] <= '2023-11-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_3_month_baseline = df_stock_data_3_month[df_stock_data_3_month['Date'] > '2024-02-10']

# Check if the test set is empty
if df_stock_data_test_3_month_baseline.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_3_month_baseline['Close_Target'] = df_stock_data_train_3_month_baseline.groupby('Symbol')['Close'].shift(-60)
df_stock_data_test_3_month_baseline['Close_Target'] = df_stock_data_test_3_month_baseline.groupby('Symbol')['Close'].shift(-60)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_3_month_baseline = df_stock_data_train_3_month_baseline.dropna(subset=['Close_Target'])
df_stock_data_test_3_month_baseline = df_stock_data_test_3_month_baseline.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_3_month_baseline.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_3_month_baseline.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_3_month_baseline.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_3_month_baseline.select_dtypes(include=[np.number]).columns

df_stock_data_train_3_month_baseline[numeric_cols_train] = df_stock_data_train_3_month_baseline[numeric_cols_train].fillna(df_stock_data_train_3_month_baseline[numeric_cols_train].median())
df_stock_data_test_3_month_baseline[numeric_cols_test] = df_stock_data_test_3_month_baseline[numeric_cols_test].fillna(df_stock_data_test_3_month_baseline[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_3_month_baseline.shape}")
print(f"Testing data shape: {df_stock_data_test_3_month_baseline.shape}")

# Create X (features) and y (target) for training
X_train_3_month_baseline = df_stock_data_train_3_month_baseline.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_3_month_baseline = df_stock_data_train_3_month_baseline['Close_Target']

# Create X and y for testing
X_test_3_month_baseline = df_stock_data_test_3_month_baseline.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_3_month_baseline = df_stock_data_test_3_month_baseline['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_3_month_baseline shape: {X_train_3_month_baseline.shape}, y_train_3_month_baseline shape: {y_train_3_month_baseline.shape}")
print(f"X_test_3_month_baseline shape: {X_test_3_month_baseline.shape}, y_test_3_month_baseline shape: {y_test_3_month_baseline.shape}")

# Ensure there are samples in both training and testing sets
if X_train_3_month_baseline.shape[0] == 0 or X_test_3_month_baseline.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_baseline_3_month = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_baseline_3_month.fit(X_train_3_month_baseline, y_train_3_month_baseline)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_3_month_baseline = model_baseline_3_month.predict(X_test_3_month_baseline)

# Calculate performance on the test data
mse_test_3_month_baseline = mean_squared_error(y_test_3_month_baseline, y_pred_3_month_baseline)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_3_month_baseline}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np


# Assuming `y_pred_3_month_baseline` are your predictions for the test data and `y_test_3_month_baseline` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_3_month_baseline = mean_squared_error(y_test_3_month_baseline, y_pred_3_month_baseline)
mae_3_month_baseline = mean_absolute_error(y_test_3_month_baseline, y_pred_3_month_baseline)
rmse_3_month_baseline = np.sqrt(mse)  # Root Mean Squared Error
r2_3_month_baseline = r2_score(y_test_3_month_baseline, y_pred_3_month_baseline)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_3_month_baseline}')
print(f'Mean Absolute Error on unseen data: {mae_3_month_baseline}')
print(f'Root Mean Squared Error on unseen data: {rmse_3_month_baseline}')
print(f'R-squared on unseen data: {r2_3_month_baseline}')

# Additional metrics
medae_3_month_baseline = median_absolute_error(y_test_3_month_baseline, y_pred_3_month_baseline)
print(f'Median Absolute Error on unseen data: {medae_3_month_baseline}')

dw_stat_3_month_baseline = durbin_watson(y_test_3_month_baseline - y_pred_3_month_baseline)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_3_month_baseline}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_3_month_baseline = np.mean(np.abs((y_test_3_month_baseline - y_pred_3_month_baseline) / y_test_3_month_baseline)) * 100
print(f'MAPE on unseen data: {mape_3_month_baseline:.2f}%')
# Get feature importance as a dictionary
feature_importance_3_month_baseline = dict(zip(X_train_3_month_baseline.columns, model_baseline_3_month.feature_importances_))

# Sort features by importance in descending order
sorted_features_3_month_baseline = sorted(feature_importance_3_month_baseline.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_3_month_baseline:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# Get feature importance from the baseline model (1-week prediction)
feature_importance = dict(zip(X_train.columns, model_baseline_3_month.feature_importances_))

# Filter features with importance greater than 1%
important_features = {feature: importance for feature, importance in feature_importance.items() if importance > 0.01}

# Sort features by importance in descending order
sorted_important_features = sorted(important_features.items(), key=lambda x: x[1], reverse=True)

# Extract the feature names (keys) into a list
important_feature_names = [feature for feature, importance in sorted_important_features]

# Print the sorted important features (optional)
print("Features with more than 1% contribution:")
for feature in sorted_important_features:
    print(f"{feature[0]}: {feature[1] * 100:.2f}%")

# The list of important features that you can use to create a new dataframe
print("List of important features:")
print(important_feature_names)


In [None]:
important_features = ['Symbol', 'Date', 'Close', 'Fib_30_Low_Min', '30_day_Fib_38',
                      '5_day-Fib_61', '5_day-Fib_23', 'EMA_5', 'Volume',
                      'EMA_12_MACD', 'High', 'Low', '30_day_Fib_61',
                      'ATR_Prev_Close', 'Fib_5_Low_Min']
df_important_feat_3_month = df_stock_data_3_month[important_features]
df_important_feat_3_month.head()

In [None]:
# 3 month prediction with only important featuers
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_important_feat_3_month = df_important_feat_3_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to November 10, 2023 for training
df_stock_data_train_3_month_if = df_important_feat_3_month[df_important_feat_3_month['Date'] <= '2023-11-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_3_month_if = df_important_feat_3_month[df_important_feat_3_month['Date'] > '2024-02-10']

# Check if the test set is empty
if df_stock_data_test_3_month_if.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_3_month_if['Close_Target'] = df_stock_data_train_3_month_if.groupby('Symbol')['Close'].shift(-60)
df_stock_data_test_3_month_if['Close_Target'] = df_stock_data_test_3_month_if.groupby('Symbol')['Close'].shift(-60)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_3_month_if = df_stock_data_train_3_month_if.dropna(subset=['Close_Target'])
df_stock_data_test_3_month_if = df_stock_data_test_3_month_if.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_3_month_if.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_3_month_if.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_3_month_if.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_3_month_if.select_dtypes(include=[np.number]).columns

df_stock_data_train_3_month_if[numeric_cols_train] = df_stock_data_train_3_month_if[numeric_cols_train].fillna(df_stock_data_train_3_month_if[numeric_cols_train].median())
df_stock_data_test_3_month_if[numeric_cols_test] = df_stock_data_test_3_month_if[numeric_cols_test].fillna(df_stock_data_test_3_month_if[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_3_month_if.shape}")
print(f"Testing data shape: {df_stock_data_test_3_month_if.shape}")

# Create X (features) and y (target) for training
X_train_3_month_if = df_stock_data_train_3_month_if.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_3_month_if = df_stock_data_train_3_month_if['Close_Target']

# Create X and y for testing
X_test_3_month_if = df_stock_data_test_3_month_if.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_3_month_if = df_stock_data_test_3_month_if['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_3_month_if shape: {X_train_3_month_if.shape}, y_train_3_month_if shape: {y_train_3_month_if.shape}")
print(f"X_test_3_month_if shape: {X_test_3_month_if.shape}, y_test_3_month_if shape: {y_test_3_month_if.shape}")

# Ensure there are samples in both training and testing sets
if X_train_3_month_if.shape[0] == 0 or X_test_3_month_if.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_baseline_if_3_month = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_baseline_if_3_month.fit(X_train_3_month_if, y_train_3_month_if)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_3_month_if = model_baseline_if_3_month.predict(X_test_3_month_if)

# Calculate performance on the test data
mse_test_3_month_if = mean_squared_error(y_test_3_month_if, y_pred_3_month_if)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_3_month_if}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np


# Assuming `y_pred_3_month_if` are your predictions for the test data and `y_test_3_month_if` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_3_month_if = mean_squared_error(y_test_3_month_if, y_pred_3_month_if)
mae_3_month_if = mean_absolute_error(y_test_3_month_if, y_pred_3_month_if)
rmse_3_month_if = np.sqrt(mse_3_month_if)  # Root Mean Squared Error
r2_3_month_if = r2_score(y_test_3_month_if, y_pred_3_month_if)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_3_month_if}')
print(f'Mean Absolute Error on unseen data: {mae_3_month_if}')
print(f'Root Mean Squared Error on unseen data: {rmse_3_month_if}')
print(f'R-squared on unseen data: {r2_3_month_if}')

# Additional metrics
medae_3_month_if = median_absolute_error(y_test_3_month_if, y_pred_3_month_if)
print(f'Median Absolute Error on unseen data: {medae_3_month_if}')

dw_stat_3_month_if = durbin_watson(y_test_3_month_if - y_pred_3_month_if)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_3_month_if}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_3_month_if = np.mean(np.abs((y_test_3_month_if - y_pred_3_month_if) / y_test_3_month_if)) * 100
print(f'MAPE on unseen data: {mape_3_month_if:.2f}%')
# Get feature importance as a dictionary
feature_importance_3_month_if = dict(zip(X_train_3_month_if.columns, model_baseline_if_3_month.feature_importances_))

# Sort features by importance in descending order
sorted_features_3_month_if = sorted(feature_importance_3_month_if.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_3_month_if:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# learning rate = 0.1
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_3_month = df_stock_data_3_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to November 10, 2023 for training
df_stock_data_train_3_month_lr_1 = df_stock_data_3_month[df_stock_data_3_month['Date'] <= '2023-11-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_3_month_lr_1 = df_stock_data_3_month[df_stock_data_3_month['Date'] > '2024-02-10']

# Check if the test set is empty
if df_stock_data_test_3_month_lr_1.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_3_month_lr_1['Close_Target'] = df_stock_data_train_3_month_lr_1.groupby('Symbol')['Close'].shift(-60)
df_stock_data_test_3_month_lr_1['Close_Target'] = df_stock_data_test_3_month_lr_1.groupby('Symbol')['Close'].shift(-60)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_3_month_lr_1 = df_stock_data_train_3_month_lr_1.dropna(subset=['Close_Target'])
df_stock_data_test_3_month_lr_1 = df_stock_data_test_3_month_lr_1.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_3_month_lr_1.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_3_month_lr_1.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_3_month_lr_1.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_3_month_lr_1.select_dtypes(include=[np.number]).columns

df_stock_data_train_3_month_lr_1[numeric_cols_train] = df_stock_data_train_3_month_lr_1[numeric_cols_train].fillna(df_stock_data_train_3_month_lr_1[numeric_cols_train].median())
df_stock_data_test_3_month_lr_1[numeric_cols_test] = df_stock_data_test_3_month_lr_1[numeric_cols_test].fillna(df_stock_data_test_3_month_lr_1[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_3_month_lr_1.shape}")
print(f"Testing data shape: {df_stock_data_test_3_month_lr_1.shape}")

# Create X (features) and y (target) for training
X_train_3_month_lr_1 = df_stock_data_train_3_month_lr_1.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_3_month_lr_1 = df_stock_data_train_3_month_lr_1['Close_Target']

# Create X and y for testing
X_test_3_month_lr_1 = df_stock_data_test_3_month_lr_1.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_3_month_lr_1 = df_stock_data_test_3_month_lr_1['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_3_month_lr_1 shape: {X_train_3_month_lr_1.shape}, y_train_3_month_lr_1 shape: {y_train_3_month_lr_1.shape}")
print(f"X_test_3_month_lr_1 shape: {X_test_3_month_lr_1.shape}, y_test_3_month_lr_1 shape: {y_test_3_month_lr_1.shape}")

# Ensure there are samples in both training and testing sets
if X_train_3_month_lr_1.shape[0] == 0 or X_test_3_month_lr_1.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_3_month_tf_1 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=5,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_3_month_tf_1.fit(X_train_3_month_lr_1, y_train_3_month_lr_1)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_3_month_lr_1 = model_3_month_tf_1.predict(X_test_3_month_lr_1)

# Calculate performance on the test data
mse_test_3_month_lr_1 = mean_squared_error(y_test_3_month_lr_1, y_pred_3_month_lr_1)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_3_month_lr_1}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np


# Assuming `y_pred_3_month_lr_1` are your predictions for the test data and `y_test_3_month_lr_1` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_3_month_lr_1 = mean_squared_error(y_test_3_month_lr_1, y_pred_3_month_lr_1)
mae_3_month_lr_1 = mean_absolute_error(y_test_3_month_lr_1, y_pred_3_month_lr_1)
rmse_3_month_lr_1 = np.sqrt(mse_3_month_lr_1)  # Root Mean Squared Error
r2_3_month_lr_1 = r2_score(y_test_3_month_lr_1, y_pred_3_month_lr_1)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_3_month_lr_1}')
print(f'Mean Absolute Error on unseen data: {mae_3_month_lr_1}')
print(f'Root Mean Squared Error on unseen data: {rmse_3_month_lr_1}')
print(f'R-squared on unseen data: {r2_3_month_lr_1}')

# Additional metrics
medae_3_month_lr_1 = median_absolute_error(y_test_3_month_lr_1, y_pred_3_month_lr_1)
print(f'Median Absolute Error on unseen data: {medae_3_month_lr_1}')

dw_stat_3_month_lr_1 = durbin_watson(y_test_3_month_lr_1 - y_pred_3_month_lr_1)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_3_month_lr_1}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_3_month_lr_1 = np.mean(np.abs((y_test_3_month_lr_1 - y_pred_3_month_lr_1) / y_test_3_month_lr_1)) * 100
print(f'MAPE on unseen data: {mape_3_month_lr_1:.2f}%')
# Get feature importance as a dictionary
feature_importance_3_month_lr_1 = dict(zip(X_train_3_month_lr_1.columns, model_3_month_tf_1.feature_importances_))

# Sort features by importance in descending order
sorted_features_3_month_lr_1 = sorted(feature_importance_3_month_lr_1.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_3_month_lr_1:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# learning rate = 0.01
# this is the best one again
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_3_month = df_stock_data_3_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to November 10, 2023 for training
df_stock_data_train_3_month_lr_01 = df_stock_data_3_month[df_stock_data_3_month['Date'] <= '2023-11-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_3_month_lr_01 = df_stock_data_3_month[df_stock_data_3_month['Date'] > '2024-02-10']

# Check if the test set is empty
if df_stock_data_test_3_month_lr_01.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_3_month_lr_01['Close_Target'] = df_stock_data_train_3_month_lr_01.groupby('Symbol')['Close'].shift(-60)
df_stock_data_test_3_month_lr_01['Close_Target'] = df_stock_data_test_3_month_lr_01.groupby('Symbol')['Close'].shift(-60)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_3_month_lr_01 = df_stock_data_train_3_month_lr_01.dropna(subset=['Close_Target'])
df_stock_data_test_3_month_lr_01 = df_stock_data_test_3_month_lr_01.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_3_month_lr_01.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_3_month_lr_01.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_3_month_lr_01.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_3_month_lr_01.select_dtypes(include=[np.number]).columns

df_stock_data_train_3_month_lr_01[numeric_cols_train] = df_stock_data_train_3_month_lr_01[numeric_cols_train].fillna(df_stock_data_train_3_month_lr_01[numeric_cols_train].median())
df_stock_data_test_3_month_lr_01[numeric_cols_test] = df_stock_data_test_3_month_lr_01[numeric_cols_test].fillna(df_stock_data_test_3_month_lr_01[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_3_month_lr_01.shape}")
print(f"Testing data shape: {df_stock_data_test_3_month_lr_01.shape}")

# Create X (features) and y (target) for training
X_train_3_month_lr_01 = df_stock_data_train_3_month_lr_01.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_3_month_lr_01 = df_stock_data_train_3_month_lr_01['Close_Target']

# Create X and y for testing
X_test_3_month_lr_01 = df_stock_data_test_3_month_lr_01.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_3_month_lr_01 = df_stock_data_test_3_month_lr_01['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_3_month_lr_01 shape: {X_train_3_month_lr_01.shape}, y_train_3_month_lr_01 shape: {y_train_3_month_lr_01.shape}")
print(f"X_test_3_month_lr_01 shape: {X_test_3_month_lr_01.shape}, y_test_3_month_lr_01 shape: {y_test_3_month_lr_01.shape}")

# Ensure there are samples in both training and testing sets
if X_train_3_month_lr_01.shape[0] == 0 or X_test_3_month_lr_01.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_3_month_tf_01 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=5,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_3_month_tf_01.fit(X_train_3_month_lr_01, y_train_3_month_lr_01)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_3_month_lr_01 = model_3_month_tf_01.predict(X_test_3_month_lr_01)

# Calculate performance on the test data
mse_test_3_month_lr_01 = mean_squared_error(y_test_3_month_lr_01, y_pred_3_month_lr_01)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_3_month_lr_01}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np


# Assuming `y_pred_3_month_lr_01` are your predictions for the test data and `y_test_3_month_lr_01` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_3_month_lr_01 = mean_squared_error(y_test_3_month_lr_01, y_pred_3_month_lr_01)
mae_3_month_lr_01 = mean_absolute_error(y_test_3_month_lr_01, y_pred_3_month_lr_01)
rmse_3_month_lr_01 = np.sqrt(mse_3_month_lr_01)  # Root Mean Squared Error
r2_3_month_lr_01 = r2_score(y_test_3_month_lr_01, y_pred_3_month_lr_01)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_3_month_lr_01}')
print(f'Mean Absolute Error on unseen data: {mae_3_month_lr_01}')
print(f'Root Mean Squared Error on unseen data: {rmse_3_month_lr_01}')
print(f'R-squared on unseen data: {r2_3_month_lr_01}')

# Additional metrics
medae_3_month_lr_01 = median_absolute_error(y_test_3_month_lr_01, y_pred_3_month_lr_01)
print(f'Median Absolute Error on unseen data: {medae_3_month_lr_01}')

dw_stat_3_month_lr_01 = durbin_watson(y_test_3_month_lr_01 - y_pred_3_month_lr_01)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_3_month_lr_01}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_3_month_lr_01 = np.mean(np.abs((y_test_3_month_lr_01 - y_pred_3_month_lr_01) / y_test_3_month_lr_01)) * 100
print(f'MAPE on unseen data: {mape_3_month_lr_01:.2f}%')
# Get feature importance as a dictionary
feature_importance_3_month_lr_01 = dict(zip(X_train_3_month_lr_01.columns, model_3_month_tf_01.feature_importances_))

# Sort features by importance in descending order
sorted_features_3_month_lr_01 = sorted(feature_importance_3_month_lr_01.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_3_month_lr_01:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# learning rate = 0.01
# max depth 3
# this is actually the best one for 3 months now
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_3_month = df_stock_data_3_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to November 10, 2023 for training
df_stock_data_train_3_month_md_3 = df_stock_data_3_month[df_stock_data_3_month['Date'] <= '2023-11-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_3_month_md_3 = df_stock_data_3_month[df_stock_data_3_month['Date'] > '2024-02-10']

# Check if the test set is empty
if df_stock_data_test_3_month_md_3.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_3_month_md_3['Close_Target'] = df_stock_data_train_3_month_md_3.groupby('Symbol')['Close'].shift(-60)
df_stock_data_test_3_month_md_3['Close_Target'] = df_stock_data_test_3_month_md_3.groupby('Symbol')['Close'].shift(-60)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_3_month_md_3 = df_stock_data_train_3_month_md_3.dropna(subset=['Close_Target'])
df_stock_data_test_3_month_md_3 = df_stock_data_test_3_month_md_3.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_3_month_md_3.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_3_month_md_3.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_3_month_md_3.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_3_month_md_3.select_dtypes(include=[np.number]).columns

df_stock_data_train_3_month_md_3[numeric_cols_train] = df_stock_data_train_3_month_md_3[numeric_cols_train].fillna(df_stock_data_train_3_month_md_3[numeric_cols_train].median())
df_stock_data_test_3_month_md_3[numeric_cols_test] = df_stock_data_test_3_month_md_3[numeric_cols_test].fillna(df_stock_data_test_3_month_md_3[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_3_month_md_3.shape}")
print(f"Testing data shape: {df_stock_data_test_3_month_md_3.shape}")

# Create X (features) and y (target) for training
X_train_3_month_md_3 = df_stock_data_train_3_month_md_3.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_3_month_md_3 = df_stock_data_train_3_month_md_3['Close_Target']

# Create X and y for testing
X_test_3_month_md_3 = df_stock_data_test_3_month_md_3.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_3_month_md_3 = df_stock_data_test_3_month_md_3['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_3_month_md_3 shape: {X_train_3_month_md_3.shape}, y_train_3_month_md_3 shape: {y_train_3_month_md_3.shape}")
print(f"X_test_3_month_md_3 shape: {X_test_3_month_md_3.shape}, y_test_3_month_md_3 shape: {y_test_3_month_md_3.shape}")

# Ensure there are samples in both training and testing sets
if X_train_3_month_md_3.shape[0] == 0 or X_test_3_month_md_3.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_3_month_md_3 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=3,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_3_month_md_3.fit(X_train_3_month_md_3, y_train_3_month_md_3)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_3_month_md_3 = model_3_month_md_3.predict(X_test_3_month_md_3)

# Calculate performance on the test data
mse_test_3_month_md_3 = mean_squared_error(y_test_3_month_md_3, y_pred_3_month_md_3)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_3_month_md_3}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np

# Assuming `y_pred_3_month_md_3` are your predictions for the test data and `y_test_3_month_md_3` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_3_month_md_3 = mean_squared_error(y_test_3_month_md_3, y_pred_3_month_md_3)
mae_3_month_md_3 = mean_absolute_error(y_test_3_month_md_3, y_pred_3_month_md_3)
rmse_3_month_md_3 = np.sqrt(mse_3_month_md_3)  # Root Mean Squared Error
r2_3_month_md_3 = r2_score(y_test_3_month_md_3, y_pred_3_month_md_3)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_3_month_md_3}')
print(f'Mean Absolute Error on unseen data: {mae_3_month_md_3}')
print(f'Root Mean Squared Error on unseen data: {rmse_3_month_md_3}')
print(f'R-squared on unseen data: {r2_3_month_md_3}')

# Additional metrics
medae_3_month_md_3 = median_absolute_error(y_test_3_month_md_3, y_pred_3_month_md_3)
print(f'Median Absolute Error on unseen data: {medae_3_month_md_3}')

dw_stat_3_month_md_3 = durbin_watson(y_test_3_month_md_3 - y_pred_3_month_md_3)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_3_month_md_3}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_3_month_md_3 = np.mean(np.abs((y_test_3_month_md_3 - y_pred_3_month_md_3) / y_test_3_month_md_3)) * 100
print(f'MAPE on unseen data: {mape_3_month_md_3:.2f}%')
# Get feature importance as a dictionary
feature_importance_3_month_md_3 = dict(zip(X_train_3_month_md_3.columns, model_3_month_md_3.feature_importances_))

# Sort features by importance in descending order
sorted_features_3_month_md_3 = sorted(feature_importance_3_month_md_3.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_3_month_md_3:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# learning rate = 0.01
# max depth 7
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_3_month = df_stock_data_3_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to November 10, 2023 for training
df_stock_data_train_3_month_md_7 = df_stock_data_3_month[df_stock_data_3_month['Date'] <= '2023-11-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_3_month_md_7 = df_stock_data_3_month[df_stock_data_3_month['Date'] > '2024-02-10']

# Check if the test set is empty
if df_stock_data_test_3_month_md_7.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_3_month_md_7['Close_Target'] = df_stock_data_train_3_month_md_7.groupby('Symbol')['Close'].shift(-60)
df_stock_data_test_3_month_md_7['Close_Target'] = df_stock_data_test_3_month_md_7.groupby('Symbol')['Close'].shift(-60)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_3_month_md_7 = df_stock_data_train_3_month_md_7.dropna(subset=['Close_Target'])
df_stock_data_test_3_month_md_7 = df_stock_data_test_3_month_md_7.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_3_month_md_7.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_3_month_md_7.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_3_month_md_7.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_3_month_md_7.select_dtypes(include=[np.number]).columns

df_stock_data_train_3_month_md_7[numeric_cols_train] = df_stock_data_train_3_month_md_7[numeric_cols_train].fillna(df_stock_data_train_3_month_md_7[numeric_cols_train].median())
df_stock_data_test_3_month_md_7[numeric_cols_test] = df_stock_data_test_3_month_md_7[numeric_cols_test].fillna(df_stock_data_test_3_month_md_7[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_3_month_md_7.shape}")
print(f"Testing data shape: {df_stock_data_test_3_month_md_7.shape}")

# Create X (features) and y (target) for training
X_train_3_month_md_7 = df_stock_data_train_3_month_md_7.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_3_month_md_7 = df_stock_data_train_3_month_md_7['Close_Target']

# Create X and y for testing
X_test_3_month_md_7 = df_stock_data_test_3_month_md_7.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_3_month_md_7 = df_stock_data_test_3_month_md_7['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_3_month_md_7 shape: {X_train_3_month_md_7.shape}, y_train_3_month_md_7 shape: {y_train_3_month_md_7.shape}")
print(f"X_test_3_month_md_7 shape: {X_test_3_month_md_7.shape}, y_test_3_month_md_7 shape: {y_test_3_month_md_7.shape}")

# Ensure there are samples in both training and testing sets
if X_train_3_month_md_7.shape[0] == 0 or X_test_3_month_md_7.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_3_month_md_7 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=7,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_3_month_md_7.fit(X_train_3_month_md_7, y_train_3_month_md_7)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_3_month_md_7 = model_3_month_md_7.predict(X_test_3_month_md_7)

# Calculate performance on the test data
mse_test_3_month_md_7 = mean_squared_error(y_test_3_month_md_7, y_pred_3_month_md_7)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_3_month_md_7}')


In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np

# Assuming `y_pred_3_month_md_7` are your predictions for the test data and `y_test_3_month_md_7` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_3_month_md_7 = mean_squared_error(y_test_3_month_md_7, y_pred_3_month_md_7)
mae_3_month_md_7 = mean_absolute_error(y_test_3_month_md_7, y_pred_3_month_md_7)
rmse_3_month_md_7 = np.sqrt(mse_3_month_md_7)  # Root Mean Squared Error
r2_3_month_md_7 = r2_score(y_test_3_month_md_7, y_pred_3_month_md_7)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_3_month_md_7}')
print(f'Mean Absolute Error on unseen data: {mae_3_month_md_7}')
print(f'Root Mean Squared Error on unseen data: {rmse_3_month_md_7}')
print(f'R-squared on unseen data: {r2_3_month_md_7}')

# Additional metrics
medae_3_month_md_7 = median_absolute_error(y_test_3_month_md_7, y_pred_3_month_md_7)
print(f'Median Absolute Error on unseen data: {medae_3_month_md_7}')

dw_stat_3_month_md_7 = durbin_watson(y_test_3_month_md_7 - y_pred_3_month_md_7)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_3_month_md_7}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_3_month_md_7 = np.mean(np.abs((y_test_3_month_md_7 - y_pred_3_month_md_7) / y_test_3_month_md_7)) * 100
print(f'MAPE on unseen data: {mape_3_month_md_7:.2f}%')
# Get feature importance as a dictionary
feature_importance_3_month_md_7 = dict(zip(X_train_3_month_md_7.columns, model_3_month_md_7.feature_importances_))

# Sort features by importance in descending order
sorted_features_3_month_md_7 = sorted(feature_importance_3_month_md_7.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_3_month_md_7:
    print(f"{feature}: {importance * 100:.2f}%")



best model: learning_rate = 0.01 and max_depth = 3

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Convert to NumPy arrays (ensuring correct types)
features = np.array([feature for feature, importance in sorted_features_3_month_md_3[:5]])  # Extract feature names
importances = np.array([importance for feature, importance in sorted_features_3_month_md_3[:5]])  # Extract importances

# Create a bar plot
plt.figure(figsize=(13, 8))
ax = sns.barplot(x=importances * 100, y=features, palette="viridis")

# Add text labels to the bars (feature importance values)
for i, v in enumerate(importances * 100):
    ax.text(v + 0.01, i, f"{v:.2f}%", va="center", fontsize=16)  # Adjust position & format

# Format x-axis labels to include % sign
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:.0f}%"))

# Extend x-axis limits for more space
plt.xlim(0, max(importances * 100) + 3)  # Extend to provide more space on the right

# Add labels and title
plt.xlabel("Feature Importance (%)", fontsize=18, fontweight='bold')  # Bigger x-axis title
plt.ylabel("Important TA Indicators", fontsize=18, fontweight='bold')  # Bigger y-axis title
plt.title("Best 3 Month Prediction Model: Top 5 Most Important Features", fontsize=18, fontweight='bold')  # Bigger title

# Increase font size for y-axis and x-axis tick labels (feature names)
ax.set_yticklabels(features, fontsize=14)
plt.xticks(fontsize=14)  # Increase font size for x-axis labels
# Show the plot
plt.show()


In [None]:
# first we will modify our feature set to add bigger lagging indicators.
# Create a new dataframe called 'df_stock_data_6_month' as a copy of 'df_stocks_price_ta'
df_stock_data_6_month = df_stocks_price_ta.copy()


In [None]:
# List of columns to create lags for (focusing on mid-term indicators)
columns_to_lag = ['Close', 'SMA_5', 'EMA_5', 'Volume', 'SMA_20',
       'SMA_50', 'EMA_5', 'EMA_20', 'EMA_50',  'EMA_12_MACD',
       'EMA_26_MACD']

# Creating lag features for each column
# [1, 3, 5, 7, 10, 12, 15, 20, 30, 60, 90, 180, 360] are the lags we will use
# but to save space, we will only use necessary lags per the timeline goal of the model
# this first model will be predicting price 1 week ahead (5 trading days)
lags = [1, 3, 5, 7, 10, 15, 20, 25, 30, 40, 50, 60, 75, 90, 180]
for col in columns_to_lag:
    for lag in lags:
        df_stock_data_6_month[f'{col}_lag_{lag}'] = df_stock_data_6_month[col].shift(lag)

# Do not drop NaN values to maintain continuity (XGBoost can handle NaNs)
# You can handle missing values in your model later, if needed
df_stock_data_6_month.head()

In [None]:
# now we're going to move onto our next model: 6 month prediction
# we'll start at our baseline model and then do the same as we just did
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_6_month = df_stock_data_6_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to November 10, 2023 for training
df_stock_data_train_6_month_baseline = df_stock_data_6_month[df_stock_data_6_month['Date'] <= '2023-07-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_6_month_baseline = df_stock_data_6_month[df_stock_data_6_month['Date'] > '2024-01-10']

# Check if the test set is empty
if df_stock_data_test_6_month_baseline.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_6_month_baseline['Close_Target'] = df_stock_data_train_6_month_baseline.groupby('Symbol')['Close'].shift(-120)
df_stock_data_test_6_month_baseline['Close_Target'] = df_stock_data_test_6_month_baseline.groupby('Symbol')['Close'].shift(-120)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_6_month_baseline = df_stock_data_train_6_month_baseline.dropna(subset=['Close_Target'])
df_stock_data_test_6_month_baseline = df_stock_data_test_6_month_baseline.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_6_month_baseline.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_6_month_baseline.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_6_month_baseline.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_6_month_baseline.select_dtypes(include=[np.number]).columns

df_stock_data_train_6_month_baseline[numeric_cols_train] = df_stock_data_train_6_month_baseline[numeric_cols_train].fillna(df_stock_data_train_6_month_baseline[numeric_cols_train].median())
df_stock_data_test_6_month_baseline[numeric_cols_test] = df_stock_data_test_6_month_baseline[numeric_cols_test].fillna(df_stock_data_test_6_month_baseline[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_6_month_baseline.shape}")
print(f"Testing data shape: {df_stock_data_test_6_month_baseline.shape}")

# Create X (features) and y (target) for training
X_train_6_month_baseline = df_stock_data_train_6_month_baseline.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_6_month_baseline = df_stock_data_train_6_month_baseline['Close_Target']

# Create X and y for testing
X_test_6_month_baseline = df_stock_data_test_6_month_baseline.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_6_month_baseline = df_stock_data_test_6_month_baseline['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_6_month_baseline shape: {X_train_6_month_baseline.shape}, y_train_6_month_baseline shape: {y_train_6_month_baseline.shape}")
print(f"X_test_6_month_baseline shape: {X_test_6_month_baseline.shape}, y_test_6_month_baseline shape: {y_test_6_month_baseline.shape}")

# Ensure there are samples in both training and testing sets
if X_train_6_month_baseline.shape[0] == 0 or X_test_6_month_baseline.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_baseline_6_month = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_baseline_6_month.fit(X_train_6_month_baseline, y_train_6_month_baseline)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_6_month_baseline = model_baseline_6_month.predict(X_test_6_month_baseline)

# Calculate performance on the test data
mse_test_6_month_baseline = mean_squared_error(y_test_6_month_baseline, y_pred_6_month_baseline)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_6_month_baseline}')



In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np

# Assuming `y_pred_6_month_baseline` are your predictions for the test data and `y_test_6_month_baseline` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_6_month_baseline = mean_squared_error(y_test_6_month_baseline, y_pred_6_month_baseline)
mae_6_month_baseline = mean_absolute_error(y_test_6_month_baseline, y_pred_6_month_baseline)
rmse_6_month_baseline = np.sqrt(mse_6_month_baseline)  # Root Mean Squared Error
r2_6_month_baseline = r2_score(y_test_6_month_baseline, y_pred_6_month_baseline)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_6_month_baseline}')
print(f'Mean Absolute Error on unseen data: {mae_6_month_baseline}')
print(f'Root Mean Squared Error on unseen data: {rmse_6_month_baseline}')
print(f'R-squared on unseen data: {r2_6_month_baseline}')

# Additional metrics
medae_6_month_baseline = median_absolute_error(y_test_6_month_baseline, y_pred_6_month_baseline)
print(f'Median Absolute Error on unseen data: {medae_6_month_baseline}')

dw_stat_6_month_baseline = durbin_watson(y_test_6_month_baseline - y_pred_6_month_baseline)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_6_month_baseline}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_6_month_baseline = np.mean(np.abs((y_test_6_month_baseline - y_pred_6_month_baseline) / y_test_6_month_baseline)) * 100
print(f'MAPE on unseen data: {mape_6_month_baseline:.2f}%')
# Get feature importance as a dictionary
feature_importance_6_month_baseline = dict(zip(X_train_6_month_baseline.columns, model_baseline_6_month.feature_importances_))

# Sort features by importance in descending order
sorted_features_6_month_baseline = sorted(feature_importance_6_month_baseline.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_6_month_baseline:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# 6 month prediction model
# learning rate = 0.1
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_6_month = df_stock_data_6_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to November 10, 2023 for training
df_stock_data_train_6_month_lr_1 = df_stock_data_6_month[df_stock_data_6_month['Date'] <= '2023-07-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_6_month_lr_1 = df_stock_data_6_month[df_stock_data_6_month['Date'] > '2024-01-10']

# Check if the test set is empty
if df_stock_data_test_6_month_lr_1.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_6_month_lr_1['Close_Target'] = df_stock_data_train_6_month_lr_1.groupby('Symbol')['Close'].shift(-120)
df_stock_data_test_6_month_lr_1['Close_Target'] = df_stock_data_test_6_month_lr_1.groupby('Symbol')['Close'].shift(-120)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_6_month_lr_1 = df_stock_data_train_6_month_lr_1.dropna(subset=['Close_Target'])
df_stock_data_test_6_month_lr_1 = df_stock_data_test_6_month_lr_1.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_6_month_lr_1.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_6_month_lr_1.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_6_month_lr_1.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_6_month_lr_1.select_dtypes(include=[np.number]).columns

df_stock_data_train_6_month_lr_1[numeric_cols_train] = df_stock_data_train_6_month_lr_1[numeric_cols_train].fillna(df_stock_data_train_6_month_lr_1[numeric_cols_train].median())
df_stock_data_test_6_month_lr_1[numeric_cols_test] = df_stock_data_test_6_month_lr_1[numeric_cols_test].fillna(df_stock_data_test_6_month_lr_1[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_6_month_lr_1.shape}")
print(f"Testing data shape: {df_stock_data_test_6_month_lr_1.shape}")

# Create X (features) and y (target) for training
X_train_6_month_lr_1 = df_stock_data_train_6_month_lr_1.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_6_month_lr_1 = df_stock_data_train_6_month_lr_1['Close_Target']

# Create X and y for testing
X_test_6_month_lr_1 = df_stock_data_test_6_month_lr_1.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_6_month_lr_1 = df_stock_data_test_6_month_lr_1['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_6_month_lr_1 shape: {X_train_6_month_lr_1.shape}, y_train_6_month_lr_1 shape: {y_train_6_month_lr_1.shape}")
print(f"X_test_6_month_lr_1 shape: {X_test_6_month_lr_1.shape}, y_test_6_month_lr_1 shape: {y_test_6_month_lr_1.shape}")

# Ensure there are samples in both training and testing sets
if X_train_6_month_lr_1.shape[0] == 0 or X_test_6_month_lr_1.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_baseline_6_month_lr_1 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=5,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_baseline_6_month_lr_1.fit(X_train_6_month_lr_1, y_train_6_month_lr_1)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_6_month_lr_1 = model_baseline_6_month_lr_1.predict(X_test_6_month_lr_1)

# Calculate performance on the test data
mse_test = mean_squared_error(y_test_6_month_lr_1, y_pred_6_month_lr_1)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test}')



In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np


# Assuming `y_pred_6_month_lr_1` are your predictions for the test data and `y_test_6_month_lr_1` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_6_month_lr_1 = mean_squared_error(y_test_6_month_lr_1, y_pred_6_month_lr_1)
mae_6_month_lr_1 = mean_absolute_error(y_test_6_month_lr_1, y_pred_6_month_lr_1)
rmse_6_month_lr_1 = np.sqrt(mse_6_month_lr_1)  # Root Mean Squared Error
r2_6_month_lr_1 = r2_score(y_test_6_month_lr_1, y_pred_6_month_lr_1)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_6_month_lr_1}')
print(f'Mean Absolute Error on unseen data: {mae_6_month_lr_1}')
print(f'Root Mean Squared Error on unseen data: {rmse_6_month_lr_1}')
print(f'R-squared on unseen data: {r2_6_month_lr_1}')

# Additional metrics
medae_6_month_lr_1 = median_absolute_error(y_test_6_month_lr_1, y_pred_6_month_lr_1)
print(f'Median Absolute Error on unseen data: {medae_6_month_lr_1}')

dw_stat_6_month_lr_1 = durbin_watson(y_test_6_month_lr_1 - y_pred_6_month_lr_1)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_6_month_lr_1}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_6_month_lr_1 = np.mean(np.abs((y_test_6_month_lr_1 - y_pred_6_month_lr_1) / y_test_6_month_lr_1)) * 100
print(f'MAPE on unseen data: {mape_6_month_lr_1:.2f}%')
# Get feature importance as a dictionary
feature_importance_6_month_lr_1 = dict(zip(X_train_6_month_lr_1.columns, model_baseline_6_month_lr_1.feature_importances_))

# Sort features by importance in descending order
sorted_features_6_month_lr_1 = sorted(feature_importance_6_month_lr_1.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_6_month_lr_1:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# 6 month prediction model
# learning rate = 0.01
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_6_month = df_stock_data_6_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to November 10, 2023 for training
df_stock_data_train_6_month_lr_01 = df_stock_data_6_month[df_stock_data_6_month['Date'] <= '2023-07-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_6_month_lr_01 = df_stock_data_6_month[df_stock_data_6_month['Date'] > '2024-01-10']

# Check if the test set is empty
if df_stock_data_test_6_month_lr_01.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_6_month_lr_01['Close_Target'] = df_stock_data_train_6_month_lr_01.groupby('Symbol')['Close'].shift(-120)
df_stock_data_test_6_month_lr_01['Close_Target'] = df_stock_data_test_6_month_lr_01.groupby('Symbol')['Close'].shift(-120)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_6_month_lr_01 = df_stock_data_train_6_month_lr_01.dropna(subset=['Close_Target'])
df_stock_data_test_6_month_lr_01 = df_stock_data_test_6_month_lr_01.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_6_month_lr_01.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_6_month_lr_01.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_6_month_lr_01.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_6_month_lr_01.select_dtypes(include=[np.number]).columns

df_stock_data_train_6_month_lr_01[numeric_cols_train] = df_stock_data_train_6_month_lr_01[numeric_cols_train].fillna(df_stock_data_train_6_month_lr_01[numeric_cols_train].median())
df_stock_data_test_6_month_lr_01[numeric_cols_test] = df_stock_data_test_6_month_lr_01[numeric_cols_test].fillna(df_stock_data_test_6_month_lr_01[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_6_month_lr_01.shape}")
print(f"Testing data shape: {df_stock_data_test_6_month_lr_01.shape}")

# Create X (features) and y (target) for training
X_train_6_month_lr_01 = df_stock_data_train_6_month_lr_01.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_6_month_lr_01 = df_stock_data_train_6_month_lr_01['Close_Target']

# Create X and y for testing
X_test_6_month_lr_01 = df_stock_data_test_6_month_lr_01.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_6_month_lr_01 = df_stock_data_test_6_month_lr_01['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_6_month_lr_01 shape: {X_train_6_month_lr_01.shape}, y_train_6_month_lr_01 shape: {y_train_6_month_lr_01.shape}")
print(f"X_test_6_month_lr_01 shape: {X_test_6_month_lr_01.shape}, y_test_6_month_lr_01 shape: {y_test_6_month_lr_01.shape}")

# Ensure there are samples in both training and testing sets
if X_train_6_month_lr_01.shape[0] == 0 or X_test_6_month_lr_01.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_baseline_6_month_lr_01 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=5,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_baseline_6_month_lr_01.fit(X_train_6_month_lr_01, y_train_6_month_lr_01)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_6_month_lr_01 = model_baseline_6_month_lr_01.predict(X_test_6_month_lr_01)

# Calculate performance on the test data
mse_test_6_month_lr_01 = mean_squared_error(y_test_6_month_lr_01, y_pred_6_month_lr_01)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_6_month_lr_01}')



In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np


# Assuming `y_pred_6_month_lr_01` are your predictions for the test data and `y_test_6_month_lr_01` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_6_month_lr_01 = mean_squared_error(y_test_6_month_lr_01, y_pred_6_month_lr_01)
mae_6_month_lr_01 = mean_absolute_error(y_test_6_month_lr_01, y_pred_6_month_lr_01)
rmse_6_month_lr_01 = np.sqrt(mse_6_month_lr_01)  # Root Mean Squared Error
r2_6_month_lr_01 = r2_score(y_test_6_month_lr_01, y_pred_6_month_lr_01)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_6_month_lr_01}')
print(f'Mean Absolute Error on unseen data: {mae_6_month_lr_01}')
print(f'Root Mean Squared Error on unseen data: {rmse_6_month_lr_01}')
print(f'R-squared on unseen data: {r2_6_month_lr_01}')

# Additional metrics
medae_6_month_lr_01 = median_absolute_error(y_test_6_month_lr_01, y_pred_6_month_lr_01)
print(f'Median Absolute Error on unseen data: {medae_6_month_lr_01}')

dw_stat_6_month_lr_01 = durbin_watson(y_test_6_month_lr_01 - y_pred_6_month_lr_01)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_6_month_lr_01}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_6_month_lr_01 = np.mean(np.abs((y_test_6_month_lr_01 - y_pred_6_month_lr_01) / y_test_6_month_lr_01)) * 100
print(f'MAPE on unseen data: {mape_6_month_lr_01:.2f}%')
# Get feature importance as a dictionary
feature_importance_6_month_lr_01 = dict(zip(X_train_6_month_lr_01.columns, model_baseline_6_month_lr_01.feature_importances_))

# Sort features by importance in descending order
sorted_features_6_month_lr_01 = sorted(feature_importance_6_month_lr_01.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_6_month_lr_01:
    print(f"{feature}: {importance * 100:.2f}%")



model with learning_rate 0.01 performs the best

In [None]:
# 6 month prediction model
# learning rate = 0.01
# max_depth = 3
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_6_month = df_stock_data_6_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to November 10, 2023 for training
df_stock_data_train_6_month_md_3 = df_stock_data_6_month[df_stock_data_6_month['Date'] <= '2023-07-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_6_month_md_3 = df_stock_data_6_month[df_stock_data_6_month['Date'] > '2024-01-10']

# Check if the test set is empty
if df_stock_data_test_6_month_md_3.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_6_month_md_3['Close_Target'] = df_stock_data_train_6_month_md_3.groupby('Symbol')['Close'].shift(-120)
df_stock_data_test_6_month_md_3['Close_Target'] = df_stock_data_test_6_month_md_3.groupby('Symbol')['Close'].shift(-120)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_6_month_md_3 = df_stock_data_train_6_month_md_3.dropna(subset=['Close_Target'])
df_stock_data_test_6_month_md_3 = df_stock_data_test_6_month_md_3.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_6_month_md_3.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_6_month_md_3.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_6_month_md_3.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_6_month_md_3.select_dtypes(include=[np.number]).columns

df_stock_data_train_6_month_md_3[numeric_cols_train] = df_stock_data_train_6_month_md_3[numeric_cols_train].fillna(df_stock_data_train_6_month_md_3[numeric_cols_train].median())
df_stock_data_test_6_month_md_3[numeric_cols_test] = df_stock_data_test_6_month_md_3[numeric_cols_test].fillna(df_stock_data_test_6_month_md_3[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_6_month_md_3.shape}")
print(f"Testing data shape: {df_stock_data_test_6_month_md_3.shape}")

# Create X (features) and y (target) for training
X_train_6_month_md_3 = df_stock_data_train_6_month_md_3.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_6_month_md_3 = df_stock_data_train_6_month_md_3['Close_Target']

# Create X and y for testing
X_test_6_month_md_3 = df_stock_data_test_6_month_md_3.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_6_month_md_3 = df_stock_data_test_6_month_md_3['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_6_month_md_3 shape: {X_train_6_month_md_3.shape}, y_train_6_month_md_3 shape: {y_train_6_month_md_3.shape}")
print(f"X_test_6_month_md_3 shape: {X_test_6_month_md_3.shape}, y_test_6_month_md_3 shape: {y_test_6_month_md_3.shape}")

# Ensure there are samples in both training and testing sets
if X_train_6_month_md_3.shape[0] == 0 or X_test_6_month_md_3.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_baseline_6_month_md_3 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=3,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_baseline_6_month_md_3.fit(X_train_6_month_md_3, y_train_6_month_md_3)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_6_month_md_3 = model_baseline_6_month_md_3.predict(X_test_6_month_md_3)

# Calculate performance on the test data
mse_test_6_month_md_3 = mean_squared_error(y_test_6_month_md_3, y_pred_6_month_md_3)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_6_month_md_3}')



In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np


# Assuming `y_pred_6_month_md_3` are your predictions for the test data and `y_test_6_month_md_3` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_6_month_md_3 = mean_squared_error(y_test_6_month_md_3, y_pred_6_month_md_3)
mae_6_month_md_3 = mean_absolute_error(y_test_6_month_md_3, y_pred_6_month_md_3)
rmse_6_month_md_3 = np.sqrt(mse_6_month_md_3)  # Root Mean Squared Error
r2_6_month_md_3 = r2_score(y_test_6_month_md_3, y_pred_6_month_md_3)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_6_month_md_3}')
print(f'Mean Absolute Error on unseen data: {mae_6_month_md_3}')
print(f'Root Mean Squared Error on unseen data: {rmse_6_month_md_3}')
print(f'R-squared on unseen data: {r2_6_month_md_3}')

# Additional metrics
medae_6_month_md_3 = median_absolute_error(y_test_6_month_md_3, y_pred_6_month_md_3)
print(f'Median Absolute Error on unseen data: {medae_6_month_md_3}')

dw_stat_6_month_md_3 = durbin_watson(y_test_6_month_md_3 - y_pred_6_month_md_3)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_6_month_md_3}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_6_month_md_3 = np.mean(np.abs((y_test_6_month_md_3 - y_pred_6_month_md_3) / y_test_6_month_md_3)) * 100
print(f'MAPE on unseen data: {mape_6_month_md_3:.2f}%')
# Get feature importance as a dictionary
feature_importance_6_month_md_3 = dict(zip(X_train_6_month_md_3.columns, model_baseline_6_month_md_3.feature_importances_))

# Sort features by importance in descending order
sorted_features_6_month_md_3 = sorted(feature_importance_6_month_md_3.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_6_month_md_3:
    print(f"{feature}: {importance * 100:.2f}%")



In [None]:
# 6 month prediction model
# learning rate = 0.01
# max_depth = 7
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Sort values by 'Symbol' and 'Date' to maintain time order
df_stock_data_6_month = df_stock_data_6_month.sort_values(by=['Symbol', 'Date'])

# Filter data to only include rows with Date before or equal to November 10, 2023 for training
df_stock_data_train_6_month_md_7 = df_stock_data_6_month[df_stock_data_6_month['Date'] <= '2023-07-10']

# Filter data to only include rows with Date after February 10, 2024 for testing
df_stock_data_test_6_month_md_7 = df_stock_data_6_month[df_stock_data_6_month['Date'] > '2024-01-10']

# Check if the test set is empty
if df_stock_data_test_6_month_md_7.empty:
    raise ValueError("No data available in the testing set for the given date range.")

# Shift 'Close' to predict 20 trading days ahead (1 month ahead)
df_stock_data_train_6_month_md_7['Close_Target'] = df_stock_data_train_6_month_md_7.groupby('Symbol')['Close'].shift(-120)
df_stock_data_test_6_month_md_7['Close_Target'] = df_stock_data_test_6_month_md_7.groupby('Symbol')['Close'].shift(-120)

# Drop rows where 'Close_Target' is NaN (caused by shifting)
df_stock_data_train_6_month_md_7 = df_stock_data_train_6_month_md_7.dropna(subset=['Close_Target'])
df_stock_data_test_6_month_md_7 = df_stock_data_test_6_month_md_7.dropna(subset=['Close_Target'])

# Replace infinite values with NaN
df_stock_data_train_6_month_md_7.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stock_data_test_6_month_md_7.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the median (or mean) of each numeric column
numeric_cols_train = df_stock_data_train_6_month_md_7.select_dtypes(include=[np.number]).columns
numeric_cols_test = df_stock_data_test_6_month_md_7.select_dtypes(include=[np.number]).columns

df_stock_data_train_6_month_md_7[numeric_cols_train] = df_stock_data_train_6_month_md_7[numeric_cols_train].fillna(df_stock_data_train_6_month_md_7[numeric_cols_train].median())
df_stock_data_test_6_month_md_7[numeric_cols_test] = df_stock_data_test_6_month_md_7[numeric_cols_test].fillna(df_stock_data_test_6_month_md_7[numeric_cols_test].median())

# Check for the shapes of the data
print(f"Training data shape: {df_stock_data_train_6_month_md_7.shape}")
print(f"Testing data shape: {df_stock_data_test_6_month_md_7.shape}")

# Create X (features) and y (target) for training
X_train_6_month_md_7 = df_stock_data_train_6_month_md_7.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_train_6_month_md_7 = df_stock_data_train_6_month_md_7['Close_Target']

# Create X and y for testing
X_test_6_month_md_7 = df_stock_data_test_6_month_md_7.drop(columns=['Close', 'Close_Target', 'Symbol', 'Date'])
y_test_6_month_md_7 = df_stock_data_test_6_month_md_7['Close_Target']

# Check the shapes of the training and testing data
print(f"X_train_6_month_md_7 shape: {X_train_6_month_md_7.shape}, y_train_6_month_md_7 shape: {y_train_6_month_md_7.shape}")
print(f"X_test_6_month_md_7 shape: {X_test_6_month_md_7.shape}, y_test_6_month_md_7 shape: {y_test_6_month_md_7.shape}")

# Ensure there are samples in both training and testing sets
if X_train_6_month_md_7.shape[0] == 0 or X_test_6_month_md_7.shape[0] == 0:
    raise ValueError("Insufficient data in either training or testing set.")

# Initialize and train XGBoost model
model_baseline_6_month_md_7 = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=7,
    alpha=0,
    reg_lambda=1,  # Fixed parameter name
    objective='reg:squarederror',
    missing=np.nan  # Ensure missing values are handled correctly
)

# Train the model on the training data
model_baseline_6_month_md_7.fit(X_train_6_month_md_7, y_train_6_month_md_7)

# Make predictions on the unseen test data (post-February 10, 2024)
y_pred_6_month_md_7 = model_baseline_6_month_md_7.predict(X_test_6_month_md_7)

# Calculate performance on the test data
mse_test_6_month_md_7 = mean_squared_error(y_test_6_month_md_7, y_pred_6_month_md_7)
print(f'Mean Squared Error on unseen data (post-February 10, 2024): {mse_test_6_month_md_7}')



In [None]:
# metrics and feature importance on unseen data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from statsmodels.stats.stattools import durbin_watson
import numpy as np


# Assuming `y_pred_6_month_md_7` are your predictions for the test data and `y_test_6_month_md_7` are the true values for the test data

# Calculate performance metrics on unseen test data
mse_6_month_md_7 = mean_squared_error(y_test_6_month_md_7, y_pred_6_month_md_7)
mae_6_month_md_7 = mean_absolute_error(y_test_6_month_md_7, y_pred_6_month_md_7)
rmse_6_month_md_7 = np.sqrt(mse_6_month_md_7)  # Root Mean Squared Error
r2_6_month_md_7 = r2_score(y_test_6_month_md_7, y_pred_6_month_md_7)

# Print out the metrics for unseen data
print(f'Mean Squared Error on unseen data: {mse_6_month_md_7}')
print(f'Mean Absolute Error on unseen data: {mae_6_month_md_7}')
print(f'Root Mean Squared Error on unseen data: {rmse_6_month_md_7}')
print(f'R-squared on unseen data: {r2_6_month_md_7}')

# Additional metrics
medae_6_month_md_7 = median_absolute_error(y_test_6_month_md_7, y_pred_6_month_md_7)
print(f'Median Absolute Error on unseen data: {medae_6_month_md_7}')

dw_stat_6_month_md_7 = durbin_watson(y_test_6_month_md_7 - y_pred_6_month_md_7)
print(f'Durbin-Watson Statistic on unseen data: {dw_stat_6_month_md_7}')

# Calculate MAPE (Mean Absolute Percentage Error) on unseen data
mape_6_month_md_7 = np.mean(np.abs((y_test_6_month_md_7 - y_pred_6_month_md_7) / y_test_6_month_md_7)) * 100
print(f'MAPE on unseen data: {mape_6_month_md_7:.2f}%')
# Get feature importance as a dictionary
feature_importance_6_month_md_7 = dict(zip(X_train_6_month_md_7.columns, model_baseline_6_month_md_7.feature_importances_))

# Sort features by importance in descending order
sorted_features_6_month_md_7 = sorted(feature_importance_6_month_md_7.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance values
for feature, importance in sorted_features_6_month_md_7:
    print(f"{feature}: {importance * 100:.2f}%")



best model: learning_rate = 0.01 and max_depth = 3

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Convert to NumPy arrays (ensuring correct types)
features = np.array([feature for feature, importance in sorted_features_6_month_md_3[:5]])  # Extract feature names
importances = np.array([importance for feature, importance in sorted_features_6_month_md_3[:5]])  # Extract importances

# Create a bar plot
plt.figure(figsize=(13, 8))
ax = sns.barplot(x=importances * 100, y=features, palette="viridis")

# Add text labels to the bars (feature importance values)
for i, v in enumerate(importances * 100):
    ax.text(v + 0.01, i, f"{v:.2f}%", va="center", fontsize=16)  # Adjust position & format

# Format x-axis labels to include % sign
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:.0f}%"))

# Extend x-axis limits for more space
plt.xlim(0, max(importances * 100) + 3)  # Extend to provide more space on the right

# Add labels and title
plt.xlabel("Feature Importance (%)", fontsize=18, fontweight='bold')  # Bigger x-axis title
plt.ylabel("Important TA Indicators", fontsize=18, fontweight='bold')  # Bigger y-axis title
plt.title("Best 6 Month Prediction Model: Top 5 Most Important Features", fontsize=18, fontweight='bold')  # Bigger title

# Increase font size for y-axis and x-axis tick labels (feature names)
ax.set_yticklabels(features, fontsize=14)
plt.xticks(fontsize=14)  # Increase font size for x-axis labels
# Show the plot
plt.show()


Best model metrics

most important metrics to visualize:

1. Root Mean Squared Error
2. R-squared
3. MAPE
4. Median Absolute Error

1 week:

Mean Squared Error on unseen data: 7604.147083529955
Mean Absolute Error on unseen data: 10.044193260395863
Root Mean Squared Error on unseen data: 87.20176078227982
R-squared on unseen data: 0.9656152295864199
Median Absolute Error on unseen data: 1.588897705078125
Durbin-Watson Statistic on unseen data: 0.1674710246723756
MAPE on unseen data: 3.96%




Fib_10_Low_Min: 63.23%
Fib_30_High_Max: 14.79%
Low: 4.90%
5_day-Fib_38: 4.25%
High: 3.05%
5_day-Fib_61: 1.76%
10_day_Fib_23: 1.33%
Fib_30_Low_Min: 1.28%
Volume: 1.07%
Fib_5_Low_Min: 0.89%
5_day-Fib_50: 0.66%
EMA_5: 0.49%
SMA_5: 0.44%
30_day_Fib_38: 0.42%
5_day-Fib_23: 0.24%
Std_Dev: 0.20%
Cumulative_Price_Volume: 0.20%
EMA_12_MACD: 0.08%
SMA_20: 0.08%
30_day_Fib_50: 0.06%
VWAP: 0.05%
30_day_Fib_61: 0.05%
SMA_5_lag_7: 0.05%
ATR_True_Range: 0.05%
SMA_50: 0.05%
ATR_Prev_Close: 0.05%
Cumulative_Volume: 0.05%
Fib_5_High_Max: 0.04%
EMA_20: 0.04%
ATR: 0.03%
Upper_Band: 0.03%
Fib_10_High_Max: 0.01%
Lower_Band: 0.01%
Volume_lag_1: 0.01%
EMA_26_MACD: 0.01%
EMA_50: 0.01%
10_day_Fib_61: 0.01%


1 month:

Mean Squared Error on unseen data: 10844.724459170979
Mean Absolute Error on unseen data: 15.639166248218757
Root Mean Squared Error on unseen data: 104.13800679469037
R-squared on unseen data: 0.9513849924746814
Median Absolute Error on unseen data: 3.326946258544922
Durbin-Watson Statistic on unseen data: 0.0750817215794377
MAPE on unseen data: 7.82%

30_day_Fib_23: 52.41%
Fib_30_High_Max: 32.44%
Fib_5_Low_Min: 2.25%
Low: 2.08%
10_day_Fib_23: 1.96%
High: 1.77%
Volume: 1.43%
5_day-Fib_50: 1.11%
30_day_Fib_50: 0.68%
10_day_Fib_38: 0.62%
5_day-Fib_23: 0.48%
Fib_10_High_Max: 0.28%
Fib_10_Low_Min: 0.25%
Std_Dev: 0.21%
VWAP: 0.21%
ATR_Prev_Close: 0.20%
EMA_5: 0.19%
Fib_5_High_Max: 0.19%
Cumulative_Price_Volume: 0.17%
Lower_Band: 0.16%
EMA_26_MACD: 0.14%
30_day_Fib_61: 0.11%
SMA_5: 0.11%
SMA_20_lag_1: 0.06%
Upper_Band: 0.05%
5_day-Fib_61: 0.04%
ATR_True_Range: 0.04%
Cumulative_Volume: 0.04%
30_day_Fib_38: 0.03%
10_day_Fib_61: 0.03%
EMA_50: 0.02%
Fib_30_Low_Min: 0.02%
ATR: 0.02%
Volume_lag_1: 0.02%
Close_lag_1: 0.02%
EMA_20_lag_20: 0.02%
SMA_50: 0.01%
EMA_12_MACD_lag_15: 0.01%
EMA_12_MACD_lag_12: 0.01%


3 months:

Mean Squared Error on unseen data: 25855.13009374476
Mean Absolute Error on unseen data: 25.29591435606357
Root Mean Squared Error on unseen data: 160.79530494931984
R-squared on unseen data: 0.8883720519645201
Median Absolute Error on unseen data: 6.283714294433594
Durbin-Watson Statistic on unseen data: 0.017109218495924613
MAPE on unseen data: 15.52%

Fib_30_Low_Min: 27.17%
30_day_Fib_38: 14.80%
5_day-Fib_61: 11.17%
Upper_Band: 8.25%
Fib_10_Low_Min: 3.10%
10_day_Fib_50: 2.76%
Close_lag_1: 2.34%
Volume: 1.87%
EMA_5: 1.82%
Fib_5_Low_Min: 1.69%
Fib_5_High_Max: 1.51%
30_day_Fib_23: 1.50%
High: 1.48%
EMA_50_lag_25: 1.37%
30_day_Fib_50: 1.29%
ATR_High_Low: 1.20%
EMA_26_MACD: 1.11%
EMA_50: 1.08%
SMA_50: 1.02%
ATR: 1.00%
10_day_Fib_61: 0.87%
Low: 0.86%
EMA_12_MACD: 0.82%
Close_lag_90: 0.76%
5_day-Fib_23: 0.76%
EMA_20: 0.73%
SMA_20_lag_90: 0.60%
EMA_12_MACD_lag_90: 0.58%
10_day_Fib_38: 0.57%
VWAP: 0.55%
30_day_Fib_61: 0.54%
Volume_lag_50: 0.53%
ATR_True_Range: 0.39%
ATR_Prev_Close: 0.32%
SMA_5_lag_90: 0.31%
EMA_50_lag_30: 0.30%
Lower_Band: 0.28%
Fib_30_High_Max: 0.24%
Std_Dev: 0.23%
Volume_lag_3: 0.16%
Volume_lag_25: 0.15%
SMA_50_lag_60: 0.13%
EMA_26_MACD_lag_40: 0.13%
Cumulative_Price_Volume: 0.13%
EMA_50_lag_20: 0.12%
Volume_lag_75: 0.11%
EMA_12_MACD_lag_20: 0.10%
SMA_50_lag_10: 0.07%
Volume_lag_60: 0.07%
EMA_20_lag_15: 0.06%
EMA_50_lag_50: 0.05%
SMA_50_lag_3: 0.05%
EMA_5_lag_40: 0.04%
SMA_50_lag_5: 0.04%
EMA_50_lag_75: 0.04%
Cumulative_Volume: 0.04%
SMA_5_lag_60: 0.04%
Fib_10_High_Max: 0.04%
EMA_50_lag_60: 0.03%
SMA_50_lag_30: 0.03%
SMA_5_lag_25: 0.03%
Volume_lag_90: 0.03%
SMA_20_lag_25: 0.03%
SMA_50_lag_7: 0.03%
ATR_High_Close: 0.02%
EMA_50_lag_90: 0.02%
SMA_20_lag_40: 0.02%
Signal_Line: 0.02%
Volume_lag_1: 0.02%
SMA_50_lag_20: 0.02%
SMA_20_lag_30: 0.02%
EMA_26_MACD_lag_50: 0.02%
SMA_5_lag_50: 0.02%
SMA_50_lag_15: 0.02%
Volume_lag_40: 0.02%
SMA_50_lag_25: 0.01%
EMA_50_lag_15: 0.01%
EMA_12_MACD_lag_40: 0.01%
MACD: 0.01%
SMA_20_lag_20: 0.01%
EMA_12_MACD_lag_15: 0.01%
Volume_lag_5: 0.01%
SMA_5: 0.01%
SMA_5_lag_15: 0.01%
EMA_26_MACD_lag_30: 0.01%
Close_lag_20: 0.01%
EMA_20_lag_50: 0.01%
Close_lag_25: 0.01%
Volume_lag_20: 0.01%
Close_lag_40: 0.01%
EMA_5_lag_50: 0.01%
SMA_5_lag_40: 0.01%
SMA_50_lag_90: 0.01%
5_day-Fib_50: 0.01%
SMA_50_lag_40: 0.01%
EMA_5_lag_75: 0.01%
SMA_20_lag_50: 0.01%
Close_lag_10: 0.01%
EMA_12_MACD_lag_5: 0.01%


6 months:

Mean Squared Error on unseen data: 31411.174267588787
Mean Absolute Error on unseen data: 34.84146322021573
Root Mean Squared Error on unseen data: 177.23197868214638
R-squared on unseen data: 0.871485858716382
Median Absolute Error on unseen data: 8.994926452636719
Durbin-Watson Statistic on unseen data: 0.020845241129731375
MAPE on unseen data: 21.51%

Fib_10_High_Max: 17.93%
Fib_30_High_Max: 13.03%
Fib_5_Low_Min: 12.74%
ATR_High_Low: 5.86%
10_day_Fib_50: 4.75%
EMA_50_lag_25: 4.42%
EMA_5: 2.79%
Middle_Band: 2.77%
Volume: 2.74%
EMA_12_MACD: 2.23%
EMA_50: 2.11%
Fib_5_High_Max: 2.10%
5_day-Fib_61: 1.93%
Low: 1.78%
High: 1.60%
30_day_Fib_61: 1.47%
VWAP: 1.45%
Fib_30_Low_Min: 1.18%
EMA_26_MACD_lag_40: 1.09%
SMA_50: 1.05%
Fib_10_Low_Min: 1.02%
SMA_50_lag_90: 1.01%
SMA_5: 0.98%
EMA_50_lag_30: 0.97%
ATR: 0.92%
SMA_50_lag_20: 0.76%
EMA_20: 0.69%
Close_lag_90: 0.65%
5_day-Fib_23: 0.58%
SMA_50_lag_25: 0.55%
ATR_Prev_Close: 0.54%
Lower_Band: 0.53%
Std_Dev: 0.43%
ATR_True_Range: 0.38%
Volume_lag_75: 0.35%
SMA_5_lag_60: 0.31%
30_day_Fib_23: 0.29%
EMA_12_MACD_lag_3: 0.21%
10_day_Fib_38: 0.21%
Volume_lag_60: 0.20%
SMA_50_lag_75: 0.20%
Volume_lag_1: 0.18%
EMA_5_lag_75: 0.17%
SMA_50_lag_3: 0.17%
Close_lag_180: 0.17%
Cumulative_Price_Volume: 0.16%
Volume_lag_50: 0.15%
5_day-Fib_50: 0.15%
EMA_50_lag_1: 0.14%
EMA_50_lag_90: 0.13%
Volume_lag_30: 0.13%
10_day_Fib_61: 0.12%
EMA_50_lag_40: 0.09%
EMA_20_lag_180: 0.08%
SMA_50_lag_40: 0.08%
Volume_lag_25: 0.08%
EMA_50_lag_75: 0.06%
Volume_lag_3: 0.05%
Cumulative_Volume: 0.04%
Volume_lag_5: 0.04%
EMA_20_lag_90: 0.04%
Signal_Line: 0.04%
EMA_50_lag_180: 0.04%
Volume_lag_40: 0.04%
EMA_26_MACD: 0.03%
EMA_26_MACD_lag_180: 0.03%
Upper_Band: 0.03%
EMA_12_MACD_lag_180: 0.03%
EMA_20_lag_3: 0.03%
EMA_5_lag_40: 0.03%
EMA_12_MACD_lag_40: 0.03%
SMA_50_lag_180: 0.02%
SMA_20_lag_15: 0.02%
Volume_lag_20: 0.02%
SMA_20_lag_30: 0.02%
EMA_12_MACD_lag_25: 0.02%
EMA_5_lag_60: 0.02%
SMA_5_lag_7: 0.02%
Volume_lag_180: 0.02%
EMA_5_lag_50: 0.02%
EMA_50_lag_15: 0.02%
SMA_50_lag_50: 0.02%
SMA_20_lag_60: 0.02%
SMA_20_lag_40: 0.02%
EMA_26_MACD_lag_10: 0.01%
Close_lag_20: 0.01%
5_day-Fib_38: 0.01%
Close_lag_50: 0.01%
MACD: 0.01%
EMA_20_lag_40: 0.01%
Close_lag_10: 0.01%
EMA_20_lag_50: 0.01%
SMA_5_lag_30: 0.01%
SMA_50_lag_10: 0.01%
Volume_lag_10: 0.01%
MACD_Histogram: 0.01%
EMA_50_lag_20: 0.01%
RSI: 0.01%
10_day_Fib_23: 0.01%
%D: 0.01%
Close_lag_5: 0.01%
Volume_lag_90: 0.01%
Volume_lag_15: 0.01%
SMA_20_lag_3: 0.01%
EMA_20_lag_15: 0.01%
SMA_5_lag_90: 0.01%
EMA_12_MACD_lag_50: 0.01%
SMA_20_lag_75: 0.01%
EMA_20_lag_75: 0.01%
EMA_50_lag_60: 0.01%
SMA_50_lag_1: 0.01%
SMA_50_lag_60: 0.01%
EMA_26_MACD_lag_60: 0.01%
Close_lag_1: 0.01%
Volume_lag_7: 0.01%
SMA_20_lag_90: 0.01%
Close_lag_75: 0.01%




In [None]:
# Root mean squared error graph
import matplotlib.pyplot as plt

# Define time horizons and RMSE values
time_horizons = ["1 Week", "1 Month", "3 Months", "6 Months"]
rmse_values = [rmse_1_week_md_7, rmse_1_month_md_7, rmse_3_month_md_3, rmse_6_month_md_3]

# Create line plot
plt.figure(figsize=(8, 5))
plt.plot(time_horizons, rmse_values, marker='o', linestyle='-', color='b', linewidth=2, markersize=8)

# Labels and title
plt.xlabel("Prediction Horizon")
plt.ylabel("Root Mean Squared Error (RMSE)")
plt.title("RMSE Across Different Prediction Timelines")
plt.grid(True, linestyle='--', alpha=0.6)

# Display plot
plt.show()


In [None]:
# r squared graph
import matplotlib.pyplot as plt

# Define time horizons and R-squared values
time_horizons = ["1 Week", "1 Month", "3 Months", "6 Months"]
r_squared_values = [r2_1_week_md_7, r2_1_month_md_7, r2_3_month_md_3, r2_6_month_md_3]

# Create line plot
plt.figure(figsize=(8, 5))
plt.plot(time_horizons, r_squared_values, marker='o', linestyle='-', color='g', linewidth=2, markersize=8)

# Labels and title
plt.xlabel("Prediction Horizon")
plt.ylabel("R-squared")
plt.title("R-squared Across Different Prediction Timelines")
plt.ylim(0.85, 1.0)  # Setting limits for better visualization
plt.grid(True, linestyle='--', alpha=0.6)

# Display plot
plt.show()


In [None]:
# MAPE graph
import matplotlib.pyplot as plt

# Define time horizons and MAPE values
time_horizons = ["1 Week", "1 Month", "3 Months", "6 Months"]
mape_values = [mape_1_week_md_7, mape_1_month_md_7, mape_3_month_md_3, mape_6_month_md_3]

# Create line plot
plt.figure(figsize=(8, 5))
plt.plot(time_horizons, mape_values, marker='o', linestyle='-', color='r', linewidth=2, markersize=8)

# Labels and title
plt.xlabel("Prediction Horizon")
plt.ylabel("MAPE (%)")
plt.title("Mean Absolute Percentage Error (MAPE) Over Different Prediction Horizons")
plt.ylim(0, 30)  # Adjusted for better visualization
plt.grid(True, linestyle='--', alpha=0.6)

# Display plot
plt.show()


In [None]:
# Median Absolute Error
import matplotlib.pyplot as plt

# Define time horizons and Median Absolute Error values
time_horizons = ["1 Week", "1 Month", "3 Months", "6 Months"]
medae_values = [medae_1_week_md_7, medae_1_month_md_7, medae_3_month_md_3, medae_6_month_md_3]

# Create line plot
plt.figure(figsize=(8, 5))
plt.plot(time_horizons, medae_values, marker='o', linestyle='-', color='b', linewidth=2, markersize=8)

# Labels and title
plt.xlabel("Prediction Horizon")
plt.ylabel("Median Absolute Error")
plt.title("Median Absolute Error Over Different Prediction Horizons")
plt.ylim(0, 15)  # Adjusted for better visualization
plt.grid(True, linestyle='--', alpha=0.6)

# Display plot
plt.show()


In [None]:
# 1 Week Price Prediction: Available Testing Dates
# Prints the first available date and the last available date 
# where 'Close' and 'Close_Target' values exist for a given stock ticker.

def print_stock_date_range(df_test, symbol):
 
    required_columns = {'Symbol', 'Date', 'Close', 'Close_Target'}
    
    # Ensure required columns exist in df_test
    if not required_columns.issubset(df_test.columns):
        raise ValueError(f"The test dataframe must contain the following columns: {required_columns}")
    
    # Filter DataFrame for the given symbol and ensure 'Close' and 'Close_Target' are not NaN
    df_filtered = df_test[(df_test['Symbol'] == symbol) & 
                          (df_test['Close'].notna()) & 
                          (df_test['Close_Target'].notna())].reset_index(drop=True)

    if df_filtered.empty:
        print(f"No available data for symbol: {symbol}")
        return

    # Extract first and last available dates
    first_date = df_filtered['Date'].min()
    last_date = df_filtered['Date'].max()

    # Print the results
    print(f"Symbol: {symbol}"),
    print(f"First Available Date: {first_date.strftime('%Y-%m-%d')}"),
    print(f"Last Available Date: {last_date.strftime('%Y-%m-%d')}")

# Example usage:
print_stock_date_range(df_stock_data_test_1_week_md_7, 'NVDA')


In [None]:
# 1 Week Price Prediction
# Prints the stock symbol, specified date, actual close price ('Close'), 
# and predicted price ('Close_Target'). If the date is not available, 
# it finds the next available future date. If the entered date is the last date, 
# it looks for the closest previous available date.
import pandas as pd

def print_stock_prediction_by_date(df_test, symbol, date):

    required_columns = {'Symbol', 'Date', 'Close', 'Close_Target'}
    
    # Ensure required columns exist in df_test
    if not required_columns.issubset(df_test.columns):
        raise ValueError(f"The test dataframe must contain the following columns: {required_columns}")
    
    # Convert date input to datetime for accurate comparisons
    date = pd.to_datetime(date)

    # Filter DataFrame for the given stock symbol
    df_filtered = df_test[df_test['Symbol'] == symbol].copy()

    if df_filtered.empty:
        print(f"No data available for symbol: {symbol}")
        return
    
    # Convert 'Date' column to datetime format
    df_filtered['Date'] = pd.to_datetime(df_filtered['Date'])

    # Ensure sorting by date for correct traversal
    df_filtered = df_filtered.sort_values(by='Date').reset_index(drop=True)

    # Try to find the exact date
    if date in df_filtered['Date'].values:
        closest_date = date
    else:
        # Find the next available future date
        future_dates = df_filtered[df_filtered['Date'] > date]
        if not future_dates.empty:
            closest_date = future_dates['Date'].iloc[0]  # Next available future date
        else:
            # If no future date exists, get the closest past date
            past_dates = df_filtered[df_filtered['Date'] < date]
            if not past_dates.empty:
                closest_date = past_dates['Date'].iloc[-1]  # Last available past date
            else:
                print(f"No available dates found for symbol: {symbol}")
                return

    # Get row for the closest available date
    row = df_filtered[df_filtered['Date'] == closest_date].iloc[0]
    actual_close = row['Close']
    predicted_price = row['Close_Target']

    # Calculate the future date (120 trading days = 6 months)
    future_date = closest_date + pd.DateOffset(days=7)
    
    # To ensure it represents 120 trading days, we might need to filter out weekends
    # and filter out time
    future_trading_date = future_date
    trading_days = pd.date_range(closest_date, future_date, freq='B')  # 'B' for business days (weekdays)
    future_trading_date = (trading_days[-1] if len(trading_days) > 0 else future_date).strftime('%Y-%m-%d')


    #Calculate percent error of actual price vs. predicted price
    percent_error = ((abs(actual_close - predicted_price)) / abs(actual_close)) * 100
    formatted_percent_error = f'{percent_error:.2f}%'

    # Print the result
    print(f"Symbol: {symbol}")
    print(f"Date: {closest_date.strftime('%Y-%m-%d')}")
    print(f"Close Price: {actual_close:.2f}")
    print(f"Predicted Price Date: {future_trading_date}")
    print(f"Predicted Price: {predicted_price:.2f}")
    print(f"Percent Error: {formatted_percent_error}")

# Example usage:
print_stock_prediction_by_date(df_stock_data_test_1_week_md_7, 'NVDA', '2024-03-21')


In [None]:
df_stock_data_test_1_week_md_3[1990:1995]

In [None]:
# 1 Month Price Prediction: Available Testing Dates
# Prints the first available date and the last available date 
# where 'Close' and 'Close_Target' values exist for a given stock ticker.

def print_stock_date_range(df_test, symbol):
 
    required_columns = {'Symbol', 'Date', 'Close', 'Close_Target'}
    
    # Ensure required columns exist in df_test
    if not required_columns.issubset(df_test.columns):
        raise ValueError(f"The test dataframe must contain the following columns: {required_columns}")

    # Ensure the 'Date' column is in datetime format
    df_test['Date'] = pd.to_datetime(df_test['Date']) 
    
    # Filter DataFrame for the given symbol and ensure 'Close' and 'Close_Target' are not NaN
    df_filtered = df_test[(df_test['Symbol'] == symbol) & 
                          (df_test['Close'].notna()) & 
                          (df_test['Close_Target'].notna())].reset_index(drop=True)

    if df_filtered.empty:
        print(f"No available data for symbol: {symbol}")
        return

    # Extract first and last available dates
    first_date = df_filtered['Date'].min()
    last_date = df_filtered['Date'].max()

    # Print the results
    print(f"Symbol: {symbol}"),
    print(f"First Available Date: {first_date.strftime('%Y-%m-%d')}"),
    print(f"Last Available Date: {last_date.strftime('%Y-%m-%d')}")

# Example usage:
print_stock_date_range(df_stock_data_test_1_month_md_7, 'AAPL')


In [None]:
# 1 Month Price Prediction
# Prints the stock symbol, specified date, actual close price ('Close'), 
# and predicted price ('Close_Target'). If the date is not available, 
# it finds the next available future date. If the entered date is the last date, 
# it looks for the closest previous available date.
import pandas as pd

def print_stock_prediction_by_date(df_test, symbol, date):

    required_columns = {'Symbol', 'Date', 'Close', 'Close_Target'}
    
    # Ensure required columns exist in df_test
    if not required_columns.issubset(df_test.columns):
        raise ValueError(f"The test dataframe must contain the following columns: {required_columns}")
    
    # Convert date input to datetime for accurate comparisons
    date = pd.to_datetime(date)

    # Filter DataFrame for the given stock symbol
    df_filtered = df_test[df_test['Symbol'] == symbol].copy()

    if df_filtered.empty:
        print(f"No data available for symbol: {symbol}")
        return
    
    # Convert 'Date' column to datetime format
    df_filtered['Date'] = pd.to_datetime(df_filtered['Date'])

    # Ensure sorting by date for correct traversal
    df_filtered = df_filtered.sort_values(by='Date').reset_index(drop=True)

    # Try to find the exact date
    if date in df_filtered['Date'].values:
        closest_date = date
    else:
        # Find the next available future date
        future_dates = df_filtered[df_filtered['Date'] > date]
        if not future_dates.empty:
            closest_date = future_dates['Date'].iloc[0]  # Next available future date
        else:
            # If no future date exists, get the closest past date
            past_dates = df_filtered[df_filtered['Date'] < date]
            if not past_dates.empty:
                closest_date = past_dates['Date'].iloc[-1]  # Last available past date
            else:
                print(f"No available dates found for symbol: {symbol}")
                return

    # Get row for the closest available date
    row = df_filtered[df_filtered['Date'] == closest_date].iloc[0]
    actual_close = row['Close']
    predicted_price = row['Close_Target']

    # Calculate the future date (120 trading days = 6 months)
    future_date = closest_date + pd.DateOffset(days=120)
    
    # To ensure it represents 120 trading days, we might need to filter out weekends
    # and filter out time
    future_trading_date = future_date
    trading_days = pd.date_range(closest_date, future_date, freq='B')  # 'B' for business days (weekdays)
    future_trading_date = (trading_days[-1] if len(trading_days) > 0 else future_date).strftime('%Y-%m-%d')


    #Calculate percent error of actual price vs. predicted price
    percent_error = ((abs(actual_close - predicted_price)) / abs(actual_close)) * 100
    formatted_percent_error = f'{percent_error:.2f}%'

    # Print the result
    print(f"Symbol: {symbol}")
    print(f"Date: {closest_date.strftime('%Y-%m-%d')}")
    print(f"Close Price: {actual_close:.2f}")
    print(f"Predicted Price Date: {future_trading_date}")
    print(f"Predicted Price: {predicted_price:.2f}")
    print(f"Percent Error: {formatted_percent_error}")

# Example usage:
print_stock_prediction_by_date(df_stock_data_test_1_month_md_3, 'MSFT', '2024-02-09')


In [None]:
# 3 Month Price Prediction: Available Testing Dates
# Prints the first available date and the last available date 
# where 'Close' and 'Close_Target' values exist for a given stock ticker.

def print_stock_date_range(df_test, symbol):
 
    required_columns = {'Symbol', 'Date', 'Close', 'Close_Target'}
    
    # Ensure required columns exist in df_test
    if not required_columns.issubset(df_test.columns):
        raise ValueError(f"The test dataframe must contain the following columns: {required_columns}")

    
    
    # Filter DataFrame for the given symbol and ensure 'Close' and 'Close_Target' are not NaN
    df_filtered = df_test[(df_test['Symbol'] == symbol) & 
                          (df_test['Close'].notna()) & 
                          (df_test['Close_Target'].notna())].reset_index(drop=True)

    if df_filtered.empty:
        print(f"No available data for symbol: {symbol}")
        return

    # Extract first and last available dates
    first_date = df_filtered['Date'].min()
    last_date = df_filtered['Date'].max()

    # Print the results
    print(f"Symbol: {symbol}"),
    print(f"First Available Date: {first_date.strftime('%Y-%m-%d')}"),
    print(f"Last Available Date: {last_date.strftime('%Y-%m-%d')}")

# Example usage:
print_stock_date_range(df_stock_data_test_3_month_md_3, 'AAPL')


In [None]:
# 3 Month Price Prediction
# Prints the stock symbol, specified date, actual close price ('Close'), 
# and predicted price ('Close_Target'). If the date is not available, 
# it finds the next available future date. If the entered date is the last date, 
# it looks for the closest previous available date.
import pandas as pd

def print_stock_prediction_by_date(df_test, symbol, date):

    required_columns = {'Symbol', 'Date', 'Close', 'Close_Target'}
    
    # Ensure required columns exist in df_test
    if not required_columns.issubset(df_test.columns):
        raise ValueError(f"The test dataframe must contain the following columns: {required_columns}")
    
    # Convert date input to datetime for accurate comparisons
    date = pd.to_datetime(date)

    # Filter DataFrame for the given stock symbol
    df_filtered = df_test[df_test['Symbol'] == symbol].copy()

    if df_filtered.empty:
        print(f"No data available for symbol: {symbol}")
        return
    
    # Convert 'Date' column to datetime format
    df_filtered['Date'] = pd.to_datetime(df_filtered['Date'])

    # Ensure sorting by date for correct traversal
    df_filtered = df_filtered.sort_values(by='Date').reset_index(drop=True)

    # Try to find the exact date
    if date in df_filtered['Date'].values:
        closest_date = date
    else:
        # Find the next available future date
        future_dates = df_filtered[df_filtered['Date'] > date]
        if not future_dates.empty:
            closest_date = future_dates['Date'].iloc[0]  # Next available future date
        else:
            # If no future date exists, get the closest past date
            past_dates = df_filtered[df_filtered['Date'] < date]
            if not past_dates.empty:
                closest_date = past_dates['Date'].iloc[-1]  # Last available past date
            else:
                print(f"No available dates found for symbol: {symbol}")
                return

    # Get row for the closest available date
    row = df_filtered[df_filtered['Date'] == closest_date].iloc[0]
    actual_close = row['Close']
    predicted_price = row['Close_Target']

    # Calculate the future date (120 trading days = 6 months)
    future_date = closest_date + pd.DateOffset(days=120)
    
    # To ensure it represents 120 trading days, we might need to filter out weekends
    # and filter out time
    future_trading_date = future_date
    trading_days = pd.date_range(closest_date, future_date, freq='B')  # 'B' for business days (weekdays)
    future_trading_date = (trading_days[-1] if len(trading_days) > 0 else future_date).strftime('%Y-%m-%d')


    #Calculate percent error of actual price vs. predicted price
    percent_error = ((abs(actual_close - predicted_price)) / abs(actual_close)) * 100
    formatted_percent_error = f'{percent_error:.2f}%'

    # Print the result
    print(f"Symbol: {symbol}")
    print(f"Date: {closest_date.strftime('%Y-%m-%d')}")
    print(f"Close Price: {actual_close:.2f}")
    print(f"Predicted Price Date: {future_trading_date}")
    print(f"Predicted Price: {predicted_price:.2f}")
    print(f"Percent Error: {formatted_percent_error}")

# Example usage:
print_stock_prediction_by_date(df_stock_data_test_3_month_md_3, 'AAPL', '2024-02-09')


In [None]:
# 6 Month Price Prediction: Available Testing Dates
# Prints the first available date and the last available date 
# where 'Close' and 'Close_Target' values exist for a given stock ticker.

def print_stock_date_range(df_test, symbol):
 
    required_columns = {'Symbol', 'Date', 'Close', 'Close_Target'}
    
    # Ensure required columns exist in df_test
    if not required_columns.issubset(df_test.columns):
        raise ValueError(f"The test dataframe must contain the following columns: {required_columns}")

    # Ensure the 'Date' column is in datetime format
    df_test['Date'] = pd.to_datetime(df_test['Date']) 
    
    # Filter DataFrame for the given symbol and ensure 'Close' and 'Close_Target' are not NaN
    df_filtered = df_test[(df_test['Symbol'] == symbol) & 
                          (df_test['Close'].notna()) & 
                          (df_test['Close_Target'].notna())].reset_index(drop=True)

    if df_filtered.empty:
        print(f"No available data for symbol: {symbol}")
        return

    # Extract first and last available dates
    first_date = df_filtered['Date'].min()
    last_date = df_filtered['Date'].max()

    # Print the results
    print(f"Symbol: {symbol}"),
    print(f"First Available Date: {first_date.strftime('%Y-%m-%d')}"),
    print(f"Last Available Date: {last_date.strftime('%Y-%m-%d')}")

# Example usage:
print_stock_date_range(df_stock_data_test_6_month_md_3, 'MSFT')


In [None]:
# 6 Month Price Prediction
# Prints the stock symbol, specified date, actual close price ('Close'), 
# and predicted price ('Close_Target'). If the date is not available, 
# it finds the next available future date. If the entered date is the last date, 
# it looks for the closest previous available date.
import pandas as pd

def print_stock_prediction_by_date(df_test, symbol, date):

    required_columns = {'Symbol', 'Date', 'Close', 'Close_Target'}
    
    # Ensure required columns exist in df_test
    if not required_columns.issubset(df_test.columns):
        raise ValueError(f"The test dataframe must contain the following columns: {required_columns}")
    
    # Convert date input to datetime for accurate comparisons
    date = pd.to_datetime(date)

    # Filter DataFrame for the given stock symbol
    df_filtered = df_test[df_test['Symbol'] == symbol].copy()

    if df_filtered.empty:
        print(f"No data available for symbol: {symbol}")
        return
    
    # Convert 'Date' column to datetime format
    df_filtered['Date'] = pd.to_datetime(df_filtered['Date'])

    # Ensure sorting by date for correct traversal
    df_filtered = df_filtered.sort_values(by='Date').reset_index(drop=True)

    # Try to find the exact date
    if date in df_filtered['Date'].values:
        closest_date = date
    else:
        # Find the next available future date
        future_dates = df_filtered[df_filtered['Date'] > date]
        if not future_dates.empty:
            closest_date = future_dates['Date'].iloc[0]  # Next available future date
        else:
            # If no future date exists, get the closest past date
            past_dates = df_filtered[df_filtered['Date'] < date]
            if not past_dates.empty:
                closest_date = past_dates['Date'].iloc[-1]  # Last available past date
            else:
                print(f"No available dates found for symbol: {symbol}")
                return

    # Get row for the closest available date
    row = df_filtered[df_filtered['Date'] == closest_date].iloc[0]
    actual_close = row['Close']
    predicted_price = row['Close_Target']

    # Calculate the future date (120 trading days = 6 months)
    future_date = closest_date + pd.DateOffset(days=120)
    
    # To ensure it represents 120 trading days, we might need to filter out weekends
    # and filter out time
    future_trading_date = future_date
    trading_days = pd.date_range(closest_date, future_date, freq='B')  # 'B' for business days (weekdays)
    future_trading_date = (trading_days[-1] if len(trading_days) > 0 else future_date).strftime('%Y-%m-%d')


    #Calculate percent error of actual price vs. predicted price
    percent_error = ((abs(actual_close - predicted_price)) / abs(actual_close)) * 100
    formatted_percent_error = f'{percent_error:.2f}%'

    # Print the result
    print(f"Symbol: {symbol}")
    print(f"Date: {closest_date.strftime('%Y-%m-%d')}")
    print(f"Close Price: {actual_close:.2f}")
    print(f"Predicted Price Date: {future_trading_date}")
    print(f"Predicted Price: {predicted_price:.2f}")
    print(f"Percent Error: {formatted_percent_error}")

# Example usage:
print_stock_prediction_by_date(df_stock_data_test_6_month_md_3, 'AAPL', '2024-02-09')


In [None]:
df_all_cleaned[1500:1505]

In [None]:
df_all_cleaned.head()