In [61]:
# Import needed libraries 

import pandas as pd 
import os
import datetime
from datetime import timedelta
import numpy as np 
from scipy.signal import argrelextrema
import alpaca_trade_api as tradeapi 
import matplotlib.pyplot as plt 
import matplotlib.dates as mpdates
from mplfinance.original_flavor import candlestick_ohlc
from dotenv import load_dotenv
from itertools import islice
import hvplot.pandas
import time

import warnings
warnings.filterwarnings("ignore")

In [62]:
# Load .env file

load_dotenv()

True

In [63]:
# Set Alpaca API key and secret passwords

alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")

In [64]:
# Initiate REST API

api = tradeapi.REST(
    alpaca_api_key,
    alpaca_secret_key,
    api_version = "v2"
)

In [65]:
# Parameters for Stock Data from Alpacas
# Establish time frame (5 minute)

time_frame = "5min"

# Identify what stock symbol is trading

stock_symbol = "QQQ"

# Identify what start date to begin data analysis

start_date = pd.Timestamp("2020-01-01", tz="America/New_York").isoformat()

# Identify what end date to finalize data analysis

end_date = pd.Timestamp("2022-01-01", tz="America/New_York").isoformat()

In [66]:
# Function to call daily stock data

def get_stock_data(api, stock_symbol, time_frame, current_date_iso, next_day_date_iso):
    
    # Assuming api.get_bars returns a DataFrame with a 'df' attribute
    
    stock_data = api.get_bars(
        stock_symbol, 
        time_frame, 
        start=current_date_iso, 
        end=next_day_date_iso
        ).df
    
    return stock_data

# Displays the information pulled for working through code

# stock_data = get_stock_data(api, stock_symbol, time_frame, start_date, end_date)
# stock_data.info()
# display(stock_data.head())
# display(stock_data.tail())


In [67]:
# Function to prepare daily stock data to identify double top/bottom patterns and prepare for targets
# Includes establishing a polynomial fit and assigning new columns for localized min/max

# Polynomial Degree

polynomial_degree = 25

def polynomial_min_max_fit(stock_data, polynomial_degree):
    
    min_length = len(stock_data.index)
    x_data = np.arange(min_length)

    # Polynomial fitting
    polynomial_coefficients_open = np.polyfit(x_data, stock_data['open'][:min_length], polynomial_degree)
    polynomial_coefficients_high = np.polyfit(x_data, stock_data['high'][:min_length], polynomial_degree)
    polynomial_coefficients_low = np.polyfit(x_data, stock_data['low'][:min_length], polynomial_degree)
    polynomial_coefficients_close = np.polyfit(x_data, stock_data['close'][:min_length], polynomial_degree)

    # Evaluate the polynomial fit for plotting
    y_polynomial_open = np.polyval(polynomial_coefficients_open, x_data)
    y_polynomial_high = np.polyval(polynomial_coefficients_high, x_data)
    y_polynomial_low = np.polyval(polynomial_coefficients_low, x_data)
    y_polynomial_close = np.polyval(polynomial_coefficients_close, x_data)

    # Identify local extrema for polynomial fit data (minima and maxima)
    local_poly_minima = argrelextrema(y_polynomial_close, np.less, order=5)[0]
    local_poly_maxima = argrelextrema(y_polynomial_close, np.greater, order=5)[0]

    # Convert the close price polynomial fit data into a dataframe
    # This is done for OHLC poly fit data

    poly_df = pd.DataFrame(y_polynomial_open)
    columns = ['poly_fit_open']
    poly_df.columns = columns
    poly_df = poly_df.assign(poly_fit_high = y_polynomial_high)
    poly_df = poly_df.assign(poly_fit_low = y_polynomial_low)
    poly_df = poly_df.assign(poly_fit_close = y_polynomial_close)

    # Reset the index of the original updated ticker dataframe to concat with the polynomial dataframe that does not include a timeseries
    # This will ensure that the indexed intergers of the ploynomial fit align with the time each data point corresponds to

    updated_stock_data = stock_data.reset_index()
    updated_stock_data = pd.concat([updated_stock_data, poly_df], axis='columns', join='inner')
    updated_stock_data.head()

    # Add minima and maxima column to the DataFrame

    updated_stock_data["minima"] = 0
    updated_stock_data["maxima"] = 0
    updated_stock_data.head()

    # Mark rows with local minima as 1 in the 'minima' column

    for index in local_poly_minima:
        updated_stock_data.at[index, 'minima'] = -1

    for index in local_poly_maxima:
        updated_stock_data.at[index, "maxima"] = 1

    # Create Target Columns - Double Top Target & Double Bottom Target
        
    updated_stock_data["dbl_top_target"] = 0
    updated_stock_data["dbl_bot_target"] = 0

    # Create Machine Learning Model Target Column (Identify Long (1) or Short (-1))

    updated_stock_data["long_short_target"] = 0

    # # Plot data: COMMENTED OUT TO NOT REPEAT PLOTS AT END OF PROGRAM
    # # Plot the stock data and identified minima

    # plt.figure(figsize=(15, 5))
    # plt.plot(stock_data.index, stock_data["close"], label='Close Prices', alpha=0.7)

    # # Plot polynomial fit

    # plt.plot(stock_data.index[:min_length], y_polynomial_close, '-', markersize=1.0, color='black', alpha=0.9, label='Polynomial Fit')

    # # Plot red dots at local minima and blue dots at local maxima

    # plt.scatter(stock_data.index[local_poly_minima], y_polynomial_close[local_poly_minima], color='red', label='Local Minima')
    # plt.scatter(stock_data.index[local_poly_maxima],y_polynomial_close[local_poly_maxima], color="blue", label = "Local Maxima")
    
    return updated_stock_data

# For data checking/confirmation

# updated_stock_data = polynomial_min_max_fit(stock_data,polynomial_degree)

In [68]:
# # Plot data
# # Plot the stock data and identified minima

# plt.figure(figsize=(15, 5))
# plt.plot(stock_data.index, stock_data["close"], label='Close Prices', alpha=0.7)

# # # Plot polynomial fit

# plt.plot(stock_data.index[:min_length], y_polynomial_close, '-', markersize=1.0, color='black', alpha=0.9, label='Polynomial Fit')

# # # Plot red dots at local minima and blue dots at local maxima

# plt.scatter(stock_data.index[local_poly_minima], y_polynomial_close[local_poly_minima], color='red', label='Local Minima')
# plt.scatter(stock_data.index[local_poly_maxima],y_polynomial_close[local_poly_maxima], color="blue", label = "Local Maxima")

In [69]:
# Define the time independent DF for double top/bottom identification
# Timestamp remains a column, but can identify pattern from peak to peak immediately

def time_independent_data(updated_stock_data):

    # Define and establish time independent DF

    time_independent_df = []
    time_independent_df = pd.DataFrame(time_independent_df, columns = ["timestamp","close","high","low","trade_count","open","volume","vwap","poly_fit_open","poly_fit_high","poly_fit_low","poly_fit_close","minima","maxima","dbl_top_target","dbl_bot_target"])

    # Loop to iterate through all rows of stock data and write min/max to new DF

    for index, row in islice(updated_stock_data.iterrows(), 0, None):

        # Assign to rows only those that contain local min or max

        if (updated_stock_data.at[index,"minima"] == -1):
            time_independent_df.loc[index] = row
            a = updated_stock_data.iloc[index]["poly_fit_close"]
        elif (updated_stock_data.at[index,"maxima"] == 1):
            time_independent_df.loc[index] = row
            a = updated_stock_data.iloc[index]["poly_fit_close"]

    # Reset time dependent index
    # Set new time independent index

    time_independent_df.reset_index(inplace = True)
    time_independent_df.rename(columns={"index":"time_dependent_index"}, inplace = True)

    return time_independent_df

# Data confirmation/check

# time_independent_df = time_independent_data(updated_stock_data)
# time_independent_df.head(10)

In [70]:
# Function to identify daily double top/bottom patterns
# Inherently there should only be 1 identification of the pattern as it is considered a trend reversal pattern
# The loop will break once the pattern is identified

def identify_double_patterns(time_independent_df, updated_stock_data):

    # Initialize variables to identify double top/bottom patterns
    # Time dependent variable x_0 will always begin at a local min/max which also coincides
    # With the start of a trend into a potential double top/bottom
    
    x_0 = 0

    # Initiation of double top/bottom variable
    
    a = 0

    # First peak/valley of double top/bottom pattern
    
    b = 0

    # Trough/peak of double top/bottom pattern

    c = 0

    # Second peak/vallye of double top/bottom pattern
    
    d = 0

    # Trigger of double top/bottom pattern
    
    e = 0

    # Final time increment to finalize and trigger double top signal
    
    x_f = 0

    # Read through code to identify double top/bottom and assign to target columns.

    for index, row in islice(time_independent_df.iterrows(), 0, len(time_independent_df) - 4):

        # Check for double top
        # If found then assigns x_f final time for writing to targets in time dependent dataframe

        if (time_independent_df.at[index,"minima"] == -1):
            a = time_independent_df.iloc[index]["poly_fit_close"]
            b = time_independent_df.iloc[index + 1]["poly_fit_close"]
            c = time_independent_df.iloc[index + 2]["poly_fit_close"]
            d = time_independent_df.iloc[index + 3]["poly_fit_close"]
            e = time_independent_df.iloc[index + 4]["poly_fit_close"]
            x_0 = time_independent_df.iloc[index]["time_dependent_index"]

            if (time_independent_df.iloc[index + 1]["poly_fit_low"])*.95 < d < (time_independent_df.iloc[index + 1]["poly_fit_high"])*1.05 and (e <= c):
                x_f = time_independent_df.iloc[index + 4]["time_dependent_index"]
                updated_stock_data.at[x_f, "long_short_target"] = -1
                while (x_0 < x_f + 1):
                    updated_stock_data.at[x_0, "dbl_top_target"] = 1
                    x_0 = x_0 + 1
                break
            
            

        # Check for double bottom
        # If found then assigns x_f final time for writing to targets in time dependent dataframe
            
        elif (time_independent_df.at[index,"maxima"] == 1):
            a = time_independent_df.iloc[index]["poly_fit_close"]
            b = time_independent_df.iloc[index + 1]["poly_fit_close"]
            c = time_independent_df.iloc[index + 2]["poly_fit_close"]
            d = time_independent_df.iloc[index + 3]["poly_fit_close"]
            e = time_independent_df.iloc[index + 4]["poly_fit_close"]
            x_0 = time_independent_df.iloc[index]["time_dependent_index"] 

            if (time_independent_df.iloc[index+1]["poly_fit_low"])*.95 < d < (time_independent_df.iloc[index+1]["poly_fit_high"])*1.05 and (e >= c):
                x_f_min = time_independent_df.iloc[index + 4]["time_dependent_index"]
                updated_stock_data.at[x_f, "long_short_target"] = 1
                while (x_0 < x_f + 1):
                    updated_stock_data.at[x_0, "dbl_bot_target"] = 1
                    x_0 = x_0 + 1
                break
            
    return updated_stock_data, x_f

# updated_stock_data, x_f = identify_double_patterns(time_independent_df,updated_stock_data)

# print(x_f)



In [71]:
# # Data Checking

# display(updated_stock_data.head(20))
# print(updated_stock_data.loc[x_f])

In [72]:
# Loop to collect and assess daily data one day at a time

master_df = {
    'timestamp': [],  # List of timestamps
    'close': [],      # List of close prices
    'high': [],       # List of high prices
    'low': [],        # List of low prices
    'trade_count': [], # List of trade counts
    'open': [],       # List of open prices
    'volume': [],     # List of volumes
    'vwap': [],       # List of volume-weighted average prices
    'poly_fit_open': [],    # List of polynomial fit open prices
    'poly_fit_high': [],    # List of polynomial fit high prices
    'poly_fit_low': [],     # List of polynomial fit low prices
    'poly_fit_close': [],   # List of polynomial fit close prices
    'minima': [],           # List of minima
    'maxima': [],           # List of maxima
    'dbl_top_target': [],   # List of double top targets
    'dbl_bot_target': [],    # List of double bottom targets
    'buy_sell_tgt': []       # List of ML Model target
}

daily_dataframes = []

master_df = pd.DataFrame(master_df)

current_date = pd.to_datetime(start_date)
next_day_date = current_date + pd.offsets.BDay(1)

# print(current_date)
# print(next_day_date)

while current_date <= (pd.to_datetime(end_date)):

    # next_day_date = next_day_date + pd.offsets.BDay(1)
    # current_date = current_date + pd.offsets.BDay(1)
    # Convert current_date and next_day_date to ISO format for API call

    current_date_iso = current_date.isoformat()
    # next_day_date = current_date + pd.offsets.BDay(1)
    next_day_date_iso = next_day_date.isoformat()

    #current_date = datetime.datetime.fromisoformat(current_date_iso) + pd.offsets.BDay(1)
    
    # Fetch stock data for the current day

    stock_data = get_stock_data(api, stock_symbol, time_frame, current_date_iso, next_day_date_iso)

    if len(stock_data.index) == 0:
        next_day_date = next_day_date + pd.offsets.BDay(1)
        current_date = current_date + pd.offsets.BDay(1)
        continue
    
    # Run polynomial fit function for updated stock data

    updated_stock_data = polynomial_min_max_fit(stock_data, polynomial_degree)
    
    # Isolate Mins/maxes

    time_independent_df = time_independent_data(updated_stock_data)

    # Updated stock data with double tops/bots identified by function
    # x_f is the double top/bot trigger row(index) for current day's double top/bot

    updated_stock_data, x_f = identify_double_patterns(time_independent_df, updated_stock_data)
    
    # Perform your analysis or call your functions here
    # e.g., identify_double_patterns(time_independent_df, updated_stock_data)

    daily_dataframes.append(updated_stock_data)

    master_df = pd.concat(daily_dataframes, ignore_index=True)
    # display(master_df.tail())
    # Increment to the next day

    next_day_date = next_day_date + pd.offsets.BDay(1)
    current_date = current_date + pd.offsets.BDay(1)

    time.sleep(.5)
    # next_day_date = pd.Timestamp(next_day_date_iso) + pd.offsets.BDay(1)
    
    # print(current_date)
    # print(next_day_date)
    


In [73]:
# master_df.info()
# print(len(master_df))
# print(type(master_df))
# print(sum(master_df["dbl_top_target"]))
# print(sum(master_df["dbl_bot_target"]))
# print(sum(master_df["long_short_target"]))
# display(master_df.head(50))
# display(master_df.tail(50))

In [74]:
# Shift master dataframe by one datapoint to set target columns to train  ML model 

master_df['close_lagged'] = master_df['close'].shift(1)
master_df['high_lagged'] = master_df['high'].shift(1)
master_df['low_lagged'] = master_df['low'].shift(1)
master_df['open_lagged'] = master_df['open'].shift(1)
master_df['trade_count_lagged'] = master_df['trade_count'].shift(1)
master_df['volume_lagged'] = master_df['volume'].shift(1)
master_df['vwap_lagged'] = master_df['vwap'].shift(1)
master_df['poly_open_lagged'] = master_df['poly_fit_open'].shift(1)
master_df['poly_high_lagged'] = master_df['poly_fit_high'].shift(1)
master_df['poly_low_lagged'] = master_df['poly_fit_low'].shift(1)
master_df['poly_close_lagged'] = master_df['poly_fit_close'].shift(1)
master_df['minima_lagged'] = master_df['minima'].shift(1)
master_df['maxima_lagged'] = master_df['maxima'].shift(1)
master_df['dbl_top_target_lagged'] = master_df['dbl_top_target'].shift(1)
master_df['dbl_bot_target_lagged'] = master_df['dbl_bot_target'].shift(1)
master_df['long_short_target_lagged'] = master_df['long_short_target'].shift(1)

master_df.head()

Unnamed: 0,timestamp,close,high,low,trade_count,open,volume,vwap,poly_fit_open,poly_fit_high,...,vwap_lagged,poly_open_lagged,poly_high_lagged,poly_low_lagged,poly_close_lagged,minima_lagged,maxima_lagged,dbl_top_target_lagged,dbl_bot_target_lagged,long_short_target_lagged
0,2020-01-02 09:00:00+00:00,214.06,214.06,213.95,35,213.98,16478,214.001984,213.938984,214.035333,...,,,,,,,,,,
1,2020-01-02 09:05:00+00:00,214.13,214.18,214.03,13,214.03,2563,214.148354,214.090518,214.186205,...,214.001984,213.938984,214.035333,213.916744,214.023885,0.0,0.0,0.0,0.0,1.0
2,2020-01-02 09:10:00+00:00,214.18,214.18,214.13,15,214.13,1666,214.145222,214.152016,214.225702,...,214.148354,214.090518,214.186205,214.081773,214.1723,0.0,0.0,0.0,0.0,0.0
3,2020-01-02 09:15:00+00:00,214.2,214.2,214.14,26,214.18,18592,214.173761,214.165412,214.213441,...,214.145222,214.152016,214.225702,214.140261,214.21148,0.0,1.0,0.0,0.0,0.0
4,2020-01-02 09:20:00+00:00,214.18,214.18,214.15,5,214.17,1600,214.17375,214.158523,214.185956,...,214.173761,214.165412,214.213441,214.146116,214.199144,0.0,0.0,0.0,0.0,0.0


In [75]:
# Convert the NaN values within both tagret columns into 0's to feed into ML Models

# master_df['long_short_target'] = master_df['long_short_target'].fillna(0)
# master_df['long_short_target_lagged'] = master_df['long_short_target_lagged'].fillna(0)
master_df = master_df.fillna(0)
master_df = master_df.drop("long_short_target", axis=1)

master_df.info()
print(len(master_df))
print(type(master_df))
print(sum(master_df["dbl_top_target"]))
print(sum(master_df["dbl_bot_target"]))
display(master_df.head(100))
display(master_df.tail(100))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93409 entries, 0 to 93408
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   timestamp                 93409 non-null  datetime64[ns, UTC]
 1   close                     93409 non-null  float64            
 2   high                      93409 non-null  float64            
 3   low                       93409 non-null  float64            
 4   trade_count               93409 non-null  int64              
 5   open                      93409 non-null  float64            
 6   volume                    93409 non-null  int64              
 7   vwap                      93409 non-null  float64            
 8   poly_fit_open             93409 non-null  float64            
 9   poly_fit_high             93409 non-null  float64            
 10  poly_fit_low              93409 non-null  float64            
 11  poly_fit_close 

Unnamed: 0,timestamp,close,high,low,trade_count,open,volume,vwap,poly_fit_open,poly_fit_high,...,vwap_lagged,poly_open_lagged,poly_high_lagged,poly_low_lagged,poly_close_lagged,minima_lagged,maxima_lagged,dbl_top_target_lagged,dbl_bot_target_lagged,long_short_target_lagged
0,2020-01-02 09:00:00+00:00,214.0600,214.0600,213.95,35,213.98,16478,214.001984,213.938984,214.035333,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
1,2020-01-02 09:05:00+00:00,214.1300,214.1800,214.03,13,214.03,2563,214.148354,214.090518,214.186205,...,214.001984,213.938984,214.035333,213.916744,214.023885,0.0,0.0,0.0,0.0,1.0
2,2020-01-02 09:10:00+00:00,214.1800,214.1800,214.13,15,214.13,1666,214.145222,214.152016,214.225702,...,214.148354,214.090518,214.186205,214.081773,214.172300,0.0,0.0,0.0,0.0,0.0
3,2020-01-02 09:15:00+00:00,214.2000,214.2000,214.14,26,214.18,18592,214.173761,214.165412,214.213441,...,214.145222,214.152016,214.225702,214.140261,214.211480,0.0,1.0,0.0,0.0,0.0
4,2020-01-02 09:20:00+00:00,214.1800,214.1800,214.15,5,214.17,1600,214.173750,214.158523,214.185956,...,214.173761,214.165412,214.213441,214.146116,214.199144,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2020-01-02 17:20:00+00:00,214.7800,214.8450,214.75,1091,214.80,348142,213.575008,214.782053,214.844336,...,214.738782,214.756034,214.819648,214.717194,214.773184,0.0,0.0,0.0,0.0,0.0
96,2020-01-02 17:25:00+00:00,214.7641,214.8500,214.76,849,214.80,123451,214.801123,214.805577,214.866411,...,213.575008,214.782053,214.844336,214.741513,214.796712,0.0,0.0,0.0,0.0,0.0
97,2020-01-02 17:30:00+00:00,214.8200,214.8775,214.76,707,214.77,124405,214.822504,214.826345,214.885645,...,214.801123,214.805577,214.866411,214.762668,214.817778,0.0,0.0,0.0,0.0,0.0
98,2020-01-02 17:35:00+00:00,214.9700,214.9800,214.82,704,214.83,129843,214.908305,214.844226,214.901942,...,214.822504,214.826345,214.885645,214.780620,214.836243,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,timestamp,close,high,low,trade_count,open,volume,vwap,poly_fit_open,poly_fit_high,...,vwap_lagged,poly_open_lagged,poly_high_lagged,poly_low_lagged,poly_close_lagged,minima_lagged,maxima_lagged,dbl_top_target_lagged,dbl_bot_target_lagged,long_short_target_lagged
93309,2021-12-31 14:10:00+00:00,400.17,400.3000,400.140,102,400.22,12967,400.242704,400.272130,400.500926,...,400.264618,400.274065,400.475197,400.091103,400.263832,0.0,0.0,1.0,0.0,0.0
93310,2021-12-31 14:15:00+00:00,399.93,400.1800,399.830,195,400.18,13722,400.030276,400.256222,400.513851,...,400.242704,400.272130,400.500926,400.049601,400.245522,0.0,0.0,1.0,0.0,0.0
93311,2021-12-31 14:20:00+00:00,399.97,400.3500,399.890,118,399.92,109102,400.327196,400.225860,400.512447,...,400.030276,400.256222,400.513851,399.992851,400.213763,0.0,0.0,1.0,0.0,0.0
93312,2021-12-31 14:25:00+00:00,399.63,399.9500,399.630,313,399.95,17977,399.821493,400.181133,400.495739,...,400.327196,400.225860,400.512447,399.921728,400.168571,0.0,0.0,1.0,0.0,0.0
93313,2021-12-31 14:30:00+00:00,400.85,400.9319,399.655,10470,399.69,1124534,400.280904,400.122719,400.463367,...,399.821493,400.181133,400.495739,399.837694,400.110515,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93404,2021-12-31 22:05:00+00:00,397.82,397.8200,397.700,15,397.70,1488,397.799281,397.760599,397.856043,...,397.849919,397.741045,397.827766,397.681727,397.800278,0.0,0.0,0.0,0.0,0.0
93405,2021-12-31 22:10:00+00:00,397.88,397.8800,397.820,28,397.82,2019,397.840766,397.805459,397.884661,...,397.799281,397.760599,397.856043,397.721695,397.840415,0.0,0.0,0.0,0.0,0.0
93406,2021-12-31 22:15:00+00:00,397.88,397.8800,397.870,12,397.88,589,397.874923,397.859546,397.896353,...,397.840766,397.805459,397.884661,397.791398,397.878856,0.0,0.0,0.0,0.0,0.0
93407,2021-12-31 22:25:00+00:00,397.85,397.9010,397.850,33,397.85,4154,397.870848,397.885910,397.877875,...,397.874923,397.859546,397.896353,397.861546,397.887312,0.0,1.0,0.0,0.0,0.0


In [76]:
# Get the length of each column and store it in a dictionary
column_lengths = {col: len(master_df[col]) for col in master_df.columns}

# Find the maximum and minimum lengths
max_length = max(column_lengths.values())
min_length = min(column_lengths.values())

# Check if there are columns with lengths different from the maximum
columns_with_different_length = [col for col, length in column_lengths.items() if length != max_length]

# Print the results
print("Maximum Length:", max_length)
print("Minimum Length:", min_length)
print("Columns with Different Lengths:", columns_with_different_length)


Maximum Length: 93409
Minimum Length: 93409
Columns with Different Lengths: []


In [77]:
# Import libraries for LSTM model
from keras.datasets import imdb
from keras.models import Sequential
from keras.preprocessing import sequence
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [93]:
# Group by day and keep 5 minute increments
grouped_df = master_df.groupby(master_df["timestamp"].dt.date).apply(lambda x:x).reset_index(drop=True)
grouped_df.set_index("timestamp", inplace=True)

# set target to 0 or 1. 1 will signal an entry in either direction
grouped_df["long_short_target_lagged"] = np.where((grouped_df["long_short_target_lagged"] == -1) | (grouped_df["long_short_target_lagged"] == 1), 1, 0)

# prepare input data
X = grouped_df.drop("long_short_target_lagged", axis=1)
y = grouped_df["long_short_target_lagged"]

# split into test and train

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, shuffle=False, random_state=42)

display(grouped_df.head())
display(X.head())
display(y.head())



Unnamed: 0_level_0,close,high,low,trade_count,open,volume,vwap,poly_fit_open,poly_fit_high,poly_fit_low,...,vwap_lagged,poly_open_lagged,poly_high_lagged,poly_low_lagged,poly_close_lagged,minima_lagged,maxima_lagged,dbl_top_target_lagged,dbl_bot_target_lagged,long_short_target_lagged
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02 09:00:00+00:00,214.06,214.06,213.95,35,213.98,16478,214.001984,213.938984,214.035333,213.916744,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2020-01-02 09:05:00+00:00,214.13,214.18,214.03,13,214.03,2563,214.148354,214.090518,214.186205,214.081773,...,214.001984,213.938984,214.035333,213.916744,214.023885,0.0,0.0,0.0,0.0,1
2020-01-02 09:10:00+00:00,214.18,214.18,214.13,15,214.13,1666,214.145222,214.152016,214.225702,214.140261,...,214.148354,214.090518,214.186205,214.081773,214.1723,0.0,0.0,0.0,0.0,0
2020-01-02 09:15:00+00:00,214.2,214.2,214.14,26,214.18,18592,214.173761,214.165412,214.213441,214.146116,...,214.145222,214.152016,214.225702,214.140261,214.21148,0.0,1.0,0.0,0.0,0
2020-01-02 09:20:00+00:00,214.18,214.18,214.15,5,214.17,1600,214.17375,214.158523,214.185956,214.132988,...,214.173761,214.165412,214.213441,214.146116,214.199144,0.0,0.0,0.0,0.0,0


Unnamed: 0_level_0,close,high,low,trade_count,open,volume,vwap,poly_fit_open,poly_fit_high,poly_fit_low,...,volume_lagged,vwap_lagged,poly_open_lagged,poly_high_lagged,poly_low_lagged,poly_close_lagged,minima_lagged,maxima_lagged,dbl_top_target_lagged,dbl_bot_target_lagged
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02 09:00:00+00:00,214.06,214.06,213.95,35,213.98,16478,214.001984,213.938984,214.035333,213.916744,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-02 09:05:00+00:00,214.13,214.18,214.03,13,214.03,2563,214.148354,214.090518,214.186205,214.081773,...,16478.0,214.001984,213.938984,214.035333,213.916744,214.023885,0.0,0.0,0.0,0.0
2020-01-02 09:10:00+00:00,214.18,214.18,214.13,15,214.13,1666,214.145222,214.152016,214.225702,214.140261,...,2563.0,214.148354,214.090518,214.186205,214.081773,214.1723,0.0,0.0,0.0,0.0
2020-01-02 09:15:00+00:00,214.2,214.2,214.14,26,214.18,18592,214.173761,214.165412,214.213441,214.146116,...,1666.0,214.145222,214.152016,214.225702,214.140261,214.21148,0.0,1.0,0.0,0.0
2020-01-02 09:20:00+00:00,214.18,214.18,214.15,5,214.17,1600,214.17375,214.158523,214.185956,214.132988,...,18592.0,214.173761,214.165412,214.213441,214.146116,214.199144,0.0,0.0,0.0,0.0


timestamp
2020-01-02 09:00:00+00:00    0
2020-01-02 09:05:00+00:00    1
2020-01-02 09:10:00+00:00    0
2020-01-02 09:15:00+00:00    0
2020-01-02 09:20:00+00:00    0
Name: long_short_target_lagged, dtype: int64

In [94]:
# Standardize X_train/test

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [95]:
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("Total elements in X_train_scaled:", np.prod(X_train_scaled.shape))
print("Total elements in X_test_scaled:", np.prod(X_test_scaled.shape))


X_train_scaled shape: (65386, 30)
X_test_scaled shape: (28023, 30)
Total elements in X_train_scaled: 1961580
Total elements in X_test_scaled: 840690


In [96]:
# Define the degree of the polynomial
degree = 2

# Create polynomial features
poly_features = PolynomialFeatures(degree=degree)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)

# Parameters for GridSearchCV
parameters = {
    "linear_reg__fit_intercept": [True, False],
    "linear_reg__copy_X": [True, False],
    "linear_reg__positive": [True, False]
}

# Create a linear regression model inside a pipeline
linear_reg_model = Pipeline([
    ('linear_reg', LinearRegression())
])

# Use GridSearchCV to find the best parameters
grid_search_model = GridSearchCV(linear_reg_model, parameters, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_model.fit(X_train_poly, y_train)

# Get the best model
best_model = grid_search_model.best_estimator_

# Make predictions on the test data using the best model
predictions_test = best_model.predict(X_test_poly)

# Evaluate the model on test data
mse_test = mean_squared_error(y_test, predictions_test)
r2_test = r2_score(y_test, predictions_test)

In [97]:
print(f'Mean Squared Error on Test Data: {mse_test}')
print(f'R-squared on Test Data: {r2_test}')

Mean Squared Error on Test Data: 0.0033847369775366807
R-squared on Test Data: 0.3600167382796684


In [98]:
unique = np.unique(y_test)
print(unique)

[0 1]


In [99]:
# Convert predicted probabilities to binary class labels (0 or 1) based on a threshold
threshold = 0.5  # You can adjust this threshold
binary_predictions_test = (predictions_test > threshold).astype(int)

# Convert y_test to binary labels (0 or 1)
binary_y_test = ((y_test == 1) | (y_test == -1)).astype(int)

# Calculate binary classification metrics
accuracy = accuracy_score(binary_y_test, binary_predictions_test)
precision = precision_score(binary_y_test, binary_predictions_test)
recall = recall_score(binary_y_test, binary_predictions_test)
f1 = f1_score(binary_y_test, binary_predictions_test)
conf_matrix = confusion_matrix(binary_y_test, binary_predictions_test)

# Print the metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

print('\nConfusion Matrix:')
print(conf_matrix)


Accuracy: 0.9968597223709096
Precision: 0.8765432098765432
Recall: 0.47651006711409394
F1 Score: 0.6173913043478261

Confusion Matrix:
[[27864    10]
 [   78    71]]
