In [17]:
import boto3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import math
import random
from decimal import Decimal, getcontext
import pandas_market_calendars as mcal
import ast
from sklearn.preprocessing import StandardScaler
import pytz

In [18]:
s3 = boto3.client('s3')

training_bucket = "icarus-research-data"
training_prefix = 'training_datasets/expanded_1d_datasets/'

# Create a calendar
nyse = mcal.get_calendar('NYSE')
holidays = nyse.regular_holidays
market_holidays = holidays.holidays()

weekly_expiries = ['SPY', 'IVV', 'QQQ', 'GLD', 'IWM', 'EFA', 'XLK', 'XLV', 'TLT', 'LQD', 'XLE', 'TQQQ', 'SQQQ', 'SPXS', 'SPXL', 'SOXL', 'SOXS', 'MMM', 'ABT', 'ABBV', 'ACN', 'ATVI', 'ADM', 'ADBE', 'ADP', 
                   'AAP', 'AFL', 'ALB', 'ALGN', 'GOOGL', 'GOOG', 'MO', 'AMZN', 'AMD', 'AAL', 'AXP', 'AIG', 'ABC', 'AMGN', 'ADI', 'APA', 'AAPL', 'AMAT', 'ANET', 'T', 'ADSK', 'BAC', 'BBWI', 'BAX', 'BBY', 'BIIB', 
                   'BLK', 'BA', 'BKNG', 'BMY', 'AVGO', 'CZR', 'CPB', 'COF', 'CAH', 'KMX', 'CCL', 'CAT', 'CBOE', 'CNC', 'CF', 'SCHW', 'CHTR', 'CVX', 'CMG', 'CI', 'CSCO', 'C', 'CLX', 'CME', 'KO', 'CMCSA', 'CMA', 'CAG', 
                   'COP', 'STZ', 'GLW', 'COST', 'CTRA', 'CSX', 'CVS', 'DHI', 'DHR', 'DE', 'DAL', 'DVN', 'DLR', 'DFS', 'DISH', 'DIS', 'DG', 'DLTR', 'DPZ', 'DOW', 'DD', 'EBAY', 'EA', 'ELV', 'LLY', 'EMR', 'ENPH', 'EOG', 'EQT', 
                   'ETSY', 'EXPE', 'XOM', 'FDX', 'FITB', 'FSLR', 'FI', 'F', 'FTNT', 'FOXA', 'FCX', 'GEHC', 'GNRC', 'GD', 'GE', 'GM', 'GILD', 'GS', 'HAL', 'HSY', 'HES', 'HD', 'HON', 'HRL', 'HPQ', 'HUM', 'HBAN', 'IBM', 'ILMN', 
                   'INTC', 'IP', 'INTU', 'ISRG', 'JNJ', 'JPM', 'JNPR', 'KEY', 'KMB', 'KMI', 'KLAC', 'KHC', 'KR', 'LRCX', 'LVS', 'LEN', 'LMT', 'LOW', 'MRO', 'MPC', 'MAR', 'MA', 'MTCH', 'MCD', 'MCK', 'MDT', 'MRK', 'META', 'MET', 
                   'MGM', 'MU', 'MSFT', 'MRNA', 'MDLZ', 'MS', 'MOS', 'NTAP', 'NFLX', 'NEM', 'NKE', 'NSC', 'NOC', 'NCLH', 'NUE', 'NVDA', 'NXPI', 'OXY', 'ON', 'ORCL', 'PARA', 'PYPL', 'PEP', 'PFE', 'PCG', 'PM', 'PSX', 'PXD', 'PNC', 
                   'PPG', 'PG', 'PHM', 'QCOM', 'RTX', 'REGN', 'ROST', 'RCL', 'SPGI', 'CRM', 'SLB', 'STX', 'NOW', 'SWKS', 'SEDG', 'SO', 'LUV', 'SBUX', 'TMUS', 'TROW', 'TTWO', 'TPR', 'TGT', 'TSLA', 'TXN', 'TMO', 'TJX', 'TSCO', 'TFC', 
                   'TSN', 'USB', 'ULTA', 'UNP', 'UAL', 'UPS', 'URI', 'UNH', 'VLO', 'VZ', 'VRTX', 'VFC', 'V', 'WBA', 'WMT', 'WBD', 'WM', 'WFC', 'WDC', 'WHR', 'WMB', 'WYNN', 'ZION']

leveraged_etfs = ["TQQQ","SQQQ","SPXS","SPXL","SOXL","SOXS"]
high_vol = ['COIN','BILI','UPST','CVNA',"TQQQ","SQQQ","SPXS","SPXL","SOXL","SOXS","NIO","BABA","ROKU","RBLX","SE","SNAP","LCID",'RIVN',"BIDU","FUTU","TSLA","JD","HOOD","CHWY"]
expensive = ["CMG","NFLX","AVGO","BKNG","ABNB"]
bf = ['QQQ','IWM','AAPL','NVDA','AMD','AMZN','SPY','MSFT','GOOG','GOOGL','C','BAC',
      'JPM','XOM','CVX','CSCO','INTC','DIS','IBM','BA','V','AXP','GS','HD','KO','PFE','PG',
      'INTU','ADBE','CMG','SBUX','NKE','FDX','GM','F']
BF3 = [
    'QQQ','SPY','IWM','ADBE', 'GOOGL', 'GOOG', 'AMZN', 'AMD', 
    'AXP', 'AAPL', 'BAC', 'BA', 'CVX', 'CSCO', 'C', 'DIS', 'XOM', 
    'GM', 'INTC', 'JPM', 'MSFT', 'NFLX', 'NVDA', 'PYPL', 
    'TSLA', 'V', 'SQ', 'TSM', 'QCOM', 'UBER', 'SNOW', 'PLTR', 'ABNB','AVGO',
    "FB",'META','NKE','SBUX','TGT','ORCL'
    ]
high_vol = ['COIN','BILI','UPST','CVNA',"NIO","BABA","ROKU","RBLX","SE","SNAP","LCID","ZM","TDOC","UBER","RCL",
            'RIVN',"BIDU","FUTU","TSLA","JD","HOOD","CHWY","MARA","SNAP",'TWLO', 'DDOG', 'ZS', 'NET', 'OKTA',
            "DOCU",'SQ', 'SHOP',"PLTR","CRWD",'MRNA', 'SNOW', 'SOFI','LYFT','TSM','PINS','PANW','ORCL','SBUX','NKE',"UPS","FDX",
            'WDAY','SPOT']


TRADING_SYMBOLS =  [
'ZM', 'UBER', 'CMG', 'AXP', 'TDOC', 'UAL', 'DAL', 'MMM', 'PEP', 'GE', 'RCL', 'MRK',
 'HD', 'LOW', 'VZ', 'PG', 'TSM', 'GOOG', 'GOOGL', 'AMZN', 'BAC', 'AAPL', 'ABNB',
 'CRM', 'MSFT', 'F', 'V', 'MA', 'JNJ', 'DIS', 'JPM', 'ADBE', 'BA', 'CVX', 'PFE',
 'META', 'C', 'CAT', 'KO', 'MS', 'GS', 'IBM', 'CSCO', 'WMT','TSLA','LCID','NIO','WFC',
 'TGT', 'COST', 'RIVN', 'COIN', 'SQ', 'SHOP', 'DOCU', 'ROKU', 'TWLO', 'DDOG', 'ZS', 'NET',
 'OKTA', 'UPST', 'ETSY', 'PINS', 'FUTU', 'SE', 'BIDU', 'JD', 'BABA', 'RBLX', 'AMD',
 'NVDA', 'PYPL', 'PLTR', 'NFLX', 'CRWD', 'INTC', 'MRNA', 'SNOW', 'SOFI', 'PANW',
 'ORCL','SBUX','NKE','TSLA','XOM',"RTX","UPS","FDX","CAT","PG","COST","LMT",'WDAY','SPOT','LCRX'"GS",
 "MS","AXP","GIS","KHC","W","CHWY","PTON","DOCU","TTD","NOW","TEAM","MDB","HOOD","MARA","AI",
 "LYFT","BYND", 'AVGO', 'QCOM', 'AAL', 'CZR', 'ARM', 'DKNG', 'NCLH','WDAY','SPOT','LCRX','INTU','ADSK',
 'MU', 'WBD', 'CCL', 'AMAT', 'TXN', 'SNAP', 'MGM', 'CVNA','MCD','AVGO','GM','DG','DE','BKNG',"QQQ","SPY","IWM"]
print(len(TRADING_SYMBOLS))
bf3 = ['QQQ','IWM','AAPL','NVDA','AMD','AMZN','SPY','MSFT','GOOG','GOOGL','C','BAC',
      'JPM','XOM','CVX','CSCO','INTC','DIS','IBM','BA', 'V','AXP','WMT','ADBE','F','GM',
      'SNOW','PYPL','NFLX','ABNB','SQ','SHOP','DOCU','UBER','PLTR',
      'TSLA','COIN','TSM','META'
      ]
hv = ['TSLA','COIN','META','PYPL','UBER','DDOG','SNOW','NIO','LYFT','SNOW','ROKU','RBLX','NFLX','ZM','DOCU','AMD','NVDA'
      'AFRM','SNAP','OKTA','TEAM','PLTR','SHOP','SQ','CRWD','TSM','PANW','WDAY','SPOT','PTON','TDOC','RIVN','NET']

['ADBE', 'GOOGL', 'GOOG', 'AMZN', 'AMD', 'AXP', 'AAPL', 'BAC', 'BA', 'CVX', 'CSCO', 'C', 'DIS', 'XOM', 'F', 'GM', 
 'INTC', 'JPM', 'MSFT', 'NFLX', 'NVDA', 'PYPL', 'TSLA', 'V', 'SQ', 'DOCU', 'UBER', 'SNOW', 'PLTR', 'ABNB']


print(len(BF3))

BF3 = ['QQQ', 'SPY', 'IWM', 'GOOGL', 'GOOG', 'AMZN', 'AMD', 'AAPL', 'BAC',
       'BA', 'C', 'DIS', 'XOM', 'INTC', 'JPM', 'MSFT', 'NFLX', 'NVDA',
       'TSLA', 'V', 'TSM', 'QCOM', 'SNOW', 'TGT', 'NKE', 'SBUX', 'ORCL',
       'PYPL', 'SQ', 'UBER', 'PLTR', 'ABNB', 'META']
PE = ['SPY', 'TSLA', 'QQQ', 'AMD', 'GOOGL','AAPL', 'MSFT', 'IWM', 'AMZN','TSM','BAC','C','XOM','DIS','SBUX','PYPL','NKE']
PE2 = ['SPY', 'TSLA', 'QQQ', 'AMD', 'GOOGL','AAPL', 'MSFT', 'IWM', 'AMZN','TSM','XOM','DIS','PYPL','BA','NFLX','NVDA','GOOG','META']

GE = [
    'QQQ','SPY','IWM', 'GOOGL', 'GOOG', 'AMZN', 'AMD','SHOP'
    'AXP', 'AAPL', 'BAC', 'BA', 'CVX', 'CSCO', 'C', 'DIS', 'XOM', 
    'GM', 'INTC', 'JPM', 'MSFT', 'NFLX', 'NVDA', 'PYPL','PANW','ADBE'
    'TSLA', 'V', 'SQ', 'TSM', 'QCOM', 'UBER', 'SNOW', 'PLTR', 'ABNB',
    "FB",'META',"TGT","NKE","SBUX","ORCL","DDOG","JNJ","AVGO","COST"
    ]

142
40


In [19]:
def create_training_data_local(key_list, prefix, bucket_name, alert_type, start_date, end_date):
    df_list = []
    hours = [10,11,12,13,14,15]
    start = start_date.split(' ')[0]
    end = end_date.split(' ')[0]
    for key in key_list:
        for hour in hours:
            try:
                data = s3.get_object(Bucket=bucket_name, Key=f'{prefix}/{key}/{hour}.csv')
                df = pd.read_csv(data.get("Body")) 
                df = df.loc[df['symbol'].isin(GE)]
                df_list.append(df)
            except Exception as e:
                print(f"{e} for {key}")
                continue

    data = pd.concat(df_list)
    data.reset_index(drop=True, inplace=True)
    data.replace([np.inf, -np.inf], 0, inplace=True)
    data.to_csv(f'/Users/charlesmiller/Documents/model_tester_data/BF/{start}_{end}GE.csv', index=False)
    return data

def build_query_keys(dates):
    key_list = []
    for date in dates:
        date_str = date.strftime('%Y-%m-%d')
        if date_str in market_holidays:
            continue
        else:
            year, month, day = date_str.split('-')
            temp = f'{year}/{month}/{day}'
            key_list.append(temp)

    return key_list

def build_date_list(start_date, end_date):
    print(start_date, end_date)
    date_diff = end_date - start_date
    numdays = date_diff.days 
    dateList = []
    for x in range (0, numdays):
        temp_date = start_date + timedelta(days = x)
        if temp_date.weekday() > 4:
            continue
        else:
            dateList.append(temp_date)
    return dateList

In [20]:
def model_runner_data(start_date,end_date):
    dates = build_date_list(start_date, end_date)
    key_list = build_query_keys(dates)
    print(key_list[-1])
    train_data = create_training_data_local(key_list, 'bf_alerts/new_features_expanded', 'inv-alerts', '', start_date.strftime('%Y-%m-%d %H:%M:%S'),end_date.strftime('%Y-%m-%d %H:%M:%S'))
    return "train_data"

In [21]:
response = model_runner_data(start_date=datetime(2015,1,1),end_date=datetime(2024,6,8))

2015-01-01 00:00:00 2024-06-08 00:00:00
2024/06/07
An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist. for 2015/11/19
An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist. for 2015/11/19
An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist. for 2015/11/19
An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist. for 2015/11/19
An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist. for 2015/11/19
An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist. for 2015/11/19
An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist. for 2018/12/05
An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist. for 2018/12/05
An error occu

In [28]:
def convert_timestamp_est(timestamp):
    # Create a naive datetime object from the UNIX timestamp
    dt_naive = datetime.utcfromtimestamp(timestamp)
    # Convert the naive datetime object to a timezone-aware one (UTC)
    dt_utc = pytz.utc.localize(dt_naive)
    # Convert the UTC datetime to EST
    dt_est = dt_utc.astimezone(pytz.timezone('US/Eastern'))
    
    return dt_est


data = pd.read_csv(f'/Users/charlesmiller/Documents/model_tester_data/BF/2015-01-01_2024-06-08GE.csv')
data['date'] = data['date'].astype(str)
data['date'] = data['date'].apply(lambda x: x.split(" ")[0])
data = data.drop(data[data['date'] == 'nan'].index)
# data.drop('date',axis=1,inplace=True)
# data['date'] = data['t'].apply(lambda x: convert_timestamp_est(x))
# data.drop(['one_pct','three_pct','Unnamed: 0.2', "Unnamed: 0","Unnamed: 0.1", "date_x","date_y",'vw_x', 'n_x', 'return_vol_240M_x', 'volume_vol_240M_x', 'return_vol_450M_x', 'volume_vol_450M_x', 'min_vol_diff_x', 'min_vol_diff_pct_x', 'min_volume_vol_diff_x', 'min_volume_vol_diff_pct_x', 'return_vol_8H_x', 'return_vol_16H_x', 'volume_vol_8H_x', 'volume_vol_16H_x', 'hour_vol_diff_x', 'hour_vol_diff_pct_x', 'hour_volume_vol_diff_x', 'hour_volume_vol_diff_pct_x', 'return_vol_5D_x', 'return_vol_10D_x', 'return_vol_30D_x', 'volume_vol_5D_x', 'volume_vol_10D_x', 'volume_vol_30D_x', 'daily_vol_diff_x', 'daily_vol_diff_pct_x', 'daily_vol_diff30_x', 'daily_vol_diff_pct30_x', 'daily_volume_vol_diff_x', 'daily_volume_vol_diff_pct_x', 'daily_volume_vol_diff30_x', 'daily_volume_vol_diff_pct30_x', 'vw_y', 'n_y', 'return_vol_240M_y', 'volume_vol_240M_y', 'return_vol_450M_y', 'volume_vol_450M_y', 'min_vol_diff_y', 'min_vol_diff_pct_y', 'min_volume_vol_diff_y', 'min_volume_vol_diff_pct_y', 'return_vol_8H_y', 'return_vol_16H_y', 'volume_vol_8H_y', 'volume_vol_16H_y', 'hour_vol_diff_y', 'hour_vol_diff_pct_y', 'hour_volume_vol_diff_y', 'hour_volume_vol_diff_pct_y', 'return_vol_5D_y', 'return_vol_10D_y', 'return_vol_30D_y', 'volume_vol_5D_y', 'volume_vol_10D_y', 'volume_vol_30D_y', 'daily_vol_diff_y', 'daily_vol_diff_pct_y', 'daily_vol_diff30_y', 'daily_vol_diff_pct30_y', 'daily_volume_vol_diff_y', 'daily_volume_vol_diff_pct_y', 'daily_volume_vol_diff30_y', 'daily_volume_vol_diff_pct30_y'], axis=1, inplace=True)
# data['roc_diff'] = data['roc'] - data['roc5'] 
# data['range_vol_diff5'] = (data['range_vol'] - data['range_vol5MA'])
# data['close_diff_deviation3'] = abs(data['close_diff3'])/(data['threeD_stddev50']*100)
# data['close_diff_deviation'] = abs(data['close_diff'])/(data['oneD_stddev50']*100)
data.dropna(subset=['date'], inplace=True)
data['date'] = data['date'].apply(lambda x: datetime.strptime(x,"%Y-%m-%d"))
data['day_of_week'] = data['date'].apply(lambda x: x.dayofweek).astype(int)
data['day_of_month'] = data['date'].apply(lambda x: x.day).astype(int)
data['month'] = data['date'].apply(lambda x: x.month).astype(int)
data['year'] = data['date'].apply(lambda x: x.year).astype(int)
data.replace([np.inf, -np.inf], 0, inplace=True)
data.to_csv(f'/Users/charlesmiller/Documents/model_tester_data/BF/2015-01-01_2024-06-08GE.csv', index=False)

In [23]:
data = pd.read_csv(f'/Users/charlesmiller/Documents/model_tester_data/BF/2015-01-01_2024-04-19BF3PE.csv')
data['symbol'].unique()

array(['QQQ', 'SPY', 'IWM', 'GOOGL', 'GOOG', 'AMZN', 'AMD', 'AAPL', 'BAC',
       'BA', 'C', 'DIS', 'XOM', 'INTC', 'JPM', 'MSFT', 'NFLX', 'NVDA',
       'TSLA', 'V', 'TSM', 'QCOM', 'SNOW', 'TGT', 'NKE', 'SBUX', 'ORCL',
       'PYPL', 'SQ', 'UBER', 'PLTR', 'ABNB', 'META'], dtype=object)

In [24]:
data.tail(15)[['one_max','three_max']]

Unnamed: 0,one_max,three_max
417621,0.00916,0.00916
417622,0.024471,0.00491
417623,0.006782,0.006782
417624,0.013888,0.005778
417625,0.003739,0.003739
417626,0.007212,0.007212
417627,0.005123,0.005123
417628,0.00874,0.00874
417629,0.007816,0.007816
417630,0.009755,0.009755


In [25]:
new_df = data.rename(columns={'one_max':'threeM','three_max':'oneM'})

In [26]:
new_df.tail(15)[['oneM','threeM']]

Unnamed: 0,oneM,threeM
417621,0.00916,0.00916
417622,0.00491,0.024471
417623,0.006782,0.006782
417624,0.005778,0.013888
417625,0.003739,0.003739
417626,0.007212,0.007212
417627,0.005123,0.005123
417628,0.00874,0.00874
417629,0.007816,0.007816
417630,0.009755,0.009755


In [27]:
new_df.to_csv(f'/Users/charlesmiller/Documents/model_tester_data/BF/reassign_test.csv', index=False)