In [1]:
import boto3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import math
import random
from decimal import Decimal, getcontext
import pandas_market_calendars as mcal
import ast
from sklearn.preprocessing import StandardScaler

In [2]:
s3 = boto3.client('s3')

training_bucket = "icarus-research-data"

# Create a calendar
nyse = mcal.get_calendar('NYSE')
holidays = nyse.regular_holidays
market_holidays = holidays.holidays()

weekly_expiries = ['SPY', 'IVV', 'QQQ', 'GLD', 'IWM', 'EFA', 'XLK', 'XLV', 'TLT', 'LQD', 'XLE', 'TQQQ', 'SQQQ', 'SPXS', 'SPXL', 'SOXL', 'SOXS', 'MMM', 'ABT', 'ABBV', 'ACN', 'ATVI', 'ADM', 'ADBE', 'ADP', 
                   'AAP', 'AFL', 'ALB', 'ALGN', 'GOOGL', 'GOOG', 'MO', 'AMZN', 'AMD', 'AAL', 'AXP', 'AIG', 'ABC', 'AMGN', 'ADI', 'APA', 'AAPL', 'AMAT', 'ANET', 'T', 'ADSK', 'BAC', 'BBWI', 'BAX', 'BBY', 'BIIB', 
                   'BLK', 'BA', 'BKNG', 'BMY', 'AVGO', 'CZR', 'CPB', 'COF', 'CAH', 'KMX', 'CCL', 'CAT', 'CBOE', 'CNC', 'CF', 'SCHW', 'CHTR', 'CVX', 'CMG', 'CI', 'CSCO', 'C', 'CLX', 'CME', 'KO', 'CMCSA', 'CMA', 'CAG', 
                   'COP', 'STZ', 'GLW', 'COST', 'CTRA', 'CSX', 'CVS', 'DHI', 'DHR', 'DE', 'DAL', 'DVN', 'DLR', 'DFS', 'DISH', 'DIS', 'DG', 'DLTR', 'DPZ', 'DOW', 'DD', 'EBAY', 'EA', 'ELV', 'LLY', 'EMR', 'ENPH', 'EOG', 'EQT', 
                   'ETSY', 'EXPE', 'XOM', 'FDX', 'FITB', 'FSLR', 'FI', 'F', 'FTNT', 'FOXA', 'FCX', 'GEHC', 'GNRC', 'GD', 'GE', 'GM', 'GILD', 'GS', 'HAL', 'HSY', 'HES', 'HD', 'HON', 'HRL', 'HPQ', 'HUM', 'HBAN', 'IBM', 'ILMN', 
                   'INTC', 'IP', 'INTU', 'ISRG', 'JNJ', 'JPM', 'JNPR', 'KEY', 'KMB', 'KMI', 'KLAC', 'KHC', 'KR', 'LRCX', 'LVS', 'LEN', 'LMT', 'LOW', 'MRO', 'MPC', 'MAR', 'MA', 'MTCH', 'MCD', 'MCK', 'MDT', 'MRK', 'META', 'MET', 
                   'MGM', 'MU', 'MSFT', 'MRNA', 'MDLZ', 'MS', 'MOS', 'NTAP', 'NFLX', 'NEM', 'NKE', 'NSC', 'NOC', 'NCLH', 'NUE', 'NVDA', 'NXPI', 'OXY', 'ON', 'ORCL', 'PARA', 'PYPL', 'PEP', 'PFE', 'PCG', 'PM', 'PSX', 'PXD', 'PNC', 
                   'PPG', 'PG', 'PHM', 'QCOM', 'RTX', 'REGN', 'ROST', 'RCL', 'SPGI', 'CRM', 'SLB', 'STX', 'NOW', 'SWKS', 'SEDG', 'SO', 'LUV', 'SBUX', 'TMUS', 'TROW', 'TTWO', 'TPR', 'TGT', 'TSLA', 'TXN', 'TMO', 'TJX', 'TSCO', 'TFC', 
                   'TSN', 'USB', 'ULTA', 'UNP', 'UAL', 'UPS', 'URI', 'UNH', 'VLO', 'VZ', 'VRTX', 'VFC', 'V', 'WBA', 'WMT', 'WBD', 'WM', 'WFC', 'WDC', 'WHR', 'WMB', 'WYNN', 'ZION']

In [3]:
def create_validation_data(key_list, prefix, bucket_name):
    df_list = []
    hours = [10,11,12,13,14,15]
    for key in key_list:
        for hour in hours:
            try:
                data = s3.get_object(Bucket=bucket_name, Key=f'{prefix}{key}/{hour}.csv')
                df = pd.read_csv(data.get("Body")) 
                df_list.append(df)
            except:
                continue

    data = pd.concat(df_list)
    data.reset_index(drop=True, inplace=True)
    # data = full_df.drop(full_df[full_df.third_friday == 'True'].index)
    # data.drop(data[data.third_friday == 'NO RESULTS'].index, inplace=True)
    data['volume_10DDiff'] = data.apply(lambda x: x.v - x.volume_10MA, axis=1)
    data['volume_25DDiff'] = data.apply(lambda x: x.v - x.volume_25MA, axis=1)
    # data['rvi_delta'] = data.apply(lambda x: x.rvi5 - x.rvi10, axis=1)

    data['dt'] = pd.to_datetime(data['date'])
    data['day_of_week'] = data['dt'].apply(lambda x: x.dayofweek)
    data['day_of_month'] = data['dt'].apply(lambda x: x.day)
    data['month'] = data['dt'].apply(lambda x: x.month)
    data['year'] = data['dt'].apply(lambda x: x.year)
    data['hour'] = data['dt'].apply(lambda x: x.hour)

    data = data.loc[data['symbol'].isin(weekly_expiries)]

    return data

def create_training_data_v2(key_list, prefix, bucket_name):
    df_list = []
    hours = [10,11,12,13,14,15]
    for key in key_list:
        for hour in hours:
            try:
                data = s3.get_object(Bucket=bucket_name, Key=f'{prefix}{key}/{hour}.csv')
                df = pd.read_csv(data.get("Body")) 
                df_list.append(df)
            except:
                continue

    data = pd.concat(df_list)
    data.reset_index(drop=True, inplace=True)
    # data = full_df.drop(full_df[full_df.third_friday == 'True'].index)
    # data.drop(data[data.third_friday == 'NO RESULTS'].index, inplace=True)
    data['volume_10DDiff'] = data.apply(lambda x: x.v - x.volume_10MA, axis=1)
    data['volume_25DDiff'] = data.apply(lambda x: x.v - x.volume_25MA, axis=1)

    data['dt'] = pd.to_datetime(data['date'])
    data['day_of_week'] = data['dt'].apply(lambda x: x.dayofweek)
    data['day_of_month'] = data['dt'].apply(lambda x: x.day)
    data['month'] = data['dt'].apply(lambda x: x.month)
    data['year'] = data['dt'].apply(lambda x: x.year)
    data['hour'] = data['dt'].apply(lambda x: x.hour)
    # data['rvi_delta'] = data.apply(lambda x: x.rvi5 - x.rvi10, axis=1)

    # data = data.loc[data['symbol'].isin(weekly_expiries)]


    return data

In [4]:
def build_date_list(start_date, end_date):
    date_diff = end_date - start_date
    numdays = date_diff.days 
    dateList = []
    for x in range (0, numdays):
        temp_date = start_date + timedelta(days = x)
        if temp_date.weekday() > 4:
            continue
        else:
            dateList.append(temp_date)
    return dateList

def build_query_keys_hist():
    start_date = datetime(2021,1,5)
    date_diff = datetime(2022,7,29) - start_date
    numdays = date_diff.days 
    key_list = []
    for x in range (0, numdays):
        temp_date = start_date + timedelta(days = x)
        if temp_date.weekday() > 4:
            continue
        else:
            date_str = temp_date.strftime('%Y-%m-%d')
            if date_str in market_holidays:
                continue
            else:
                date_str = date_str.replace("-","/")
                key_list.append(date_str)
        
    return key_list
    
def build_query_keys(dates):
    key_list = []
    for date in dates:
        date_str = date.strftime('%Y-%m-%d')
        if date_str in market_holidays:
            continue
        else:
            year, month, day = date_str.split('-')
            temp = f'{year}/{month}/{day}'
            key_list.append(temp)

    return key_list

def build_query_keys_validation(end_date):
    validation_end_date = end_date + timedelta(days=7)
    dates = build_date_list(end_date, validation_end_date)
    key_list = []
    for date in dates:
        date_str = date.strftime('%Y-%m-%d')
        if date_str in market_holidays:
            continue
        else:
            year, month, day = date_str.split('-')
            temp = f'{year}/{month}/{day}'
            key_list.append(temp)

    return key_list

In [5]:
def model_results_analyzer(predictions, y_validate, target_value):
    result_list = []
    counter = 0
    predictions_series = pd.Series(predictions,name='prediction_values')
    for x in predictions:
        if x == 1:
            if y_validate.iloc[counter] == 1:
                classification_result = 0
            else:
                classification_result = 1
        elif x == 0:
            if y_validate.iloc[counter] == 0:
                classification_result = 2
            else: 
                classification_result = 3
        result_list.append(classification_result)
        counter += 1
    three_max = pd.Series(y_validate,name='three_max')
    df = pd.concat([pd.Series(result_list,name='classifier_performance'),predictions_series,three_max],axis=1)
    df.reset_index(drop=True, inplace=True)
    # df = pd.DataFrame([result_list, y_validate], columns=['classifier_performance', 'prediction_score'])

    tp = df.loc[df['classifier_performance'] == 0]
    fp = df.loc[df['classifier_performance'] == 1]
    tn = df.loc[df['classifier_performance'] == 2]
    fn = df.loc[df['classifier_performance'] == 3]

    # tp_scr = tp["prediction_score"].mean()
    # fp_scr = fp["prediction_score"].mean()
    # tn_scr = tn["prediction_score"].mean()
    # fn_scr = fn["prediction_score"].mean()

    return len(tp), "0", len(fp), "0", len(tn), "0", len(fn), "0"

In [6]:
def create_dynamo_record(tp, tp_scr, fp, fp_scr, tn, tn_scr, fn, fn_scr, model_name, deployment_date, dataset_name, hyperparam_str, feature_str, target_str, fi_list):    
    ddb = boto3.resource('dynamodb','us-east-1')
    table = ddb.Table('icarus-models-results-table')
    eval_start = deployment_date
    eval_end = deployment_date + timedelta(days=4)
    try:
        precision = (tp/fp)
    except:
        precision = 0

    ## FILL IN
    item={
        'model_name': model_name,
        'deployment_date': deployment_date.strftime("%Y-%m-%d"),
        'algorithm_type': 'xgboost',
        'dataset': dataset_name,
        'TP': tp,
        'TPpct': Decimal("0"),
        'FP': fp,
        'FPpct': Decimal("0"),
        'TN': tn,
        'TNpct': Decimal("0"),
        'FN': fn,
        'FNpct': Decimal("0"),
        'precision_ratio': Decimal(str(precision)),
        'evaluation_timeperiod': f'{eval_start.strftime("%Y-%m-%d")}_{eval_end.strftime("%Y-%m-%d")}',
        'live': False,
        'hyperparameters': {hyperparam_str},
        'features' : {feature_str},
        'target' : target_str,
        'feature_importances': fi_list

    }

    print(item)
    response = table.put_item(
            Item=item
        )

    return response

In [7]:
def train_model(features, dataset, validation_dataset, target_label, target_value, hyperparams):
    # dataset[["rsi", "cmf", "adx", "roc", "roc3", "roc5","three_max"]] = dataset[["rsi", "cmf", "adx", "roc", "roc3", "roc5","three_max"]].apply(pd.to_numeric)
    # validation_dataset[["rsi", "cmf", "adx", "roc", "roc3", "roc5","three_max"]] = validation_dataset[["rsi", "cmf", "adx", "roc", "roc3", "roc5","three_max"]].apply(pd.to_numeric)
    dataset.loc[:, 'label'] = (dataset['three_min'] < -.04).astype(int)
    validation_dataset.loc[:, 'label'] = (validation_dataset['three_min'] < -.04).astype(int)

    scaler = StandardScaler()
    
    X = dataset[features]
    y = dataset['label']
    # X = scaler.fit_transform(X)

    # train_len = int(.7*len(dataset))
    # X_train = X.iloc[0:train_len]
    # y_train = y[0:train_len]
    # X_test = X.iloc[train_len:]
    # y_test = y[train_len:]

    print(y.value_counts())
    X_validate = validation_dataset[features]
    y_validate = validation_dataset['label']
    # X_validate = scaler.fit_transform(X_validate)

    # pos_bal_weight = (len(y.loc[y == 0])) / (len(y.loc[y == 1]))
    xgb_model = xgb.XGBClassifier(subsample=hyperparams['subsample'],num_round=hyperparams['num_round'],min_child_weight=hyperparams['min_child_weight'],max_depth=hyperparams['max_depth'],learning_rate=hyperparams['learning_rate'],gamma=hyperparams['gamma'],colsample_bytree=hyperparams['colsample_bytree'],silent=0,objective='binary:logistic')
    xgb_model.fit(X,y)

    predictions = xgb_model.predict(X_validate)
    tp, tp_scr, fp, fp_scr, tn, tn_scr, fn, fn_scr = model_results_analyzer(predictions, y_validate, target_value)

    fi = xgb_model.feature_importances_
    fi_list = []
    counter = 0
    for x in features:
        fi_list.append({x:fi[counter]})
        counter += 1
    print(tp,fp,tn,fn)
    return tp, tp_scr, fp, fp_scr, tn, tn_scr, fn, fn_scr, str(fi_list), predictions

In [8]:
def model_runner_v2(model_name, dataset_name, title, features, target_label, target_value,start_date, end_date,deployment_date, feature_str, target_str, hyperparams_str, hyperparams):
    dates = build_date_list(start_date, end_date)
    key_list = build_query_keys(dates)
    validation_keys = build_query_keys_validation(deployment_date)
    # dataset = create_training_data_v2(key_list, 'inv_alerts_with_price/losers/', 'icarus-research-data')
    # recent_dataset = create_training_data_v2(key_list[-100:], 'inv_alerts_with_price/losers/', 'icarus-research-data')
    validation_dataset = create_validation_data(validation_keys, 'vdiff_loss/', 'inv-alerts')
    dataset = create_training_data_v2(key_list, 'vdiff_loss/', 'inv-alerts')
    # full_data = build_multi_time_dfs(key_list, dataset)
    # mod_dataset = pd.concat([dataset, recent_dataset])
    # hyperparams = optimize_model_params(dataset, features)
    tp, tp_scr, fp, fp_scr, tn, tn_scr, fn, fn_scr, fi_list, predictions = train_model(features, dataset, validation_dataset, target_label, target_value, hyperparams)
    validation_dataset['predictions'] = predictions
    validation_csv = validation_dataset.to_csv()
    put_response = s3.put_object(Bucket="icarus-research-data", Key=f"backtesting_data/inv_alerts/standard_dataset/vdiff_lossP/{deployment_date.strftime('%Y-%m-%d')}.csv", Body=validation_csv)
    response = create_dynamo_record(tp, tp_scr, fp, fp_scr, tn, tn_scr, fn, fn_scr, model_name, deployment_date, dataset_name,hyperparams_str,feature_str, target_str, fi_list)
    return "response"

In [9]:
def build_evaluation_period(eval_start, eval_end):
    dates_list = []
    while eval_start <= eval_end:
        date_object = {
            "deployment_date": eval_start,
            "dataset_end": eval_start - timedelta(days=10)
        }
        dates_list.append(date_object)
        eval_start += timedelta(days=7)
    return dates_list


In [10]:

## Need to re-run 'inv_alerts_vdiffP_weekly:fulltrain_last60_unscaled_tuned'
model_name = 'inv_alerts_vdiff_lossP_weekly:fulltrain_fixedvDiff_tuned'
dataset_name = 'inv_alerts_standard'
title = 'most_actives'
hyperparameter_string = "{'subsample': 0.6, 'num_round': 500, 'min_child_weight': 8, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 8, 'colsample_bytree': 0.6}"
hyperparams =  {'subsample': 0.6, 'num_round': 500, 'min_child_weight': 8, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 8, 'colsample_bytree': 0.6}
feature_str = "'rsi','cmf','adx','roc','roc3','roc5','PCR','v','volume_10MA','volume_25MA','volume_10DDiff','volume_25DDiff','close_diff', 'v_diff_pct','day_of_week','hour'"
features = ['rsi','cmf','adx','roc','roc3','roc5','PCR','v','volume_10MA','volume_25MA','volume_10DDiff','volume_25DDiff','close_diff', 'v_diff_pct','day_of_week','hour']
target_str = 'three_max > 3'
target_label = 'three_max'
target_value = 3.0
dataset_start_date = datetime(2021,1,1) 

dates_list = build_evaluation_period(datetime(2022,10,24), datetime(2023,7,7))
for date in dates_list:
    response = model_runner_v2(model_name, dataset_name, title, features, target_label, target_value,dataset_start_date,date['dataset_end'],date['deployment_date'], feature_str, target_str,hyperparameter_string, hyperparams)
    print(response)
    

label
0    100752
1     33948
Name: count, dtype: int64
Parameters: { "num_round", "silent" } are not used.

23 18 486 55
{'model_name': 'inv_alerts_vdiff_lossP_weekly:fulltrain_fixedvDiff_tuned', 'deployment_date': '2022-10-24', 'algorithm_type': 'xgboost', 'dataset': 'inv_alerts_standard', 'TP': 23, 'TPpct': Decimal('0'), 'FP': 18, 'FPpct': Decimal('0'), 'TN': 486, 'TNpct': Decimal('0'), 'FN': 55, 'FNpct': Decimal('0'), 'precision_ratio': Decimal('1.2777777777777777'), 'evaluation_timeperiod': '2022-10-24_2022-10-28', 'live': False, 'hyperparameters': {"{'subsample': 0.6, 'num_round': 500, 'min_child_weight': 8, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 8, 'colsample_bytree': 0.6}"}, 'features': {"'rsi','cmf','adx','roc','roc3','roc5','PCR','v','volume_10MA','volume_25MA','volume_10DDiff','volume_25DDiff','close_diff', 'v_diff_pct','day_of_week','hour'"}, 'target': 'three_max > 3', 'feature_importances': "[{'rsi': 0.041849725}, {'cmf': 0.036460016}, {'adx': 0.03564944}, {'roc'