In [13]:
import boto3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import math
import random
from decimal import Decimal, getcontext
import pandas_market_calendars as mcal
import ast
from sklearn.preprocessing import StandardScaler
import pytz
import warnings
warnings.filterwarnings('ignore')

In [14]:
s3 = boto3.client('s3')

# Create a calendar
nyse = mcal.get_calendar('NYSE')
holidays = nyse.regular_holidays
market_holidays = holidays.holidays()

index = ["QQQ","SPY","IWM","TLT"]
test_lag = ["DAL","UAL","VZ","T","AAL","AMC"]
train_lag = ["AMC"]
bf_plus = ["AMD","NVDA","PYPL","GOOG","GOOGL","AMZN","PLTR","BAC","AAPL","NFLX","ABNB","CRWD","SHOP","FB","CRM",
            "MSFT","F","V","MA","JNJ","DIS","JPM","INTC","ADBE","BA","CVX","MRNA","PFE","SNOW","NKE",'META',
            'C','TGT','MMM','SQ','PANW','DAL','CSCO','UBER','SBUX']


high_vol = ['ZM', 'UBER', 'TDOC', 'UAL', 'RCL', 'AMZN', 'ABNB', 'META', 'TSLA',
       'LCID', 'NIO', 'RIVN', 'SQ', 'SHOP', 'DOCU', 'ROKU',
       'TWLO', 'DDOG', 'ZS', 'NET', 'OKTA', 'ETSY', 'PINS',
       'FUTU', 'SE', 'RBLX', 'AMD', 'NVDA', 'PYPL', 'PLTR', 'NFLX',
       'CRWD', 'MRNA', 'SNOW', 'SOFI', 'WBD', 'ARM', 'SNAP', 'BILI',
       'AAL', 'CCL', 'NCLH', 'LYFT', 'BIDU', 'JD', 'BABA', 'MU', 'AMAT',
       'DKNG', 'CZR', 'VXX']

high_vol = ['COIN','BILI','UPST','CVNA',"NIO","BABA","ROKU","RBLX","SE","SNAP","LCID","ZM","TDOC","UBER","RCL",
            'RIVN',"BIDU","FUTU","TSLA","JD","HOOD","CHWY","MARA","SNAP",'TWLO', 'DDOG', 'ZS', 'NET', 'OKTA',
            "DOCU",'SQ', 'SHOP',"PLTR","CRWD",'MRNA', 'SNOW', 'SOFI','LYFT']


mid_vol =['CMG', 'AXP', 'DAL', 'GE', 'TSM', 'GOOG', 'GOOGL', 'BAC', 'AAPL',
       'CRM', 'MSFT', 'F', 'DIS', 'ADBE', 'BA', 'CVX', 'C', 'CAT', 'MS',
       'WFC', 'TGT', 'INTC', 'PANW', 'ORCL', 'LOW', 'SBUX', 'NKE', 'QCOM',
       'AVGO', 'TXN', 'MGM', 'XOM']

low_vol = ['MMM', 'MRK', 'HD', 'VZ', 'V', 'MA', 'JPM',
       'PFE', 'GS', 'IBM', 'CSCO', 'WMT', 'COST', 'QQQ', 'SPY',
       'TLT', 'IWM']

bf2 = ['QQQ','IWM','AAPL','NVDA','AMD','AMZN','SPY','MSFT','GOOG','GOOGL','C','BAC',
      'JPM','XOM','CVX','CSCO','INTC','DIS','IBM','BA', 'V','AXP','INTU','ADBE','F','GM']

hv2 = ['TSLA','META','PYPL','UBER','DDOG','SNOW','LYFT','SNOW','ROKU','NFLX','ZM','DOCU','AMD','NVDA'
      'AFRM','SNAP','OKTA','TEAM','PLTR','SHOP','SQ','CRWD','TSM','PANW','WDAY','SPOT','PTON','TDOC','NET']


In [15]:
# def create_training_data_local(key_list, prefix, bucket_name, start_date, end_date):
#     df_list = []
#     hours = [10,11,12,13,14,15]
#     start = start_date.split(' ')[0]
#     end = end_date.split(' ')[0]
#     # print(file_key)
#     for key in key_list:
#         for hour in hours:
#             try:
#                 data = s3.get_object(Bucket=bucket_name, Key=f'{prefix}{key}/{hour}.csv')
#                 df = pd.read_csv(data.get("Body")) 
#                 df_list.append(df)
#                 # df['hour'] = hour
#             except:
#                 continue

#     data = pd.concat(df_list)
#     data = data.loc[~data['symbol'].isin(index)]
#     # data = data.loc[~data['symbol'].isin(laggards)]
#     data.reset_index(drop=True, inplace=True)
#     # data['date'] = pd.to_datetime(data['date'])
#     # data['day_of_week'] = data['dt'].apply(lambda x: x.dayofweek)
#     # data = data.round(3)
#     data.replace([np.inf, -np.inf], 0, inplace=True)

#     data.to_csv(f'/Users/charlesmiller/Documents/ALL_SYM/{start}_{end}.csv', index=False)

#     return data

def pull_training_data_local(end_date,start_date):
    data = pd.read_csv(f'/Users/charlesmiller/Documents/model_tester_data/HV/2018-01-01_2023-12-23.csv')
    data['dt'] = pd.to_datetime(data['date'])
    data = data.loc[data['dt'] <= end_date]
    data['one_max_vol'] = (data['one_max']/data['return_vol_10D']).round(3)
    data['three_max_vol'] = (data['three_max']/data['return_vol_10D']).round(3)
    data['one_min_vol'] = (data['one_min']/data['return_vol_10D']).round(3)
    data['three_min_vol'] = (data['three_min']/data['return_vol_10D']).round(3)    
    data['one_max_vol30'] = (data['one_max']/data['return_vol_30D']).round(3)
    data['three_max_vol30'] = (data['three_max']/data['return_vol_30D']).round(3)
    data['one_min_vol30'] = (data['one_min']/data['return_vol_30D']).round(3)
    data['three_min_vol30'] = (data['three_min']/data['return_vol_30D']).round(3)    
    data['cd_vol'] = (data['close_diff']/data['return_vol_10D']).round(3)
    data['cd_vol3'] = (data['close_diff3']/data['return_vol_10D']).round(3)
    data['cd_vol30'] = (data['close_diff']/data['return_vol_30D']).round(3)
    data['cd_vol330'] = (data['close_diff3']/data['return_vol_30D']).round(3)
    data = data.loc[data['symbol'].isin(hv2)]
    data.replace([np.inf, -np.inf], 0, inplace=True)

    return data


def pull_validation_data_local(end_date,start_date):
    data = pd.read_csv(f'/Users/charlesmiller/Documents/model_tester_data/HV/2018-01-01_2023-12-23.csv')
    data['dt'] = pd.to_datetime(data['date'])
    data['one_max_vol'] = (data['one_max']/data['return_vol_10D']).round(3)
    data['three_max_vol'] = (data['three_max']/data['return_vol_10D']).round(3)
    data['one_min_vol'] = (data['one_min']/data['return_vol_10D']).round(3)
    data['three_min_vol'] = (data['three_min']/data['return_vol_10D']).round(3)    
    data['one_max_vol30'] = (data['one_max']/data['return_vol_30D']).round(3)
    data['three_max_vol30'] = (data['three_max']/data['return_vol_30D']).round(3)
    data['one_min_vol30'] = (data['one_min']/data['return_vol_30D']).round(3)
    data['three_min_vol30'] = (data['three_min']/data['return_vol_30D']).round(3)    
    data['cd_vol'] = (data['close_diff']/data['return_vol_10D']).round(3)
    data['cd_vol3'] = (data['close_diff3']/data['return_vol_10D']).round(3)
    data['cd_vol30'] = (data['close_diff']/data['return_vol_30D']).round(3)
    data['cd_vol330'] = (data['close_diff3']/data['return_vol_30D']).round(3)
    data = data.loc[data['symbol'].isin(hv2)]
    data = data.loc[data['dt'] <= end_date]
    data = data.loc[data['dt'] >= start_date]
    data.replace([np.inf, -np.inf], 0, inplace=True)
    return data

In [16]:
def build_date_list(start_date, end_date):
    print(start_date, end_date)
    date_diff = end_date - start_date
    numdays = date_diff.days 
    dateList = []
    for x in range (0, numdays):
        temp_date = start_date + timedelta(days = x)
        if temp_date.weekday() > 4:
            continue
        else:
            dateList.append(temp_date)
    return dateList

def build_query_keys_hist():
    start_date = datetime(2021,1,5)
    date_diff = datetime(2022,7,29) - start_date
    numdays = date_diff.days 
    key_list = []
    for x in range (0, numdays):
        temp_date = start_date + timedelta(days = x)
        if temp_date.weekday() > 4:
            continue
        else:
            date_str = temp_date.strftime('%Y-%m-%d')
            if date_str in market_holidays:
                continue
            else:
                date_str = date_str.replace("-","/")
                key_list.append(date_str)
        
    return key_list
    
def build_query_keys(dates):
    key_list = []
    for date in dates:
        date_str = date.strftime('%Y-%m-%d')
        if date_str in market_holidays:
            continue
        else:
            year, month, day = date_str.split('-')
            temp = f'{year}/{month}/{day}'
            key_list.append(temp)

    return key_list

def build_query_keys_validation(end_date):
    validation_end_date = end_date + timedelta(days=7)
    dates = build_date_list(end_date, validation_end_date)
    key_list = []
    for date in dates:
        date_str = date.strftime('%Y-%m-%d')
        if date_str in market_holidays:
            continue
        else:
            year, month, day = date_str.split('-')
            temp = f'{year}/{month}/{day}'
            key_list.append(temp)

    return key_list

def build_validation_dates_local(deployment_date):
    end_date = deployment_date + timedelta(days=5)
    return end_date

In [17]:
def model_results_analyzer(predictions, y_validate, target_value):
    result_list = []
    counter = 0
    predictions_series = pd.Series(predictions,name='prediction_values')
    for x in predictions:
        if x == 1:
            if y_validate.iloc[counter] == 1:
                classification_result = 0
            else:
                classification_result = 1
        elif x == 0:
            if y_validate.iloc[counter] == 0:
                classification_result = 2
            else: 
                classification_result = 3
        result_list.append(classification_result)
        counter += 1
    three_max = pd.Series(y_validate,name='three_max')
    df = pd.concat([pd.Series(result_list,name='classifier_performance'),predictions_series,three_max],axis=1)
    df.reset_index(drop=True, inplace=True)
    # df = pd.DataFrame([result_list, y_validate], columns=['classifier_performance', 'prediction_score'])

    tp = df.loc[df['classifier_performance'] == 0]
    fp = df.loc[df['classifier_performance'] == 1]
    tn = df.loc[df['classifier_performance'] == 2]
    fn = df.loc[df['classifier_performance'] == 3]

    # tp_scr = tp["prediction_score"].mean()
    # fp_scr = fp["prediction_score"].mean()
    # tn_scr = tn["prediction_score"].mean()
    # fn_scr = fn["prediction_score"].mean()

    return len(tp), "0", len(fp), "0", len(tn), "0", len(fn), "0"

In [18]:
def create_dynamo_record(tp, tp_scr, fp, fp_scr, tn, tn_scr, fn, fn_scr, model_name, deployment_date, dataset_name, hyperparam_str, feature_str, target_str, fi_list):    
    ddb = boto3.resource('dynamodb','us-east-1')
    table = ddb.Table('icarus-models-results-table')
    eval_start = deployment_date
    eval_end = deployment_date + timedelta(days=4)
    try:
        precision = (tp/fp)
    except:
        precision = 0

    ## FILL IN
    item={
        'model_name': model_name,
        'deployment_date': deployment_date.strftime("%Y-%m-%d"),
        'algorithm_type': 'xgboost',
        'dataset': dataset_name,
        'TP': tp,
        'TPpct': Decimal("0"),
        'FP': fp,
        'FPpct': Decimal("0"),
        'TN': tn,
        'TNpct': Decimal("0"),
        'FN': fn,
        'FNpct': Decimal("0"),
        'precision_ratio': Decimal(str(precision)),
        'evaluation_timeperiod': f'{eval_start.strftime("%Y-%m-%d")}_{eval_end.strftime("%Y-%m-%d")}',
        'live': False,
        'hyperparameters': {hyperparam_str},
        'features' : {feature_str},
        'target' : target_str,
        'feature_importances': fi_list

    }

    print(item)
    response = table.put_item(
            Item=item
        )

    return response

In [19]:
def train_model(features, dataset, validation_dataset, target_label, target_value, hyperparams):
    dataset.loc[:, 'label'] = (dataset[target_label] > target_value).astype(int)
    validation_dataset.loc[:, 'label'] = (validation_dataset[target_label] > target_value).astype(int)

    # dataset = dataset.round(3)
    # validation_dataset = validation_dataset.round(3)

    X = dataset[features].astype(float)
    y = dataset['label']

    X_validate = validation_dataset[features].astype(float)
    y_validate = validation_dataset['label']


    xgb_model = xgb.XGBClassifier(subsample=hyperparams['subsample'],num_round=hyperparams['num_round'],min_child_weight=hyperparams['min_child_weight'],max_depth=hyperparams['max_depth'],learning_rate=hyperparams['learning_rate'],gamma=hyperparams['gamma'],colsample_bytree=hyperparams['colsample_bytree'],verbosity=0,objective='binary:logistic',random_state=42)
    xgb_model.fit(X,y)

    print(y.value_counts())
    predictions = xgb_model.predict(X_validate)
    probabilities = xgb_model.predict_proba(X_validate)
    tp, tp_scr, fp, fp_scr, tn, tn_scr, fn, fn_scr = model_results_analyzer(predictions, y_validate, target_value)

    fi = xgb_model.feature_importances_
    fi_list = []
    counter = 0
    for x in features:
        fi_list.append({x:fi[counter]})
        counter += 1
    print(tp,fp,tn,fn)
    return tp, tp_scr, fp, fp_scr, tn, tn_scr, fn, fn_scr,str(fi_list), predictions, probabilities


def train_model_TSSim(features, dataset, validation_dataset, target_label, target_value, hyperparams):
    dataset.loc[:, 'label'] = (dataset[target_label] > target_value).astype(int)
    validation_dataset.loc[:, 'label'] = (validation_dataset[target_label] > target_value).astype(int)

    dataset = dataset.round(5)
    validation_dataset = validation_dataset.round(3)


    X = dataset[features].astype(float)
    y = dataset['label']

    X_validate = validation_dataset[features].astype(float)
    y_validate = validation_dataset['label']
    

    xgb_model = xgb.XGBClassifier(subsample=hyperparams['subsample'],num_round=hyperparams['num_round'],min_child_weight=hyperparams['min_child_weight'],max_depth=hyperparams['max_depth'],learning_rate=hyperparams['learning_rate'],gamma=hyperparams['gamma'],colsample_bytree=hyperparams['colsample_bytree'],verbosity=0,objective='binary:logistic',random_state=42)
    xgb_model.fit(X,y)

    predictions = xgb_model.predict(X_validate)
    tp, tp_scr, fp, fp_scr, tn, tn_scr, fn, fn_scr = model_results_analyzer(predictions, y_validate, target_value)

    return tp, tp_scr, fp, fp_scr, tn, tn_scr, fn, fn_scr,"str(fi_list)", predictions, "probabilities"

In [20]:
def model_runner_v2RM(model_name, dataset_name, title, features, target_label, target_percentile, start_date, end_date,deployment_date, feature_str, hyperparams_str, hyperparams,local_data, dataset_start_date):
    validation_end_date = build_validation_dates_local(deployment_date)
    validation_dataset = pull_validation_data_local(validation_end_date,deployment_date)
    dataset = pull_training_data_local(end_date, dataset_start_date)
    dates_list = pull_gmm_labels(deployment_date)
    rm_dataset = dataset.loc[dataset['dt'].isin(dates_list)]
    target_value = rm_dataset[target_label].quantile(target_percentile).round(3)
    print(target_value)
    print(len(rm_dataset)/len(dataset))
    tp, tp_scr, fp, fp_scr, tn, tn_scr, fn, fn_scr, fi_list, predictions, probabilities = train_model(features, rm_dataset, validation_dataset, target_label, target_value, hyperparams)
    response = create_dynamo_record(tp, tp_scr, fp, fp_scr, tn, tn_scr, fn, fn_scr, model_name, deployment_date, dataset_name,hyperparams_str,feature_str, f"{target_value}+{target_label}", fi_list)
    validation_dataset['probabilities'] = probabilities[:,1]
    validation_dataset['predictions'] = predictions
    validation_dataset['target_value'] = target_value
    validation_dataset['target_pct'] = validation_dataset['target_value'] * validation_dataset['return_vol_10D']
    validation_csv = validation_dataset.to_csv()
    put_response = s3.put_object(Bucket="icarus-research-data", Key=f"backtesting_data/inv_alerts/{dataset_name}/{title}/{deployment_date.strftime('%Y-%m-%d')}.csv", Body=validation_csv)
    return response

def model_runner_v2(model_name, dataset_name, title, features, target_label, target_percentile, start_date, end_date,deployment_date, feature_str, hyperparams_str, hyperparams,local_data, dataset_start_date):
    validation_end_date = build_validation_dates_local(deployment_date)
    validation_dataset = pull_validation_data_local(validation_end_date,deployment_date)
    dataset = pull_training_data_local(end_date, dataset_start_date)
    target_value = dataset[target_label].quantile(target_percentile).round(3)
    tp, tp_scr, fp, fp_scr, tn, tn_scr, fn, fn_scr, fi_list, predictions, probabilities = train_model(features, dataset, validation_dataset, target_label, target_value, hyperparams)
    response = create_dynamo_record(tp, tp_scr, fp, fp_scr, tn, tn_scr, fn, fn_scr, model_name, deployment_date, dataset_name,hyperparams_str,feature_str, f"{target_value}+{target_label}", fi_list)
    validation_dataset['probabilities'] = probabilities[:,1]
    validation_dataset['predictions'] = predictions
    validation_dataset['target_value'] = target_value
    validation_dataset['target_pct'] = validation_dataset['target_value'] * validation_dataset['return_vol_10D']
    validation_csv = validation_dataset.to_csv()
    put_response = s3.put_object(Bucket="icarus-research-data", Key=f"backtesting_data/inv_alerts/{dataset_name}/{title}/{deployment_date.strftime('%Y-%m-%d')}.csv", Body=validation_csv)
    return response

def model_runner_temporal_simulation(features, target_label, target_percentile,dataset_start_date,end_date,deployment_date,hyperparams):
    validation_end_date = build_validation_dates_local(deployment_date)
    dataset = pull_training_data_local(end_date, dataset_start_date)
    validation_dataset = pull_validation_data_local(validation_end_date,deployment_date)
    dataset.dropna(subset=["close_diff_deviation3"],inplace=True)
    validation_dataset.dropna(subset=["close_diff_deviation3"],inplace=True)
    dataset.dropna(subset=["close_diff_deviation"],inplace=True)
    validation_dataset.dropna(subset=["close_diff_deviation"],inplace=True)
    target_value = dataset[target_label].quantile(target_percentile).round(3)
    tp, tp_scr, fp, fp_scr, tn, tn_scr, fn, fn_scr, fi_list, predictions, probabilities = train_model_TSSim(features, dataset, validation_dataset, target_label, target_value, hyperparams)
    return tp, fp, fn, tn 

def model_runner_data(start_date,end_date):
    dates = build_date_list(start_date, end_date)
    key_list = build_query_keys(dates)
    print(key_list[-1])
    train_data = create_training_data_local(key_list, 'full_alerts/weekly_exp_alerts/', 'inv-alerts', 'cdvol_gainers', start_date.strftime('%Y-%m-%d %H:%M:%S'),end_date.strftime('%Y-%m-%d %H:%M:%S'))
    val_data = create_validation_data_local(key_list, 'full_alerts/trading_symbols_alerts/', 'inv-alerts', 'cdvol_gainers', start_date.strftime('%Y-%m-%d %H:%M:%S'),end_date.strftime('%Y-%m-%d %H:%M:%S'))
    return "train_data", "val_data"

In [21]:
def build_evaluation_period(eval_start, eval_end):
    dates_list = []
    while eval_start <= eval_end:
        date_object = {
            "deployment_date": eval_start,
            "dataset_end": eval_start - timedelta(days=10),
            "dataset_start": datetime(2018,1,1)
        }
        dates_list.append(date_object)
        eval_start += timedelta(days=7)
    return dates_list

def pull_gmm_labels(date):
    date_str = date.strftime('%Y-%m-%d')
    data_20d = s3.get_object(Bucket="icarus-research-data", Key=f'regime_modeling_data/gmm20d_labels_RM2/{date_str}.csv')
    data_10d = s3.get_object(Bucket="icarus-research-data", Key=f'regime_modeling_data/gmm10d_labels_RM2/{date_str}.csv')
    data_3d = s3.get_object(Bucket="icarus-research-data", Key=f'regime_modeling_data/gmm3d_labels_RM2/{date_str}.csv')

    data_20d = pd.read_csv(data_20d.get("Body"))
    data_10d = pd.read_csv(data_10d.get("Body"))
    data_3d = pd.read_csv(data_3d.get("Body"))

    data_20d['dt'] = pd.to_datetime(data_20d['date'])
    data_10d['dt'] = pd.to_datetime(data_10d['date'])
    data_3d['dt'] = pd.to_datetime(data_3d['date'])

    label_20d = data_20d['labels'].iloc[-1]
    label_10d = data_10d['labels'].iloc[-1]
    label_3d = data_3d['labels'].iloc[-1]

    dates_20d = data_20d['dt'].loc[data_20d['labels'] == label_20d].tolist()
    dates_10d = data_10d['dt'].loc[data_10d['labels'] == label_10d].tolist()
    dates_3d = data_3d['dt'].loc[data_3d['labels'] == label_3d].tolist()

    combined_list = list(set(dates_20d  + dates_3d))
    return dates_20d

In [22]:
target_percentile = 0.5
title = 'CDHVC'
model_name = f'{title}:TSSIM1_HV2_custHypTP{target_percentile}'
dataset_name = f'TSSIM1_HV2_custHypTP{target_percentile}'
hyperparams = {'subsample': 0.6, 'num_round': 1000, 'min_child_weight': 10, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 2, 'colsample_bytree': 1}
features = ['return_vol_5D', 'volume_10DDiff', 'oneD_stddev50', 'rsi', 'volume_vol_16H', 'return_vol_8H', 'min_volume_vol_diff', 'roc3', 'daily_vol_diff30', 
            'return_vol_3D', 'daily_vol_diff_pct30', 'hour_volume_vol_diff_pct', 'SPY_5D', 'hour_vol_diff_pct', 'volume_vol_10D', 'daily_volume_vol_diff_pct30', 
            'range_vol5MA', 'volume_vol_60M', 'SPY_diff3', 'month', 'vol7', 'range_vol', 'hour_volume_vol_diff', 'roc_diff', 'range_vol_diff5', 'return_vol_120M', 
            'close_diff', 'daily_volume_vol_diff', 'SPY_3D', 'volume_vol_30M', 'vol14', 'daily_vol_diff_pct']

target_label = 'three_max_vol'


dataset_start_date = datetime(2018,1,1,tzinfo=pytz.timezone('US/Eastern')) 
dates_list = build_evaluation_period(datetime(2022,10,3), datetime(2023,12,23))

# date = dates_list[-1]
# print(date)
# response = model_runner_data(start_date=datetime(2018,1,1),end_date=datetime(2023,12,23))

# dataset = pull_training_data_local(start_date=datetime(2018,1,1),end_date=datetime(2023,12,23))
# print(dataset.columns.tolist())

for date in dates_list:
    print(date)
    response = model_runner_v2(model_name, dataset_name, title, features, target_label, target_percentile,"dataset_start_date",date['dataset_end'],date['deployment_date'], str(features),str(hyperparams), hyperparams, local_data=True, dataset_start_date=date['dataset_start'])
    


{'deployment_date': datetime.datetime(2022, 10, 3, 0, 0), 'dataset_end': datetime.datetime(2022, 9, 23, 0, 0), 'dataset_start': datetime.datetime(2018, 1, 1, 0, 0)}
label
0    60556
1    59964
Name: count, dtype: int64
110 202 218 70
{'model_name': 'CDHVC:TSSIM1_HV2_custHypTP0.5', 'deployment_date': '2022-10-03', 'algorithm_type': 'xgboost', 'dataset': 'TSSIM1_HV2_custHypTP0.5', 'TP': 110, 'TPpct': Decimal('0'), 'FP': 202, 'FPpct': Decimal('0'), 'TN': 218, 'TNpct': Decimal('0'), 'FN': 70, 'FNpct': Decimal('0'), 'precision_ratio': Decimal('0.5445544554455446'), 'evaluation_timeperiod': '2022-10-03_2022-10-07', 'live': False, 'hyperparameters': {"{'subsample': 0.6, 'num_round': 1000, 'min_child_weight': 10, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 2, 'colsample_bytree': 1}"}, 'features': {"['return_vol_5D', 'volume_10DDiff', 'oneD_stddev50', 'rsi', 'volume_vol_16H', 'return_vol_8H', 'min_volume_vol_diff', 'roc3', 'daily_vol_diff30', 'return_vol_3D', 'daily_vol_diff_pct30', 'hour_v

In [23]:
# def run_temporal_simulation(features, target_label, target_value, dataset_start_date, evaluation_start_date, evaluation_end_date):
#     tp_avg_list = []
#     gross_accuracy_list = []
#     # fn_list = []
#     # tn_list = []
    
#     hyperparams = {'subsample': 0.6, 'num_round': 1000, 'min_child_weight': 10, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 2, 'colsample_bytree': 1}
#     dates_list = build_evaluation_period(evaluation_start_date, evaluation_end_date)
#     for date in dates_list:
#         tp, fp, fn, tn = model_runner_temporal_simulation(features, target_label, target_value,dataset_start_date,date['dataset_end'],date['deployment_date'], hyperparams)
#         tp_avg_list.append(tp)
#         gross_accuracy_list.append((tp-fp))

#     return tp_avg_list, gross_accuracy_list


In [24]:
# number_of_simulations = 300
# now = datetime.now()
# title = 'CDBFC'
# # total_feature_list = ['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'v', 'o', 'c', 'h', 'l',
# #  'date', 'hour', 'symbol', 't', 'price7', 'price14', 'adjusted_volume', 'vol7', 'vol14', 'rsi', 
# # 'rsi3', 'rsi5', 'roc', 'roc3', 'roc5', 'threeD_returns_close', 'oneD_returns_close', 'range_vol', 
# # 'range_vol5MA', 'range_vol10MA', 'range_vol25MA', 'oneD_stddev50', 'threeD_stddev50', 'cmf', 'close_diff', 
# # 'close_diff3', 'close_diff5', 'v_diff_pct', 'adx', 'volume_10MA', 'volume_25MA', 'price_10MA', 'price_25MA',
# # 'volume_10DDiff', 'volume_25DDiff', 'price_10DDiff', 'price_25DDiff', 'one_max', 'one_min', 'one_pct', 
# # 'three_max', 'three_min', 'three_pct', 'SPY_diff', 'SPY_diff3', 'SPY_diff5', 'SPY_1D', 'SPY_3D', 'SPY_5D', 
# # 'vw', 'n', 'return_vol_15M', 'volume_vol_15M', 'return_vol_30M', 'volume_vol_30M', 'return_vol_60M', 
# # 'volume_vol_60M', 'return_vol_120M', 'volume_vol_120M', 'return_vol_240M', 'volume_vol_240M', 
# # 'return_vol_450M', 'volume_vol_450M', '15min_vol_diff', '15min_vol_diff_pct', 'min_vol_diff', 
# # 'min_vol_diff_pct', 'min_volume_vol_diff', 'min_volume_vol_diff_pct', 'return_vol_4H', 'return_vol_8H', 
# # 'return_vol_16H', 'volume_vol_4H', 'volume_vol_8H', 'volume_vol_16H', 'hour_vol_diff', 'hour_vol_diff_pct', 
# # 'hour_volume_vol_diff', 'hour_volume_vol_diff_pct', 'return_vol_3D', 'return_vol_5D', 'return_vol_10D', 
# # 'return_vol_30D', 'volume_vol_3D', 'volume_vol_5D', 'volume_vol_10D', 'volume_vol_30D', 'daily_vol_diff', 
# # 'daily_vol_diff_pct', 'daily_vol_diff30', 'daily_vol_diff_pct30', 'daily_volume_vol_diff', 'daily_volume_vol_diff_pct', 
# # 'daily_volume_vol_diff30', 'daily_volume_vol_diff_pct30', 'cd_vol', 'cd_vol3', 'roc_diff', 'range_vol_diff5', 
# # 'close_diff_deviation3', 'close_diff_deviation', 'day_of_week', 'day_of_month', 'month', 'year', 'dt', 'one_max_vol', 
# # 'three_max_vol', 'one_min_vol', 'three_min_vol']

# testing_features = ['hour', 'price7', 'price14','vol7', 'vol14', 'rsi', 
# 'rsi3', 'rsi5', 'roc', 'roc3', 'roc5', 'threeD_returns_close', 'oneD_returns_close', 'range_vol', 
# 'range_vol5MA', 'range_vol10MA', 'range_vol25MA', 'oneD_stddev50', 'threeD_stddev50', 'cmf', 'close_diff', 
# 'close_diff3', 'close_diff5','adx',
# 'volume_10DDiff', 'volume_25DDiff', 'price_10DDiff', 'price_25DDiff', 
# 'SPY_diff', 'SPY_diff3', 'SPY_diff5', 'SPY_1D', 'SPY_3D', 'SPY_5D', 
# 'return_vol_15M', 'volume_vol_15M', 'return_vol_30M', 'volume_vol_30M', 'return_vol_60M', 
# 'volume_vol_60M', 'return_vol_120M', 'volume_vol_120M', 'return_vol_240M', 'volume_vol_240M', 
# 'return_vol_450M', 'volume_vol_450M', '15min_vol_diff', '15min_vol_diff_pct', 'min_vol_diff', 
# 'min_vol_diff_pct', 'min_volume_vol_diff', 'min_volume_vol_diff_pct', 'return_vol_4H', 'return_vol_8H', 
# 'return_vol_16H', 'volume_vol_4H', 'volume_vol_8H', 'volume_vol_16H', 'hour_vol_diff', 'hour_vol_diff_pct', 
# 'hour_volume_vol_diff', 'hour_volume_vol_diff_pct', 'return_vol_3D', 'return_vol_5D', 'return_vol_10D', 
# 'return_vol_30D', 'volume_vol_3D', 'volume_vol_5D', 'volume_vol_10D', 'volume_vol_30D', 'daily_vol_diff', 
# 'daily_vol_diff_pct', 'daily_vol_diff30', 'daily_vol_diff_pct30', 'daily_volume_vol_diff', 'daily_volume_vol_diff_pct', 
# 'daily_volume_vol_diff30', 'daily_volume_vol_diff_pct30', 'roc_diff', 'range_vol_diff5', 
# 'close_diff_deviation3', 'close_diff_deviation', 'day_of_week', 'day_of_month', 'month']

# target_label = 'three_max_vol'
# target_percentile = 0.5
# results_array = []

# i = 0
# while i < number_of_simulations:
#        model_name = f"{title}_temporal_simulation_{i}"
#        print(model_name)
#        random_int = random.randint(15, 40)
#        features = random.sample(testing_features, random_int)
#        print(features)
#        tp_avg_list, gross_accuracy_list = run_temporal_simulation(features, target_label, target_percentile, dataset_start_date=datetime(2018,1,1), evaluation_start_date=datetime(2022,10,3), evaluation_end_date=datetime(2023,12,23))        
#        tp_avg = sum(tp_avg_list)/len(tp_avg_list)
#        gross_accuracy = sum(gross_accuracy_list)/len(gross_accuracy_list)
#        print(tp_avg, gross_accuracy)
#        print()
#        results_array.append({"model_name": model_name, "features": features, "tp_avg": tp_avg, "gross_accuracy": gross_accuracy,"num_features": random_int})
#        i += 1

# results_df = pd.DataFrame(results_array)
# results_df.to_csv(f'/Users/charlesmiller/Documents/temporal_simulation_results/{title}/{now.year}_{now.month}_{now.day}.csv', index=False)