In [1]:
from baseline.main_model import *
import pandas as pd
import numpy as np
import os
import copy  
import json
import pickle
from tqdm import tqdm

pd.options.display.float_format = '{:.4f}'.format

In [2]:
d = '2021-12-02'

# Testing 

In [3]:
def agg_intraday_daily(intraday_data, daily_data):
    '''Aggregate intraday and daily data for a specific symbol.'''
    intraday_data, daily_data = eliminate_half_days(intraday_data, daily_data)
    daily_data['daily_vol_pct'] = daily_data['total_vol_m'].rank(pct=True)
    df = intraday_data.merge(daily_data[['DATE', 'symbol', 'total_vol_m', 'overnight_gap', 'daily_vol_pct']], how='left', left_on=['date', 'symbol'],
                     right_on=['DATE', 'symbol'])
    df.index = pd.DatetimeIndex(df['datetime'])
    df['time'] = df.index.time
    df.drop(columns=['DATE', 'datetime'], inplace=True)
    df = df.between_time('09:31:00', '15:59:00')
    df['%_vol'] = df['size'] / df['total_vol_m']
    df = df.drop('sym_suffix', 1)  #add new line
    return df

# Whole Russell

In [4]:
path = '/Users/Mason/Desktop/Volatility/russell'
data_paths = [os.path.join(path, f) for f in os.listdir(path)]

In [5]:
r_1000 = '/Users/Mason/Desktop/Volatility/russell/Russell_1000.txt'
r_2000 = '/Users/Mason/Desktop/Volatility/russell/Russell_2000.txt'
r_sample = '/Users/Mason/Desktop/Volatility/russell/Russell_Sample.txt'

data_paths.remove(r_1000)
data_paths.remove(r_2000)
data_paths.remove(r_sample)

In [6]:
def prep_data(csv_file,date):
    df = pd.read_csv(csv_file)
    df_sample = copy.deepcopy(df)
    df['time']  = pd.to_timedelta(df['time'])
    df['date'] = pd.to_datetime(df['date'])# format='%Y%m%d')
    df = df[df['date'] == str(date)]
    df_sample = df_sample[df_sample['date'] == str(date)]
    df_sample['datetime'] = df['date'] + df['time']
    df_sample = df_sample.set_index('datetime')
    df_sample['time'] = pd.to_datetime(df_sample['time']).dt.strftime('%H:%M:%S')
    df_sample = df_sample.between_time('9:45:00','15:55:00')
    return df_sample

In [7]:
russell_est={}
russell_real={}

for f in tqdm(data_paths[:]):
    try:
    
        date=d
        date = datetime.strptime(date, '%Y-%m-%d').date()
        df_sample = prep_data(f, date)
        intraday_file = f
        daily_file = '/Users/Mason/Desktop/Volatility/russell_daily.csv'
        ticker = df_sample.sym_root.unique()[0]
            
        daily_data, intraday_data, overnight_gap = clean_data(intraday_file, daily_file, ticker, date)
        historical = agg_intraday_daily(intraday_data, daily_data)
        estimated_daily = DailyModel(daily_data)
        coef = regress_volume(historical)
        estimated_intraday = predict_intraday(coef, historical, estimated_daily, overnight_gap)
        estimated_intraday = estimated_intraday.sort_index()
        estimation = pd.DataFrame({'time':estimated_intraday[14:-4].index, 'est':estimated_intraday[14:-4].values})

        russell_est[ticker] = estimation
        russell_real[ticker] = df_sample[['time', 'size']]

    except:
        pass



100%|██████████| 112/112 [03:45<00:00,  2.02s/it]


In [8]:
russell_est

{'SR':          time       est
 0    09:45:00  604.0000
 1    09:46:00  577.0000
 2    09:47:00  591.0000
 3    09:48:00  589.0000
 4    09:49:00  568.0000
 ..        ...       ...
 366  15:51:00 3149.0000
 367  15:52:00 3471.0000
 368  15:53:00 3800.0000
 369  15:54:00 4322.0000
 370  15:55:00 4827.0000
 
 [371 rows x 2 columns],
 'ELS':          time       est
 0    09:45:00 1158.0000
 1    09:46:00 1183.0000
 2    09:47:00 1244.0000
 3    09:48:00 1230.0000
 4    09:49:00 1219.0000
 ..        ...       ...
 366  15:51:00 5070.0000
 367  15:52:00 5486.0000
 368  15:53:00 5847.0000
 369  15:54:00 6561.0000
 370  15:55:00 7351.0000
 
 [371 rows x 2 columns],
 'MOV':          time       est
 0    09:45:00  452.0000
 1    09:46:00  510.0000
 2    09:47:00  525.0000
 3    09:48:00  521.0000
 4    09:49:00  564.0000
 ..        ...       ...
 366  15:51:00 1766.0000
 367  15:52:00 1901.0000
 368  15:53:00 1924.0000
 369  15:54:00 2085.0000
 370  15:55:00 2397.0000
 
 [371 rows x 2 columns],

## Save in Pickle

In [9]:

# create a binary pickle file 
f = open("russell_baseline.pkl","wb")

# write the python object (dict) to pickle file
pickle.dump(russell_est,f)

# close file
f.close()

In [10]:
f = open("russell_size.pkl","wb")

# write the python object (dict) to pickle file
pickle.dump(russell_real,f)

# close file
f.close()

In [11]:
file_to_read = open("russell_baseline.pkl", "rb")
loaded_dictionary = pickle.load(file_to_read)

## Metrics Evaluation

In [12]:
def find_metrics(data):
    abs_val = np.abs(data['residual'].values)/ data['size'].values
    return np.sum(abs_val)/len(abs_val)

In [13]:
error_dict = {}
for i in tqdm(russell_est.keys()):
    if len(russell_est[i]) == 0:
        pass
    else:
        russell_est[i]['time'] = russell_est[i]['time'].astype(str)
        russell_real[i]['time'] =  russell_real[i]['time'].astype(str)
        new_df = pd.merge(russell_est[i], russell_real[i], how = 'inner', on = 'time')
        new_df['residual'] = new_df['size'] - new_df['est']
        error_dict[i] = new_df

100%|██████████| 107/107 [00:00<00:00, 413.75it/s]


In [14]:
f = open("russell_residual.pkl","wb")

# write the python object (dict) to pickle file
pickle.dump(error_dict,f)

# close file
f.close()

In [15]:
error_dict

{'SR':          time       est  size   residual
 0    09:45:00  604.0000   662    58.0000
 1    09:47:00  591.0000   172  -419.0000
 2    09:50:00  570.0000     2  -568.0000
 3    09:51:00  599.0000  1549   950.0000
 4    09:52:00  583.0000   205  -378.0000
 ..        ...       ...   ...        ...
 344  15:51:00 3149.0000   967 -2182.0000
 345  15:52:00 3471.0000  1326 -2145.0000
 346  15:53:00 3800.0000  1119 -2681.0000
 347  15:54:00 4322.0000  1349 -2973.0000
 348  15:55:00 4827.0000  4906    79.0000
 
 [349 rows x 4 columns],
 'ELS':          time       est   size   residual
 0    09:45:00 1158.0000   3483  2325.0000
 1    09:46:00 1183.0000   1844   661.0000
 2    09:47:00 1244.0000    672  -572.0000
 3    09:48:00 1230.0000    843  -387.0000
 4    09:49:00 1219.0000    683  -536.0000
 ..        ...       ...    ...        ...
 363  15:51:00 5070.0000   7340  2270.0000
 364  15:52:00 5486.0000   4201 -1285.0000
 365  15:53:00 5847.0000   4405 -1442.0000
 366  15:54:00 6561.0000  

In [16]:
metrics_dict = {}
for j in error_dict.keys():
    result = find_metrics(error_dict[j])
    metrics_dict[j] = result

In [17]:
f = open("russell_base_metrics.pkl","wb")

# write the python object (dict) to pickle file
pickle.dump(metrics_dict,f)

# close file
f.close()

In [18]:
metrics_dict

{'SR': 48.43563019167653,
 'ELS': 12.424035096943573,
 'MOV': 43.70130952443656,
 'MCRB': 3.705964327980516,
 'LEN': 0.7283131762666618,
 'TTI': 115.31502919109761,
 'TRMB': 4.7916755890592055,
 'OLN': 1.326183100632041,
 'CPRX': 25.781625323546788,
 'FDBC': 1.5727045123421939,
 'PCG': 2.676201068698076,
 'EVBG': 9.95125647988092,
 'LEA': 36.89102291416913,
 'FLXS': 11.445510550028937,
 'BUSE': 43.21545081247352,
 'G': 19.31332895699935,
 'BWFG': 3.061953702885228,
 'NBHC': 26.78470807421852,
 'XBIT': 21.932653356278088,
 'GCO': 23.884849284743492,
 'STNG': 8.95559940517685,
 'SFL': 89.26806703392158,
 'MCD': 0.7360970825723462,
 'GME': 1.4732837047843752,
 'VICI': 1.4827018217781038,
 'NATH': 2.0208805210036003,
 'HEES': 36.90832009626383,
 'APPF': 44.42090360385745,
 'CARA': 45.422004424805735,
 'SILK': 34.15450647532133,
 'RNR': 36.21491800945645,
 'PEG': 4.345201918885878,
 'PEP': 0.8139305308558885,
 'ESE': 21.5170922126468,
 'WWD': 10.360322441170196,
 'GPK': 13.177507256954408,
