In [5]:
from baseline.main_model import *

In [8]:
def baseline(daily_data):
    ''' put it all together (make sure the date you want is within the testing set'''
    modified_data = prep_data(daily_data)
    train, test = split_dataset(modified_data, 0.7)
    res = minimize(my_objective, np.array([0.5, 0.5], dtype=float), args=train['y_true'].values)
    phi, theta = res.x
    print(phi, theta)
    test['y_hat'] = estimate_y(test['y_true'].values, phi, theta, False)
    test['total_vol_hat'] = round(np.exp(test['y_hat'] + test['log_20days_AM']))

    return test

In [10]:
def Model1(daily_data):
    ''' put it all together (make sure the date you want is within the testing set'''
    modified_data = prep_data(daily_data)
    train, test = split_dataset(modified_data, 0.7)
    res = minimize(my_objective, np.array([0.5, 0, -0.5], dtype=float), args=train['y_true'].values)
    phi, phi_2, theta = res.x
    print(phi, phi_2, theta)
    test['y_hat'] = estimate_y(test['y_true'].values, phi, phi_2, theta, False)
    test['total_vol_hat'] = round(np.exp(test['y_hat'] + test['log_20days_AM']))
    
    return test

def prep_data(daily_data):
    '''
    Dataset preparation

    Input:
        csv_file (csv str) : your stock dataset
        stock (str) : your chosen stock capitalized symbol

    Output:
         dataset
    '''
    daily_data['log_20days_AM'] = (np.log(daily_data['total_vol_m'])).rolling(20).mean()
    daily_data['y_true'] = np.log(daily_data['total_vol_m']) - daily_data['log_20days_AM'].shift()
    data = daily_data[20:]
    data['log_total_vol_m'] = np.log(data['total_vol_m'])
    data.set_index('DATE', inplace=True)

    return data
def estimate_y(y_true, phi, phi_2, theta, predict):
    '''
    Calculate y_hat based on known y_true

    Input:
        y_true (array): log(Vt) - 20-day moving average
        phi(float): universal standard is about 0.7
        theta(float): universal standard is about -0.3

    Outputs:
        y_hat (array):estimated y_true
    '''
    n = len(y_true)
    y_hat = np.zeros(n)
    eps = np.zeros(n)
    
    for t in range(1, n):
        I = determine_indicator(y_true, t)
        y_hat[t] = phi * y_true[t - 1]*(1-I) + theta * eps[t - 1] + phi_2 * I * y_true[t-2]
        eps[t] = y_true[t] - y_hat[t]

    if predict:
        predict_y = phi * y_true[-1] + theta * eps[-1]
        return predict_y
    else:
        return y_hat

def determine_indicator(y_true, t):
    vol_pct = stats.percentileofscore(y_true, y_true[t-1])
    
    if (vol_pct > 97.5) or (vol_pct < 2.5):
        return 1
    else:
        return 0
    
def my_objective(param, y_true):
    '''
    Objective loss function for minimizing
    Find the Weighted Asymmetrical Logarithmic Error

    Input:
        param(float array): potential phi and theta
        y_true (array): log(Vt) - 20-day moving average

    Outputs:
        ALE, weighted asymmetrical logarithmic error
    '''
    phi = param[0]
    phi_2 = param[1]
    theta = param[2]
    y_hat = estimate_y(y_true, phi, phi_2, theta, False)
    y_diff = y_hat - y_true
    L1_v = np.sum(abs(y_diff/y_true))
    return L1_v

In [6]:
intraday_file = '/Users/fandimeng/OneDrive - The University of Chicago/Academics/MPCS/2021-2022/Data Science Clinic/data/fandi_intraday_data.csv'
daily_file = '/Users/fandimeng/OneDrive - The University of Chicago/Academics/MPCS/2021-2022/Data Science Clinic/data/fandi_daily.csv'
ticker = 'GOOGL'
date = '2021-12-01'

In [7]:
date = datetime.strptime(date, '%Y-%m-%d').date()
daily_data, intraday_data, overnight_gap = clean_data(intraday_file, daily_file, ticker, date)

In [9]:
baseline_test = baseline(daily_data)

0.6548617299791252 -0.2976735657585193


In [11]:
test = Model1(daily_data)
test

-0.22239943976123533 -0.2765956250176297 0.227381999958004


Unnamed: 0_level_0,SYM_ROOT,total_vol,symbol,CSize,CPrc,OSize,OPrc,total_vol_m,total_vol_b,total_vol_a,20d_sd,price_diff,overnight_gap,log_20days_AM,y_true,log_total_vol_m,y_hat,total_vol_hat
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2021-05-05,GOOG,1157080.0,GOOGL,74435.0,2314.77,20615.0,2328.63,1231411.0,16323.0,84178.0,41.280377,21.80,0.528096,14.173293,-0.145005,14.023671,0.000000,1430155.0
2021-05-06,GOOG,1060605.0,GOOGL,155018.0,2337.35,17846.0,2306.33,1215405.0,7125.0,37094.0,40.912026,8.44,0.206296,14.167595,-0.162705,14.010588,0.032249,1468634.0
2021-05-07,GOOG,1264917.0,GOOGL,88066.0,2351.93,20364.0,2363.89,1353050.0,25087.0,66826.0,41.933822,26.54,0.632902,14.172973,-0.049723,14.117872,-0.008144,1418101.0
2021-05-10,GOOG,1309041.0,GOOGL,148461.0,2291.75,21456.0,2328.14,1456709.0,13838.0,75334.0,39.720561,23.79,0.598934,14.183722,0.018717,14.191690,0.001604,1447468.0
2021-05-11,GOOG,1553209.0,GOOGL,81306.0,2270.06,38388.0,2243.07,1633892.0,35091.0,40514.0,38.799931,48.68,1.254641,14.198854,0.122753,14.306475,-0.000271,1466783.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-11-23,GOOG,1087920.0,GOOGL,116431.0,2915.64,22012.0,2923.06,1203842.0,30443.0,127370.0,33.108858,2.98,0.090006,14.119974,-0.146399,14.001029,0.000164,1356119.0
2021-11-24,GOOG,772383.0,GOOGL,99112.0,2922.40,18555.0,2909.47,871286.0,6986.0,53531.0,33.179605,6.17,0.185958,14.043243,-0.442248,13.677726,-0.000767,1254787.0
2021-11-26,GOOG,1491864.0,GOOGL,117326.0,2843.66,27409.0,2886.97,1427042.0,36422.0,65999.0,39.937322,35.43,0.887140,14.034050,0.127871,14.171114,-0.002029,1241736.0
2021-11-29,GOOG,1336664.0,GOOGL,198455.0,2910.61,30180.0,2880.00,1535900.0,26169.0,74907.0,40.299327,36.34,0.901752,14.023415,0.210577,14.244627,0.001099,1232449.0


In [39]:
def visualize(test, baseline_test, ticker):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=test.index, y=test['total_vol_hat'], mode='lines', name='ARMA with Indicator'))
    fig.add_trace(go.Scatter(x=baseline_test.index, y=baseline_test['total_vol_hat'], mode='lines', name='Baseline'))
    fig.add_trace(go.Scatter(x=test.index, y=test['total_vol_m'], mode='lines', name='Actual'))
    
    fig.update_layout(title=f"ARMA Prediction for {ticker} Daily Volume", title_x=0.5,
                   xaxis_title='Date',
                   yaxis_title='Total Daily Volume', legend=dict(yanchor="top",y=0.99,xanchor="left",x=0.01))
    
    fig.show()

In [40]:
visualize(test, baseline_test, ticker)

In [14]:
import plotly.express as px
import plotly.graph_objects as go

In [18]:
test.index

DatetimeIndex(['2021-05-05', '2021-05-06', '2021-05-07', '2021-05-10',
               '2021-05-11', '2021-05-12', '2021-05-13', '2021-05-14',
               '2021-05-17', '2021-05-18',
               ...
               '2021-11-16', '2021-11-17', '2021-11-18', '2021-11-19',
               '2021-11-22', '2021-11-23', '2021-11-24', '2021-11-26',
               '2021-11-29', '2021-11-30'],
              dtype='datetime64[ns]', name='DATE', length=146, freq=None)

In [42]:
test['error'] = abs(test['total_vol_m'] - test['total_vol_hat'])/test['total_vol_m']
baseline_test['error'] = abs(baseline_test['total_vol_m'] - baseline_test['total_vol_hat'])/baseline_test['total_vol_m']

In [43]:
print (test['error'].mean(), baseline_test['error'].mean())

0.23279757880771038 0.20990122317702614
