In [69]:
%matplotlib inline
import numpy as np
import pandas as pd
import csv
import math
from matplotlib import pyplot as plt
from scipy import stats
from scipy.optimize import minimize 
from sklearn.metrics import mean_squared_error

In [104]:
df = pd.read_csv('sushan_daily_updated.csv')
mask = df['symbol'].str.contains('BA')
data = df[mask]
data

Unnamed: 0.1,Unnamed: 0,DATE,SYM_ROOT,total_vol,symbol,CSize,CPrc,OPrc,total_vol_m,total_vol_b,total_vol_a,Ovn_Gap
0,0,2018/12/31,BA,3000834,BA,331313,322.50,320.50,3333107,17615.0,222859.0,
11,11,2019/01/02,BA,2615881,BA,344277,323.81,315.51,2960345,29263.0,303300.0,-6.99
12,12,2019/01/03,BA,4930361,BA,326533,310.90,318.53,5255368,31021.0,431490.0,-5.28
13,13,2019/01/04,BA,3674863,BA,463794,327.08,316.69,4139757,42039.0,288650.0,5.79
14,14,2019/01/07,BA,3064577,BA,359758,328.11,330.30,3422972,24279.0,583696.0,3.22
...,...,...,...,...,...,...,...,...,...,...,...,...
5804,5804,2021/12/27,BA,7271163,BA,253331,203.17,201.96,7523713,95126.0,208367.0,-2.26
5805,5805,2021/12/28,BA,8333741,BA,240289,206.13,202.23,8574671,62362.0,256928.0,-0.94
5806,5806,2021/12/29,BA,5645408,BA,240570,203.66,205.05,5886519,47561.0,188310.0,-1.08
5807,5807,2021/12/30,BA,6693702,BA,159322,202.71,203.00,6854595,49387.0,110237.0,-0.66


In [186]:
class ARMA_ALE:
    def __init__(self,dataset):
        '''
        Initializes the file and values indeed from the file

        Args:
            file: (csv str) stock dataset
        '''
        self.df = dataset
        self.log_20days_AM = None
        self.train = None
        self.test = None
        self.train_y_true = None
        self.test_y_true = None 
        self.test_y_hat = None 
        self.train_log_20days = None
        self.test_log_20days = None
        self.phi = None
        self.theta = None
        self.MSE = None 
        
    def cal_mean_and_true(self):
        self.df['log_20days_AM'] = \
        (np.log(self.df['total_vol'])).shift().rolling(20).mean().values
        self.df['y_true'] = \
        np.log(self.df['total_vol']).values - self.df['log_20days_AM']

    def split_data (self, split_amount):
        n = int(len(self.df) * split_amount)
        self.train = self.df[:n].iloc[20:]
        self.test = self.df[n:].iloc[20:]

    def estimate_y(self, y_true, phi, theta):
        n = len(y_true)
        y_hat = np.zeros(n)
        eps = np.zeros(n)
        for t in range(1,n):
            y_hat[t] = phi*y_true[t-1] + theta * eps[t-1]
            eps[t] = y_true[t] - y_hat[t]
        return y_hat

    def my_objective(self, param, y_true):
        phi = param[0]
        theta = param[1]
        y_hat = self.estimate_y(y_true, phi, theta)
        y_diff = y_hat - y_true
        ALE = np.sum((1.5 + 0.5*np.sign(y_diff)) * np.abs(y_diff))
        return ALE 

    def find_phi_theta(self):
        res = minimize(self.my_objective, np.array([0.5,0.5]), args=self.train_y_true)
        phi, theta = res.x
        self.phi = phi
        self.theta = theta
        self.test_y_hat = self.estimate_y(self.test_y_true, self.phi, self.theta)

    def mse_check(self):
        mse = mean_squared_error(self.test_y_true, self.test_y_hat)
        self.MSE = mse
        return mse

    def line_graph(self):
        self.train.index = pd.to_datetime(self.train['DATE'])
        self.train.drop('DATE',axis=1,inplace=True)
        self.test.index = pd.to_datetime(self.test['DATE'])
        self.test.drop('DATE',axis=1,inplace=True)
        
        log_total_vol_hat = self.y_hat + self.log_20days_AM
        train_vol = np.log(self.train['total_vol']).values
        test_vol = np.log(self.test['total_vol']).values

        plt.figure(figsize=(30,10))
        ax = train_vol.plot(grid = True, figsize = (30,10))
        plt.plot(test_vol, ax=ax, grid = True)
        plt.plot(test_vol.index, log_total_vol_hat, color='green', marker=',', linestyle='dashed')
        plt.legend(['train_log_vol','test_log_vol', 'predicted log vol'])
        plt.show()
    

In [187]:
hope = ARMA_ALE(dataset)

In [188]:
hope.split_data(0.7)

In [183]:
hope.cal_mean_and_true()

In [189]:
hope.df

Unnamed: 0.1,Unnamed: 0,DATE,SYM_ROOT,total_vol,symbol,CSize,CPrc,OPrc,total_vol_m,total_vol_b,total_vol_a,Ovn_Gap
30,30,2019/01/30,BA,11755060,BA,195467,387.72,387.40,11950792,548589.0,208628.0,22.49
31,31,2019/01/31,BA,5526672,BA,1093448,385.62,386.99,6618882,24129.0,235733.0,-0.73
32,32,2019/02/01,BA,4053990,BA,872249,387.43,387.00,4926113,42482.0,657392.0,1.38
33,33,2019/02/04,BA,3640190,BA,326206,397.00,388.50,3967066,7895.0,283361.0,1.07
34,34,2019/02/05,BA,6068814,BA,926707,410.18,400.75,6995875,96682.0,138570.0,3.75
...,...,...,...,...,...,...,...,...,...,...,...,...
5804,5804,2021/12/27,BA,7271163,BA,253331,203.17,201.96,7523713,95126.0,208367.0,-2.26
5805,5805,2021/12/28,BA,8333741,BA,240289,206.13,202.23,8574671,62362.0,256928.0,-0.94
5806,5806,2021/12/29,BA,5645408,BA,240570,203.66,205.05,5886519,47561.0,188310.0,-1.08
5807,5807,2021/12/30,BA,6693702,BA,159322,202.71,203.00,6854595,49387.0,110237.0,-0.66


In [174]:
hope.find_phi_theta()

0.5