# LASSO application in High-Frequency Trading

## Part I. Original Version

Here, we use 30 min estimation window to do CV to select the optimal lambda, and trial LASSO, AR(3) and combined strategy.

In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LinearRegression
import random
import time

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
def rand_ticker(n_ticker , ticker , no_stock , seed):
    random.seed( seed )
    ind = random.sample(range(n_ticker), no_stock)
    ind.sort()
        
    smp = ticker[ind]
    return smp


In [7]:
def find_return(dat , SYM):
    part = pd.Series()
    for i in range(dat.shape[0]):
        part = part.append(pd.Series(dat.iloc[i , 3 : dat.shape[1]]))
    d = {"SYM_ROOT":SYM, "RETURN":part}
    data = pd.DataFrame(data = d)
    return data

In [8]:
def assign_variable(dat1,dat2,name_list):
    k = 0
    for i in range(dat1.shape[0]):
        for j in range(3):
            part = pd.Series()
            part = part.append(pd.Series(dat1.iloc[i , j:(dat1.shape[1] - 3 + j)]),ignore_index = True)
            dat2[name_list[k]] = part
            k += 1
    return dat2


In [9]:
def preprocess(data , no_p = 389):

    ticker = data.SYMBOL.unique()
    n_ticker = len(ticker)
    
    data_tmp = data['TIME']
    data_tmp = data_tmp.str.slice(start = 0,stop = -3)
    data["TIME"] = data_tmp
    
    data_c = data.groupby(["SYMBOL","TIME"]).last()        # get the last price of each minute as price of each minute
    data_BB = data_c.BB
    data_BO = data_c.BO
    data_BB = data_BB.unstack()
    data_BO = data_BO.unstack()
    data_price = (data_BB + data_BO) / 2    # mean of NBBO as stock price at t
    
    data_price = data_price.reindex_axis(sorted(data_price.columns, key = lambda x: (len(x),x)), axis = 1)

    dat = np.log(data_price) - np.log(data_price.shift(1 , axis = 1)) 

    dat = dat.iloc[:,1:]              # remove the column of 9:30 
    
    
    #####-------can be deleted when running full data-------#####
    dat = dat.iloc[ : , : no_p]

    dat = dat.fillna(0)   # impute NULL with 0
    
    SYM = list(ticker) * (dat.shape[1] - 3)
    SYM.sort()


    returns = find_return(dat , SYM)        # form dataframe of returns with 1st column ticker name, 2nd column minute return, all stocks in the same column
    
    name_list1 = ["RETURN 3 MINS BEFORE", "RETURN 2 MINS BEFORE", "RETURN 1 MINS BEFORE"]
    name_list2 = sorted(list(set(SYM)))

    name_list = []
    for i in name_list2:
        for j in name_list1:
            name_list.append(i + ' ' + j)
    

    data_variable = pd.DataFrame(columns = name_list)
    
    data_0 = assign_variable(dat,data_variable,name_list)


    return data_0 , dat, returns , n_ticker , ticker 


In [10]:
# use first no_p data_0, can be deleted when running full data
# no_stock: number of stocks to be predicted
# data: original nbbo_20141201 data
# fold: cv folds
# python3.7 use version = 1, python 3.6 , use version = 0 
def lasso_stock(data, fold, no_stock ,seed, lamb_range = None, no_p = 389 ,version = 0):
    
    data_0 , dat, returns , n_ticker , ticker = preprocess(data)

    
    nr = data_0.shape[0]      # 386 if the data is whole day 9:34 - 15:59
    n_pred = nr - 30     # no. of predictions per stock

    
    if np.isfinite(data_0).sum().sum() != data_0.shape[0] * data_0.shape[1]:
        return 'There is infinite number.'
    else:
        
        smp = rand_ticker(n_ticker , ticker , no_stock , seed)
        
        # for LASSO
        pred = np.empty([n_pred, no_stock])
        lamb = np.empty([n_pred, no_stock])
        num_var = np.empty([n_pred, no_stock])
        
        r2_adj = np.empty(no_stock) 
        
        
        # for AR(3)
        pred_ar = np.empty([n_pred, no_stock])
        r2_adj_ar = np.empty(no_stock) 
        
        
        # for LASSO & AR(3)
        r2_adj_comb = np.empty(no_stock) 
                   
        
        # for portfolio weight calculation
        sd_lasso = np.empty([n_pred, no_stock])
        
        
        for i in range(no_stock):
            
            y = returns[returns.SYM_ROOT == smp[i]].iloc[ : , version]  # when run in school computer, use 0; use macbook, use 1 here
            
            # for fit AR(3)
            dat_use = dat.loc[smp[i]]
            dat_ar = pd.DataFrame(columns = ['3min','2min','1min','return'])
            dat_ar['3min'] = np.array(dat_use.iloc[ : (no_p - 3)])
            dat_ar['2min'] = np.array(dat_use.iloc[1 : (no_p - 2)])
            dat_ar['1min'] = np.array(dat_use.iloc[2 : (no_p - 1)])
            dat_ar['return'] = np.array(dat_use.iloc[3 : no_p]) 
            


            for j in range(n_pred):

                train_X = data_0.iloc[j : (j + 30) , : ]
                test_X = data_0.iloc[j + 30 , : ].values.reshape(1 , -1)  # reshape(1 , -1) when test data has only one observation

                train_y = y[j : (j + 30)]

                # fit lassoCV
                fit = LassoCV(random_state = 0 , cv = fold , alphas = lamb_range).fit(train_X , train_y)
                
                
                pred[j , i] = fit.predict(X = test_X)
                lamb[j , i] = fit.alpha_
                num_var[j , i] = sum(fit.coef_ != 0)
                
                # in sample standard deviation of lasso
                sd_lasso[j , i] = np.std(train_y)
                
                # fit AR(3)
                train_ar = dat_ar.iloc[j : (j + 30) , :3 ]
                
                ##---version difference: add values. for mac----###
                test_ar = dat_ar.iloc[j + 30 , : 3].values.reshape(1 , -1)   # reshape since only one observation for test
                train_ar_y = dat_ar.iloc[j : (j + 30) , 3 ]
                
                fit_ar = LinearRegression().fit(train_ar , train_ar_y)
                pred_ar[j , i] = fit_ar.predict(X = test_ar)
                
                
            # calculate lasso R^2
            true_return = y[30 : ]
            lr_lasso = LinearRegression().fit(pred[ : , i].reshape(-1 , 1) , true_return)
            r2 = lr_lasso.score(pred[ : , i].reshape(-1 , 1) , true_return)
            r2_adj[i] = 1 - (1 - r2) * (n_pred - 1) / (n_pred - 2)
            
            
            # calculate AR3 R^2
            lr_ar = LinearRegression().fit(pred_ar[ : , i].reshape(-1 , 1) , dat_ar['return'][30:])
            r2_ar = lr_ar.score(pred_ar[ : , i].reshape(-1 , 1) , dat_ar['return'][30:])
            r2_adj_ar[i] = 1 - (1 - r2_ar) * (n_pred - 1) / (n_pred - 2)
            
            
            # calculate multiple R^2
            ar_lasso = pd.DataFrame(columns = ['lasso_pred' ,'ar_pred'])
            ar_lasso['lasso_pred'] = np.array(pred[:,i])
            ar_lasso['ar_pred'] = np.array(pred_ar[:,i])
            lr_lasso_ar = LinearRegression().fit(ar_lasso , true_return)
            r2_lasso_ar = lr_lasso_ar.score(ar_lasso , true_return)
            r2_adj_comb[i] = 1 - (1 - r2_lasso_ar) * (n_pred - 1) / (n_pred - 2)
            
        return lamb , num_var , r2_adj , r2_adj_ar , r2_adj_comb , sd_lasso , smp


In [11]:
date = ['01' , '02' ,'03' ,'04' ,'05' ,'08' ,'09' ,'10' ,'11' ,'12' ,'15' ,'16' ,'17' ,'18' ,'19' ,'22' ,'23' ,'24' ,'29' , '30' , '31']



In [12]:
def loop_cv(date , no_stock , seed , lamb_range = None, fold = 3):
    n = len(date)
    lamb_0 = np.empty([no_stock , n])
    num_var_0 = np.empty([no_stock , n])
    r2_adj_0 = np.empty([no_stock , n])
    r2_adj_ar0 = np.empty([no_stock , n])
    r2_adj_comb0 = np.empty([no_stock , n])
    
    for i in range(len(date)):
        dat_name = 'nbbo_201412' + date[i] + '.csv'
        data = pd.read_csv(dat_name)
        lamb_tmp , num_var_tmp , r2_adj_tmp , r2_adj_ar_tmp , r2_adj_comb_tmp , sd_lasso0 , smp = lasso_stock(data, fold , no_stock , seed, lamb_range)
        
        lamb_0[ : , i] = np.mean(lamb_tmp , axis = 0)
        num_var_0[ : , i] = np.mean(num_var_tmp , axis = 0)
        r2_adj_0[ : , i] = r2_adj_tmp
        r2_adj_ar0[ : , i] =  r2_adj_ar_tmp
        r2_adj_comb0[ : , i] = r2_adj_comb_tmp
        
        sd_lasso0 = pd.DataFrame(sd_lasso0)
        save_sd_name = 'sd_ls_201412' + date[i] + '.csv'
        sd_lasso0.to_csv(save_sd_name)
        
    return lamb_0 , num_var_0 ,  r2_adj_0 , r2_adj_ar0 , r2_adj_comb0
        
        
        

Below are the same codes but just changing the date and variable names. We can get the same result by simply using date0 = date[:7] and just run the first section, but since data for each single day takes 3-4 hours to run, I run it on the computer in school's library, I cannot ensure no interruption or no leaving. So I seperate it and run it one by one.

In [34]:
date0 = date[:1]
lamb_1 , num_var_1 , r2_adj_1 , r2_adj_ar1 , r2_adj_comb1 = loop_cv(date0 , 50 , 5)

In [38]:
lamb_1 = pd.DataFrame(lamb_1)
lamb_1.to_csv('lambda1.csv')
num_var_1 = pd.DataFrame(num_var_1)
num_var_1.to_csv('num_var1.csv')
r2_adj_1 = pd.DataFrame(r2_adj_1)
r2_adj_1.to_csv('r2_adj1.csv')
r2_adj_ar1 = pd.DataFrame(r2_adj_ar1)
r2_adj_ar1.to_csv('r2_adj_ar1.csv')
r2_adj_comb1 = pd.DataFrame(r2_adj_comb1)
r2_adj_comb1.to_csv('r2_adj_comb1.csv')

In [40]:
print("np.mean(lamb_1)", np.mean(lamb_1))
print("np.mean(num_var_1)", np.mean(num_var_1))
print("np.mean(r2_adj_1)", np.mean(r2_adj_1))
print("np.mean(r2_adj_ar1)", np.mean(r2_adj_ar1))
print("np.mean(r2_adj_comb1)", np.mean(r2_adj_comb1))


np.mean(lamb_1) 0    2.679628e-07
dtype: float64
np.mean(num_var_1) 0    3.898989
dtype: float64
np.mean(r2_adj_1) 0    0.001837
dtype: float64
np.mean(r2_adj_ar1) 0    0.002655
dtype: float64
np.mean(r2_adj_comb1) 0    0.006788
dtype: float64


In [None]:
np.mean(lamb_1)

In [11]:
start = time.time()
date1 = date[1:2]
lamb_2 , num_var_2 , r2_adj_2 , r2_adj_ar2 , r2_adj_comb2 = loop_cv(date1 , 50 , 5)
end = time.time()

In [12]:
lamb_2 = pd.DataFrame(lamb_2)
lamb_2.to_csv('lambda2.csv')
num_var_2 = pd.DataFrame(num_var_2)
num_var_2.to_csv('num_var2.csv')
r2_adj_2 = pd.DataFrame(r2_adj_2)
r2_adj_2.to_csv('r2_adj2.csv')
r2_adj_ar2 = pd.DataFrame(r2_adj_ar2)
r2_adj_ar2.to_csv('r2_adj_ar2.csv')
r2_adj_comb2 = pd.DataFrame(r2_adj_comb2)
r2_adj_comb2.to_csv('r2_adj_comb2.csv')

In [15]:
print("np.mean(lamb_2)", np.mean(lamb_2))
print("np.mean(num_var_2)", np.mean(num_var_2))
print("np.mean(r2_adj_2)", np.mean(r2_adj_2))
print("np.mean(r2_adj_ar2)", np.mean(r2_adj_ar2))
print("np.mean(r2_adj_comb2)", np.mean(r2_adj_comb2))
print('Time for 50 stocks per day: ', (end - start)/3600)

np.mean(lamb_2) 0    2.427635e-07
dtype: float64
np.mean(num_var_2) 0    3.385899
dtype: float64
np.mean(r2_adj_2) 0    0.000878
dtype: float64
np.mean(r2_adj_ar2) 0    0.000941
dtype: float64
np.mean(r2_adj_comb2) 0    0.00495
dtype: float64
Time for 50 stocks per day:  4.035016413066122


In [16]:
start1 = time.time()
date2 = date[2:3]
lamb_3 , num_var_3 , r2_adj_3 , r2_adj_ar3 , r2_adj_comb3 = loop_cv(date2 , 50 , 5)
end1 = time.time()

In [17]:
lamb_3 = pd.DataFrame(lamb_3)
lamb_3.to_csv('lambda3.csv')
num_var_3 = pd.DataFrame(num_var_3)
num_var_3.to_csv('num_var3.csv')
r2_adj_3 = pd.DataFrame(r2_adj_3)
r2_adj_3.to_csv('r2_adj3.csv')
r2_adj_ar3 = pd.DataFrame(r2_adj_ar3)
r2_adj_ar3.to_csv('r2_adj_ar3.csv')
r2_adj_comb3 = pd.DataFrame(r2_adj_comb3)
r2_adj_comb3.to_csv('r2_adj_comb3.csv')

In [18]:
print("np.mean(lamb_3)", np.mean(lamb_3))
print("np.mean(num_var_3)", np.mean(num_var_3))
print("np.mean(r2_adj_3)", np.mean(r2_adj_3))
print("np.mean(r2_adj_ar2)", np.mean(r2_adj_ar3))
print("np.mean(r2_adj_comb3)", np.mean(r2_adj_comb3))
print('Time for 50 stocks per day: ', (end1 - start1)/3600)

np.mean(lamb_3) 0    2.630656e-07
dtype: float64
np.mean(num_var_3) 0    3.467921
dtype: float64
np.mean(r2_adj_3) 0    0.0006
dtype: float64
np.mean(r2_adj_ar2) 0    0.002724
dtype: float64
np.mean(r2_adj_comb3) 0    0.005992
dtype: float64
Time for 50 stocks per day:  4.289340320958032


In [10]:
start2 = time.time()
date3 = date[3:4]
lamb_4 , num_var_4 , r2_adj_4 , r2_adj_ar4 , r2_adj_comb4 = loop_cv(date3 , 50 , 5)
end2 = time.time()

In [11]:
lamb_4 = pd.DataFrame(lamb_4)
lamb_4.to_csv('lambda4.csv')
num_var_4 = pd.DataFrame(num_var_4)
num_var_4.to_csv('num_var4.csv')
r2_adj_4 = pd.DataFrame(r2_adj_4)
r2_adj_4.to_csv('r2_adj4.csv')
r2_adj_ar4 = pd.DataFrame(r2_adj_ar4)
r2_adj_ar4.to_csv('r2_adj_ar4.csv')
r2_adj_comb4 = pd.DataFrame(r2_adj_comb4)
r2_adj_comb4.to_csv('r2_adj_comb4.csv')

In [13]:
print("np.mean(lamb_4)", np.mean(lamb_4))
print("np.mean(num_var_4)", np.mean(num_var_4))
print("np.mean(r2_adj_4)", np.mean(r2_adj_4))
print("np.mean(r2_adj_ar4)", np.mean(r2_adj_ar4))
print("np.mean(r2_adj_comb4)", np.mean(r2_adj_comb4))
print('Time for 50 stocks per day: ', (end2 - start2)/3600)

np.mean(lamb_4) 0    2.152720e-07
dtype: float64
np.mean(num_var_4) 0    3.446854
dtype: float64
np.mean(r2_adj_4) 0    0.000246
dtype: float64
np.mean(r2_adj_ar4) 0    0.002992
dtype: float64
np.mean(r2_adj_comb4) 0    0.005948
dtype: float64
Time for 50 stocks per day:  2.9872699840863546


In [15]:
start3 = time.time()
date4 = date[4:5]
lamb_5 , num_var_5 , r2_adj_5 , r2_adj_ar5 , r2_adj_comb5 = loop_cv(date4 , 50 , 5)
end3 = time.time()

In [16]:
lamb_5 = pd.DataFrame(lamb_5)
lamb_5.to_csv('lambda5.csv')
num_var_5 = pd.DataFrame(num_var_5)
num_var_5.to_csv('num_var5.csv')
r2_adj_5 = pd.DataFrame(r2_adj_5)
r2_adj_5.to_csv('r2_adj5.csv')
r2_adj_ar5 = pd.DataFrame(r2_adj_ar5)
r2_adj_ar5.to_csv('r2_adj_ar5.csv')
r2_adj_comb5 = pd.DataFrame(r2_adj_comb5)
r2_adj_comb5.to_csv('r2_adj_comb5.csv')

In [17]:
print("np.mean(lamb_5)", np.mean(lamb_5))
print("np.mean(num_var_5)", np.mean(num_var_5))
print("np.mean(r2_adj_5)", np.mean(r2_adj_5))
print("np.mean(r2_adj_ar5)", np.mean(r2_adj_ar5))
print("np.mean(r2_adj_comb5)", np.mean(r2_adj_comb5))
print('Time for 50 stocks per day: ', (end3 - start3)/3600)

np.mean(lamb_5) 0    2.084580e-07
dtype: float64
np.mean(num_var_5) 0    3.596404
dtype: float64
np.mean(r2_adj_5) 0    0.001408
dtype: float64
np.mean(r2_adj_ar5) 0    0.002765
dtype: float64
np.mean(r2_adj_comb5) 0    0.007355
dtype: float64
Time for 50 stocks per day:  3.6316642542680104


In [10]:
start4 = time.time()
date5 = date[5:6]
lamb_6 , num_var_6 , r2_adj_6 , r2_adj_ar6 , r2_adj_comb6 = loop_cv(date5 , 50 , 5)
end4 = time.time()

In [12]:
lamb_6 = pd.DataFrame(lamb_6)
lamb_6.to_csv('lambda6.csv')
num_var_6 = pd.DataFrame(num_var_6)
num_var_6.to_csv('num_var6.csv')
r2_adj_6 = pd.DataFrame(r2_adj_6)
r2_adj_6.to_csv('r2_adj6.csv')
r2_adj_ar6 = pd.DataFrame(r2_adj_ar6)
r2_adj_ar6.to_csv('r2_adj_ar6.csv')
r2_adj_comb6 = pd.DataFrame(r2_adj_comb6)
r2_adj_comb6.to_csv('r2_adj_comb6.csv')

In [13]:
print("np.mean(lamb_6)", np.mean(lamb_6))
print("np.mean(num_var_6)", np.mean(num_var_6))
print("np.mean(r2_adj_6)", np.mean(r2_adj_6))
print("np.mean(r2_adj_ar6)", np.mean(r2_adj_ar6))
print("np.mean(r2_adj_comb6)", np.mean(r2_adj_comb6))
print('Time for 50 stocks per day: ', (end4 - start4)/3600)

np.mean(lamb_6) 0    2.536778e-07
dtype: float64
np.mean(num_var_6) 0    3.595787
dtype: float64
np.mean(r2_adj_6) 0    0.001447
dtype: float64
np.mean(r2_adj_ar6) 0    0.001582
dtype: float64
np.mean(r2_adj_comb6) 0    0.005757
dtype: float64
Time for 50 stocks per day:  2.98414255950186


### Final results for 6 trading days are here.

In [13]:

print('Mean of lambda is:', np.mean([2.679628e-07, 2.427635e-07, 2.630656e-07 , 2.152720e-07 , 2.084580e-07 , 2.536778e-07 ]))
print('Average number of variables is:', np.mean([3.898989 , 3.385899 , 3.467921 , 3.446854 , 3.596404 , 3.595787]))

R2_lasso = np.mean([0.001837 , 0.000878 , 0.0006 , 0.000246 , 0.001408 , 0.001447 ])
R2_ar3 =  np.mean([0.002655 , 0.000941 , 0.002724, 0.002992 , 0.002765 , 0.001582 ])
R2_comb =  np.mean([0.006788 , 0.00495 , 0.005992 , 0.005948 , 0.007355 , 0.005757 ])
R2_partial = R2_comb - R2_ar3
                   
print('Mean of R2 for LASSO is: ', R2_lasso )
print('Mean of R2 for AR(3) is: ',R2_ar3)
print('Mean of R2 for LASSO + AR(3) is: ', R2_comb)
print('Additional variation explained by LASSO Compared to AR(3) is: ', R2_partial)





Mean of lambda is: 2.4186661666666664e-07
Average number of variables is: 3.5653090000000005
Mean of R2 for LASSO is:  0.0010693333333333334
Mean of R2 for AR(3) is:  0.0022765
Mean of R2 for LASSO + AR(3) is:  0.006131666666666666
Additional variation explained by LASSO Compared to AR(3) is:  0.0038551666666666665


## Part II. Fix lambda for 1.5 hours

Instead of modifying lambda every minute, we try to do CV on 1.5 hour estimation window and apply the optimal lambda to the next 1.5 hour

In [None]:
def preprocess(data , no_p = 389):

    ticker = data.SYMBOL.unique()
    n_ticker = len(ticker)
    
    data_tmp = data['TIME']
    data_tmp = data_tmp.str.slice(start = 0,stop = -3)
    data["TIME"] = data_tmp
    
    data_c = data.groupby(["SYMBOL","TIME"]).last()        # get the last price of each minute as price of each minute
    data_BB = data_c.BB
    data_BO = data_c.BO
    data_BB = data_BB.unstack()
    data_BO = data_BO.unstack()
    data_price = (data_BB + data_BO) / 2    # mean of NBBO as stock price at t
    
    data_price = data_price.reindex_axis(sorted(data_price.columns, key = lambda x: (len(x),x)), axis = 1)

    dat = np.log(data_price) - np.log(data_price.shift(1 , axis = 1)) 

    dat = dat.iloc[:,1:]              # remove the column of 9:30 
    
    
    #####-------can be deleted when running full data-------#####
    dat = dat.iloc[ : , : no_p]

    dat = dat.fillna(0)   # impute NULL with 0
    
    SYM = list(ticker) * (dat.shape[1] - 3)
    SYM.sort()


    returns = find_return(dat , SYM)        # form dataframe of returns with 1st column ticker name, 2nd column minute return, all stocks in the same column
    
    name_list1 = ["RETURN 3 MINS BEFORE", "RETURN 2 MINS BEFORE", "RETURN 1 MINS BEFORE"]
    name_list2 = sorted(list(set(SYM)))

    name_list = []
    for i in name_list2:
        for j in name_list1:
            name_list.append(i + ' ' + j)
    

    data_variable = pd.DataFrame(columns = name_list)
    
    data_0 = assign_variable(dat,data_variable,name_list)


    return data_0 , returns , n_ticker , ticker 



In [None]:
def long_time_cv(data, no_stock , seed , lamb_range = None, fold = 3):
    
    data_0 , returns , n_ticker , ticker = preprocess(data)

    if np.isfinite(data_0).sum().sum() != data_0.shape[0] * data_0.shape[1]:
        return 'There is infinite number.'
    else:

        smp = rand_ticker(n_ticker , ticker , no_stock , seed)
        

        nr = data_0.shape[0]  # 386
        n_pred = nr - 90     # n_pred = 296


        pred = np.empty([n_pred, no_stock])
        lamb = np.empty([4, no_stock])
        num_var = np.empty([n_pred, no_stock])
        
        r2_adj = np.empty(no_stock) 
        

        for i in range(no_stock):

            y = returns[returns.SYM_ROOT == smp[i]].iloc[ : , 0] 


            for j in range(4):
                X_train = data_0.iloc[90 * j : (90 * (j + 1)) , :]
                y_train = y[90 * j : (j + 1) * 90 ]
                fit = LassoCV(random_state = 0 , cv = fold , alphas = lamb_range).fit(X_train , y_train)
                lamb[j , i] = fit.alpha_
                      
                if j in range(0 , 3):
                      
                    for l in range(90 * j + 60 , 90 * j + 150):
                        train = data_0.iloc[l : (l + 30) , : ]
                        train_y = y[l : (l + 30) ]
                        test = data_0.iloc[l + 30 , :].values.reshape(1 , -1)

                        fit1 = Lasso(alpha = lamb[j, i]).fit(train , train_y)
                        num_var[l - 90 * j - 60 , i] = sum(fit1.coef_ != 0)
                        pred[l - 90 * j - 60 , i] = fit1.predict(test)


                if j == 3:
                    for l in range(90 * j + 60 , nr - 30):
                        train = data_0.iloc[l : (l + 30) , : ]
                        train_y = y[l : (l + 30) ]
                        test = data_0.iloc[l + 30 , :].values.reshape(1 , -1)

                        fit1 = Lasso(alpha = lamb[j, i]).fit(train , train_y)
                        pred[l - 90 * j - 60 , i] = fit1.predict(test)
                        num_var[l - 90 * j - 60 , i] = sum(fit1.coef_ != 0)

            # calculate lasso R^2
            true_return = y[90 : ]
            lr_lasso = LinearRegression().fit(pred[ : , i].reshape(-1 , 1) , true_return)
            r2 = lr_lasso.score(pred[ : , i].reshape(-1 , 1) , true_return)
            r2_adj[i] = 1 - (1 - r2) * (n_pred - 1) / (n_pred - 2)
        
        return pred , lamb , num_var , r2_adj 

  

In [None]:
def loop_over_time(date , no_stock , seed , lamb_range = None , fold = 3):
    n = len(date)
    lamb_0 = np.empty([no_stock , n])
    num_var_0 = np.empty([no_stock , n])
    r2_adj_0 = np.empty([no_stock , n])
    
    for i in range(len(date)):
        dat_name = 'nbbo_201412' + date[i] + '.csv'
        data = pd.read_csv(dat_name)
        pred_tmp , lamb_tmp , num_var_tmp , r2_adj_tmp  = long_time_cv(data , no_stock , seed , lamb_range , fold)
        lamb_0[ : , i] = np.mean(lamb_tmp , axis = 0)
        num_var_0[ : , i] = np.mean(num_var_tmp , axis = 0)
        r2_adj_0[ : , i] = r2_adj_tmp
        
    return lamb_0 , num_var_0 , r2_adj_0

In [None]:
date0 = date[:5]
lamb_1 , num_var_1 , r2_adj_1 = loop_over_time(date0 , 150 , 5)

In [None]:
print('By fixing lambda for 1.5 hours, mean of R2 is ', np.mean(np.mean(r2_adj_1)))

# Part III. Long-lived predictors

Use iShare market ETF as market return, Russell 1000 as size return and Russell 2000 as value return 

In [24]:
def preprocess_returns(data , no_p = 389):

    ticker = data.SYMBOL.unique()
    n_ticker = len(ticker)
    
    data_tmp = data['TIME']
    data_tmp = data_tmp.str.slice(start = 0,stop = -3)
    data["TIME"] = data_tmp
    
    data_c = data.groupby(["SYMBOL","TIME"]).last()        # get the last price of each minute as price of each minute
    data_BB = data_c.BB
    data_BO = data_c.BO
    data_BB = data_BB.unstack()
    data_BO = data_BO.unstack()
    data_price = (data_BB + data_BO) / 2    # mean of NBBO as stock price at t
    
    data_price = data_price.reindex_axis(sorted(data_price.columns, key = lambda x: (len(x),x)), axis = 1)

    dat = np.log(data_price) - np.log(data_price.shift(1 , axis = 1)) 

    dat = dat.iloc[:,1:]              # remove the column of 9:30 
    
    
    #####-------can be deleted when running full data-------#####
    dat = dat.iloc[ : , : no_p]

    dat = dat.fillna(0)   # impute NULL with 0
    
    SYM = list(ticker) * (dat.shape[1] - 3)
    SYM.sort()


    returns = find_return(dat , SYM)        # form dataframe of returns with 1st column ticker name, 2nd column minute return, all stocks in the same column

    return returns, n_ticker , ticker


In [25]:
def preprocess_X(data , no_p = 389):

    ticker = data.SYMBOL.unique()
    n_ticker = len(ticker)
    
    data_tmp = data['TIME']
    data_tmp = data_tmp.str.slice(start = 0,stop = -3)
    data["TIME"] = data_tmp
    
    data_c = data.groupby(["SYMBOL","TIME"]).last()        # get the last price of each minute as price of each minute
    data_BB = data_c.BB
    data_BO = data_c.BO
    data_BB = data_BB.unstack()
    data_BO = data_BO.unstack()
    data_price = (data_BB + data_BO) / 2    # mean of NBBO as stock price at t
    
    data_price = data_price.reindex_axis(sorted(data_price.columns, key = lambda x: (len(x),x)), axis = 1)

    dat = np.log(data_price) - np.log(data_price.shift(1 , axis = 1)) 

    dat = dat.iloc[:,1:]              # remove the column of 9:30 
    
    
    #####-------can be deleted when running full data-------#####
    dat = dat.iloc[ : , : no_p]

    dat = dat.fillna(0)   # impute NULL with 0
    
    SYM = list(ticker) * (dat.shape[1] - 3)
    SYM.sort()

    name_list1 = ["RETURN 3 MINS BEFORE", "RETURN 2 MINS BEFORE", "RETURN 1 MINS BEFORE"]
    name_list2 = sorted(list(set(SYM)))

    name_list = []
    for i in name_list2:
        for j in name_list1:
            name_list.append(i + ' ' + j)
    

    data_variable = pd.DataFrame(columns = name_list)
    
    data_0 = assign_variable(dat,data_variable,name_list)


    return data_0 , dat


In [26]:
# data1 for NYSE listed stocks
# data2 for long-lived 3 predictors

def long_lived(data1, data2, no_stock ,seed, version = 0):
    
    returns , n_ticker, ticker = preprocess_returns(data1)
    data_0 , dat0 = preprocess_X(data2)
    
    nr = data_0.shape[0]      # 386 if the data is whole day 9:34 - 15:59
    n_pred = nr - 30     # no. of predictions per stock

    
    if np.isfinite(data_0).sum().sum() != data_0.shape[0] * data_0.shape[1]:
        return 'There is infinite number.'
    else:
        
        smp = rand_ticker(n_ticker , ticker , no_stock , seed)
        
        pred = np.empty([n_pred, no_stock])

        r2_adj = np.empty(no_stock) 
        
        for i in range(no_stock):
            
            # when run in school computer (python3.6), use 0; use macbook (python 3.7), use 1 here
            y = returns[returns.SYM_ROOT == smp[i]].iloc[ : , version]  

            for j in range(n_pred):

                train_X = data_0.iloc[j : (j + 30) , : ]
                test_X = data_0.iloc[j + 30 , : ].values.reshape(1 , -1)  # reshape(1 , -1) when test data has only one observation

                train_y = y[j : (j + 30)]

                # fit simple linear regression
                fit =  LinearRegression().fit(train_X , train_y)
                              
                pred[j , i] = fit.predict(X = test_X)      
                
            # calculate R^2
            true_return = y[30 : ]
            lr = LinearRegression().fit(pred[ : , i].reshape(-1 , 1) , true_return)
            r2 = lr.score(pred[ : , i].reshape(-1 , 1) , true_return)
            r2_adj[i] = 1 - (1 - r2) * (n_pred - 1) / (n_pred - 2)            
                        
        return  r2_adj 

In [29]:
def loop_market(date , no_stock , seed , lamb_range = None, fold = 3):
    n = len(date)
    r2_adj_0 = np.empty([no_stock , n])
    for i in range(len(date)):
        dat_name1 = 'nbbo_201412' + date[i] + '.csv'
        data1 = pd.read_csv(dat_name1)
        dat_name2 = 'market_201412' + date[i] + '.csv'
        data2 = pd.read_csv(dat_name2)
        
        r2_adj_tmp = long_lived(data1, data2, no_stock , seed , version = 1)
        r2_adj_0[ : , i] = r2_adj_tmp
        
    return r2_adj_0 
   

In [31]:
start_0 = time.time()
date_0 = date[:3]
r2_adj_market = loop_market(date_0 , 50 , 5)
end_0 = time.time()

In [32]:
print("Mean of R2 by using long lived predictors are ", np.mean(r2_adj_market))


Mean of R2 by using long lived predictors are  0.0014358302920113915
