In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LinearRegression
import random
import time
import warnings
warnings.filterwarnings("ignore")

In [2]:
def rand_ticker(n_ticker , ticker , no_stock , seed):
    random.seed( seed )
    ind = random.sample(range(n_ticker), no_stock)
    ind.sort()
        
    smp = ticker[ind]
    return smp

def find_return(dat , SYM):
    part = pd.Series()
    for i in range(dat.shape[0]):
        part = part.append(pd.Series(dat.iloc[i , 3 : dat.shape[1]]))
    d = {"SYM_ROOT":SYM, "RETURN":part}
    data = pd.DataFrame(data = d)
    return data


def assign_variable(dat1,dat2,name_list):
    k = 0
    for i in range(dat1.shape[0]):
        for j in range(3):
            part = pd.Series()
            part = part.append(pd.Series(dat1.iloc[i , j:(dat1.shape[1] - 3 + j)]),ignore_index = True)
            dat2[name_list[k]] = part
            k += 1
    return dat2

In [3]:
def preprocess(data , no_p = 389):

    ticker = data.SYMBOL.unique()
    n_ticker = len(ticker)
    
    data_tmp = data['TIME']
    data_tmp = data_tmp.str.slice(start = 0,stop = -3)
    data["TIME"] = data_tmp
    
    data_c = data.groupby(["SYMBOL","TIME"]).last()        # get the last price of each minute as price of each minute
    data_BB = data_c.BB
    data_BO = data_c.BO
    data_BB = data_BB.unstack()
    data_BO = data_BO.unstack()
    data_price = (data_BB + data_BO) / 2    # mean of NBBO as stock price at t
    
    data_price = data_price.reindex_axis(sorted(data_price.columns, key = lambda x: (len(x),x)), axis = 1)

    dat = np.log(data_price) - np.log(data_price.shift(1 , axis = 1)) 

    dat = dat.iloc[:,1:]              # remove the column of 9:30 
    
    
    #####-------can be deleted when running full data-------#####
    dat = dat.iloc[ : , : no_p]

    dat = dat.fillna(0)   # impute NULL with 0
    
    SYM = list(ticker) * (dat.shape[1] - 3)
    SYM.sort()


    returns = find_return(dat , SYM)        # form dataframe of returns with 1st column ticker name, 2nd column minute return, all stocks in the same column
    
    name_list1 = ["RETURN 3 MINS BEFORE", "RETURN 2 MINS BEFORE", "RETURN 1 MINS BEFORE"]
    name_list2 = sorted(list(set(SYM)))

    name_list = []
    for i in name_list2:
        for j in name_list1:
            name_list.append(i + ' ' + j)
    

    data_variable = pd.DataFrame(columns = name_list)
    
    data_0 = assign_variable(dat,data_variable,name_list)


    return data_0 , returns , n_ticker , ticker 



### First do CV on 1.5-hour window to get the optimal λ, then apply the λ to the next 1.5 hours.

In [5]:
def long_time_cv(data, no_stock , seed , lamb_range = None, fold = 3):
    
    data_0 , returns , n_ticker , ticker = preprocess(data)

    if np.isfinite(data_0).sum().sum() != data_0.shape[0] * data_0.shape[1]:
        return 'There is infinite number.'
    else:

        smp = rand_ticker(n_ticker , ticker , no_stock , seed)
        

        nr = data_0.shape[0]  # 386
        n_pred = nr - 90     # n_pred = 296


        pred = np.empty([n_pred, no_stock])
        lamb = np.empty([4, no_stock])
        num_var = np.empty([n_pred, no_stock])
        
        r2_adj = np.empty(no_stock) 
        

        for i in range(no_stock):

            y = returns[returns.SYM_ROOT == smp[i]].iloc[ : , 0] 


            for j in range(4):
                X_train = data_0.iloc[90 * j : (90 * (j + 1)) , :]
                y_train = y[90 * j : (j + 1) * 90 ]
                fit = LassoCV(random_state = 0 , cv = fold , alphas = lamb_range).fit(X_train , y_train)
                lamb[j , i] = fit.alpha_
                      
                if j in range(0 , 3):
                      
                    for l in range(90 * j + 60 , 90 * j + 150):
                        train = data_0.iloc[l : (l + 30) , : ]
                        train_y = y[l : (l + 30) ]
                        test = data_0.iloc[l + 30 , :].reshape(1 , -1)

                        fit1 = Lasso(alpha = lamb[j, i]).fit(train , train_y)
                        num_var[l - 60 , i] = sum(fit1.coef_ != 0)
                        pred[l - 60 , i] = fit1.predict(test)


                if j == 3:
                    for l in range(90 * j + 60 , nr - 30):
                        train = data_0.iloc[l : (l + 30) , : ]
                        train_y = y[l : (l + 30) ]
                        test = data_0.iloc[l + 30 , :].reshape(1 , -1)

                        fit1 = Lasso(alpha = lamb[j, i]).fit(train , train_y)
                        pred[l -  60 , i] = fit1.predict(test)
                        num_var[l - 60 , i] = sum(fit1.coef_ != 0)

            # calculate lasso R^2
            true_return = y[90 : ]
            lr_lasso = LinearRegression().fit(pred[ : , i].reshape(-1 , 1) , true_return)
            r2 = lr_lasso.score(pred[ : , i].reshape(-1 , 1) , true_return)
            r2_adj[i] = 1 - (1 - r2) * (n_pred - 1) / (n_pred - 2)
        
        return pred , lamb , num_var , r2_adj 

  

In [7]:
date = ['01' , '02' ,'03' ,'04' ,'05' ,'08' ,'09' ,'10' ,'11' ,'12' ,'15' ,'16' ,'17' ,'18' ,'19' ,'22' ,'23' ,'24' ,'29' , '30' , '31']



In [6]:
def loop_over_time(date , no_stock , seed , lamb_range = None , fold = 3):
    n = len(date)
    lamb_0 = np.empty([no_stock , n])
    num_var_0 = np.empty([no_stock , n])
    r2_adj_0 = np.empty([no_stock , n])
    
    for i in range(len(date)):
        dat_name = 'nbbo_201412' + date[i] + '.csv'
        data = pd.read_csv(dat_name)
        pred_tmp , lamb_tmp , num_var_tmp , r2_adj_tmp  = long_time_cv(data , no_stock , seed , lamb_range , fold)
        lamb_0[ : , i] = np.mean(lamb_tmp , axis = 0)
        num_var_0[ : , i] = np.mean(num_var_tmp , axis = 0)
        r2_adj_0[ : , i] = r2_adj_tmp
        
    return lamb_0 , num_var_0 , r2_adj_0

In [8]:
date0 = date[:7]
lamb_1 , num_var_1 , r2_adj_1 = loop_over_time(date0 , 50 , 5)

In [9]:
np.mean(np.mean(r2_adj_1))

0.00041425128015926284