## Imports and Functions

We begin by loading the necessary packages/modules and define functions to calculate the log price ratio for our regression and to view dataframes.

In [1]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from IPython.display import display, HTML

def calculate_log_price_ratio(last_trade_prices):
    # calc log(vi+1/vi)
    return np.log(last_trade_prices / last_trade_prices.shift(1))

def display_scrollable(df):
    """
    Displays a pandas DataFrame as a scrollable HTML table in a Jupyter Notebook.

    Args:
    df (pandas.DataFrame): The DataFrame to display.
    """
    df_html = df.to_html()
    scrollable_table = f'''
    <div style="overflow-x: auto; white-space: nowrap; max-height:400px; overflow-y:scroll;">
        {df_html}
    </div>
    '''
    display(HTML(scrollable_table))

## Initialization

Initialize a list to hold regression results. We then import and sort [`tick_data.csv`](<..\data\processed\tick_data.csv>) by time after converting 'Timestamp' to a datatime object and adjusting 'Volume' based on tick direction in 'Tick'.

In [2]:
results = []

df = pd.read_csv(os.path.join('..', 'data', 'processed', 'tick_data.csv'))

# convert 'Timestamp' to a datetime object and sort
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df.sort_values(by=['RIC', 'Timestamp'], inplace=True)

# adjust 'Volume' based on 'Tick' direction
df['Adjusted Volume'] = df.apply(lambda row: row['Volume'] if row['Tick'] == 'UP' or row['Tick'] == 'n' else -row['Volume'], axis=1)

# display first 20 rows of sorted dataframe
display_scrollable(df.head(20))

Unnamed: 0,RIC,Timestamp,Tick,Last Trade,Volume,Adjusted Volume
11349,ABBN.S,2023-12-21 09:00:00.007,n,37.13,77694.0,77694.0
11348,ABBN.S,2023-12-21 09:00:00.017,DOWN,37.12,78523.0,-78523.0
11347,ABBN.S,2023-12-21 09:00:00.035,UP,37.15,78523.0,78523.0
11342,ABBN.S,2023-12-21 09:00:00.079,UP,37.16,78993.0,78993.0
11340,ABBN.S,2023-12-21 09:00:00.084,DOWN,37.15,78523.0,-78523.0
11337,ABBN.S,2023-12-21 09:00:00.173,UP,37.16,78859.0,78859.0
11324,ABBN.S,2023-12-21 09:00:00.485,UP,37.16,76079.0,76079.0
11319,ABBN.S,2023-12-21 09:00:00.613,UP,37.16,76083.0,76083.0
11310,ABBN.S,2023-12-21 09:00:00.975,DOWN,37.15,74600.0,-74600.0
11306,ABBN.S,2023-12-21 09:00:01.144,DOWN,37.12,74591.0,-74591.0


## Run the regression

We perform a nested loop through each equity and trading day to obtain daily liquidity estimates. We then append the results into a dataframe and save it to the [`\liq_param`](<..\models\liq_param>) file directory.

In [4]:
for ric, ric_group in df.groupby('RIC'):
    for date, group in ric_group.groupby(ric_group['Timestamp'].dt.date):
        # check if the group is empty
        if group.empty:
            continue

        # sort again just in case
        group.sort_values('Timestamp', inplace=True)

        # calc yi using 'Last Trade' column
        group['log_price_ratio'] = calculate_log_price_ratio(group['Last Trade'])

        # calculate predictors
        group['ti_diff'] = group['Timestamp'].diff().dt.total_seconds()
        group['wi'] = (group['Adjusted Volume'].diff() / group['ti_diff'].pow(0.5))
        group['zi'] = group['ti_diff'].pow(0.5)

        # remove nan rows due to diff
        group = group.dropna(subset=['log_price_ratio', 'wi', 'zi'])

        # check if group has enough points to fit
        if group.shape[0] > 1:
            # prep vars for regression
            X = group[['wi', 'zi']]
            y = group['log_price_ratio']

            # fit ols
            model = sm.OLS(y, X).fit()
            
            # extract names
            predictors = model.params.index
            
            # check if desired predictor is present
            if 'wi' in predictors:
                # calc confidence intervals
                confidence_intervals = model.conf_int()

                # append to results list
                results.append({
                    'RIC': ric,
                    'Date': date,
                    'Gamma Estimate': model.params['wi'],
                    'CI Lower Bound': confidence_intervals.loc['wi', 0],
                    'CI Upper Bound': confidence_intervals.loc['wi', 1]
                })
            else:
                print(f"Predictor 'wi' not found in the model's parameters for RIC {ric} on {date}")

# convert list to dataframe and save
results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join('..', 'models', 'liq_param', 'liq_results.csv'))

# display dataframe
display_scrollable(results_df)

  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid


Unnamed: 0,RIC,Date,Gamma Estimate,CI Lower Bound,CI Upper Bound
0,ABBN.S,2023-12-21,1.02219e-10,8.310948e-11,1.213285e-10
1,ABBN.S,2023-12-22,7.514952e-11,-3.089147e-11,1.811905e-10
2,ABBN.S,2023-12-27,7.78551e-11,6.219103e-11,9.351917e-11
3,ABBN.S,2023-12-28,6.974448e-11,-5.305702e-11,1.92546e-10
4,ABBN.S,2023-12-29,1.468415e-10,1.112931e-10,1.8239e-10
5,ABBN.S,2024-01-03,5.163514e-11,3.313427e-11,7.013602e-11
6,ABBN.S,2024-01-04,1.493666e-10,1.222573e-10,1.764759e-10
7,ABBN.S,2024-01-05,6.266399e-11,3.992471e-11,8.540327e-11
8,ABBN.S,2024-01-08,1.497933e-10,1.115479e-10,1.880387e-10
9,ABBN.S,2024-01-09,1.799871e-10,1.465359e-10,2.134382e-10
