In [1]:
'''
Analyze volume for all cattle commodities split by before and after an arbitrary number of days to contract expiration (DTE).
For example if we set the DTE to 45 then in each chart would get one volume trend line showing volumes on trading contracts that are
greater then 45 DTE and another for those with less than 45 DTE
For each contract we plot the following:
- Average Daily Nominal Trading Volume By Minute
- Average Daily Normalized Trading Volume By Minute
'''
import pandas as pd
import os
import ipywidgets as widgets
from IPython.display import display
from tqdm.notebook import trange, tqdm
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

In [59]:
CONTRACTS_PREFIX_MATCHER = 'LE' # Only contracts with filenames matching this prefix will be analyzed
DAYS_TO_EXPIRATION_THRESHOLD = 45.0 # A positive float to use for splitting the volume data under analysis

In [3]:
# Set plotly as the plotting engine for pandas for convenience
pd.options.plotting.backend = "plotly"

In [4]:
# Get a list of all the csv files to process
csv_files = []
for file in os.listdir("../data/raw/firstratedata_futures"):
    if file.startswith(CONTRACTS_PREFIX_MATCHER):
        csv_files.append(file)
csv_files.sort()
print(f"Analyzing the following files: {csv_files}")

Analyzing the following files: ['LEG09.csv', 'LEG10.csv', 'LEG11.csv', 'LEG12.csv', 'LEG13.csv', 'LEG14.csv', 'LEG15.csv', 'LEG16.csv', 'LEG17.csv', 'LEG18.csv', 'LEG19.csv', 'LEG20.csv', 'LEJ08.csv', 'LEJ09.csv', 'LEJ10.csv', 'LEJ11.csv', 'LEJ12.csv', 'LEJ13.csv', 'LEJ14.csv', 'LEJ15.csv', 'LEJ16.csv', 'LEJ17.csv', 'LEJ18.csv', 'LEJ19.csv', 'LEJ20.csv', 'LEM08.csv', 'LEM09.csv', 'LEM10.csv', 'LEM11.csv', 'LEM12.csv', 'LEM13.csv', 'LEM14.csv', 'LEM15.csv', 'LEM16.csv', 'LEM17.csv', 'LEM18.csv', 'LEM19.csv', 'LEM20.csv', 'LEQ08.csv', 'LEQ09.csv', 'LEQ10.csv', 'LEQ11.csv', 'LEQ12.csv', 'LEQ13.csv', 'LEQ14.csv', 'LEQ15.csv', 'LEQ16.csv', 'LEQ17.csv', 'LEQ18.csv', 'LEQ19.csv', 'LEQ20.csv', 'LEV08.csv', 'LEV09.csv', 'LEV10.csv', 'LEV11.csv', 'LEV12.csv', 'LEV13.csv', 'LEV14.csv', 'LEV15.csv', 'LEV16.csv', 'LEV17.csv', 'LEV18.csv', 'LEV19.csv', 'LEV20.csv', 'LEZ08.csv', 'LEZ09.csv', 'LEZ10.csv', 'LEZ11.csv', 'LEZ12.csv', 'LEZ13.csv', 'LEZ14.csv', 'LEZ15.csv', 'LEZ16.csv', 'LEZ17.csv', 'LEZ18

In [5]:
def convert_csv_to_df(filename):
    df_volume = pd.read_csv(
        f"../data/raw/firstratedata_futures/{filename}",
        parse_dates=['DateTime'], usecols=['DateTime', 'Volume'], index_col=['DateTime']
    )
    return df_volume

In [6]:
def get_unique_trading_days(df):
    '''Calculate the number of unique trading days in the dataset'''
    unique_trading_days = df.index.map(lambda t: t.date()).unique()
    return unique_trading_days

In [7]:
def initialize_df_grouped_by_minute():
    '''Initialize an empty dataframe with no data and an index with a row for each minute of the day'''
    date_range = pd.date_range(start='1/1/2021', end='1/02/2021', freq='T')[:-1]
    new_df = pd.DataFrame(data={'DateTime':date_range}).set_index('DateTime')
    new_df = new_df.groupby(lambda x: x.time()).sum()
    return new_df

In [8]:
def combine_data_frames(df1,df2):
    '''Return a dataframe that concats the two provided dataframes together'''
    combined_df = pd.concat([df1, df2])
    return combined_df

In [78]:
def resample_volume_by_minute(df):
    '''Resample the data-set by minute filling in the gaps and summing the trading volume within each minute'''
    df_temp = df[['Volume']].resample('1T').sum()[["Volume"]]
    df_volume_by_minute = df_temp.groupby(lambda x: x.time()).sum()
    return df_volume_by_minute

In [49]:
def split_dataframe_by_dte(a_df: pd.DataFrame, dte_threshold: float) -> pd.DataFrame:
    '''
    Split a dataframe into two dataframes. One contains all rows where the dte is <= the dte_threshold 
    and the other contains all rows where the dte is > the dte_threshold
    '''
    less_than_or_equal_dte_threshold_df = a_df[a_df["Days To Contract Expiration"] <= dte_threshold].copy()
    greater_than_dte_threshold_df = a_df[a_df["Days To Contract Expiration"] > dte_threshold].copy()
    return (less_than_or_equal_dte_threshold_df, greater_than_dte_threshold_df)

In [85]:
def get_master_avg_daily_nominal_df(
    lte_dte_by_minute_df: pd.DataFrame, gt_dte_by_minute_df: pd.DataFrame,
    num_lte_dte_unique_trading_days: int, num_gt_dte_unique_trading_days: int
) -> pd.DataFrame:
    '''Create the dataframe with the average intraday nominal trading volume by minute'''
    master_avg_daily_nominal_df = initialize_df_grouped_by_minute()
    lte_dte_by_minute_df = lte_dte_by_minute_df.rename(columns={'Volume':f"Total Volume <= {DAYS_TO_EXPIRATION_THRESHOLD} DTE"})
    lte_dte_by_minute_df[f"Average Volume <= {DAYS_TO_EXPIRATION_THRESHOLD} DTE"] = lte_dte_by_minute_df.apply(lambda row: row / num_lte_dte_unique_trading_days )
    gt_dte_by_minute_df = gt_dte_by_minute_df.rename(columns={'Volume':f"Total Volume > {DAYS_TO_EXPIRATION_THRESHOLD} DTE"})
    gt_dte_by_minute_df[f"Average Volume > {DAYS_TO_EXPIRATION_THRESHOLD} DTE"] = gt_dte_by_minute_df.apply(lambda row: row / num_gt_dte_unique_trading_days )
    master_avg_daily_nominal_df = pd.concat([master_avg_daily_nominal_df, lte_dte_by_minute_df, gt_dte_by_minute_df], axis=1)
    return master_avg_daily_nominal_df

In [12]:
def clean_master_avg_daily_nominal_df(master_avg_daily_nominal_df: pd.DataFrame, split_date_cutoff: int) -> pd.DataFrame:
    '''Drop columns from the df we don't need'''
    return master_avg_daily_nominal_df.drop([f"Total Volume Before {split_date_cutoff}",f"Total Volume After {split_date_cutoff}"], axis=1)

In [13]:
def clean_master_avg_daily_normalized_df(df_to_clean: pd.DataFrame, split_date_cutoff: int) -> pd.DataFrame:
    '''Drop columns from the df we don't need'''
    return df_to_clean.drop([f"Total Volume Before {split_date_cutoff}",f"Total Volume After {split_date_cutoff}"], axis=1)

In [14]:
# Returns an array of normalized values given an ndarray of nominal values
def normalize_nd_array(to_normalize):
    '''train the normalization'''
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler = scaler.fit(to_normalize)
    # print('Min: %f, Max: %f' % (scaler.data_min_, scaler.data_max_))
    # normalize the dataset and print the first 5 rows
    normalized = scaler.transform(to_normalize)
    normalized = list(map(lambda x: x[0], normalized.tolist()))
    # for i in range(5):
    # 	print(normalized[i])
    return normalized

In [15]:
def create_dataframes_for_volume_grouped_by_minute(df_volume_by_minute, unique_trading_days):
    '''
    Create and populate an array of dataframes. Each dataframe contains one trading days worth of volume values grouped by minute
    and normalized against just that days worth of activity
    '''
    frames = []
    for i in trange(len(unique_trading_days), desc="Splitting into dataframes grouped by minute"):
        day=unique_trading_days[i]
        string_date = day.strftime("%Y-%m-%d")
        days_df = df_volume_by_minute.loc[string_date]
        volume_values = days_df['Volume'].values
        volume_values = volume_values.reshape((len(volume_values), 1))
        normalized_day_volume = normalize_nd_array(volume_values)
        days_df['Volume Normalized Intraday'] = normalized_day_volume
        frames.append(days_df.copy())
    return frames

In [16]:
def concat_to_single_df(frames):
    '''Concatenate the entire array of dataframes back into one big dataframe that contains the Volume Normalized Intraday values for every minute of every day'''
    df_intraday_normalized = pd.concat(frames)
    return df_intraday_normalized

In [17]:
# Returns the volume Normalized intraday Avg
def calculate_normalized_vol_by_minute(intraday_summed, num_unique_trading_days):
    return intraday_summed / num_unique_trading_days

In [18]:
def group_by_minute_sum_normalized_volumes(df_intraday_normalized, num_unique_trading_days):
    '''Group by minute across all days summing the intraday normalized volumes'''
    df_normalized_grouped_by_minute = df_intraday_normalized.groupby(lambda x: x.time()).sum().rename(columns={'Volume Normalized Intraday':'Volume Normalized Summed'})
    # Add a column that shows the average intraday normalized volume for each minute
    df_normalized_grouped_by_minute['Daily Avg Volume Normalized'] = df_normalized_grouped_by_minute.apply(
        lambda row: calculate_normalized_vol_by_minute(row['Volume Normalized Summed'],
                                                       num_unique_trading_days),
        axis=1
    )
    return df_normalized_grouped_by_minute

In [19]:
def calculate_dte_for_row(row, last_unique_trading_minute_in_contract):
    this_row_date = row.name.to_pydatetime()
    # print(f"this_row_date {this_row_date}")
    time_difference = last_unique_trading_minute_in_contract - this_row_date
    # print(f"time_difference {time_difference}")
    return time_difference.days

In [20]:
def add_dte_column_to_df(a_contract_df):
    unique_trading_days = list(get_unique_trading_days(a_contract_df))
    unique_trading_days.sort()
    last_unique_trading_day_in_contract = unique_trading_days[-1].strftime("%Y-%m-%d")
    last_unique_trading_minute_in_contract = a_contract_df.loc[last_unique_trading_day_in_contract].iloc[-1].name.to_pydatetime()
    a_contract_df["Days To Contract Expiration"] = a_contract_df.apply(lambda r: calculate_dte_for_row(r, last_unique_trading_minute_in_contract), axis=1)
    return a_contract_df

In [21]:
def get_master_ungrouped_data_frame(files_to_process):
    '''Build up a single dataframe containing volume DateTime and DTE for all contracts'''
    grouped_df = pd.DataFrame(columns = ["DateTime", "Volume"]).set_index('DateTime')
    for i in trange(len(files_to_process), desc=f"Overall Analysis"):
        file = files_to_process[i]
        contract_symbol = file[:len(file) - 4]
        a_contract_df = convert_csv_to_df(file)
        with_dte_df = add_dte_column_to_df(a_contract_df)
        grouped_df = combine_data_frames(grouped_df, with_dte_df)
        # display(a_contract_df)
    return grouped_df

In [22]:
# Gather all the data from every contract into one big dataframe
master_ungrouped_df = get_master_ungrouped_data_frame(csv_files)

Overall Analysis:   0%|          | 0/77 [00:00<?, ?it/s]

In [33]:
# Sort the grouped dataframe
master_ungrouped_df = master_ungrouped_df.sort_values(by=['DateTime'])

In [60]:
# Split the dataframe in two based on the DTE threshold
less_than_or_equal_dte_threshold_df, greater_than_dte_threshold_df = split_dataframe_by_dte(master_ungrouped_df, DAYS_TO_EXPIRATION_THRESHOLD)

In [62]:
# Determine the unique trading days before and after the DTE threshold
less_than_or_equal_dte_unique_trading_days = get_unique_trading_days(less_than_or_equal_dte_threshold_df)
greater_than_dte_unique_trading_days = get_unique_trading_days(greater_than_dte_threshold_df)

In [65]:
# Calculate the number of unique trading days before and after the cutoff date
num_lte_dte_unique_trading_days = len(less_than_or_equal_dte_unique_trading_days)
num_gt_dte_unique_trading_days = len(greater_than_dte_unique_trading_days)

In [79]:
# Resample the before and after dataframes to show the total volume by minute of the day
lte_dte_by_minute_df = resample_volume_by_minute(less_than_or_equal_dte_threshold_df)
gt_dte_by_minute_df = resample_volume_by_minute(greater_than_dte_threshold_df)

In [87]:
# Calculate the average intraday volume for each minute of the day before and after the dte threshold
# Put all that information into one dataframe for charting
master_avg_daily_nominal_df = get_master_avg_daily_nominal_df(
    lte_dte_by_minute_df, gt_dte_by_minute_df,
    num_lte_dte_unique_trading_days, num_gt_dte_unique_trading_days
)

In [90]:
# master_avg_daily_nominal_df.iloc[560:580]

Unnamed: 0_level_0,Total Volume <= 45.0 DTE,Average Volume <= 45.0 DTE,Total Volume > 45.0 DTE,Average Volume > 45.0 DTE
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
09:20:00,6581,2.576742,32434,9.547836
09:21:00,6321,2.474941,31953,9.406241
09:22:00,6518,2.552075,33112,9.747424
09:23:00,6283,2.460063,33145,9.757139
09:24:00,6046,2.367267,32507,9.569326
09:25:00,6494,2.542678,30553,8.994112
09:26:00,6595,2.582224,33038,9.72564
09:27:00,6181,2.420125,31162,9.173388
09:28:00,6009,2.35278,32018,9.425375
09:29:00,6573,2.57361,33043,9.727112


In [None]:
# Remove columns from the dataframe that we have no intention of charting
master_avg_daily_nominal_df = clean_master_avg_daily_nominal_df(
    master_avg_daily_nominal_df, DAYS_TO_EXPIRATION_THRESHOLD
)

In [None]:
# Create two arrays of dataframes each array element is a dataframe for a single days worth of intraday normalized volume
before_cutoff_date_frames = create_dataframes_for_volume_grouped_by_minute(master_ungrouped_df, before_date_unique_trading_days)
after_cutoff_date_frames = create_dataframes_for_volume_grouped_by_minute(master_ungrouped_df, after_date_unique_trading_days)

In [None]:
# Concatenate the array of dataframes into one big dataframe each for before and after the cutoff date
df_intraday_normalized_before = concat_to_single_df(before_cutoff_date_frames)
df_intraday_normalized_after = concat_to_single_df(after_cutoff_date_frames)

In [None]:
df_intraday_normalized_after

In [None]:
# Group by minute normalized for both before and after the cutoff date
df_intraday_normalized_before_grouped_by_minute = group_by_minute_sum_normalized_volumes(df_intraday_normalized_before, num_before_date_unique_trading_days)
df_intraday_normalized_after_grouped_by_minute = group_by_minute_sum_normalized_volumes(df_intraday_normalized_after, num_after_date_unique_trading_days)

In [None]:
# Rename and drop some columns we no longer need
df_intraday_normalized_before_grouped_by_minute = df_intraday_normalized_before_grouped_by_minute.rename(columns={'Daily Avg Volume Normalized':f"Average Volume Before {DAYS_TO_EXPIRATION_THRESHOLD}"})
df_intraday_normalized_before_grouped_by_minute = df_intraday_normalized_before_grouped_by_minute.drop(["Volume Normalized Summed"], axis=1)
df_intraday_normalized_after_grouped_by_minute = df_intraday_normalized_after_grouped_by_minute.rename(columns={'Daily Avg Volume Normalized':f"Average Volume After {DAYS_TO_EXPIRATION_THRESHOLD}"})
df_intraday_normalized_after_grouped_by_minute = df_intraday_normalized_after_grouped_by_minute.drop(["Volume Normalized Summed"], axis=1)

In [None]:
# Combine all relev
master_avg_daily_normalized_df = pd.concat([df_intraday_normalized_before_grouped_by_minute, df_intraday_normalized_after_grouped_by_minute], axis=1)

In [None]:
# Create and display our figure for nominal intraday volume
fig1 = master_avg_daily_nominal_df.plot(kind="line", title=f"All Contracts Starting With {CONTRACTS_PREFIX_MATCHER} - Average Intraday Nominal Trading Volume By Minute")
fig2 = master_avg_daily_normalized_df.plot(kind="line", title=f"All Contracts Starting With {CONTRACTS_PREFIX_MATCHER} - Average Intraday Normalized Trading Volume By Minute")
fig1.show()
fig2.show()