In [1]:
'''
Analyze volume for all cattle commodities split by before and after an arbitrary contract month and year
For each contract we plot the following:
- Average Daily Nominal Trading Volume By Minute
- Average Daily Normalized Trading Volume By Minute
'''
import pandas as pd
import os
import ipywidgets as widgets
from IPython.display import display
from tqdm.notebook import trange, tqdm
from datetime import datetime

In [2]:
CONTRACTS_PREFIX_MATCHER = 'LEM' # Only contracts with filenames matching this prefix will be analyzed
SPLIT_DATE_CUTOFF = '2015-06-07' # The date to use for splitting the contracts into a before and after

In [3]:
# Set plotly as the plotting engine for pandas for convenience
pd.options.plotting.backend = "plotly"

In [4]:
# Get a list of all the csv files to process
csv_files = []
for file in os.listdir("../data/raw/firstratedata_futures"):
    if file.startswith(CONTRACTS_PREFIX_MATCHER):
        csv_files.append(file)
csv_files.sort()
csv_files

['LEM08.csv',
 'LEM09.csv',
 'LEM10.csv',
 'LEM11.csv',
 'LEM12.csv',
 'LEM13.csv',
 'LEM14.csv',
 'LEM15.csv',
 'LEM16.csv',
 'LEM17.csv',
 'LEM18.csv',
 'LEM19.csv',
 'LEM20.csv']

In [5]:
def convert_csv_to_df(filename):
    df_volume = pd.read_csv(
        f"../data/raw/firstratedata_futures/{filename}",
        parse_dates=['DateTime'], usecols=['DateTime', 'Volume'], index_col=['DateTime']
    )
    return df_volume

In [6]:
def get_unique_trading_days(df):
    '''Calculate the number of unique trading days in the dataset'''
    unique_trading_days = df.index.map(lambda t: t.date()).unique()
    return unique_trading_days

In [7]:
def initialize_df_grouped_by_minute():
    '''Initialize an empty dataframe with no data and an index with a row for each minute of the day'''
    date_range = pd.date_range(start='1/1/2021', end='1/02/2021', freq='T')[:-1]
    new_df = pd.DataFrame(data={'DateTime':date_range}).set_index('DateTime')
    new_df = new_df.groupby(lambda x: x.time()).sum()
    return new_df

In [8]:
def combine_data_frames(df1,df2):
    '''Return a dataframe that concats the two provided dataframes together'''
    combined_df = pd.concat([df1, df2])
    return combined_df

In [9]:
def get_master_ungrouped_data_frame(files_to_process):
    '''Build up a single dataframe containing volume and DateTime for all contracts'''
    initial_df = pd.DataFrame(columns = ["DateTime", "Volume"]).set_index('DateTime')
    for i in trange(len(files_to_process), desc=f"Overall Analysis"):
        file = files_to_process[i]
        contract_symbol = file[:len(file) - 4]
        a_contract_df = convert_csv_to_df(file)
        initial_df = combine_data_frames(initial_df, a_contract_df)
        # display(a_contract_df)
    return initial_df

In [10]:
def resample_volume_by_minute(df):
    '''Resample the data-set by minute filling in the gaps and summing the trading volume within each minute'''
    df_temp = df.resample('1T').sum()[["Volume"]]
    df_volume_by_minute = df_temp.groupby(lambda x: x.time()).sum()
    return df_volume_by_minute

In [11]:
def split_dataframe_by_date(df, split_date):
    '''Split a dataframe into two dataframes. Onecontains all rows before the split_date and the other contains all rows after it'''
    before_date_df = master_ungrouped_df[master_ungrouped_df.index.date < datetime.strptime(split_date, '%Y-%m-%d').date()].copy()
    after_date_df = master_ungrouped_df[master_ungrouped_df.index.date >= datetime.strptime(split_date, '%Y-%m-%d').date()].copy()
    return (before_date_df, after_date_df)

In [12]:
def get_master_avg_daily_nominal_df(
    before_date_by_minute_df: pd.DataFrame, after_date_by_minute_df: pd.DataFrame,
    before_date_unique_trading_days: int, after_date_unique_trading_days: int
) -> pd.DataFrame:
    master_avg_daily_nominal_df = initialize_df_grouped_by_minute()
    before_date_by_minute_df = before_date_by_minute_df.rename(columns={'Volume':f"Total Volume Before {SPLIT_DATE_CUTOFF}"})
    before_date_by_minute_df[f"Average Volume Before {SPLIT_DATE_CUTOFF}"] = before_date_by_minute_df.apply(lambda row: row / before_date_unique_trading_days )
    after_date_by_minute_df = after_date_by_minute_df.rename(columns={'Volume':f"Total Volume After {SPLIT_DATE_CUTOFF}"})
    after_date_by_minute_df[f"Average Volume After {SPLIT_DATE_CUTOFF}"] = after_date_by_minute_df.apply(lambda row: row / after_date_unique_trading_days )
    master_avg_daily_nominal_df = pd.concat([master_avg_daily_nominal_df, before_date_by_minute_df, after_date_by_minute_df], axis=1)
    return master_avg_daily_nominal_df

In [13]:
def clean_master_avg_daily_nominal_df(master_avg_daily_nominal_df: pd.DataFrame, split_date_cutoff: int) -> pd.DataFrame:
    '''Drop columns from the df we don't need'''
    return master_avg_daily_nominal_df.drop([f"Total Volume Before {split_date_cutoff}",f"Total Volume After {split_date_cutoff}"], axis=1)

In [14]:
# Gather all the data from every contract into one big dataframe
master_ungrouped_df = get_master_ungrouped_data_frame(csv_files)

Overall Analysis:   0%|          | 0/13 [00:00<?, ?it/s]

In [15]:
# Sort then split the big ungrouped dataframe into a before and after dataframe using the cutoff date
master_ungrouped_df = master_ungrouped_df.sort_values(by=['DateTime'])
before_date_df, after_date_df = split_dataframe_by_date(master_ungrouped_df, SPLIT_DATE_CUTOFF)

In [16]:
# Calculate the number of unique trading days before and after the cutoff date
before_date_unique_trading_days = len(get_unique_trading_days(before_date_df))
after_date_unique_trading_days = len(get_unique_trading_days(after_date_df))

In [None]:
# Resample the before and after dataframes to show the total volume by minute of the day
before_date_by_minute_df = resample_volume_by_minute(before_date_df)
after_date_by_minute_df = resample_volume_by_minute(after_date_df)

In [None]:
# Calculate the average intraday volume for each minute of the day before and after the cutoff date.
# Put all that information into one dataframe for charting
master_avg_daily_nominal_df = get_master_avg_daily_nominal_df(
    before_date_by_minute_df, after_date_by_minute_df,
    before_date_unique_trading_days, after_date_unique_trading_days
)
# master_avg_daily_nominal_df

In [None]:
# Remove columns from the dataframe that we have no intention of charting
master_avg_daily_nominal_df = clean_master_avg_daily_nominal_df(
    master_avg_daily_nominal_df, SPLIT_DATE_CUTOFF
)

In [None]:
# Create and display our figure for nominal intraday volume
fig1 = master_avg_daily_nominal_df.plot(kind="line", title=f"All LEM Contracts - Average Intraday Nominal Trading Volume By Minute")
fig1.show()