In [None]:
import pandas as pd
import os
import datetime
from tqdm.notebook import trange, tqdm

A quick and dirty way to test out if there is any meaningful correlation between the intraday price change at minute 60 after the open and the price difference between the market open and  a prior days settlement prices. 
We do this two ways:
- First we generate a histogram for each combination of open type and and interval lookback period showing the frequency of occurence of various values for 'Price Difference b/w Open And Prior Day Settlement'. This allows us to see the shape of the distribution.
- Second we graph out a scatter plot to show the relationships between these variables in on an x,y coordinate plane.

In [None]:
CURRENT_DIR = os.getcwd()
SETLLEMENT_CHANGE_DATA_PATH = os.path.join(
    CURRENT_DIR, '../data/processed/futures_contracts/settlement_analytics'
)
CONTRACT_INTRADAY_SLIDING_OPEN_FILE_PATH = os.path.join(
    CURRENT_DIR, '../data/processed/futures_contracts/contract_open_enriched_sliding_open.csv')
CONTRACT_INTRADAY_TRUE_OPEN_FILE_PATH = os.path.join(
    CURRENT_DIR, '../data/processed/futures_contracts/contract_open_enriched_true_open.csv')

# These parameters allow us to filter out trading activity on days where the contract DTE tends to have missing open bars
DTE_FILTER_UPPER_BOUNDARY = 140
DTE_FILTER_LOWER_BOUNDARY = 25

In [None]:
def get_change_from_settlement_information(filename):
    csv_as_df = pd.read_csv(
        SETLLEMENT_CHANGE_DATA_PATH + '/' + filename,
        parse_dates=['Date'],
        usecols=['Date','Price Difference b/w Open And Prior Day Settlement', 'Symbol']
    )
    populated_price_only_df = csv_as_df[~csv_as_df['Price Difference b/w Open And Prior Day Settlement'].isnull()]
    return (populated_price_only_df, filename)

In [None]:
def intraday_open_csv_to_df(filename) -> pd.DataFrame:
    csv_as_df = pd.read_csv(
        filename,
        parse_dates=['DateTime'],
        usecols=[
            'Symbol', 'DateTime', 'Open Minutes Offset', 'Open', 'High', 'Low', 'Close',
            'Volume', 'Price Change From Intraday Open', 'Expiration Date', 'DTE'
        ]
    )
    return csv_as_df

In [None]:
def filter_bars_for_dte_with_frequently_missing_open(
    intraday_open_df: pd.DataFrame,
    dte_filter_lower_boundary: int,
    dte_filter_upper_boundary: int
) -> pd.DataFrame:
    '''Filter out days associated with a DTE that is often missing a true open bar'''
    filtered_df = intraday_open_df[(intraday_open_df['DTE'] >= dte_filter_lower_boundary) & (
        intraday_open_df['DTE'] <= dte_filter_upper_boundary)]
    return filtered_df

In [None]:
def get_settlement_price_change_for_date(a_date: datetime.date, a_settlement_df: pd.DataFrame):
  rows_with_date = a_settlement_df[a_settlement_df['Date'].dt.date == a_date]
  if rows_with_date.empty: return None
  return rows_with_date.iloc[0]['Price Difference b/w Open And Prior Day Settlement']

In [None]:
def get_intraday_price_change_at_minute_sixty(a_date: datetime.date, intraday_df: pd.DataFrame):
    rows_with_date = intraday_df[intraday_df['DateTime'].dt.date == a_date]
    if rows_with_date.empty: return None
    t_sixty_row = rows_with_date[rows_with_date['Open Minutes Offset'] == 59]
    if t_sixty_row.empty: return None
    return t_sixty_row.iloc[0]['Price Change From Intraday Open']

In [None]:
files = os.listdir(SETLLEMENT_CHANGE_DATA_PATH)
del files[3]
files

In [None]:
settlement_change_datasets = [get_change_from_settlement_information(x) for x in files]

Generate a histogram for each combination of open type and and interval lookback period showing the frequency of occurence of various values for 'Price Difference b/w Open And Prior Day Settlement'. This allows us to see the shape of the distribution.

In [None]:
for settlement_change_dataset in settlement_change_datasets:
  a_df_price_diff_only = settlement_change_dataset[0][['Price Difference b/w Open And Prior Day Settlement']]
  a_df_price_diff_only.plot.hist(bins=30, figsize=(30, 8), title=settlement_change_dataset[1])

In [None]:
sliding_open_df = intraday_open_csv_to_df(CONTRACT_INTRADAY_SLIDING_OPEN_FILE_PATH)
true_open_df = intraday_open_csv_to_df(CONTRACT_INTRADAY_TRUE_OPEN_FILE_PATH)
sliding_open_df = filter_bars_for_dte_with_frequently_missing_open(
    intraday_open_df=sliding_open_df,
    dte_filter_lower_boundary=DTE_FILTER_LOWER_BOUNDARY,
    dte_filter_upper_boundary=DTE_FILTER_UPPER_BOUNDARY
)
true_open_df = filter_bars_for_dte_with_frequently_missing_open(
    intraday_open_df=true_open_df,
    dte_filter_lower_boundary=DTE_FILTER_LOWER_BOUNDARY,
    dte_filter_upper_boundary=DTE_FILTER_UPPER_BOUNDARY
)

In [None]:
unique_symbols = sliding_open_df['Symbol'].drop_duplicates().to_list()

In [None]:
corellation_matrix = {}
for a_settlement_change_dataset_index in trange(len(settlement_change_datasets), desc="settlement change dataset"):
  a_settlement_change_dataset = settlement_change_datasets[a_settlement_change_dataset_index]
  corellation_matrix[a_settlement_change_dataset[1]] = []
  if a_settlement_change_dataset[1] == "true":
    intraday_df = true_open_df
  else:
    intraday_df = sliding_open_df
  for a_symbol_index in trange(len(unique_symbols), desc="Contract symbols fpr dataset"):
    a_symbol = unique_symbols[a_symbol_index]
    a_settlement_change_dataset_for_symbol = a_settlement_change_dataset[0][a_settlement_change_dataset[0]['Symbol'] == a_symbol]
    intraday_df_for_symbol = intraday_df[intraday_df['Symbol'] == a_symbol]
    unique_dates = a_settlement_change_dataset_for_symbol['Date'].dt.date.drop_duplicates().tolist()
    for a_date in unique_dates:
      settlement_price_diff = get_settlement_price_change_for_date(a_date=a_date, a_settlement_df=a_settlement_change_dataset_for_symbol)
      intraday_cfo = get_intraday_price_change_at_minute_sixty(a_date=a_date,intraday_df=intraday_df_for_symbol)
      if (settlement_price_diff is None) or (intraday_cfo is None):
        continue
      else:
        prices_for_date = {
          'Symbol': a_symbol,
          'Date': a_date,
          'Price Difference b/w Open And Prior Day Settlement': settlement_price_diff,
          'CFO At Open t+60': intraday_cfo
        }
      corellation_matrix[a_settlement_change_dataset[1]].append(prices_for_date)
  
  

Generate out scatter plots

In [None]:
for title, a_dict in corellation_matrix.items():
  a_df = pd.DataFrame(a_dict)
  a_df_filtered = a_df[['Price Difference b/w Open And Prior Day Settlement', 'CFO At Open t+60']]
  a_df_filtered.plot.scatter(x='Price Difference b/w Open And Prior Day Settlement', y='CFO At Open t+60', title=title, figsize=(10, 10))