In [77]:
import os
import pandas as pd
import datetime as dt
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from IPython.display import display
from tqdm.notebook import trange, tqdm

This notebook is used to analyze our enriched contract data and provide insight on how many days are missing a true market open bar (ie they open flat) vs have a true market open bar populated.
The data that that is gathered is grouped and visualized both by contract and by days to expiration (DTE)
It is dependent upong the contract_open_enriched_with_flat_open dataset already being populated via the apppriate predecessor etl script.

In [78]:
CONTRACT_OPEN_ENRICHED_FILE_PATH = '../data/processed/futures_contracts/contract_open_enriched_with_flat_open.csv'

In [79]:
def calculate_num_unique_dates_for_contract(contract_df: pd.DataFrame, symbol: str):
    missing_open_for_contract_df = contract_df[contract_df['Symbol'] == symbol]
    missing_open_dates_series_unique = missing_open_for_contract_df['Date'].unique()
    num_missing_open_dates_series_unique = len(missing_open_dates_series_unique)
    return num_missing_open_dates_series_unique

In [80]:
def filter_rows_where_day_is_missing_open(contract_df: pd.DataFrame) -> pd.DataFrame:
    '''
    Remove all rows where the day they are associated with is missing an actual open bar at
    the actual market open time
    '''
    unique_dates = contract_df['DateTime'].dt.date.unique()
    for a_date in unique_dates:
        only_this_days_rows_df = contract_df[contract_df['DateTime'].dt.date == a_date]
        has_actual_open_bar = 0 in set(
            only_this_days_rows_df['Open Minutes Offset'])
        if not has_actual_open_bar:  # This date is missing a real open bar
            # Remove all bars for this date
            contract_df = contract_df[contract_df['DateTime'].dt.date != a_date]
        # print(
        #     f"Date {a_date} is missing a real open so we are removing all its bars from the df")
    return contract_df

In [81]:
def filter_rows_where_day_contains_open(contract_df: pd.DataFrame) -> pd.DataFrame:
    '''
    Remove all rows where the day they are associated with contains an actual open bar at
    the actual market open time
    '''
    unique_dates = contract_df['DateTime'].dt.date.unique()
    for a_date in unique_dates:
        only_this_days_rows_df = contract_df[contract_df['DateTime'].dt.date == a_date]
        has_actual_open_bar = 0 in set(
            only_this_days_rows_df['Open Minutes Offset'])
        if has_actual_open_bar:  # This date has a real open bar
            # Remove all bars for this date
            contract_df = contract_df[contract_df['DateTime'].dt.date != a_date]
        # print(
        #     f"Date {a_date} is missing a real open so we are removing all its bars from the df")
    return contract_df

In [82]:
contract_open_enriched_df = pd.read_csv(CONTRACT_OPEN_ENRICHED_FILE_PATH, parse_dates=['DateTime'])

In [83]:
contract_open_enriched_df['Date'] = contract_open_enriched_df['DateTime'].apply(
        lambda x: x.strftime("%Y-%m-%d")
    )

In [84]:
unique_symbols = contract_open_enriched_df.Symbol.unique()

Iterate over our contract symbols. For each one we get the rows associated with the days that are missing a real open and the days that have a real open. This is used to populate the rows_with_missing_open_df and rows_with_populated_open_df dataframes respectively.

In [85]:
rows_with_missing_open_df = pd.DataFrame()
rows_with_populated_open_df = pd.DataFrame()
for i in trange(len(unique_symbols), desc="Populating dataframes with rows missing and containing open respectively"):
  symbol = unique_symbols[i]
  rows_for_one_contract_df = contract_open_enriched_df[contract_open_enriched_df['Symbol'] == symbol]
  rows_for_one_contract_missing_open_df = filter_rows_where_day_contains_open(rows_for_one_contract_df)
  rows_for_one_contract_containing_open_df = filter_rows_where_day_is_missing_open(rows_for_one_contract_df)
  rows_with_missing_open_df = pd.concat([rows_with_missing_open_df, rows_for_one_contract_missing_open_df], ignore_index=True)
  rows_with_populated_open_df = pd.concat([rows_with_populated_open_df, rows_for_one_contract_containing_open_df], ignore_index=True)


Populating dataframes with rows missing and containing open respectively:   0%|          | 0/77 [00:00<?, ?it/…

In [86]:
open_bar_by_contract_count_df = pd.DataFrame(columns=['Symbol', 'Days Without Open Bar', 'Days With Open Bar'])

Iterate over our contract symbols. For each one calculate the number of unique dates that have an open bar and that do not. Add this information to the open_bar_by_contract_count_df dataframe

In [87]:
for symbol in unique_symbols:
    num_unique_dates_with_missing_open = calculate_num_unique_dates_for_contract(
        contract_df=rows_with_missing_open_df, symbol=symbol
    )
    num_unique_dates_with_open = calculate_num_unique_dates_for_contract(
        contract_df=rows_with_populated_open_df, symbol=symbol
    )
    open_bar_by_contract_count_df = open_bar_by_contract_count_df.append({
        'Symbol': symbol,
        'Days Without Open Bar': num_unique_dates_with_missing_open,
        'Days With Open Bar': num_unique_dates_with_open
    }, ignore_index=True)
    # print(num_unique_dates_with_missing_open)
    
    

Initialize a dataframe that will be used to count the number of times we encounter trading days with missing and populated open bars respectively

In [88]:
days_to_expiration_open_bar_count_df = pd.DataFrame(data={
  'DTE': pd.Series(range(0,601)),
  'Days Without Open Bar': 0,
  'Days With Open Bar': 0
})

In [89]:
for i in trange(len(unique_symbols), desc="Calculating missing open by DTE for each contract"):
  symbol = unique_symbols[i]
  missing_open_for_contract_df = rows_with_missing_open_df[rows_with_missing_open_df['Symbol'] == symbol]
  populated_open_for_contract_df = rows_with_populated_open_df[rows_with_populated_open_df['Symbol'] == symbol]
  dte_values_with_missing_open_bar = missing_open_for_contract_df['DTE'].unique()
  dte_values_with_populated_open_bar = populated_open_for_contract_df['DTE'].unique()
  # Iterate over each unique dte value thats associated with a missing open bar and increment the count in the dataframe we will be using to plot
  for missing_dte_value in dte_values_with_missing_open_bar:
    index_of_row_to_increment = days_to_expiration_open_bar_count_df.index[days_to_expiration_open_bar_count_df['DTE'] == missing_dte_value]
    current_value = days_to_expiration_open_bar_count_df.iloc[index_of_row_to_increment]['Days Without Open Bar']
    days_to_expiration_open_bar_count_df.at[index_of_row_to_increment, 'Days Without Open Bar'] = current_value + 1
  # Iterate over each unique dte value thats associated with a populated open bar and increment the count in the dataframe we will be using to plot
  for populated_dte_value in dte_values_with_populated_open_bar:
    index_of_row_to_increment = days_to_expiration_open_bar_count_df.index[days_to_expiration_open_bar_count_df['DTE'] == populated_dte_value]
    current_value = days_to_expiration_open_bar_count_df.iloc[index_of_row_to_increment]['Days With Open Bar']
    days_to_expiration_open_bar_count_df.at[index_of_row_to_increment, 'Days With Open Bar'] = current_value + 1

Calculating missing open by DTE for each contract:   0%|          | 0/77 [00:00<?, ?it/s]

In [90]:
fig = go.Figure(data=[
    go.Bar(name='Days Without Open Bar', x=unique_symbols, y=open_bar_by_contract_count_df['Days Without Open Bar']),
    go.Bar(name='Days With Open Bar', x=unique_symbols, y=open_bar_by_contract_count_df['Days With Open Bar'])
])
# Change the bar mode
fig.update_layout(barmode='group', title_text='Number of days With and Without An Open Bar By Contract - LE Only')
fig.show()

In [91]:
unique_dte = list(days_to_expiration_open_bar_count_df['DTE'])
open_bar_by_contract_count_df['Days Without Open Bar']

0     107
1      92
2      94
3     105
4     106
     ... 
72    113
73     27
74     63
75     57
76     47
Name: Days Without Open Bar, Length: 77, dtype: object

In [92]:
days_to_expiration_open_bar_count_df['Total Days'] = days_to_expiration_open_bar_count_df['Days With Open Bar'] + days_to_expiration_open_bar_count_df['Days Without Open Bar']
days_to_expiration_open_bar_count_df['Percentage Missing Open Bar'] = (days_to_expiration_open_bar_count_df['Days Without Open Bar'] / days_to_expiration_open_bar_count_df['Total Days']) * 100

In [93]:
dte_without_open_fig = go.Figure(data=[
    go.Bar(name='Days Without Open Bar', x=unique_dte, y=days_to_expiration_open_bar_count_df['Days Without Open Bar'])
])
dte_without_open_fig.update_layout(title_text='Number of trading days without An Open Bar By DTE - LE Only')
dte_percentage_without_open_fig = go.Figure(data=[
    go.Bar(name='Percentage Missing Open Bar', x=unique_dte, y=days_to_expiration_open_bar_count_df['Percentage Missing Open Bar'])
])
dte_percentage_without_open_fig.update_layout(title_text='Percentage of trading days without An Open Bar By DTE - LE Only')
dte_percentage_without_open_fig.show()
dte_without_open_fig.show()