In [13]:
import pandas as pd
import sqlite3
import glob
import datetime

In [130]:
pkl_files = glob.glob('0-2*.pkl')
dfs = []
for file in pkl_files:
    df = pd.read_pickle(file)
    df['file_source'] = file
    dfs.append(df)
options_df = pd.concat(dfs, ignore_index=True)
options_df['expiry'] = pd.to_datetime(options_df['expiry'])


In [133]:
for col in options_df.columns:
    print (f'NA for {col} total is {options_df[col].isna().sum()}')

NA for volume_options total is 0
NA for volume_weighted_options total is 0
NA for open_options total is 0
NA for close_options total is 0
NA for high_options total is 0
NA for low_options total is 0
NA for timestamp_options total is 0
NA for number of trades_options total is 0
NA for ticker total is 0
NA for full_name total is 0
NA for type total is 0
NA for strike total is 0
NA for expiry total is 0
NA for equity_start_price total is 0
NA for time_converted total is 0
NA for open_equity total is 1898
NA for close_equity total is 1898
NA for high_equity total is 1898
NA for low_equity total is 1898
NA for timestamp_equity total is 1898
NA for number of trades_equity total is 1898
NA for _merge total is 0
NA for equity_pct_change total is 1898
NA for options_pct_change total is 0
NA for options_earliest_open total is 0
NA for file_source total is 0
NA for equity_pct_change_normalized total is 1898
NA for day_classification total is 0
NA for DTE total is 0
NA for day_name total is 0
NA f

In [134]:
mysqldf = lambda q: sqldf(q, globals())
options_df['options_pct_change'] = options_df['options_pct_change'].astype(float)
options_df['equity_pct_change_normalized'] = (options_df['equity_pct_change']-1)*100

# day classification for ranges. below range boundaries are in absolute pct. Ex: 1 - 1%
very_high_range = 2
high_range = 1
average_high_range = 0.8
average = 0.5

options_df['day_classification'] = None
for day in options_df['time_converted'].dt.date.unique():
    day_mask = options_df['time_converted'].dt.date == day
    #find the maximum abs pct change
    max_pct_daily = max(options_df.loc[day_mask,'equity_pct_change_normalized'].max(),
                        abs(options_df.loc[day_mask,'equity_pct_change_normalized'].min()))
    if max_pct_daily > very_high_range:
        options_df.loc[day_mask,'day_classification'] = 'very_high'
    elif max_pct_daily > high_range:
        options_df.loc[day_mask,'day_classification'] = 'high'
    elif max_pct_daily > average_high_range:
        options_df.loc[day_mask,'day_classification'] = 'average_high'
    elif max_pct_daily > average:
         options_df.loc[day_mask,'day_classification'] = 'average'
    else:
         options_df.loc[day_mask,'day_classification'] = 'very_low'

#calculate DTE and convert to days resulting timedelta object
options_df['DTE'] = (pd.to_datetime(options_df['expiry']) 
                     + pd.to_timedelta('16:00:00') 
                     - pd.to_datetime(options_df['time_converted'])).dt.days

# also add day of week for potential future analysis
options_df['day_name'] = options_df['time_converted'].dt.day_name()


# because of sat/sunday - we have now 3 and 4 DTE
# for now create new column DTE_adjusted where DTE  3 is DTE 1 (since Mon to Friday)
# and DTE 4 becomes DTE 2 (Thu=>Mon/Fri=>Tue)
# small UDF for it:
def _dte_adjusted_func(DTE):
    if DTE == 3: return 1
    elif DTE == 4: return 2
    else: return DTE
options_df['DTE_adjusted'] = options_df['DTE'].apply(_dte_adjusted_func)

In [136]:
#print(df_global[df_global['open_equity'].isna()]['file_source'].unique())


# 
print(options_df.query('file_source == "0-2DTE_spy_options_01June23-30Aug23.pkl"  & low_equity.isna()')['time_converted'].dt.time.unique())
print(options_df.query('file_source == "0-2DTE_spy_options_01June23-30Aug23.pkl"  & low_equity.isna()')['strike'].unique())

options_df.shape

[datetime.time(14, 0) datetime.time(14, 15) datetime.time(14, 30)
 datetime.time(15, 15) datetime.time(15, 45) datetime.time(12, 30)
 datetime.time(12, 45) datetime.time(13, 0) datetime.time(13, 15)
 datetime.time(13, 45) datetime.time(15, 30) datetime.time(14, 45)
 datetime.time(13, 30) datetime.time(15, 0)]
[436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453
 454 455 456 457 458 459 460 461 462 463]


(718715, 31)

In [126]:
#misc checks
_dt = datetime.datetime(2023,8,30,9,45)
options_df[options_df['time_converted'] == _dt].head(3)

Unnamed: 0,volume_options,volume_weighted_options,open_options,close_options,high_options,low_options,timestamp_options,number of trades_options,ticker,full_name,...,_merge,equity_pct_change,options_pct_change,options_earliest_open,equity_pct_change_normalized,day_classification,file_source,DTE,day_name,DTE_adjusted
450212,104,0.0176,0.01,0.01,0.02,0.01,1693403100000,13,SPY,O:SPY230830P00436000,...,both,1.0,1.0,0.01,0.0,very_low,0-2DTE_spy_options_01June23-30Aug23.pkl,0,Wednesday,0
450230,3,14.01,14.0,14.03,14.03,14.0,1693403100000,2,SPY,O:SPY230830C00437000,...,both,1.0,1.0,14.0,0.0,very_low,0-2DTE_spy_options_01June23-30Aug23.pkl,0,Wednesday,0
450239,3,0.0133,0.01,0.02,0.02,0.01,1693403100000,3,SPY,O:SPY230830P00437000,...,both,1.0,1.0,0.01,0.0,very_low,0-2DTE_spy_options_01June23-30Aug23.pkl,0,Wednesday,0


In [135]:
conn = sqlite3.connect('optionsQuotes1.db')
c = conn.cursor()
options_df.to_sql(name = 'data', con = conn, if_exists='replace')
conn.commit()

In [137]:
options_df.to_parquet('optionsDB.parquet')

In [141]:
(options_df['time_converted'].dt.date)

datetime.date(2023, 1, 3)