In [None]:
#imports
import yfinance as yf
import pandas as pd
import os
import sys
import datetime
import time
import requests
import pickle
import random
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
pd.set_option("display.max_rows", None, "display.max_columns", None)
pd.set_option('display.max_colwidth', None)

## Run the following cell to collect data again(warning: this will overwrite the pickle file)
### Make sure to set the start date and pickle file name

In [None]:
#scrape data from CBOE
#after 10-4-2019 mm-dd-yyyy
#url format yyyy-mm-dd
base_url = "https://markets.cboe.com/us/options/market_statistics/daily/?mkt=cone&dt="
pickle_file = "scraped_data.pkl"

def get(url):
    headers = {}
    try:
        resp = requests.get(url)
        if resp.ok:
            return resp.text
    except requests.exceptions.RequestException as e:
        return None
    
"""This part is to scrape from CBOE website and save the file"""
scrapeDict ={}
weekdays=[]
start_date = datetime.date(2019, 10, 5)
end_date = datetime.date.today()
delta = datetime.timedelta(days=1)
run_date=start_date
while run_date < end_date:
    if run_date.weekday() not in [5,6]: #ie. Mon-Fri only
        weekdays.append(run_date)
    run_date += delta
print(len(weekdays))

print('Running:') 
# this loop loops around for some reason, not sure why
for get_date in weekdays:
    html_date = datetime.datetime.strftime(get_date, '%Y-%m-%d')
    print(get_date, end='|')
    data = get(base_url+html_date)
    if data == None:
        print("none data")
        continue
    # As I mentioned above this is an easy site, I can just use Pandas read_html to extract the tables efficiently
    # add code to deal with pandas read exception
    scrapeDict[get_date] = pd.read_html(data)
    
# Over here I have to manually incorporate the holidays so to delete them after downloading, for these dates the scraped data are incorrect
# This procedure works after the pickle file is created and run for subsequent times.
# For first time download, this module here will have error and exited - please continue to next module for the scraping.
# opt_out_days: Opting out public holidays
opt_out_days = ['2019-11-28', '2019-12-25', '2020-01-01', '2020-01-20', '2020-02-17', '2020-04-10',
                '2020-05-25', '2020-07-03', '2020-09-07', "2020-11-26", "2020-12-25", '2021-01-01', '2021-01-18', '2021-02-15', '2021-04-02',
                '2021-05-31', '2021-07-05', '2021-09-06', "2021-11-25", "2021-12-24", "2022-01-17"]
# Getting rid of holidays listed under [opt_out_days]
print(len(scrapeDict.keys()))
for i in opt_out_days:
    try: 
        del scrapeDict[datetime.datetime.strptime(i,'%Y-%m-%d').date()]
    except KeyError:
        continue
print('Adjusted for opt-out-days:', len(scrapeDict.keys())) 

# save to a pickle
outfile = open(pickle_file,"wb")
pickle.dump(scrapeDict,outfile)
outfile.close()
print('Done!')

## Run the following cell to load data from the saved pickle file
### This then formats the data similar to how totalpc.csv has it

In [None]:
infile = open(pickle_file,"rb")
df = pd.DataFrame(pickle.load(infile))
infile.close()
print(df.shape)
print('Done!')
# now remove un-needed data from the dataframe
# format: DATE(mm/dd/yyyy) | CALLS | PUTS | TOTAL | P/C Ratio
df = df.transpose()
print(df.shape)
df = df.drop(axis='columns',columns=range(2,9))
print(df.shape)
# turn name of series into another column
# eventually to turn it into the index
df['date']=df.apply(lambda row : row.name, axis = 1)
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
df['calls']=df.apply(lambda row : row.iloc[1].iloc[1][1], axis=1)
df['puts']=df.apply(lambda row : row.iloc[1].iloc[1][2], axis=1)
df['total']=df.apply(lambda row : row.iloc[1].iloc[1][3], axis=1)
df['p_c_ratio']=df.apply(lambda row : row.iloc[0].iloc[0][1], axis=1)
df = df.drop(axis='columns',columns=[0,1])

In [None]:
print(df.head())

## The following cell loads totalpc.csv
### It then appends the data and saves it as a new file

In [None]:
total_df = pd.read_csv("totalpc_archive.csv")
total_df.columns=['date','calls','puts','total','p_c_ratio']
total_df=total_df.drop(labels=[0,1], axis=0)
total_df['date'] = pd.to_datetime(total_df['date'])
total_df = total_df.set_index('date')
total_df.head()
# sanity checks
print(df.shape)
print(total_df.shape)
final_df = pd.concat([total_df,df], axis=0, verify_integrity=True, sort=True)
print(final_df.shape)
#cleanup
final_df=final_df.dropna(axis=0)
print(final_df.shape)
print(final_df.head())

In [None]:
final_df.to_csv("totalpc.csv")