# Import dependencies, get CAT data

In [None]:
# Run this code as is

# Imports
import datetime as dt
import requests, pdfplumber, io, warnings
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import numpy as np

# Download all CAT pdfs, compile in DataFrame
current_year = dt.datetime.now().year
years = list(range(2020, current_year + 1))
months = [f"{i:02}" for i in range(1, 13)]
days = [f"{i:02}" for i in range(1, 32)]

pre_link = "https://catnmsplan.com//sites//default//files//"
post_link = "-Monthly-CAT-Update.pdf"

raw_CAT  = []

bad_date_urls = ['https://catnmsplan.com//sites//default//files//2022-02//02.16.22-Monthly-CAT-Update.pdf',
                 'https://catnmsplan.com//sites//default//files//2022-03//03.16.22-Monthly-CAT-Update.pdf',
                 'https://catnmsplan.com//sites//default//files//2022-04//04.20.22-Monthly-CAT-Update.pdf',
                 'https://catnmsplan.com//sites//default//files//2022-09//09.27.22-Monthly-CAT-Update.pdf',
                 'https://catnmsplan.com//sites//default//files//2023-08//08.17.23-Monthly-CAT-Update.pdf']

# Loop through all possible reporting dates and create links
warnings.simplefilter(action='ignore', category=FutureWarning)
for year in years:
    for month in months:
        for day in days:
            url = pre_link + str(year) + "-" + str(month) + "//" + str(month) + "." + str(day) + "." + str(year)[-2:] + post_link

            # download data (exclude bad dates - formatting is wrong for some reason)
            if url not in bad_date_urls:

                response = requests.get(url, allow_redirects=False)
                if response.status_code == 200:
                    pdf_file = io.BytesIO(response.content)
                    with pdfplumber.open(pdf_file) as pdf:
                        # Loop through last 4 pages and extract text
                        total_pages = len(pdf.pages)
                        last_pages = pdf.pages[-4:-2]
                        for page_number, page in enumerate(last_pages, start=total_pages - len(last_pages)):
                            text = page.extract_text()
                            lines = text.strip().split('\n')
                            header = ["Trade Date", "Processed", "Accepted", "Late", "Overall Errors Count"]
                            data = [line.split() for line in lines[2:]]
                            data = data[1:-1]
                            df = pd.DataFrame(data, columns=header)
                            df = df.replace({',': ''}, regex=True)
                            df = df.apply(pd.to_numeric, errors='ignore')
                            raw_CAT.append(df)
raw_CAT_all = pd.concat(raw_CAT, ignore_index=True)
raw_CAT_all['Date'] = pd.to_datetime(raw_CAT_all['Trade Date'])
raw_CAT_all.set_index('Date', inplace=True)
raw_CAT_all.drop('Trade Date', axis=1, inplace=True)
full_range = pd.date_range(start=raw_CAT_all.index.min(), end=raw_CAT_all.index.max())
raw_CAT_all = raw_CAT_all.reindex(full_range)
warnings.resetwarnings()

# Choose stock ticker for comparison

In [None]:
# Choose stock ticker, then run code

###
symbol = "GME"
### 

start_date = raw_CAT_all.index[0].strftime('%Y-%m-%d')
end_date = raw_CAT_all.index[-1].strftime('%Y-%m-%d')
dates = raw_CAT_all.index
stock_data = yf.download(symbol, start=start_date, end=end_date, progress=False)
filtered_data = stock_data.loc[stock_data.index.isin(dates)]
high_prices = filtered_data['High']


# Plot comparison

In [None]:
# Run this code as is

# Plot comparison
fig, ax1 = plt.subplots()
line1, = ax1.plot(raw_CAT_all['Overall Errors Count'], "m-", label="Errors")
ax1.set_ylabel("Overall Errors Count (in billions)")

ax2 = ax1.twinx()
line2, = ax2.plot(high_prices, "c-", label=str(symbol)+" Daily High")
ax2.set_ylabel(str(symbol)+" Daily High ($)")

for label in ax1.get_xticklabels():
    label.set_rotation(45)

lines = [line1, line2]
labels = [line.get_label() for line in lines]
ax1.legend(lines, labels, loc='upper left')

plt.title("CAT Equity Errors and " + str(symbol) + " Daily High")
plt.show()

# Plot cross-correlation (+/- 365 days)

In [None]:
# Run this code as is (or change max lag)

# Cross-correlation and plot
common_dates = raw_CAT_all.index.intersection(high_prices.index)
filtered_raw_CAT_all = raw_CAT_all.loc[common_dates]

max_lag = 365 
lags = np.arange(-max_lag, max_lag + 1)

cross_corr = []
for lag in lags:
    shifted_ts2 = high_prices.shift(lag)
    valid_pairs = pd.concat([filtered_raw_CAT_all['Overall Errors Count'], shifted_ts2], axis=1).dropna()
    if len(valid_pairs) > 0:
        correlation = valid_pairs.iloc[:, 0].corr(valid_pairs.iloc[:, 1])
    else:
        correlation = np.nan
    
    cross_corr.append(correlation)

cross_corr = pd.Series(cross_corr, index=lags)
top_3_indices = cross_corr.abs().nlargest(1).index

# Plot
plt.plot(lags, cross_corr)
plt.xlabel('Lag (days)')
plt.ylabel('Cross-Correlation')
plt.title('Cross-Correlation between CAT Errors and GME Daily High')

xlim = plt.xlim()
ylim = plt.ylim()

for index in top_3_indices:
    # Adjust annotation position to stay within plot limits
    x_pos = index
    y_pos = cross_corr[index]
    
    # Avoid annotations going outside the plot
    if y_pos + 0.05 * np.sign(y_pos) > ylim[1]:
        y_text = ylim[1] - 0.1
    elif y_pos + 0.05 * np.sign(y_pos) < ylim[0]:
        y_text = ylim[0] + 0.1
    else:
        y_text = y_pos + 0.05 * np.sign(y_pos)
    
    plt.annotate(f'Lag: {index}', 
                 xy=(x_pos, y_pos), 
                 xytext=(x_pos, y_text),
                 arrowprops=dict(facecolor='black', arrowstyle='->'),
                 ha='center',
                 fontsize=8,
                 bbox=dict(boxstyle="round,pad=0.3", edgecolor='black', facecolor='white'))

plt.show()

# Plot comparison with lag

In [None]:
# Choose lag (see cross-correlation), then run code

###
lag = -140
###

fig, ax1 = plt.subplots()
line1, = ax1.plot(raw_CAT_all['Overall Errors Count'], "m-", label="Errors")
ax1.set_ylabel("Overall Errors Count (in billions)")

ax1.set_xticklabels([])

ax2 = ax1.twinx()
line2, = ax2.plot(high_prices.shift(lag), "c-", label=str(symbol)+" Daily High (Shifted)")
ax2.set_ylabel(str(symbol)+" Daily High ($)")

for label in ax1.get_xticklabels():
    label.set_rotation(45)

lines = [line1, line2]
labels = [line.get_label() for line in lines]
ax1.legend(lines, labels, loc='upper left')

plt.title("CAT Equity Errors and " + str(symbol) + " Daily High (Shifted -140 Days)")
plt.show()

# View max CAT error dates, find future trading dates

In [None]:
# Max errors
raw_CAT_all["Overall Errors Count"].nlargest(5)

In [None]:
# Get future dates
start_date = '2022-07-13'
days_ahead = 140

trading_days = pd.bdate_range(start='2022-01-01', end='2027-12-31')
start_date = pd.Timestamp(start_date)
print(trading_days[trading_days.get_loc(pd.Timestamp(start_date)) + days_ahead])