In [9]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

input_folder = os.path.join(os.getcwd(), 'raw')
if(not os.path.exists(input_folder)):
    os.mkdir(input_folder)

cns_folder = os.path.join(input_folder, 'cnsfails')
if(not os.path.exists(cns_folder)):
    os.mkdir(cns_folder)

output_folder = os.path.join(os.getcwd(), 'generated')
if(not os.path.exists(output_folder)):
    os.mkdir(output_folder)

In [10]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

start_year = 2004
start_month = 1
end_year = 2021
end_month = 10

def download_and_unzip(url):
    resp = urlopen(url)
    zipfile = ZipFile(BytesIO(resp.read()))
    zipfile.extractall(path=cns_folder)

def download_if_necessary(url, downloaded_files):
    if(url not in downloaded_files):
        print("Download " + url)
        download_and_unzip(url)
        downloaded_files.append(url)

# Store already downloaded files in a list
downloaded_files_path = f"{cns_folder}/downloaded.txt"
downloaded_files = []
if(os.path.exists(downloaded_files_path)):
    with open(downloaded_files_path, 'r') as f:
        downloaded_files = f.readlines()
    downloaded_files = list(map(lambda s: s.strip(), downloaded_files))

# download all files in the time range that have not yet been downloaded
# Sadly they have quite scrambled naming schemas
# Only first entry  https://www.sec.gov/files/data/fails-deliver-data/cnsp_sec_fails_2004q1.zip
# Until 2009-06     https://www.sec.gov/files/data/frequently-requested-foia-document-fails-deliver-data/cnsp_sec_fails_2004q2.zip
# Until 2017-06a    https://www.sec.gov/files/data/frequently-requested-foia-document-fails-deliver-data/cnsfails201706a.zip
# Starting 2017-06b https://www.sec.gov/files/data/fails-deliver-data/cnsfails201706b.zip
# Between 2020-02 and 2020-04     https://www.sec.gov/files/node/add/data_distribution/cnsfails202002a.zip
base_url_long = "https://www.sec.gov/files/data/frequently-requested-foia-document-fails-deliver-data/"
base_url_short = "https://www.sec.gov/files/data/fails-deliver-data/"
base_url_data = "https://www.sec.gov/files/node/add/data_distribution/"
for year in range(start_year, end_year+1):
    start = start_month if (year == start_year) else 1
    end = end_month if (year == end_year) else 12
    for month in range(start, end+1):
        # First entry has a different naming
        if(year == 2004 and month <= 3):
            if(month == 3):
                download_if_necessary(f"{base_url_short}cnsp_sec_fails_2004q1.zip", downloaded_files)
            continue
        
        # up to Q2 2009, the files were quarter chunks
        if(year < 2009 or (year == 2009 and month <= 6) ):
            if(month % 3 == 0):
                quarter = int(month/3)
                download_if_necessary(f"{base_url_long}cnsp_sec_fails_{year}q{quarter}.zip", downloaded_files)
            continue

        # Special mixed case for 2017-06
        if(year == 2017 and month == 6):
            download_if_necessary(f"{base_url_long}cnsfails{year}{month:02d}a.zip", downloaded_files)
            download_if_necessary(f"{base_url_short}cnsfails{year}{month:02d}b.zip", downloaded_files)
            continue

        # Special case 2019-10
        if(year == 2019 and month == 10):
            download_if_necessary(f"{base_url_short}cnsfails{year}{month:02d}a_0.zip", downloaded_files)
            download_if_necessary(f"{base_url_short}cnsfails{year}{month:02d}b.zip", downloaded_files)
            continue

        base_url = base_url_long if(year < 2017 or (year == 2017 and month <6)) else base_url_short
        #Special case 2020-02 - 2020-04
        if(year == 2020 and (month >= 2 and month <= 4)):
            base_url = base_url_data

        download_if_necessary(f"{base_url}cnsfails{year}{month:02d}a.zip", downloaded_files)
        download_if_necessary(f"{base_url}cnsfails{year}{month:02d}b.zip", downloaded_files)

with open(downloaded_files_path, 'w') as f:
    f.write("\n".join(map(str, downloaded_files)))

print(f"{len(downloaded_files)} files downloaded or already found in the cache")

318 files downloaded or already found in the cache


In [None]:
def read_csv(folder, name):
    li = []

    for path in Path(folder).rglob(f'*{name}*.txt'):
        print(f"Read csv {path}")
        csv_df = pd.read_csv(path, index_col=None, header=0, sep ='|', encoding = "ISO-8859-1")
        li.append(csv_df)

    df = pd.concat(li, axis=0, ignore_index=True)
    return df

df = read_csv(cns_folder, 'cnsfails')
df.head()
df.count()

Read csv c:\Data\Documents\Python\gme\raw\cnsfails202107a.txt
Read csv c:\Data\Documents\Python\gme\raw\cnsfails202107b.txt
Read csv c:\Data\Documents\Python\gme\raw\cnsfails202108a.txt
Read csv c:\Data\Documents\Python\gme\raw\cnsfails202108b.txt
Read csv c:\Data\Documents\Python\gme\raw\cnsfails202109a.txt
Read csv c:\Data\Documents\Python\gme\raw\cnsfails202109b.txt
Read csv c:\Data\Documents\Python\gme\raw\cnsfails202110a.txt
Read csv c:\Data\Documents\Python\gme\raw\cnsfails202110b.txt


SETTLEMENT DATE     478860
CUSIP               478844
SYMBOL              478844
QUANTITY (FAILS)    478844
DESCRIPTION         478844
PRICE               478844
dtype: int64

In [None]:
def store_csv(df, filename, output_folder):
    fullpath = os.path.join(output_folder, filename)
    df.to_csv(fullpath, index=False, sep =',', decimal='.')

store_csv(df, f"cnsfails-{start_year}{start_month:02d}-{end_year}{end_month:02d}.csv", output_folder)