In [8]:
# This notebook pulls raw data from https://www.sec.gov/data/foiadocsfailsdatahtm
# and sanitizes it, afterwards it is exported by year, and as a single csv into the sanitized subfolder

import os
import shutil
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

def get_or_create_folder(baseDir, folderName):
    path = os.path.join(baseDir, folderName)
    if(not os.path.exists(path)):
        os.mkdir(path)
    return path

current_dir = os.getcwd()
raw_folder = get_or_create_folder(current_dir, 'raw')
cns_folder_raw = get_or_create_folder(raw_folder, 'cnsfails')

sanitized_folder = get_or_create_folder(current_dir, 'sanitized')
cns_folder_sanitized = get_or_create_folder(sanitized_folder, 'cnsfails-txt')
csv_yearly = get_or_create_folder(sanitized_folder, 'cnsfails-yearly')

In [9]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

start_year = 2004
start_month = 3
end_year = 2021
end_month = 10

def download_and_unzip(url):
    resp = urlopen(url)
    zipfile = ZipFile(BytesIO(resp.read()))
    zipfile.extractall(path=cns_folder_raw)

def download_if_necessary(url, downloaded_files):
    if(url not in downloaded_files):
        print("Download " + url)
        download_and_unzip(url)
        downloaded_files.append(url)

# Store already downloaded files in a list
downloaded_files_path = f"{cns_folder_raw}/downloaded.txt"
downloaded_files = []
if(os.path.exists(downloaded_files_path)):
    with open(downloaded_files_path, 'r') as f:
        downloaded_files = f.readlines()
    downloaded_files = list(map(lambda s: s.strip(), downloaded_files))

# download all files in the time range that have not yet been downloaded
# Sadly they have quite scrambled naming schemas
# Only first entry  https://www.sec.gov/files/data/fails-deliver-data/cnsp_sec_fails_2004q1.zip
# Until 2009-06     https://www.sec.gov/files/data/frequently-requested-foia-document-fails-deliver-data/cnsp_sec_fails_2004q2.zip
# Until 2017-06a    https://www.sec.gov/files/data/frequently-requested-foia-document-fails-deliver-data/cnsfails201706a.zip
# Starting 2017-06b https://www.sec.gov/files/data/fails-deliver-data/cnsfails201706b.zip
# Between 2020-02 and 2020-04     https://www.sec.gov/files/node/add/data_distribution/cnsfails202002a.zip
base_url_long = "https://www.sec.gov/files/data/frequently-requested-foia-document-fails-deliver-data/"
base_url_short = "https://www.sec.gov/files/data/fails-deliver-data/"
base_url_data = "https://www.sec.gov/files/node/add/data_distribution/"
for year in range(start_year, end_year+1):
    start = start_month if (year == start_year) else 1
    end = end_month if (year == end_year) else 12
    for month in range(start, end+1):
        # First entry has a different naming
        if(year == 2004 and month <= 3):
            if(month == 3):
                download_if_necessary(f"{base_url_short}cnsp_sec_fails_2004q1.zip", downloaded_files)
            continue
        
        # up to Q2 2009, the files were quarter chunks
        if(year < 2009 or (year == 2009 and month <= 6) ):
            if(month % 3 == 0):
                quarter = int(month/3)
                download_if_necessary(f"{base_url_long}cnsp_sec_fails_{year}q{quarter}.zip", downloaded_files)
            continue

        # Special mixed case for 2017-06
        if(year == 2017 and month == 6):
            download_if_necessary(f"{base_url_long}cnsfails{year}{month:02d}a.zip", downloaded_files)
            download_if_necessary(f"{base_url_short}cnsfails{year}{month:02d}b.zip", downloaded_files)
            continue

        # Special case 2019-10
        if(year == 2019 and month == 10):
            download_if_necessary(f"{base_url_short}cnsfails{year}{month:02d}a_0.zip", downloaded_files)
            download_if_necessary(f"{base_url_short}cnsfails{year}{month:02d}b.zip", downloaded_files)
            continue

        base_url = base_url_long if(year < 2017 or (year == 2017 and month <6)) else base_url_short
        #Special case 2020-02 - 2020-04
        if(year == 2020 and (month >= 2 and month <= 4)):
            base_url = base_url_data

        download_if_necessary(f"{base_url}cnsfails{year}{month:02d}a.zip", downloaded_files)
        download_if_necessary(f"{base_url}cnsfails{year}{month:02d}b.zip", downloaded_files)

with open(downloaded_files_path, 'w') as f:
    f.write("\n".join(map(str, downloaded_files)))

print(f"{len(downloaded_files)} files downloaded or already found in the cache")

318 files downloaded or already found in the cache


In [10]:

def duplicate_and_truncate_cnsfails(raw_folder, sanitized_folder, name, truncate_target):
    # Offset from which we start searching for truncate_target
    reading_offset = 100
    for path in Path(raw_folder).rglob(f'*{name}*.txt'):
        filename = os.path.basename(path)
        new_path = os.path.join(sanitized_folder, filename)
        shutil.copy2(path, new_path)
        with open(new_path, "r+", encoding='ISO-8859-1') as file:
            # Move the pointer to the end of the file with the defined offset
            file.seek(0, os.SEEK_END)
            pos = file.tell() - reading_offset
            file.seek(pos, os.SEEK_SET)
            
            # Read the end values to check, if it contains the truncation target
            end_text = file.read(reading_offset)
            index = end_text.find(truncate_target)
            if(index == -1):
                continue
            end_pos = pos + index
            file.seek(end_pos, os.SEEK_SET)

            if pos > 0:
                file.seek(end_pos, os.SEEK_SET)
                file.truncate()

duplicate_and_truncate_cnsfails(cns_folder_raw, cns_folder_sanitized, 'cns', 'Trailer record count')

In [11]:
def sanitize_cnsfails_ocr(path):
    filename = os.path.basename(path)
    print(f"sanitize cnsfails {filename}")
    raw_cns_fails = []
    with open(path, 'r', encoding='ISO-8859-1') as f:
        raw_cns_fails = f.readlines()
    
    sanitized_cns_fails = [line\
        .replace('BAM| ENTMT INC', 'BAM! ENTMT INC')\
        .replace('BRAVO| FOODS INTERNATIONAL CP', 'BRAVO! FOODS INTERNATIONAL CP')\
        .replace('YUM| BRANDS, INC', 'YUM BRANDS, INC')\
        .replace('EZENIA| INC', 'EZENIA! INC')\
        .replace('POW| ENTERTAINMENT INC', 'POW! ENTERTAINMENT INC')\
        .replace('MAKEMUSIC| INC NEW', 'MAKEMUSIC! INC NEW')\
        .replace('DMY TECHNOLOGY GROUP INC IV |', 'DMY TECHNOLOGY GROUP INC IV WT')\
            for line in raw_cns_fails]
    
    with open(path, 'w') as f:
        f.write("".join(sanitized_cns_fails))

# Those are all files we found that have wrong entries that add a pipe (and therefore breaking the csv)
# The fails in the data probably come from OCR problems  matching a ! to a | instead
broken_files = ['cnsp_sec_fails_200408.txt', 'cnsp_sec_fails_200409.txt', 'cnsp_sec_fails_200410.txt', \
    'cnsp_sec_fails_200411.txt', 'cnsp_sec_fails_200412.txt', 'cnsp_sec_fails_200501.txt', 'cnsfails202104b.txt']
for filename in broken_files:
    path = os.path.join(cns_folder_sanitized, filename)
    sanitize_cnsfails_ocr(path)

sanitize cnsfails cnsp_sec_fails_200408.txt
sanitize cnsfails cnsp_sec_fails_200409.txt
sanitize cnsfails cnsp_sec_fails_200410.txt
sanitize cnsfails cnsp_sec_fails_200411.txt
sanitize cnsfails cnsp_sec_fails_200412.txt
sanitize cnsfails cnsp_sec_fails_200501.txt
sanitize cnsfails cnsfails202104b.txt


In [5]:
def convert_float(x):
    if not x or  x == '.':
        return 0.0
    try:
        return float(x)   
    except:        
        print(f"Can't convert {x} to float")
        return 0.0

def read_csv(folder, name):
    li = []
    counter = 0
    print(f"Reading all files in memory for pandas, this will take some time")
    for path in Path(folder).rglob(f'*{name}*.txt'):
        csv_df = pd.read_csv(path, index_col=None, header=0, sep ='|', encoding = 'ISO-8859-1', parse_dates=['SETTLEMENT DATE'],\
            converters={'PRICE': convert_float},\
            dtype={'CUSIP': 'str', 'SYMBOL': 'str', 'QUANTITY (FAILS)': 'int', 'DESCRIPTION': 'str'})
        li.append(csv_df)
        counter = counter+1
        if(counter % 50 == 0):
            print(f"{counter} files read")
    print(f"All {counter} files read, generating dataframe")
    df = pd.concat(li, axis=0, ignore_index=True)
    print(f"dataframe generated")
    return df

df = read_csv(cns_folder_sanitized, 'cns')
df.count()

Reading all files in memory for pandas, this will take some time
50 files read
100 files read
150 files read
200 files read
250 files read
300 files read
350 files read
All 360 files read, generating dataframe
dataframe generated


SETTLEMENT DATE     21348881
CUSIP               21348881
SYMBOL              21348871
QUANTITY (FAILS)    21348881
DESCRIPTION         21348881
PRICE               21348881
dtype: int64

In [6]:
df = df.rename(columns={"QUANTITY (FAILS)": "FAILS"})
df = df.rename(columns={"SETTLEMENT DATE": "DATE"})
df = df[['DATE', 'SYMBOL', 'FAILS', 'PRICE']]
df = df.sort_values(by=['DATE', 'SYMBOL'])
df.head()

Unnamed: 0,DATE,SYMBOL,FAILS,PRICE
16834021,2004-03-22,866,13000,0.0
16831725,2004-03-22,AAC,613985,0.0
16831839,2004-03-22,AACS,92968,0.0
16831717,2004-03-22,AAII,522948,0.0
16831835,2004-03-22,AAMI,73220,0.0


In [7]:
def store_csv(df, filename, folder):
    print(f"Storing {filename} to output folder")
    fullpath = os.path.join(folder, filename)
    df.to_csv(fullpath, index=False, sep =',', decimal='.')

def store_csv_by_year(df, folder, start_year, end_year):
    for year in range(start_year, end_year+1):
        df_year=df[df['DATE'].dt.year == year]
        store_csv(df_year, f"cnsfails-{year}.csv", folder)

store_csv_by_year(df, csv_yearly, start_year, end_year)
store_csv(df, f"cnsfails-{start_year}{start_month:02d}-{end_year}{end_month:02d}.csv", sanitized_folder)

Storing cnsfails-2004.csv to output folder
Storing cnsfails-2005.csv to output folder
Storing cnsfails-2006.csv to output folder
Storing cnsfails-2007.csv to output folder
Storing cnsfails-2008.csv to output folder
Storing cnsfails-2009.csv to output folder
Storing cnsfails-2010.csv to output folder
Storing cnsfails-2011.csv to output folder
Storing cnsfails-2012.csv to output folder
Storing cnsfails-2013.csv to output folder
Storing cnsfails-2014.csv to output folder
Storing cnsfails-2015.csv to output folder
Storing cnsfails-2016.csv to output folder
Storing cnsfails-2017.csv to output folder
Storing cnsfails-2018.csv to output folder
Storing cnsfails-2019.csv to output folder
Storing cnsfails-2020.csv to output folder
Storing cnsfails-2021.csv to output folder
Storing cnsfails-200403-202110.csv to output folder
