In [12]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

input_folder = os.path.join(os.getcwd(),'raw')
if(not os.path.exists(input_folder)):
    os.mkdir(input_folder)

output_folder = os.path.join(os.getcwd(),'generated')
if(not os.path.exists(output_folder)):
    os.mkdir(output_folder)

In [16]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

# example url https://www.sec.gov/files/data/fails-deliver-data/cnsfails202110b.zip
base_url = "https://www.sec.gov/files/data/fails-deliver-data/"
file_prefix = "cnsfails"
start_year = 2021
start_month = 7
end_year = 2021
end_month = 10

def download_and_unzip(url):
    resp = urlopen(url)
    zipfile = ZipFile(BytesIO(resp.read()))
    zipfile.extractall(path=input_folder)

def download_if_necessary(url, downloaded_files):
    if(url not in downloaded_files):
        print("Download " + url)
        download_and_unzip(url)
        downloaded_files.append(url)
    else:
        print("Skip " + url)

# Store already downloaded files in a list
downloaded_files_path = f"{input_folder}/downloaded.txt"
downloaded_files = []
if(os.path.exists(downloaded_files_path)):
    with open(downloaded_files_path, 'r') as f:
        downloaded_files = f.readlines()
    downloaded_files = list(map(lambda s: s.strip(), downloaded_files))

# download all files in the time range that have not yet been downloaded
for year in range(start_year, end_year+1):
    start = start_month if (year == start_year) else 1
    end = end_month if (year == end_year) else 12
    for month in range(start, end+1):
        download_if_necessary(f"{base_url}{file_prefix}{year}{month:02d}a.zip", downloaded_files)
        download_if_necessary(f"{base_url}{file_prefix}{year}{month:02d}b.zip", downloaded_files)

with open(downloaded_files_path, 'w') as f:
    f.write("\n".join(map(str, downloaded_files)))

Skip https://www.sec.gov/files/data/fails-deliver-data/cnsfails202107a.zip
Skip https://www.sec.gov/files/data/fails-deliver-data/cnsfails202107b.zip
Skip https://www.sec.gov/files/data/fails-deliver-data/cnsfails202108a.zip
Skip https://www.sec.gov/files/data/fails-deliver-data/cnsfails202108b.zip
Skip https://www.sec.gov/files/data/fails-deliver-data/cnsfails202109a.zip
Skip https://www.sec.gov/files/data/fails-deliver-data/cnsfails202109b.zip
Skip https://www.sec.gov/files/data/fails-deliver-data/cnsfails202110a.zip
Skip https://www.sec.gov/files/data/fails-deliver-data/cnsfails202110b.zip


In [14]:
def read_csv(input_folder, name):
    li = []

    for path in Path(input_folder).rglob(f'*{name}*.txt'):
        print(f"Read csv {path}")
        csv_df = pd.read_csv(path, index_col=None, header=0, sep ='|', encoding = "ISO-8859-1")
        li.append(csv_df)

    df = pd.concat(li, axis=0, ignore_index=True)
    return df

df = read_csv(input_folder, 'cnsfails')
df.head()
df.count()

Read csv c:\Data\Documents\Python\gme\raw\cnsfails202107a.txt
Read csv c:\Data\Documents\Python\gme\raw\cnsfails202107b.txt
Read csv c:\Data\Documents\Python\gme\raw\cnsfails202108a.txt
Read csv c:\Data\Documents\Python\gme\raw\cnsfails202108b.txt
Read csv c:\Data\Documents\Python\gme\raw\cnsfails202109a.txt
Read csv c:\Data\Documents\Python\gme\raw\cnsfails202109b.txt
Read csv c:\Data\Documents\Python\gme\raw\cnsfails202110a.txt
Read csv c:\Data\Documents\Python\gme\raw\cnsfails202110b.txt


SETTLEMENT DATE     478860
CUSIP               478844
SYMBOL              478844
QUANTITY (FAILS)    478844
DESCRIPTION         478844
PRICE               478844
dtype: int64

In [15]:
def store_csv(df, filename, output_folder):
    fullpath = os.path.join(output_folder, filename)
    df.to_csv(fullpath, index=False, sep =',', decimal='.')

store_csv(df, f"{start_year}{start_month:02d}-{end_year}{end_month:02d}combined.csv", output_folder)