In [None]:
# SOURCE https://medium.com/@thomasfunk10/pulling-company-filings-from-edgar-using-pandas-eaa662cd3c22

In [None]:
!pip3 install pandas
!pip3 install numpy
!pip3 install requests
!pip3 install datetime
!pip3 install python-dateutil

In [None]:
import pandas as pd
import io
import gzip
import requests
import datetime
from dateutil.relativedelta import relativedelta
import os

In [None]:

sec_YYYY = 2022
sec_QTR = 'QTR1'

#url_format = 'https://www.sec.gov/Archives/edgar/full-index/[YYYY]/QTR[Q]/master.gz'  # .format(sec_YYYY,sec_QTR)
#url = 'https://www.sec.gov/Archives/edgar/full-index/2022/QTR1/master.gz'

# https://www.youtube.com/watch?v=VEFH2A_LLQs

In [None]:
def quarter_tuple(date):
    # 2020-4-15  ->  (2, 2020)
    return (((date.month - 1) // 3 + 1), date.year)


def get_quarters(start_date, end_date):
    quarters = []
    next_date = start_date
    while next_date < end_date:
        quarters.append(quarter_tuple(next_date))
        next_date += relativedelta(months=3)

    end_date_quarter = quarter_tuple(end_date)
    if end_date_quarter not in quarters:
        quarters.append(end_date_quarter)
    return quarters


def get_quarters_urls(start_date, end_date):
    quarters = get_quarters(start_date, end_date)
    return [f'https://www.sec.gov/Archives/edgar/full-index/{y}/QTR{q}/master.gz' for (q, y) in quarters]

In [None]:
def strip_header(data):
    header = ''
    line = ''
    while set(line) != set(['-']):
        header = line
        line = data.readline().strip()
    return data, header


def read_url(url, delimiter='|'):
    # get the master index gzip
    r = requests.get(url)
    # unzip
    data_stream = gzip.decompress(r.content)
    # decode bytes
    data = io.StringIO(data_stream.decode('utf-8'))
    # remove the unstructured header
    data, columns = strip_header(data)
    # create dataframe
    df = pd.read_csv(data, sep=delimiter)
    df.columns = columns.split(delimiter)
    df['Date Filed'] = pd.to_datetime(df['Date Filed'], format='%Y-%m-%d')
    return df

In [None]:
cik_mapping = {66740:'MMM',1800:'ABT'}

In [None]:
def filter_df(df, start_date, end_date, cik_mapping={}):
    in_date_range = (df['Date Filed'] >= start_date) & (df['Date Filed'] <= end_date)
    is_form_type = (df['Form Type'] == form_type)
    df = df.loc[in_date_range & is_form_type]
    if cik_mapping:
        df = df[df['CIK'].isin(cik_mapping.keys())]
        df['Ticker'] = df['CIK'].map(cik_mapping)

    df = df.reset_index()
    return df

In [None]:
def get_filings(start_date, end_date, form_type, cik_mapping, dir_path):
    urls = get_quarters_urls(start_date, end_date)

    for i, url in enumerate(urls):
        # loop through the quarter urls
        df = read_url(url)
        df = filter_df(df, start_date, end_date, cik_mapping)
        for j, row in df.iterrows():
            # loop through the filings
            ticker = row['Ticker']
            form = row['Form Type'].replace(' ', '-')
            date = row['Date Filed'].date()
            filename = row['Filename']
            # more human-readable filename
            outname = f"{ticker}_{form}_{date}.html"

            full_path = os.path.join(dir_path, outname)
            file_url = f"https://www.sec.gov/Archives/{filename}"

            # download
            r = requests.get(file_url)
            content = r.content.decode('utf-8')

            with open(full_path, 'w+') as f:
                f.write(content)

In [None]:
end_date = datetime.datetime.today()
start_date = end_date - datetime.timedelta(days=365)
form_type = 'Master.gz'
dir_path = ''
get_filings(start_date, end_date, form_type, cik_mapping, dir_path)
