In [1]:
############### import packages
import os, requests, sys, re, pandas as pd, time, urllib.request, csv, gc, psutil
from bs4 import BeautifulSoup
from tqdm import tqdm

##########################################################
##################### parameter ##########################
##########################################################
obj_type = '8-K'
period_start = 1993 # included
period_end = 2005 # included
raw_filing_dir = 'G:\\8-K\\' ########## directory where you want to save the downloaded EDGAR filings
master_index_dir = 'F:\\github\\python-edgar-master\\edgar-idx' ######### directory where the edgar master index are saved
output_csv_dir = r'..\filings\id_data_' + obj_type + '_' + str(period_start) + '-' + str(period_end) +'.csv' ######### directory of the output id_data.csv
time_waiting = 0 ########## sleeping time between the scraping of each filing in order to avoid being blocked by EDGAR
begin_from = 486788 ## = excel count -1 = last begin from + processed
memory_limit = 95

############### Set working directory to parent directory
if os.getcwd() != r'F:\github\narrative_conservatism\code':
    os.chdir(r'F:\github\narrative_conservatism\code')

In [2]:
#################### Access all fillings through SEC master index #################################
####### indexes downloaded using python-edgar: https://github.com/edouardswiac/python-edgar #######
#### open terminal, and run the following lines:
#### cd F:\github\python-edgar-master (switch dir to where the run.py script is located)
#### python run.py -y 1993 -d edgar_idx (downloading all quarterly master index from 1993 into folder edgar_idx)

#### cd F:\github\python-edgar-master\edgar-idx (switch dir to where the downloaded indexes are located)
#### cat *.tsv > master.tsv (stitch all quarterly indexes into one master index)
#### du -h master.tsv (inspect how large the master index file is)

index_edgar = list()
doc_url = list()

# create an index of downloaded local quarterly master indexes
for subdir, dirs, files in os.walk(master_index_dir):
    for file in files:
        file_year = int(file.split('-')[0])
        if file_year >= period_start and file_year <= period_end:
            index_edgar.append(os.path.join(subdir, file))

# read each index file, select rows with matched file type, and store matched doc_links
for filenameTSV in index_edgar:
    tsv_read = pd.read_csv(filenameTSV, sep='|', header=None, encoding = "utf-8")
    tsv_read.columns = ['1', '2', '3', '4', '5', '6']
    
    # select the rows with filetype equal to predefined type
    tsv_type = tsv_read.loc[tsv_read['3'] == obj_type]
    doc_link = tsv_type['6'].values.tolist()
    doc_link = ['https://www.sec.gov/Archives/' + w for w in doc_link]
    for doc in doc_link:
        doc_url.append(doc)

del index_edgar
len(doc_url)

490837

In [3]:
############### Extract file identification info from doc_url
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}

if os.path.exists(output_csv_dir) == False:
    with open(output_csv_dir, mode='w') as file:
        writer = csv.writer(file, delimiter=',')
        writer.writerow(['accnum','cik','name','fd', 'rp','item8k','sic','fye','state','bazip','irs','film','pdc','accepted','nexhibit','ngraph','web_url'])

## define lists
accnum = []
cik = []
name = []
fd = []
rp = []
item8k = []
sic = []
fye = []
state = []
bazip = []
irs = []
film = []
pdc = []
accepted = []
nexhibit = []
ngraph = []
web_url = []

for doc in tqdm(doc_url[begin_from:]):
    if psutil.virtual_memory().percent > memory_limit:
        break
        
    time.sleep(time_waiting) # SEC does not allow to exceed 10 requests/sec
    doc_resp = requests.get(doc, headers=headers)
    if doc_resp.status_code == 429:
        time.sleep(10*60+5) # if exceeds cool off for 10 mins
        doc_resp = requests.get(doc, headers=headers)
    else:
        pass

    soup = BeautifulSoup(doc_resp.text, 'html.parser')

    # Save the SEC accession number (accnum)
    try:
        accnum_i = soup.find('div', id='formHeader').find('div', id='secNum').get_text().split()[3]
        accnum.append(accnum_i)
    except:
        accnum.append(float('NaN'))
        pass

    # Save the Filing Date (0), Accepted Date (Date as of Change) (1), Public Document Count (2) and Reporting Period (3)
    try:
        dates = soup.find('div', class_='formContent').find_all('div', class_='info')
        # Filing Date
        fd.append(dates[0].get_text())
    except:
        fd.append(float('NaN'))
        pass

        # Accepted Date (Date as of Change)
    try:
        accepted.append(dates[1].get_text())
    except:
        accepted.append(float('NaN'))
        pass

        # Public Document Count
    try:
        pdc.append(dates[2].get_text())
    except:
        pdc.append(float('NaN'))
        pass

        # Reporting Period
    try:
        rp.append(dates[3].get_text())
    except:
        rp.append(float('NaN'))
        pass

    # For 8K files, Save item info
    try:
        if obj_type == '8-K':
            clist = re.findall(r'\d.\d\d', dates[4].get_text())
            if clist != []:
                item8k.append(', '.join(clist))
            else:
                clist = re.findall(r'\d+', c = dates[4].get_text())
                item8k.append(', '.join(clist))
        else :
            item8k.append(float('NaN'))
    except:
        item8k.append(float('NaN'))
        pass

    # Save the Company name and CIK
    try:
        comname = soup.find('div', class_='companyInfo').find('span', class_='companyName')
        # Company Name
        name.append(comname.get_text().split("\n")[0].replace(' (Filer)', ''))
    except:
        name.append(float('NaN'))
        pass

        # CIK
    try:
        cik.append(comname.get_text().split("\n")[1].replace('CIK: ', '').replace(' (see all company filings)', ''))
    except:
        cik.append(float('NaN'))
        pass

    # Save Business Address ZIP 
    try:
        div_tag = soup.find_all('div', class_='mailer')[1].find_all('span', class_='mailerAddress')[1]
        ba = div_tag.get_text()
        alist = re.findall(r'\d\d\d\d\d', ba)
        if alist == []:
            div_tag = soup.find_all('div', class_='mailer')[1].find_all('span', class_='mailerAddress')[2]
            ba = div_tag.get_text()
            alist = re.findall(r'\d\d\d\d\d', ba)
        bazip.append(', '.join(alist))
    except:
        bazip.append(float('NaN'))
        pass

        # Save SIC, Fiscal Year End, State of Incorporation, IRS number and film number
    try:
        filinginfo = soup.find('div', class_='companyInfo').find('p', class_='identInfo')
        # SIC
        sic.append(filinginfo.get_text().split("SIC:")[1].split()[0])
    except:
        sic.append(float('NaN'))
        pass

        # Save Fiscal Year End
    try:
        fye.append(filinginfo.get_text().split("Fiscal Year End:")[1].split()[0].replace('Type:', ''))
    except:
        fye.append(float('NaN'))
        pass

        # State
    try:
        state.append(filinginfo.get_text().split("State of Incorp.:")[1].split()[0].replace('Type:', ''))
    except:
        state.append(float('NaN'))
        pass

        # IRS number
    try:
        irs.append(filinginfo.get_text().split("IRS No.:")[1].split()[0].replace('Type:', ''))
    except:
        irs.append(float('NaN'))
        pass

        # film number
    try:
        film.append(filinginfo.get_text().split("Film No.: ")[1].split()[0].replace('SIC:', ''))
    except:
        film.append(float('NaN'))
        pass

    # Save the HTML/TXT website urls from doc_url to raw data folder
    try:
        rows = soup.find('table', class_='tableFile', summary='Document Format Files').find_all('tr')
        cell_html = rows[1].find_all('td')
        html = cell_html[2].a['href'].replace('ix?doc=/', '')
        cell_txt = rows[-1].find_all('td')
        txt = cell_txt[2].a['href']

        if html.endswith("htm") or html.endswith("txt"):
            url = 'https://www.sec.gov' + html
        else:
            url = 'https://www.sec.gov' + txt
        web_url.append(url)

    except:
        web_url.append(float('NaN'))
        pass

    # downloading the report
    try:
        if os.path.exists(raw_filing_dir + accnum_i + '.txt') == False:
            urllib.request.urlretrieve(url, raw_filing_dir + accnum_i + '.txt')
    except:
        pass

    # Count number of exhibits and graphics in this filing
    try:
        ex = 0
        graph = 0
        for row in rows[2:-1]:
            if row.find_all('td')[3].get_text().startswith('EX'):
                ex = ex + 1
            elif row.find_all('td')[3].get_text().startswith('GRAPHIC'):
                graph = graph + 1
            else:
                pass
        ngraph.append(graph)
        nexhibit.append(ex)

    except:
        ngraph.append(float('NaN'))
        nexhibit.append(float('NaN'))
        pass

100%|██████████████████████████████████████████████████████████████████████████████| 4049/4049 [32:44<00:00,  2.06it/s]


In [4]:
#### save scraped data locally 
id_data = pd.DataFrame(data={'accnum': accnum, 'cik': cik, 'name': name, 'fd': fd, 'rp': rp, 'item8k': item8k, 'sic': sic,'fye': fye, 'state': state, 'bazip': bazip, \
 'irs': irs, 'film': film, 'pdc': pdc, 'accepted': accepted, 'nexhibit': nexhibit, 'ngraph': ngraph, 'web_url': web_url})
id_data_saved = pd.read_csv(output_csv_dir,  dtype = {'cik':str, 'bazip':str, 'sic':str, 'fye':str, 'film':str, 'irs':str, 'web_url':str})
id_data = pd.concat([id_data_saved, id_data])
id_data.to_csv(output_csv_dir, index=False)

In [5]:
id_data = pd.read_csv(output_csv_dir,  dtype = {'cik':str, 'bazip':str, 'sic':str, 'fye':str, 'film':str, 'irs':str, 'web_url':str})
id_data.isnull().sum()

accnum         87
cik             0
name           87
fd             87
rp              0
item8k        436
sic          5436
fye         20121
state       29735
bazip       12642
irs         38754
film           76
pdc             0
accepted        0
nexhibit        0
ngraph          0
web_url        87
dtype: int64

In [6]:
id_data

Unnamed: 0,accnum,cik,name,fd,rp,item8k,sic,fye,state,bazip,irs,film,pdc,accepted,nexhibit,ngraph,web_url
0,0000060512-94-000008,0000060512,LOUISIANA LAND & EXPLORATION CO,1993-10-29,1993-10-29,5,1311,1231,MD,70112,720244700,94500406,1.0,1993-10-29 00:00:00,0.0,0.0,https://www.sec.gov/Archives/edgar/data/60512/...
1,0000950144-94-000103,0000100240,TURNER BROADCASTING SYSTEM INC,1994-01-24,1994-01-24,7,4833,1231,GA,30303,580950695,94502410,7.0,1994-01-24 00:00:00,6.0,0.0,https://www.sec.gov/Archives/edgar/data/100240...
2,0000950144-94-000177,0000100240,TURNER BROADCASTING SYSTEM INC,1994-02-02,1994-01-27,7,4833,1231,GA,30303,580950695,94504285,4.0,1994-02-02 00:00:00,3.0,0.0,https://www.sec.gov/Archives/edgar/data/100240...
3,0000950144-94-000277,0000100240,TURNER BROADCASTING SYSTEM INC,1994-02-07,1994-01-28,"2, 7",4833,1231,GA,30303,580950695,94504789,1.0,1994-02-07 00:00:00,0.0,0.0,https://www.sec.gov/Archives/edgar/data/100240...
4,0000716039-94-000008,0000100880,UNION OIL CO OF CALIFORNIA,1994-03-02,1994-03-02,1,2911,1231,CA,90017,951315450,94514338,1.0,1994-03-02 00:00:00,0.0,0.0,https://www.sec.gov/Archives/edgar/data/100880...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490832,0001104659-05-061526,0000099771,TRINITY CAPITAL CORP,2005-12-19,2005-12-15,"1.01, 8.01, 9.01",6159,1231,,87544,000000000,051272454,3.0,2005-12-19 15:34:48,1.0,1.0,https://www.sec.gov/Archives/edgar/data/99771/...
490833,0001299933-05-005719,0000099780,TRINITY INDUSTRIES INC,2005-11-03,2005-11-02,"2.02, 7.01",3743,1231,DE,75207,750225040,051176629,6.0,2005-11-03 15:07:25,5.0,0.0,https://www.sec.gov/Archives/edgar/data/99780/...
490834,0001299933-05-006471,0000099780,TRINITY INDUSTRIES INC,2005-12-12,2005-12-06,1.01,3743,1231,DE,75207,750225040,051257876,1.0,2005-12-12 14:51:42,0.0,0.0,https://www.sec.gov/Archives/edgar/data/99780/...
490835,0001157523-05-009073,0000009984,BARNES GROUP INC,2005-10-24,2005-10-24,"2.02, 9.01",3490,1231,DE,06010,060247840,051150836,4.0,2005-10-24 07:00:37,1.0,2.0,https://www.sec.gov/Archives/edgar/data/9984/0...
