In [5]:
############### import packages
import os, requests, sys, re, pandas as pd, time
from bs4 import BeautifulSoup
from tqdm import tqdm
from time import process_time

##########################################################
##################### parameter ##########################
##########################################################
obj_type = '10-Q'
period_start = 2017 # included
period_end = 2019 # included

############### Set working directory to parent directory
os.getcwd()
# os.chdir('F:\\github\\narrative_conservatism\\code')

'F:\\github\\narrative_conservatism\\code'

In [6]:
#################### Access all fillings through SEC master index #################################
####### indexes downloaded using python-edgar: https://github.com/edouardswiac/python-edgar #######
#### open terminal, and run the following lines:
#### cd F:\github\python-edgar-master (switch dir to where the run.py script is located)
#### python run.py -y 1993 -d edgar_idx (downloading all quarterly master index from 1993 into folder edgar_idx)

#### cd F:\github\python-edgar-master\edgar-idx (switch dir to where the downloaded indexes are located)
#### cat *.tsv > master.tsv (stitch all quarterly indexes into one master index)
#### du -h master.tsv (inspect how large the master index file is)

index_edgar = list()
doc_url = list()

# create an index of downloaded local quarterly master indexes
for subdir, dirs, files in os.walk("F:\\github\\python-edgar-master\\edgar-idx"):
    for file in files:
        file_year = int(file.split('-')[0])
        if file_year >= period_start and file_year <= period_end:
            index_edgar.append(os.path.join(subdir, file))

# read each index file, select rows with matched file type, and store matched doc_links
for filenameTSV in index_edgar:
    tsv_read = pd.read_csv(filenameTSV, sep='|', header=None, encoding = "utf-8")
    tsv_read.columns = ['1', '2', '3', '4', '5', '6']
    
    # select the rows with filetype equal to predefined type
    tsv_type = tsv_read.loc[tsv_read['3'] == obj_type]
    doc_link = tsv_type['6'].values.tolist()
    doc_link = ['https://www.sec.gov/Archives/' + w for w in doc_link]
    for doc in doc_link:
        doc_url.append(doc)
        
len(doc_url)

57511

In [7]:
# #################### Access all fillings through SEC search engine ####################################
# ################## NOT RECOMMENDED AT ALL #############################################################
# cik = '0000051143'
# obj_type = '8-K'
# number of documents listed per page
# count = '100'
# # index of first document listed in the current page
# start = '0'
# # find filings prior to the date 2016y01m01d
# dateb = ''

# # Obtain url for intial search result page
# base_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type={}&dateb={}&start={}&count={}"
# init_url = base_url.format(cik, obj_type, dateb, start, count)

# # define a function that takes the input url and returns next search page url
# def get_next_url(input_url):
#     edgar_resp = requests.get(input_url)
#     edgar_str = edgar_resp.text
#     soup = BeautifulSoup(edgar_str, 'html.parser')

#     div_tag = soup.find('div', style='margin-top: 5px; margin-bottom: 5px;')
#     button = div_tag.find('td', style='text-align: right;')
#     fbutton = button.find_all('input')[0]['value']
#     if re.findall(r'Next', fbutton) == ['Next']:
#         next_url = button.find_all('input')[0]['onclick'][:-1]
#     elif len(button.find_all('input')) == 2:
#         next_url = button.find_all('input')[1]['onclick'][:-1]
#     else:
#         next_url = 'NA'
        
#     next_url = next_url.replace('parent.location=\'', 'https://www.sec.gov')
#     return next_url

# # create a search result page url list
# search_url = [init_url]

# while get_next_url(init_url) != 'NA':
#     search_url.append(get_next_url(init_url))
#     init_url = get_next_url(init_url)
    
# ############### Create a document link list of a given CIK and file type
# doc_link = list()

# for url in search_url:
#     edgar_resp = requests.get(url)
#     edgar_str = edgar_resp.text
#     soup = BeautifulSoup(edgar_str, 'html.parser')
#     table_tag = soup.find('table', class_='tableFile2')
#     rows = table_tag.find_all('tr')

#     for row in rows[1:]:
#         cells = row.find_all('td')
#         doc_link.append('https://www.sec.gov' + cells[1].a['href'])
        
# len(doc_link)

In [4]:
############### Extract file identification info from doc_url
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}

accnum = list()
fd = list()
rp = list()
name = list()
cik = list()
sic = list()
file_type = list()
fye = list()
state = list()
bazip = list()
item8k = list()
web_url = list()

# t1_start = process_time()
t1_start = time.time()

for doc in tqdm(doc_url):
    doc_resp = requests.get(doc, headers=headers)
    doc_str = doc_resp.text
    soup = BeautifulSoup(doc_str, 'html.parser')
    
    # Save the SEC accession number (accnum)
    try:
        div_tag = soup.find('div', id='formHeader')
        secnum = div_tag.find('div', id='secNum')
        a = secnum.get_text().split()[3]
        accnum.append(a)
    except:
        accnum.append('NA')
        pass

    # Save the Filing Date and Reporting Period
    try:
        div_tag = soup.find('div', class_='formContent')
        dates = div_tag.find_all('div', class_='info')
        # Filing Date
        a = dates[0].get_text()
        fd.append(a)
    except:
        fd.append('NA')
        pass
    
        # Reporting Period
    try:
        b = dates[3].get_text()
        rp.append(b)
    except:
        rp.append('NA')
        pass
    
    # For 8K files, Save item info
    if obj_type == '8-K':
        c = dates[4].get_text()
        clist = re.findall(r'\d.\d\d', c)
        if clist != []:
            c = ', '.join(clist)
            item8k.append(c)
        else:
            clist = re.findall(r'\d', c)
            c = ', '.join(clist)
            item8k.append(c)
    else :
        c = 'NA'
        item8k.append(c)

        # Save the Company name and CIK
    try:
        div_tag = soup.find('div', class_='companyInfo')
        comname = div_tag.find('span', class_='companyName')
        # Company Name
        a = comname.get_text().split("\n")[0].replace(' (Filer)', '')
        name.append(a)
    except:
        name.append('NA')
        pass
    
        # CIK
    try:
        b = comname.get_text().split("\n")[1].replace('CIK: ', '').replace(' (see all company filings)', '')
        cik.append(b)
    except:
        cik.append('NA')
        pass

        # Save Business Address ZIP 
    try:
        div_tag = soup.find_all('div', class_='mailer')[1].find_all('span', class_='mailerAddress')[1]
        ba = div_tag.get_text()
        alist = re.findall(r'\d\d\d\d\d', ba)
        if alist == []:
            div_tag = soup.find_all('div', class_='mailer')[1].find_all('span', class_='mailerAddress')[2]
            ba = div_tag.get_text()
            alist = re.findall(r'\d\d\d\d\d', ba)
        a = ', '.join(alist)
        bazip.append(a)
    except:
        bazip.append('NA')
        pass

        # Save SIC, File Type, Fiscal Year End and State of Incorporation
    try:
        div_tag = soup.find('div', class_='companyInfo')
        filinginfo = div_tag.find('p', class_='identInfo')
        # SIC
        a = filinginfo.get_text().split("|")[5].split("SIC")[1].split()[1]
        sic.append(a)
    except:
        sic.append('NA')
        pass
    
    # File Type
    b = obj_type
    file_type.append(b)
        
        # Fiscal Year End
    try:
        c = filinginfo.get_text().split("|")[2].split("Type")[0].split(":")[1]
        fye.append(c)
    except:
        fye.append('NA')
        pass
    
        # State
    try:
        d = filinginfo.get_text().split("|")[1].split(":")[1]
        state.append(d)
    except:
        state.append('NA')
        pass

    # Save the HTML/TXT website urls from doc_url to raw data folder
    table_tag = soup.find('table', class_='tableFile', summary='Document Format Files')
    rows = table_tag.find_all('tr')
    cell_html = rows[1].find_all('td')
    html = cell_html[2].a['href'].replace('ix?doc=/', '')
    cell_txt = rows[-1].find_all('td')
    txt = cell_txt[2].a['href']

    if html.endswith("htm") or html.endswith("txt"):
        web_url.append('https://www.sec.gov' + html)
    else:
        web_url.append('https://www.sec.gov' + txt)

# t1_end = process_time()
t1_end = time.time()
print("Elapsed time during the whole program in seconds:", t1_end - t1_start)

 51%|████████████████████████████████████▍                                   | 29069/57511 [2:15:11<1:57:50,  4.02it/s]

KeyboardInterrupt: 

In [145]:
############### Save web_url to local index
path_web_url_index = '..\\filings\\web_url_index_'+ obj_type + '_' + str(period_start) + '-' + str(period_end) + '.txt'
with open(path_web_url_index, "w") as f:
    for s in web_url:
        f.write(s +"\n")
        
############### Scraping adjustments for some exceptional data
for w in state:
    if re.findall(r'\DType', w) != []:
        state[state.index(w)] = w.split('Type')[0]
    if re.findall(r'\dType', w) != []:
        fye[state.index(w)] = w.split('Type')[0]
        state[state.index(w)] = 'NA'
    if w == ' 34 ':
        state[state.index(w)] = 'NA'
        
for date in fye:
    if re.findall(r'[A-Z]', date) != []:
        state[fye.index(date)] = date
        fye[fye.index(date)] = 'NA'
    if re.findall('-', date) != []:
        fye[fye.index(date)] = 'NA'
    if date == ' 34 ':
        fye[fye.index(date)] = 'NA'

for zipcode in bazip:
    if zipcode == '00000' or zipcode == '':
        bazip[bazip.index(zipcode)] = 'NA'

############### Create Data Frame
d = {'accnum': accnum, 'file_type': file_type, 'cik': cik, 'name': name, 'sic': sic, 'fd': fd, 'rp': rp, 'fye': fye, 'item8k': item8k, \
     'bazip': bazip, 'state': state}
id_data = pd.DataFrame(data=d)
id_data.to_csv('..\\filings\\id_data_' + obj_type + '_' + str(period_start) + '-' + str(period_end) +'.csv', index=False)

id_data

Unnamed: 0,accnum,file_type,cik,name,sic,fd,rp,fye,item8k,bazip,state
0,0001193125-17-035551,10-Q,0001000045,NICHOLAS FINANCIAL INC,6153,2017-02-09,2016-12-31,,,33759,
1,0001437749-17-004005,10-Q,0001000230,OPTICAL CABLE CORP,3357,2017-03-08,2017-01-31,,,24019,1031
2,0001001039-17-000046,10-Q,0001001039,WALT DISNEY CO/,4841,2017-02-07,2016-12-31,,,91521,0930
3,0001564590-17-001258,10-Q,0001001115,GEOSPACE TECHNOLOGIES CORP,3829,2017-02-09,2016-12-31,,,77040,0930
4,0001104659-17-006054,10-Q,0001001250,ESTEE LAUDER COMPANIES INC,2844,2017-02-02,2016-12-31,,,10153,0630
...,...,...,...,...,...,...,...,...,...,...,...
57506,0001513162-19-000239,10-Q,0000099106,TRANS LUX Corp,3990,2019-11-08,2019-09-30,,,10022,1231
57507,0000099250-19-000013,10-Q,0000099250,"TRANSCONTINENTAL GAS PIPE LINE COMPANY, LLC",4922,2019-10-31,2019-09-30,,,77251,1231
57508,0001206774-19-003657,10-Q,0000099302,TRANSCAT INC,3825,2019-11-05,2019-09-28,,,14624,0328
57509,0000099780-19-000125,10-Q,0000099780,TRINITY INDUSTRIES INC,3743,2019-10-24,2019-09-30,,,75207,1231


In [11]:
# ############### Download HTML into TXT files (NOT RECOMMANDED DUE TO LARGE FILE SIZE)
# for link in web_url:
#     if os.path.exists('..\\filings\\raw\\'+str(accnum[web_url.index(link)])+'.txt') == False:
#         urllib.request.urlretrieve(link, '..\\filings\\raw\\'+str(accnum[web_url.index(link)])+'.txt')