# AcF701 - SEC EDGAR Data Extraction 2
by Dr Liang Jin

- Step 1: access crawler.idx files from SEC EDGAR
- Step 2: re-write crawler data to csv files
- Step 3: retrieve 10K filing information including URLs
- Step 4: extract text from html

## Setup
- import packages
- Global variables (all in capital recommended)

In [None]:
import os
import csv
import time
import random
import re
import requests
from bs4 import BeautifulSoup

In [None]:
# define some global variables such as sample periods
# Since 2005, SEC requires firms to disclosre "Item 1A. Risk Factors"
BEG_YEAR = 2005
END_YEAR = 2017

# define working directories
CWD = os.getcwd()
# store re-formated index in csv
INDEX_DIR = os.path.join(CWD, "index/")
# store parsed page info in csv
PAGE_DIR = os.path.join(CWD, "page/")
# store collected form 10-k in html
FORM_DIR = os.path.join(CWD, "form/")

In [None]:
# create directories if not exists yet
dirs = [INDEX_DIR, PAGE_DIR, FORM_DIR]

for d in dirs:
    if not os.path.isdir(d):
        os.mkdir(d)
    print("{} exists".format(d))

In [None]:
# Some examples
demo_index_urls = ['https://www.sec.gov/Archives/edgar/full-index/2016/QTR1/crawler.idx',
                   'https://www.sec.gov/Archives/edgar/full-index/2016/QTR2/crawler.idx']
demo_index_csv = INDEX_DIR + "demo.csv"
demo_page_csv = PAGE_DIR + "demo.csv"

## Previously...
- function to get index URLs
- function to re-write index data to csv

In [None]:
# create a list of urls pointing to individual crawler.idx files
def getIndexURLs(start=2005, end=2017):
    urls = []
    for year in range(start, end+1):
        for qtr in ['QTR1', 'QTR2', 'QTR3', 'QTR4']:
            url = 'https://www.sec.gov/Archives/edgar/full-index/{}/{}/crawler.idx'.format(year, qtr)
            urls.append(url)
    return urls

In [None]:
# Now we have a list of address, let's retrieve information from crawler.idx files
# and rewrite the data to csv locally
def writeIndexCSV(url, header_loc=7, firstrow_loc=9):
    r = requests.get(url)
    lines = r.text.splitlines()
    
    # retrieve the location of columns
    name_loc = lines[header_loc].find('Company Name')
    type_loc = lines[header_loc].find('Form Type')
    cik_loc = lines[header_loc].find('CIK')
    date_loc = lines[header_loc].find('Date Filed')
    url_loc = lines[header_loc].find('URL')
     
    # create file name based on the original idx file
    file_yr = url.split('/')[-3]
    file_qtr = url.split('/')[-2][-1]
    file_name = file_yr + "Q" + file_qtr + ".csv"
    
    # create and write to csv file
    with open(file_name, 'w') as wf:
        writer = csv.writer(wf, delimiter = ',')
        
        # go through lines
        for line in lines[firstrow_loc:]:
            company_name = line[:type_loc].strip()
            form_type = line[type_loc:cik_loc].strip()
            cik = line[cik_loc:date_loc].strip()
            date_filed = line[date_loc:url_loc].strip()
            page_url = line[url_loc:].strip()
            
            # let's foucs on 10-K files only
            if form_type == '10-K':
            
                # create a new row of data using tuple which is ordered and unchanged
                row = [company_name, form_type, cik, date_filed, page_url]
                writer.writerow(row)
                
        print("{} saved".format(file_name))

In [None]:
# change to directory
os.chdir(INDEX_DIR)

# loop through URLs
# for url in getIndexURLs(2016, 2017):
for url in demo_index_urls:
    writeIndexCSV(url)
    time.sleep(3 + random.random() * 3)

## Step 3: Retrieve 10K Filing Information

1. Retrieve and save 1 10-K filing using its filing URL
2. Parse and save 1 10-K filing's URL and associated meta data
3. Read and retrieve 10-K filing page's URL from CSV
4. Loop through every CSV and records within a CSV

### Step 3.1 Retrieve 10-K filing

### Step 3.2 Parse 10-K filing's URL and associated meta data

In [None]:
# Parse 10K Form page, including 10-K form URL and other meta data
def parseFormPage(url):
    '''
    Input: URL
    
    Output:
        filer_cik:
        filing_date:
        report_date:
        form_url
    '''
    
    # get page and create soup
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')
    
    # parse filer Info on 10K page
    filer_div = soup.find('div', {'id': 'filerDiv'})
    filer_text = filer_div.find('span', {'class': 'companyName'}).find('a').get_text()
    filer_cik = re.search(r"(\d{10})\s(\(.+\))$" ,filer_text)[1]
    
    # parse 10K Page Meta data
    form_content = soup.find('div', {'class': 'formContent'})
    
    filing_date = form_content.find('div', text='Filing Date').findNext('div').get_text()
    report_date = form_content.find('div', text='Period of Report').findNext('div').get_text()
    
    # parse 10-K URL
    table = soup.find('table', {'class': 'tableFile', 'summary': 'Document Format Files'})
    href = table.find('td', text='10-K').find_parent('tr').find('a')['href']
    form_url = "https://www.sec.gov" + href
    
    return filer_cik, filing_date, report_date, form_url

In [None]:
# Testing!!!
url = 'https://www.sec.gov/Archives/edgar/data/1606163/0001144204-16-089184-index.htm'

# in a tuple
row = (parseFormPage(url))
row

In [None]:
# let's collect form page data to new csv
def writeFormPage(file, path):
    '''
    Input:
        Index CSV
    Output:
        Page CSV
    '''
    with open(file, 'r') as rf:
        reader = csv.reader(rf)
        
        base_name = os.path.basename(file)
        file_path = os.path.join(path, base_name)
        
        with open(file_path, 'w') as wf:
            writer = csv.writer(wf, delimiter = ',')
            
            for line in reader:
                url = line[-1]
                page_data = (parseFormPage(url))
                writer.writerow(page_data)
                
                time.sleep(3 + random.random() * 3)

In [None]:
# change to directory
os.chdir(PAGE_DIR)

# testing!!!
writeFormPage(demo_index_csv, PAGE_DIR)

### Step 3.3 Retrieve 10-K filing page's URL from CSV (optional)

In [None]:
# Read CSV into memory using list, ready for processing
def readPageURLs(file):
    '''
    Input: CSV file's full path
    
    Output:
        URLs
    '''
    with open(file, 'r') as infile:
        reader = csv.reader(infile)
        data = [row for row in reader]
    
    return data

In [None]:
# index demo
readPageURLs(demo_index_csv)[0]

### Step 3.4 Save 10-K form files

In [None]:
def writeFormHTML(file, path):
    '''
    Input:
        file: Form Page CSV
        path: save to
    
    Output:
        Form Document in HTML format
    '''
    # open csv with info including company CIK and 10-K URLs
    with open(file, 'r') as rf:
        
        reader = csv.reader(rf)
        
        # be ready to create seperate folers to store raw HTML files
        base_name = os.path.basename(file)
        base_dir = os.path.splitext(base_name)[0]
        dir_path = os.path.join(path, base_dir)
        
        # create folder if not exists yet
        if not os.path.isdir(dir_path):
            os.mkdir(dir_path)
            
        # change to the target directory
        os.chdir(dir_path)
        
        # start to read lines in csv
        for line in reader:
            
            # retrieve info to create file name
            company_name = str(line[0])
            filing_date = str(line[1])
            
            file_name = company_name + "_" + filing_date + ".html"
            
            # get html from SEC using the parsed url
            url = str(line[-1])
            res = requests.get(url)
            html = res.text
            time.sleep(3 + random.random() * 3)
            
            # write to a local file
            with open(file_name, 'w') as wf:
                wf.write(html)    

In [None]:
if not os.path.isdir(FORM_DIR):
    os.mkdir(FORM_DIR)
os.chdir(FORM_DIR)

In [None]:
# Testing!
file = "/Users/liang/TextualAnalysis/page/example.csv"
writeFormHTML(file, FORM_DIR)

## Task 1: now you need to play with these blocks in class

## Feel free to ask questions

## Homework: spend the weekend to loop through all the CSV files and store all HTML

In [None]:
# Picking up CSV files in a target folder
for f_name in os.listdir(INDEX_DIR):
    
    # only .csv files
    if re.search(r'^2\d{3}Q\d\.csv$', f_name):
        
        # create full path for this file
        f_path = os.path.join(INDEX_DIR, f_name)
        
        # do something
        print(f_path)