In [1]:
import sqlite3 as lite
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import unicodedata
import re
# import pickle
import time
import sqlalchemy
from sqlalchemy import create_engine

In [3]:
# Parameters - Flow Control
SIC_list = ['6029', '6022', '6035', '6036', '6099', '4512', '4513', '4522', '4581', '6163', '6172', '6199', '6200', '6211', '6221', '6282', '0100', '0200', '0700', '0800', '0900']
form_type = '10-K'
filter_filings_from='20170101'
filter_filings_before=None

# Store CIK Codes & Accession Numbers in a Dictionary - Insert Scraped Data into DataBase
CIK_codes_dict = {}

for SIC in SIC_list:
    CIK_codes_dict[SIC] = {}
    endpoint = r'https://www.sec.gov/cgi-bin/browse-edgar'
    params_dict= {'SIC':SIC,
                'myowner':'exclude',
                'action':'getcompany',
                'output':'atom'}

    user_agent = {'User-agent' : '<evanmcgarry7@gmail.com>', 'Host': 'www.sec.gov'}

    response = requests.get(url=endpoint, headers=user_agent, params=params_dict)
    
    soup = BeautifulSoup(response.content, 'xml')
    entries = soup.find_all('entry')

    for entry in entries:
        CIK = entry.find('cik').text
        CIK_codes_dict[SIC][CIK] = {}
    links = soup.find_all('link',{'rel':'next'})

    while soup.find_all('link',{'rel':'next'}) != []:
        next_page_link = links[0]['href']
        time.sleep(0.5)
        response = requests.get(url=next_page_link, headers=user_agent)
        soup = BeautifulSoup(response.content, 'xml')
        entries = soup.find_all('entry')

        for entry in entries:
            CIK = entry.find('cik').text
            CIK_codes_dict[SIC][CIK] = {}
        links = soup.find_all('link',{'rel':'next'})

for SIC, CIK_dict in CIK_codes_dict.items():
    for CIK in CIK_dict.keys():            
        endpoint = r'https://www.sec.gov/cgi-bin/browse-edgar'
        params_dict = {'action': 'getcompany',
            'CIK':str(CIK),
            'type':form_type,
            'datea':filter_filings_from,
            'dateb':filter_filings_before,
            'owner':'include',
            'output':'atom'}

        response = requests.get(url=endpoint, headers=user_agent, params=params_dict)
        soup = BeautifulSoup(response.content, 'xml')
        entries = soup.find_all('entry')
        conformed_name = soup.find('conformed-name').text
        print("Completed: ", conformed_name)
        CIK_dict[CIK] = {}
        for entry in entries:
            if entry.find('filing-type').text == '10-K':
                accession_no = entry.find('accession-number').text
                CIK_dict[CIK][accession_no] = {}
                CIK_dict[CIK][accession_no]['company_name'] = conformed_name
                CIK_dict[CIK][accession_no]['cik'] = CIK
                CIK_dict[CIK][accession_no]['filing_date'] = entry.find('filing-date').text
                CIK_dict[CIK][accession_no]['filing_type'] = entry.find('filing-type').text
                CIK_dict[CIK][accession_no]['filing_href'] = entry.find('filing-href').text

                time.sleep(.5)
                response = requests.get(url=CIK_dict[CIK][accession_no]['filing_href'], headers=user_agent)
                soup = BeautifulSoup(response.content, 'xml')
                a = soup.select("a[href*=Archives]")
                link = a[0]['href']
                CIK_dict[CIK][accession_no]['direct_link_to_doc_filed'] = 'https://www.sec.gov' + link[8:] if '/ix?doc=' in link else 'https://www.sec.gov' + link

# Create SQLite Database
con = lite.connect('SEC_Filings.db')
with con:
    cur=con.cursor()
    cur.execute("DROP TABLE IF EXISTS Reports_Data")
    cur.execute("CREATE TABLE Reports_Data(SIC TEXT, CIK TEXT, Company TEXT, Filing_Date TEXT, Filing_Link TEXT, Accession_No TEXT PRIMARY KEY NOT NULL UNIQUE)");
    for SIC, CIK_dict in CIK_codes_dict.items():
        for CIK in CIK_dict.keys():
            if CIK_dict[CIK]:
                for accession_no in CIK_dict[CIK].keys():
                    rowi = (SIC, CIK, CIK_dict[CIK][accession_no]['company_name'], CIK_dict[CIK][accession_no]['filing_date'], CIK_dict[CIK][accession_no]['direct_link_to_doc_filed'], accession_no)
                    cur.execute("INSERT OR IGNORE INTO Reports_Data VALUES(?,?,?,?,?,?)", rowi)

https://www.sec.gov/cgi-bin/browse-edgar?SIC=6029&myowner=exclude&action=getcompany&output=atom
https://www.sec.gov/cgi-bin/browse-edgar?SIC=6022&myowner=exclude&action=getcompany&output=atom
https://www.sec.gov/cgi-bin/browse-edgar?SIC=6035&myowner=exclude&action=getcompany&output=atom
https://www.sec.gov/cgi-bin/browse-edgar?SIC=6036&myowner=exclude&action=getcompany&output=atom
https://www.sec.gov/cgi-bin/browse-edgar?SIC=6099&myowner=exclude&action=getcompany&output=atom
https://www.sec.gov/cgi-bin/browse-edgar?SIC=4512&myowner=exclude&action=getcompany&output=atom
https://www.sec.gov/cgi-bin/browse-edgar?SIC=4513&myowner=exclude&action=getcompany&output=atom
https://www.sec.gov/cgi-bin/browse-edgar?SIC=4522&myowner=exclude&action=getcompany&output=atom
https://www.sec.gov/cgi-bin/browse-edgar?SIC=4581&myowner=exclude&action=getcompany&output=atom
https://www.sec.gov/cgi-bin/browse-edgar?SIC=6163&myowner=exclude&action=getcompany&output=atom
https://www.sec.gov/cgi-bin/browse-edgar

In [4]:
db_name = "SEC_Filings.db"
table_name = "Reports_Data"

engine = sqlalchemy.create_engine('sqlite:///' + db_name, execution_options={"sqlite_raw_colnames": True})
df = pd.read_sql_table(table_name, engine)
df

Unnamed: 0,SIC,CIK,Company,Filing_Date,Filing_Link,Accession_No
0,6029,0001527383,BankGuam Holding Co,2022-03-28,https://www.sec.gov/Archives/edgar/data/152738...,0001564590-22-011871
1,6029,0001527383,BankGuam Holding Co,2021-03-22,https://www.sec.gov/Archives/edgar/data/152738...,0001564590-21-014469
2,6029,0001527383,BankGuam Holding Co,2020-03-19,https://www.sec.gov/Archives/edgar/data/152738...,0001564590-20-011686
3,6029,0001527383,BankGuam Holding Co,2019-03-15,https://www.sec.gov/Archives/edgar/data/152738...,0001564590-19-007926
4,6029,0001527383,BankGuam Holding Co,2018-06-29,https://www.sec.gov/Archives/edgar/data/152738...,0001564590-18-016424
...,...,...,...,...,...,...
3969,0900,0001488419,"Sino Agro Food, Inc.",2021-06-23,https://www.sec.gov/Archives/edgar/data/148841...,0001104659-21-084333
3970,0900,0001488419,"Sino Agro Food, Inc.",2021-02-10,https://www.sec.gov/Archives/edgar/data/148841...,0001104659-21-016840
3971,0900,0001488419,"Sino Agro Food, Inc.",2019-04-15,https://www.sec.gov/Archives/edgar/data/148841...,0001144204-19-019679
3972,0900,0001488419,"Sino Agro Food, Inc.",2018-04-17,https://www.sec.gov/Archives/edgar/data/148841...,0001144204-18-020929


In [14]:
user_agent = {'User-agent' : '<evanmcgarry7@gmail.com>', 'Host': 'www.sec.gov'} ## REMOVE LATER

# Retrieve Filing Documents as HTML and Clean Data

# Function to Clean HTML
def restore_windows_1252_characters(restore_string):
    """
    Replace C1 control characters in the Unicode string s by the
    characters at the corresponding code points in Windows-1252,
    where possible.
    """
    def to_windows_1252(match):
        try:
            return bytes([ord(match.group(0))]).decode('windows-1252')
        except UnicodeDecodeError:
            # No character at the corresponding code point: remove it.
            return ''
    return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)

# Create New DataFrame Column to Store the Normalised Text
df['Filing_Document_Text'] = ''

# Loop Through Filing Links
for i in range(0, len(df)):
    response = requests.get(df['Filing_Link'][i], headers=user_agent)
    soup = BeautifulSoup(response.content, 'lxml')
    filing_document = soup.find('body')
    filing_doc_text = filing_document.extract()
    filing_doc_string = str(filing_doc_text)
    text_bytes = bytes(filing_doc_string, 'utf-8')
    doc_soup = BeautifulSoup(text_bytes, 'html5')
    doc_text = doc_soup.html.body.get_text(' ', strip = True)
    doc_text_normalise = restore_windows_1252_characters(unicodedata.normalize('NFKD', doc_text))

    # Additional Cleaning Steps
    doc_text_normalise = doc_text_normalise.replace('Â', '').replace('  ', ' ').replace('\n', ' ')
    doc_text_normalise = doc_text_normalise.lower()

    # Store Cleaned Data in DataFrame
    df['Filing_Document_Text'][i] = [doc_text_normalise]
    print("Completed:"+str(df['Company'][i]))



Completed:BankGuam Holding Co
Completed:BankGuam Holding Co
Completed:BankGuam Holding Co
Completed:BankGuam Holding Co
Completed:BankGuam Holding Co
Completed:BankGuam Holding Co
Completed:Esquire Financial Holdings, Inc.
Completed:Esquire Financial Holdings, Inc.
Completed:Esquire Financial Holdings, Inc.
Completed:Esquire Financial Holdings, Inc.
Completed:Esquire Financial Holdings, Inc.
Completed:Esquire Financial Holdings, Inc.
Completed:PEAPACK GLADSTONE FINANCIAL CORP
Completed:PEAPACK GLADSTONE FINANCIAL CORP
Completed:PEAPACK GLADSTONE FINANCIAL CORP
Completed:PEAPACK GLADSTONE FINANCIAL CORP
Completed:PEAPACK GLADSTONE FINANCIAL CORP
Completed:PEAPACK GLADSTONE FINANCIAL CORP
Completed:PEAPACK GLADSTONE FINANCIAL CORP
Completed:SUN BANCORP INC /NJ/
Completed:1ST SOURCE CORP
Completed:1ST SOURCE CORP
Completed:1ST SOURCE CORP
Completed:1ST SOURCE CORP
Completed:1ST SOURCE CORP


KeyboardInterrupt: 

In [None]:
# Convert 10-K Data to String
df['Filing_Document_Text'] = df['Filing_Document_Text'].astype(str)

# Insert into SQLite Database
# Create SQLite Table
db_name2 = "SEC_Filings.db"
table_name2 = "10K_Data"

engine2 = sqlalchemy.create_engine('sqlite:///' + db_name, execution_options={"sqlite_raw_colnames": True})
df.to_sql(table_name2, engine2, if_exists='replace', index=False)


3917