# Header Scraping from EDGAR database

### Import modules

In [2]:
import re
import requests
import unicodedata
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

### Data Import

In [3]:
df1 = pd.read_csv('data/PublicFirms_NorthAmericaOnly.csv')
df2 = pd.read_excel('data/cik_ticker_file.xlsx')
df3 = pd.read_csv('data/sec__edgar_company_info.csv')
df4 = pd.read_csv('data/PublicFirms_Global.csv')

cik1 = list(df1[df1['cik'].notnull()]['cik'].astype('int'))
cik2 = list(df2['CIK'])
cik3 = list(df3['Company CIK Key'])
cik4 = list(df4[df4['cik'].notnull()]['cik'].astype('int'))
cik_numbers = cik1+ cik2+cik3+cik4
cik_numbers = list((set(cik_numbers)))

In [4]:
def restore_windows_1252_characters(restore_string):
    """
        Replace C1 control characters in the Unicode string s by the
        characters at the corresponding code points in Windows-1252,
        where possible.
    """

    def to_windows_1252(match):
        try:
            return bytes([ord(match.group(0))]).decode('windows-1252')
        except UnicodeDecodeError:
            # No character at the corresponding code point: remove it.
            return ''
        
    return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)

In [7]:

# master cik_list
cik_master_list = {}

# loop through each cik number
for cik_number in cik_numbers:

    # base URL for the SEC EDGAR browser
    endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

    # define our parameters dictionary
    param_dict = {'action':'getcompany',
                  'CIK':cik_number,
                  'type':'10-k',
                  'owner':'exclude',
                  'start':'',
                  'output':'atom',
                  'count':'10'}
    
    # request the url, and then parse the response.
    response = requests.get(url = endpoint, params = param_dict)
    soup = BeautifulSoup(response.content, 'lxml')


    # find all the entry tags
    entries = soup.find_all('entry')

    # initalize our list for storage
    master_list_xml = []
    if len(entries) > 0:
        # loop through each found entry, remember this is only the first two
        entry = entries[0]
        if entry.find('filing-date').text[:4] >= '2008':
            # create a new dictionary
            entry_dict = {}

            # store the file info
            entry_dict['file_number_href'] = entry.find('file-number-href').text
            entry_dict['filing_date'] =  entry.find('filing-date').text

             # request the url for the filing page, and then parse the response.
            response_filing_page = requests.get(entry.find('filing-href').text)
            soup_filing_page = BeautifulSoup(response_filing_page.content, 'lxml')

            # find the element that has the specified text.
            element = soup_filing_page.find(text ='Complete submission text file')

            # text file href
            text_file_href = 'https://www.sec.gov' + element.find_next('td').find('a')['href']

            # add to the dictionary
            entry_dict['text_file_href'] = text_file_href

            ## store in the master list
            #master_list_xml.append(entry_dict)

            # store the master list in the CIK dictionary
            cik_master_list[cik_number] = entry_dict




SSLError: HTTPSConnectionPool(host='www.sec.gov', port=443): Max retries exceeded with url: /cgi-bin/browse-edgar?action=getcompany&CIK=1048723&type=10-k&owner=exclude&start=&output=atom&count=10 (Caused by SSLError(SSLError("bad handshake: SysCallError(-1, 'Unexpected EOF')",),))

In [14]:
import json
with open('cik_master.json', 'w') as fp:
    json.dump(cik_master_list, fp)

In [20]:
cik_master_list

{20: {'file_number_href': 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&filenum=000-09576&owner=exclude&count=10',
  'filing_date': '2010-03-15',
  'text_file_href': 'https://www.sec.gov/Archives/edgar/data/20/000095012310024631/0000950123-10-024631.txt'},
 1048620: {'file_number_href': 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&filenum=000-29643&owner=exclude&count=10',
  'filing_date': '2014-03-19',
  'text_file_href': 'https://www.sec.gov/Archives/edgar/data/1048620/000110465914020938/0001104659-14-020938.txt'},
 1048685: {'file_number_href': 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&filenum=001-32453&owner=exclude&count=10',
  'filing_date': '2015-04-15',
  'text_file_href': 'https://www.sec.gov/Archives/edgar/data/1048685/000119312515131468/0001193125-15-131468.txt'},
 1048695: {'file_number_href': 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&filenum=000-26041&owner=exclude&count=10',
  'filing_date': '2018-11-21',
  

In [15]:
# initialize an empty dictionary
company_dict = {}

for cik in cik_master_list: 
    #define the url to specific html_text file
    new_html_text = r""+cik_master_list[cik]['text_file_href']

    #grab the response
    response = requests.get(new_html_text)

    # pass it through the parser
    soup = BeautifulSoup(response.content,'lxml')
    


    # add a new level to our master_dict, this will also be a dictionary
    master_filings_dict = {}

    master_filings_dict['sec_header_content'] = {}

    # grab the sec-header tag, so we can store it in the master filing dictionary
    sec_header_tag = soup.find('sec-header')

    # store the tag in the dictinoary
    master_filings_dict['sec_header_content']['sec_header_code'] = sec_header_tag
    
    if sec_header_tag != None:
        # store the header as a text 
        header_txt = sec_header_tag.text


        # initialize a smaller dictionary to store the company header
        header_dict = {}


        for idx, line in enumerate(header_txt.splitlines()[12:]):

            temp = re.split(r'\t+', line.strip().rstrip('\t'))
            if len(temp)>1:
                header_dict[temp[0][:-1]] = temp[1]

        company_dict[cik] = header_dict

In [16]:
comp_df = pd.DataFrame(company_dict).T

In [17]:
comp_df

Unnamed: 0,ABS ASSET CLASS:,BUSINESS PHONE,CENTRAL INDEX KEY,CITY,COMPANY CONFORMED NAME,CONFORMED PERIOD OF REPORT,DATE AS OF CHANGE,DATE OF NAME CHANGE,FILED AS OF DATE,FILM NUMBER,...,FORMER CONFORMED NAME,IRS NUMBER,SEC ACT,SEC FILE NUMBER,STANDARD INDUSTRIAL CLASSIFICATION,STATE,STATE OF INCORPORATION,STREET 1,STREET 2,ZIP
20,,8562563318,0000000020,PITMAN,K TRON INTERNATIONAL INC,,,,,10681142,...,,221759452,1934 Act,000-09576,"INDUSTRIAL INSTRUMENTS FOR MEASUREMENT, DISPLA...",NJ,NJ,ROUTE 55 & 553,P O BOX 888,08071-0888
1048620,,952-215-0660,0001048620,MINNEAPOLIS,GRANITE CITY FOOD & BREWERY LTD.,,,19991112,,14704127,...,GRANITE CITY FOOD & BREWERY LTD,411883639,1934 Act,000-29643,RETAIL-EATING PLACES [5812],MN,MN,701 XENIA AVENUE SOUTH,SUITE 120,55416
1048685,,908-497-9610,0001048685,CRANFORD,METALICO INC,,,19971029,,15772539,...,METALICO INC /NJ,000000000,1934 Act,001-32453,SECONDARY SMELTING & REFINING OF NONFERROUS ME...,NJ,,186 NORTH AVENUE EAST,908-497-9610,07016
1048695,,2062725555,0001048695,SEATTLE,F5 NETWORKS INC,,,19990305,,181197855,...,F5 LABS INC,911714307,1934 Act,000-26041,COMPUTER COMMUNICATIONS EQUIPMENT [3576],WA,WA,401 ELLIOT AVE WEST,STE 500,98119
1048701,,604-525-2386,0001048701,COQUITLAM,AVANI INTERNATIONAL GROUP INC //,,,19971029,,11539231,...,AVANI INTERNATIONAL GROUP INC,880367866,1934 Act,001-14415,BOTTLED & CANNED SOFT DRINKS CARBONATED WATERS...,A1,NV,"#328, 17 FAWCETT ROAD",,V3K 6V2
1048789,,9724444900,0001048789,IRVING,FelCor Lodging LP,,,19971030,,19648887,...,FELCOR SUITES LP,752544994,1934 Act,333-39595-01,REAL ESTATE INVESTMENT TRUSTS [6798],TX,DE,545 E. JOHN CARPENTER FREEWAY,SUITE 1300,75062
1048911,,9018187500,0001048911,MEMPHIS,FEDEX CORP,,,19971103,,191030425,...,FDX CORP,621721435,1934 Act,001-15829,AIR COURIER SERVICES [4513],TN,DE,942 SOUTH SHADY GROVE ROAD,,38120-
1049011,,011886225061688,0001049011,"TAIPEI, TAIWAN ROC",KID CASTLE EDUCATIONAL CORP,,,19971104,,09686957,...,OMNI DOORS INC,592549529,1934 Act,333-39629,WHOLESALE-PROFESSIONAL & COMMERCIAL EQUIPMENT ...,F5,FL,7TH FLOOR,127-1 SUNG CHIANG ROAD,00000
1049108,,9186607700,0001049108,TULSA,DOLLAR THRIFTY AUTOMOTIVE GROUP INC,,,,,12648858,...,,731356520,1934 Act,001-13647,SERVICES-AUTO RENTAL & LEASING (NO DRIVERS) [7...,OK,DE,5330 EAST 31ST STREET,,74135
1049210,,858 431-8500,0001049210,SAN DIEGO,VERENIUM CORP,,,19991201,,13996154,...,DIVERSA CORP,223297375,1934 Act,000-29173,INDUSTRIAL ORGANIC CHEMICALS [2860],CA,DE,3550 JOHN HOPKINS COURT,,92121


In [18]:
comp_df.to_csv('company_header_dict.csv')

In [19]:
comp_df

Unnamed: 0,ABS ASSET CLASS:,BUSINESS PHONE,CENTRAL INDEX KEY,CITY,COMPANY CONFORMED NAME,CONFORMED PERIOD OF REPORT,DATE AS OF CHANGE,DATE OF NAME CHANGE,FILED AS OF DATE,FILM NUMBER,...,FORMER CONFORMED NAME,IRS NUMBER,SEC ACT,SEC FILE NUMBER,STANDARD INDUSTRIAL CLASSIFICATION,STATE,STATE OF INCORPORATION,STREET 1,STREET 2,ZIP
20,,8562563318,0000000020,PITMAN,K TRON INTERNATIONAL INC,,,,,10681142,...,,221759452,1934 Act,000-09576,"INDUSTRIAL INSTRUMENTS FOR MEASUREMENT, DISPLA...",NJ,NJ,ROUTE 55 & 553,P O BOX 888,08071-0888
1048620,,952-215-0660,0001048620,MINNEAPOLIS,GRANITE CITY FOOD & BREWERY LTD.,,,19991112,,14704127,...,GRANITE CITY FOOD & BREWERY LTD,411883639,1934 Act,000-29643,RETAIL-EATING PLACES [5812],MN,MN,701 XENIA AVENUE SOUTH,SUITE 120,55416
1048685,,908-497-9610,0001048685,CRANFORD,METALICO INC,,,19971029,,15772539,...,METALICO INC /NJ,000000000,1934 Act,001-32453,SECONDARY SMELTING & REFINING OF NONFERROUS ME...,NJ,,186 NORTH AVENUE EAST,908-497-9610,07016
1048695,,2062725555,0001048695,SEATTLE,F5 NETWORKS INC,,,19990305,,181197855,...,F5 LABS INC,911714307,1934 Act,000-26041,COMPUTER COMMUNICATIONS EQUIPMENT [3576],WA,WA,401 ELLIOT AVE WEST,STE 500,98119
1048701,,604-525-2386,0001048701,COQUITLAM,AVANI INTERNATIONAL GROUP INC //,,,19971029,,11539231,...,AVANI INTERNATIONAL GROUP INC,880367866,1934 Act,001-14415,BOTTLED & CANNED SOFT DRINKS CARBONATED WATERS...,A1,NV,"#328, 17 FAWCETT ROAD",,V3K 6V2
1048789,,9724444900,0001048789,IRVING,FelCor Lodging LP,,,19971030,,19648887,...,FELCOR SUITES LP,752544994,1934 Act,333-39595-01,REAL ESTATE INVESTMENT TRUSTS [6798],TX,DE,545 E. JOHN CARPENTER FREEWAY,SUITE 1300,75062
1048911,,9018187500,0001048911,MEMPHIS,FEDEX CORP,,,19971103,,191030425,...,FDX CORP,621721435,1934 Act,001-15829,AIR COURIER SERVICES [4513],TN,DE,942 SOUTH SHADY GROVE ROAD,,38120-
1049011,,011886225061688,0001049011,"TAIPEI, TAIWAN ROC",KID CASTLE EDUCATIONAL CORP,,,19971104,,09686957,...,OMNI DOORS INC,592549529,1934 Act,333-39629,WHOLESALE-PROFESSIONAL & COMMERCIAL EQUIPMENT ...,F5,FL,7TH FLOOR,127-1 SUNG CHIANG ROAD,00000
1049108,,9186607700,0001049108,TULSA,DOLLAR THRIFTY AUTOMOTIVE GROUP INC,,,,,12648858,...,,731356520,1934 Act,001-13647,SERVICES-AUTO RENTAL & LEASING (NO DRIVERS) [7...,OK,DE,5330 EAST 31ST STREET,,74135
1049210,,858 431-8500,0001049210,SAN DIEGO,VERENIUM CORP,,,19991201,,13996154,...,DIVERSA CORP,223297375,1934 Act,000-29173,INDUSTRIAL ORGANIC CHEMICALS [2860],CA,DE,3550 JOHN HOPKINS COURT,,92121
