In [None]:
# make simple web crawler
import requests
import re
import os
from pathlib import Path
from typing import Union
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from multiprocessing import Pool # Pool import하기# selenium crawler
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# dir = Path(r"C:\Users\wonhyeong\workings\data\10X\cleaned") # on office
dir = Path(r"/Users/jowonhyeong/Desktop/workspace/data") # on office
index_dir = dir / 'index.pkl'
index: pd.DataFrame = pd.read_pickle(index_dir)
cik_list = index['cik'].unique()
cik_list = list(map(str, cik_list))
url_cast = 'https://sec.report/CIK/'
url_list = list(map(lambda x: ''.join([url_cast, x]), cik_list))
ninety = index.query('name == "-99"')

In [None]:
def get_url(df):
    url_cast = 'https://www.sec.gov/Archives/edgar/data'
    url_list = []
    for row in df.itertuples():
        cik = row.cik
        acc = row.acc
        mid_acc = acc.replace('-', '')
        end_acc = '-'.join([acc, 'index.html'])
        url = '/'.join([url_cast, str(cik), mid_acc, end_acc])
        url_list.append((acc, url))
    return url_list

url_list = get_url(ninety)

In [None]:
options = Options()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
options.add_argument("lang=ko_KR") # 한국어!
driver = webdriver.Chrome(dir / 'chromedriver', options=options)
data = {}

def get_data(row):
    acc, url = row
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    data[acc] = soup
    print(acc, 'done')

# with Pool(12) as p:
#     p.apply(get_data, url_list)

for i in url_list:
    get_data(i)

In [None]:
data_backup = data.copy()

In [None]:
# parsing data from sec.gov/Archives/edgar/data/.../...-index.html
# data of SEC HEADER

def _parsing_addr(soup: BeautifulSoup) -> dict:
    dic = {}
    address = soup.find_all('div', 'mailer')
    addr = address[1].find_all('span').__iter__()
    dic['STREET 1'] = next(addr).text
    city, state, *_ = next(addr).text.strip().split(' ')
    zip_num = _[0] if len(_) else None
    dic['CITY'] = city
    dic['STATE'] = state
    dic['ZIP'] = zip_num
    try: 
      phone_No = next(addr).text
    except StopIteration:
      phone_No = None
    dic['BUSINESS PHONE'] = phone_No
    return dic

def _parsing_filer(filer: BeautifulSoup) -> dict:
    ident = filer.find('p', {'class': 'identInfo'})
    name = filer.find('span', {'class': 'companyName'}).text
    coname = name.split('(')[0].strip()
    cik = name.split('(')[1].split(')')[1].split(':')[1].strip()
    addr = _parsing_addr(filer)
    # find div that inner text contains 'Business Address'
    info = map(lambda x: re.sub('[^A-Za-z0-9\-\s\/\.]+', '', x.text) , ident.contents)
    info = [x.strip() for x in info if 0<len(x.strip())<25]
    filer_data = {k: v for k, v in zip(info[::2], info[1::2])}
    company_data = {}
    company_data['COMPANY CONFORMED NAME'] = coname
    company_data['CENTRAL INDEX KEY'] = cik
    company_data['STANDARD INDUSTRIAL CLASSIFICATION'] = filer_data.get('SIC')
    company_data['IRS NUMBER'] = filer_data.get('IRS No.')
    company_data['STATE OF INCORPORATION'] = filer_data.get('State of Incorp.')
    company_data['FISCAL YEAR END'] = filer_data.get('Fiscal Year End')
    filing_values = {}
    filing_values['FORM TYPE'] = filer_data.get('Type')
    filing_values['SEC ACT'] = filer_data.get('Act')
    filing_values['SEC FILE NUMBER'] = filer_data.get('File No.')
    filing_values['FILM NUMBER'] = filer_data.get('Film No.')

    filing = {}
    filing['COMPANY DATA'] = company_data
    filing['FILING VALUES'] = filing_values
    filing['ADDRESS'] = addr
    filing['FORMER COMPANY'] = {'FORMER CONFORMED NAME' : None, 'DATE OF NAME CHANGE': None}
    return filing

def get_form_data(soup):
    dic = {}
    div = v.find('div', {'id': 'formDiv'})
    # find what that classname is 'formGrouping'
    infohead = div.find_all('div', {'class': 'infoHead'})
    info = div.find_all('div', {'class': 'info'})
    for head, body in zip(infohead, info):
        dic[head.text] = body.text
    return dic
    
def get_filer_data(soup: BeautifulSoup) -> list:
    filer = v.find_all('div', {'id': 'filerDiv'})
    filer_data = [_parsing_filer(x) for x in filer]
    return filer_data

def df_to_text(df):
  df = df[['ACCESSION NUMBER', 'CONFORMED SUBMISSION TYPE', 'Documents', 'Period of Report', 'Filing Date', 'FILER']]
  df.columns = ['ACCESSION NUMBER', 'CONFORMED SUBMISSION TYPE', 'PUBLIC DOCUMENT COUNT', 'CONFORMED PERIOD OF REPORT', 'FILED AS OF DATE', 'FILER']
  dic = {}
  for row in range(len(df)):
    text = ''.join([df['ACCESSION NUMBER'][row], '.hdr.sgml:'.ljust(15), df['FILED AS OF DATE'][row], '\n'])
    for i in df.columns:
      if i != 'FILER':
        col = ''.join([i, ':']).ljust(35)
        text += f'{col}{df[i][row]}\n'
      else:
        for j in df[i][row]:
          text += '\nFILER:\n\n'
          for k, v in j.items():
            text += f'\t{k}:\n'
            for kk, vv in v.items():
              kk = ''.join([kk, ':']).ljust(35)
              text += f'\t\t{kk}{vv}\n'
    dic[df['ACCESSION NUMBER'][row]] = text
  return dic

if __name__ == '__main__':
  df = pd.DataFrame()
  for k, v in data.items():
      v: BeautifulSoup
      dic = {'ACCESSION NUMBER': k}
      dic.update(get_form_data(v))
      dic['FILER'] = get_filer_data(v)
      dic['CONFORMED SUBMISSION TYPE'] = dic['FILER'][0]['FILING VALUES']['FORM TYPE']
      df = df.append(dic, ignore_index=True)
      
  text_dict = df_to_text(df)
  

# class is 'mailer'
# find div that class is 'mialer'
# find div that inner text contains 'Business Address'
# find div that inner text contains 'Mailing Address'


In [None]:
df.columns


In [None]:
filer = df['FILER'][0]
filer[0]['COMPANY DATA']

In [None]:
text = ''.join([df['ACCESSION NUMBER'][0], '.hdr.sgml:'.ljust(15), df['FILED AS OF DATE'][0], '\n'])
for i in df.columns:
  if i != 'FILER':
    col = ''.join([i, ':']).ljust(35)
    text += f'{col}{df[i][0]}\n'
  else:
    for j in df[i][0]:
      text += '\nFILER:\n\n'
      for k, v in j.items():
        text += f'\t{k}:\n'
        for kk, vv in v.items():
          kk = ''.join([kk, ':']).ljust(35)
          text += f'\t\t{kk}{vv}\n'
print(text)

In [None]:
for i in a:
  print('FILER:\n')
  for k, v in i.items():
     print('\t', k.ljust(30), v)

In [None]:
len('CONFORMED SUBMISSION TYPE')

In [None]:
df

In [None]:
# make simple web crawler
def get_html(url):
    # get html
    response = requests.get(url)
    html = response.text
    return html


def get_soup(html):
    # get soup
    soup = BeautifulSoup(html, 'html.parser')
    return soup

print(get_soup(get_html('https://www.sec.gov/Archives/edgar/data/933972/000093639296000235/')))

In [None]:
from selenium import webdriver

TEST_URL = 'https://intoli.com/blog/making-chrome-headless-undetectable/chrome-headless-test.html'

options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
options.add_argument("lang=ko_KR") # 한국어!
driver = webdriver.Chrome('chromedriver', chrome_options=options)

driver.get(TEST_URL)
driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5]}})")
# lanuages 속성을 업데이트해주기
driver.execute_script("Object.defineProperty(navigator, 'languages', {get: function() {return ['ko-KR', 'ko']}})")

user_agent = driver.find_element_by_css_selector('#user-agent').text
plugins_length = driver.find_element_by_css_selector('#plugins-length').text
languages = driver.find_element_by_css_selector('#languages').text

print('User-Agent: ', user_agent)
print('Plugin length: ', plugins_length)
print('languages: ', languages)

driver.quit()

In [None]:
options = Options()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
options.add_argument("lang=ko_KR") # 한국어!
driver = webdriver.Chrome('chromedriver', options=options)
data = {}
count: int = 0

def get_url_list():
  url_cast = 'https://sec.report/CIK/'
  url_list = list(map(lambda x: ''.join([url_cast, x]), cik_list))
  return url_list

def find_some_tables(panels: list, text: str) -> Union[None, list]:
  for i in panels:
    if i.text.find(text) != -1:
      trs = i.find_elements_by_tag_name('tr')
      contents = [(x.find_element_by_xpath('td[1]').text, x.find_element_by_xpath('td[2]').text) for x in trs]
      return contents
  return None

def find_and_get(url: str):
  print(url)
  driver.get(url)
  print(count)
  panels = driver.find_elements_by_class_name('panel')
  details = find_some_tables(panels, 'Company Details')
  relations = find_some_tables(panels, 'Related SEC Filings')
  callback_func({'details': details, 'relations': relations})

def callback_func(result):
    data[str(cik_list[count])] = result
    count = count + 1
    print(count)

if __name__=='__main__':
    # pool = Pool(processes=12)
    # pool.map(find_and_get, url_list[:10])