In [114]:
import json
import lxml
from sys import getsizeof
from time import strptime, strftime
from collections import Counter
from pathlib import Path
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

import regex as re
import pandas as pd
import requests
from requests.exceptions import Timeout, ConnectionError, HTTPError, RequestException
from tqdm.notebook import tqdm
from lxml import etree
from bs4 import BeautifulSoup

# RSS Feed

In [7]:
def rss_url(start=0, count=100):
    type = '10-'
    return f'https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type={type}&company=&dateb=&owner=include&start={start}&count={count}&output=atom'

def check_validity(entry: BeautifulSoup) -> str:
    acc_pattern = re.compile(r'(\d{10}\-\d{2}\-\d{6})')
    acc_1 = acc_pattern.search(entry.find('summary').text).group()
    acc_2 = acc_pattern.search(entry.find('id').text).group()
    if acc_1 == acc_2:
        return acc_1
    raise ValueError(f'Invalid accession number: {acc_1} != {acc_2}')
    # Not sure if this is necessary
    return None

def get_filing_date(entry: BeautifulSoup) -> str:
    summary = entry.find('summary').text
    date_pattern = re.compile(r'(\d{4}\-\d+\-\d+)')
    date = date_pattern.search(summary).group()
    return date

def get_url(entry: BeautifulSoup) -> list:
    url = entry.find('link')['href']
    url.replace('-index.htm', '.txt')
    return url

def get_header(document: BeautifulSoup) -> dict:
    """Return the header of the txt file."""
    def find_acc(txt: str) -> str:
        return txt[txt.find('ACCESSION NUMBER:') + 17:txt.find('CONFORMED SUBMISSION TYPE:')].strip()
    def find_cik(txt: str) -> str:
        return txt[txt.find('CENTRAL INDEX KEY:') + 18:txt.find('STANDARD INDUSTRIAL CLASSIFICATION:')].strip()
    def find_irs(txt: str) -> str:
        return txt[txt.find('IRS NUMBER:') + 11:txt.find('STATE OF INCORPORATION:')].strip()
    def find_form(txt: str) -> str:
        return txt[txt.find('CONFORMED SUBMISSION TYPE:') + 26:txt.find('PUBLIC DOCUMENT COUNT:')].strip()
    def find_name(txt: str) -> str:
        return txt[txt.find('COMPANY CONFORMED NAME:') + 23:txt.find('CENTRAL INDEX KEY:')].strip()
    def find_date(txt: str) -> str:
        return txt[txt.find('FILED AS OF DATE:') + 17:txt.find('DATE AS OF CHANGE:')].strip()
    def identify_markup(document: BeautifulSoup) -> str:
        markup_list = ['html', 'xml', 'xbrl']
        next = document.find('TEXT').next
        markup = next.name if next.name else next.next.name
        markup = markup.lower()
        if markup not in markup_list:
            raise ValueError(f'Invalid markup: {markup}')
        return markup
            
    txt = document.text
    header = txt[txt.find('<SEC-HEADER>') + 12:txt.find('</SEC-HEADER>')]
    header_dict = {
        'acc': find_acc(header),
        'cik': find_cik(header),
        'irs': find_irs(header),
        'form': find_form(header),
        'coname': find_name(header),
        'date': find_date(header),
        'markup': identify_markup(document)
    }
    return header_dict

def soup_to_df(soup: BeautifulSoup) -> pd.DataFrame:
    header = soup.find('SEC-HEADER')
    header = get_header(header)
    document = soup.find('DOCUMENT')
    document = {'text': document.text}
    df = pd.DataFrame([{**header, **document}])
    return df


if __name__ == '__main__':
    HEADERS = {
        'Content-Type': 'application/json; charset=utf-8', 
        'User-Agent': 'cdo@oneline.tec',
        'Accept-Encoding': 'gzip, deflate',
        'Host': 'www.sec.gov'
    }
    # get last server access time from s3://oneline-access-log/edgar/edgar.log
    # 상단 링크 변경 필요
    LAST_DATE = strptime('2020-01-01', '%Y-%m-%d')
    
    num = 0
    while count == 0:
        count = 0
        df = pd.DataFrame()
        req = requests.get(rss_url(start=num*100, count=100), headers=HEADERS)
        soup = BeautifulSoup(req.text, 'xml')

        entries = soup.find_all('entry')
        link_list = ((get_url(i), get_filing_date(i)) for i in entries)
        for url, date in link_list:
            if strptime(date, '%Y-%m-%d') < LAST_DATE:
                count += 1
                continue
            requ = requests.get(url, headers=HEADERS)
            document = BeautifulSoup(requ.text, 'xml')
            df = pd.concat([df, soup_to_df(document)])
        num += 1

<entry>
<title>10-Q - Legacy Housing Corp (0001436208) (Filer)</title>
<link href="https://www.sec.gov/Archives/edgar/data/1436208/000155837022014338/0001558370-22-014338-index.htm" rel="alternate" type="text/html"/>
<summary type="html">
 &lt;b&gt;Filed:&lt;/b&gt; 2022-09-12 &lt;b&gt;AccNo:&lt;/b&gt; 0001558370-22-014338 &lt;b&gt;Size:&lt;/b&gt; 7 MB
</summary>
<updated>2022-09-12T17:30:09-04:00</updated>
<category label="form type" scheme="https://www.sec.gov/" term="10-Q"/>
<id>urn:tag:sec.gov,2008:accession-number=0001558370-22-014338</id>
</entry>


In [None]:
### request header 차단당했을 때 이용

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
options.add_argument("lang=ko_KR")
driver = webdriver.Chrome('chromedriver', options=options)

driver.get(rss_url)
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')

## 전처리 이후 s3 업로드, bigquery 로드 확인

In [None]:
path = Path(r'whatasamplepath/path')
date = strftime('%Y%m%d', LAST_DATE)
name = f'edgar_scrap_{date}.parquet'
df.to_parquet(path / name, index=False)

import boto3
import botocore
from sys import getsizeof

ARN = 'arn:aws:s3:::oneline-edgar'
S3_BUCKET = 'oneline-edgar'
KEY_LIST = "'0123','6789'"
S3_FILE = '1994_0.parquet'
REGION = 'ap-northeast-2'   # Seoul
AWSAccessKeyId, AWSSecretKey = open('awskey.txt').read().splitlines()

s3:boto3.Session = boto3.client('s3', region_name=REGION, aws_access_key_id=AWSAccessKeyId, aws_secret_access_key=AWSSecretKey)
s3.upload_file(path / name, S3_BUCKET, name)

# 하단 코드 개수 필요

# Company reports

In [None]:
filings_df = pd.DataFrame()

for i in tqdm(range(0, len(cik_df))):
    cik = cik_df['cik'][i]
    url = f'http://data.sec.gov/submissions/CIK{cik}.json'
    time.sleep(0.5)

    while True:
        try:
            edgar = requests.get(url, headers = HEADERS)

            if edgar.status_code != 200:
                print("Not 200")
                continue
            else:
                edgar = edgar.json()    

            name = edgar['name'] ; ticker = ";".join(edgar['tickers']) ; exchange = ";".join(edgar['exchanges'])
            
            filings = pd.DataFrame(edgar['filings']['recent'])
            filings = filings[(filings['form']=='10-Q') | (filings['form']=='10-K')]
            
            filings['cik'] = cik ; filings['name'] = name
            filings['ticker'] = ticker ; filings['exchange'] = exchange
            
            filings_df = pd.concat([filings_df, filings]).reset_index(drop = True)

            break
            
        except (Timeout, ConnectionError, HTTPError, RequestException):
            print("Error")
            continue

# cik_list.extend(cik)
# accessionNumber_list.extend(accessionNumber)
# form_list.extend(form)
# primaryDocument_list.extend(primaryDocument)

In [None]:
filings_df = filings_df[['cik', 'name', 'ticker', 'exchange', 'accessionNumber', 'filingDate',
                         'reportDate', 'acceptanceDateTime', 'act', 'form', 'fileNumber', 'filmNumber', # delete items
                         'size', 'isXBRL', 'isInlineXBRL', 'primaryDocument', 'primaryDocDescription']] #reorder

In [None]:
filings_df.info(memory_usage='deep')

In [None]:
filings_df.to_csv("filings_df.csv", encoding = "utf-8", index  = False)

# Reports Content

In [2]:
filings_df = pd.read_csv("filings_df.csv", encoding = "utf-8")

# primaryDocument 없는 경우도 있음.
filings_df = filings_df.dropna(subset=['primaryDocument']).reset_index(drop = True)

filings_df['cik'] = filings_df['cik'].apply(lambda x: str(x).zfill(10))
filings_df['act'] = filings_df['act'].fillna(-1)
filings_df['act'] = filings_df['act'].apply(lambda x: int(x))

filings_df['filmNumber'] = filings_df['filmNumber'].fillna(-1)
filings_df['filmNumber'] = filings_df['filmNumber'].apply(lambda x: str(x).replace(",",""))
filings_df['filmNumber'] = filings_df['filmNumber'].apply(lambda x: int(float(x))) # ['filmNumber']==17010285.17010284]
# filings_df['filmNumber'] = filings_df['filmNumber'].apply(lambda x: re.sub(r"[0-9]", "", x))

In [3]:
filings_df.head(5)

Unnamed: 0,cik,name,ticker,exchange,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
0,320193,Apple Inc.,AAPL,Nasdaq,0000320193-22-000059,2022-04-29,2022-03-26,2022-04-28T18:03:58.000Z,34,10-Q,001-36743,22868650,6140838,1,1,aapl-20220326.htm,10-Q
1,320193,Apple Inc.,AAPL,Nasdaq,0000320193-22-000007,2022-01-28,2021-12-25,2022-01-27T18:00:58.000Z,34,10-Q,001-36743,22564628,5669748,1,1,aapl-20211225.htm,10-Q
2,320193,Apple Inc.,AAPL,Nasdaq,0000320193-21-000105,2021-10-29,2021-09-25,2021-10-28T18:04:28.000Z,34,10-K,001-36743,211359752,10502096,1,1,aapl-20210925.htm,10-K
3,320193,Apple Inc.,AAPL,Nasdaq,0000320193-21-000065,2021-07-28,2021-06-26,2021-07-27T18:03:42.000Z,34,10-Q,001-36743,211119137,8446381,1,1,aapl-20210626.htm,10-Q
4,320193,Apple Inc.,AAPL,Nasdaq,0000320193-21-000056,2021-04-29,2021-03-27,2021-04-28T18:02:54.000Z,34,10-Q,001-36743,21866148,8468959,1,1,aapl-20210327.htm,10-Q


### API 우선 100개의 cik에 대해서 진행

In [4]:
# cik_list = list(filings_df['cik'].drop_duplicates())
temp_df = filings_df[['cik']].drop_duplicates().reset_index(drop = True)
temp_df = list(temp_df.sample(100, random_state= 7)['cik'])

filings_df = filings_df[filings_df['cik'].isin(temp_df)].reset_index(drop = True)

In [5]:
cik_list = list(filings_df['cik'].drop_duplicates())

In [7]:
headers = {"User-agent":'cmo@onelinetec.com'}


key_path = glob.glob('*.json')[0]
credentials = service_account.Credentials.from_service_account_file(key_path)
project_id = 'fine-scene-356009'
table_id = 'cik_filings.cik_contents'
client = bigquery.Client(credentials = credentials, project = credentials.project_id)                                                             

In [8]:
%%time

sql = f"""
DELETE FROM `{project_id}.{table_id}` WHERE true;
"""

client.query(sql)

CPU times: user 90.6 ms, sys: 18.5 ms, total: 109 ms
Wall time: 886 ms


QueryJob<project=fine-scene-356009, location=asia-northeast3, id=6c91ecd3-039b-4bd8-b74c-4fb2dc1bd1a7>

In [None]:
del_elements = [f'.//table', f'//*[@id="DSPFPageNumber"]', "//div[@style='text-align:center']", f'.//head', "//div[@style='display:none']"]
stop_words = ['☐','☒']

# This function will be our all-in-one noise removal function
def remove_stopwords(tokens):
    cleaned_tokens = []
    for token in tokens:
        if token not in stop_words:
            cleaned_tokens.append(token.strip())
    cleaned_tokens = list(filter(None, cleaned_tokens))
    return cleaned_tokens


for index in tqdm(range(0, len(cik_list))):
    cik = cik_list[index]
    temp_df = filings_df[filings_df['cik']==cik].reset_index(drop = True)
    name = temp_df['name'][0]
    content_list = [] # init
    
    for k in range(0, len(temp_df)):
        
        accessionNumber = temp_df['accessionNumber'][k].replace("-","")
        primaryDocument = temp_df['primaryDocument'][k]    

        url = f'https://www.sec.gov/Archives/edgar/data/{cik}/{accessionNumber}/{primaryDocument}'
        response = requests.get(url, headers = headers)
        response = BeautifulSoup(response.content, "html.parser")
        try:
            response = etree.HTML(str(response))
        except ValueError: # Unicode strings with encoding declaration are not supported
            response = etree.HTML(bytes(str(response), encoding='utf-8'))

        for p in del_elements:
            if p == f'.//table':
                for i in response.xpath(p):
                    if len(i) != 1:  # 테이블로 형태의 텍스트를 제외하고는 삭제
                        i.getparent().remove(i)
            else:        
                for i in response.xpath(p): #pagination 일반화가능?
                    i.getparent().remove(i)  

        content = response.xpath(f'.//text()')
        content = remove_stopwords(content)
        content = " ".join(content)
        content = re.sub(r"\s+", " ", content)
        content_list.append(content)
        
    temp_df['content'] = content_list
    temp_df.to_gbq(table_id, project_id, if_exists='append', credentials = credentials)
    print(f"{cik}-{name} completed")