In [1]:
import requests
import pandas as pd
from collections import Counter
import json
from tqdm.notebook import tqdm
from requests.exceptions import Timeout, ConnectionError, HTTPError, RequestException
import lxml
from lxml import etree
from bs4 import BeautifulSoup
import re

import time

#pip install pandas-gbq
from google.oauth2 import service_account
import pandas_gbq

import warnings
warnings.filterwarnings("ignore")

import glob
from google.cloud import bigquery
from google.oauth2 import service_account

# CIK LIST

In [None]:
headers = {"User-agent":'cmo@onelinetec.com'}

In [None]:
%%time

url = 'https://www.sec.gov/files/company_tickers_exchange.json'
edgar = requests.get(url, headers = headers)

if edgar.status_code != 200:
    print("Not 200")
else:
    edgar = edgar.json()

cik_df = pd.DataFrame(edgar['data'], columns=edgar['fields'])

# only Nasdq & NYSE
cik_df = cik_df[(cik_df['exchange']=='Nasdaq') | (cik_df['exchange']=='NYSE')].reset_index(drop = True)

# cik+10
cik_df['cik'] = cik_df['cik'].apply(lambda x: str(x).zfill(10))

cik_df = cik_df.drop_duplicates(['cik']).reset_index(drop = True)

In [None]:
cik_df.head(5)

# Company reports

In [None]:
filings_df = pd.DataFrame()

for i in tqdm(range(0, len(cik_df))):
    cik = cik_df['cik'][i]
    url = f'http://data.sec.gov/submissions/CIK{cik}.json'
    time.sleep(0.5)

    while True:
        try:
            edgar = requests.get(url, headers = headers)

            if edgar.status_code != 200:
                print("Not 200")
                continue
            else:
                edgar = edgar.json()    

            name = edgar['name'] ; ticker = ";".join(edgar['tickers']) ; exchange = ";".join(edgar['exchanges'])
            
            filings = pd.DataFrame(edgar['filings']['recent'])
            filings = filings[(filings['form']=='10-Q') | (filings['form']=='10-K')]
            
            filings['cik'] = cik ; filings['name'] = name
            filings['ticker'] = ticker ; filings['exchange'] = exchange
            
            filings_df = pd.concat([filings_df, filings]).reset_index(drop = True)

            break
            
        except (Timeout, ConnectionError, HTTPError, RequestException):
            print("Error")
            continue

# cik_list.extend(cik)
# accessionNumber_list.extend(accessionNumber)
# form_list.extend(form)
# primaryDocument_list.extend(primaryDocument)

In [None]:
filings_df = filings_df[['cik', 'name', 'ticker', 'exchange', 'accessionNumber', 'filingDate',
                         'reportDate', 'acceptanceDateTime', 'act', 'form', 'fileNumber', 'filmNumber', # delete items
                         'size', 'isXBRL', 'isInlineXBRL', 'primaryDocument', 'primaryDocDescription']] #reorder

In [None]:
filings_df.info(memory_usage='deep')

In [None]:
filings_df.to_csv("filings_df.csv", encoding = "utf-8", index  = False)

# Reports Content

In [2]:
filings_df = pd.read_csv("filings_df.csv", encoding = "utf-8")

# primaryDocument 없는 경우도 있음.
filings_df = filings_df.dropna(subset=['primaryDocument']).reset_index(drop = True)

filings_df['cik'] = filings_df['cik'].apply(lambda x: str(x).zfill(10))
filings_df['act'] = filings_df['act'].fillna(-1)
filings_df['act'] = filings_df['act'].apply(lambda x: int(x))

filings_df['filmNumber'] = filings_df['filmNumber'].fillna(-1)
filings_df['filmNumber'] = filings_df['filmNumber'].apply(lambda x: str(x).replace(",",""))
filings_df['filmNumber'] = filings_df['filmNumber'].apply(lambda x: int(float(x))) # ['filmNumber']==17010285.17010284]
# filings_df['filmNumber'] = filings_df['filmNumber'].apply(lambda x: re.sub(r"[0-9]", "", x))

In [3]:
filings_df.head(5)

Unnamed: 0,cik,name,ticker,exchange,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
0,320193,Apple Inc.,AAPL,Nasdaq,0000320193-22-000059,2022-04-29,2022-03-26,2022-04-28T18:03:58.000Z,34,10-Q,001-36743,22868650,6140838,1,1,aapl-20220326.htm,10-Q
1,320193,Apple Inc.,AAPL,Nasdaq,0000320193-22-000007,2022-01-28,2021-12-25,2022-01-27T18:00:58.000Z,34,10-Q,001-36743,22564628,5669748,1,1,aapl-20211225.htm,10-Q
2,320193,Apple Inc.,AAPL,Nasdaq,0000320193-21-000105,2021-10-29,2021-09-25,2021-10-28T18:04:28.000Z,34,10-K,001-36743,211359752,10502096,1,1,aapl-20210925.htm,10-K
3,320193,Apple Inc.,AAPL,Nasdaq,0000320193-21-000065,2021-07-28,2021-06-26,2021-07-27T18:03:42.000Z,34,10-Q,001-36743,211119137,8446381,1,1,aapl-20210626.htm,10-Q
4,320193,Apple Inc.,AAPL,Nasdaq,0000320193-21-000056,2021-04-29,2021-03-27,2021-04-28T18:02:54.000Z,34,10-Q,001-36743,21866148,8468959,1,1,aapl-20210327.htm,10-Q


### API 우선 100개의 cik에 대해서 진행

In [4]:
# cik_list = list(filings_df['cik'].drop_duplicates())
temp_df = filings_df[['cik']].drop_duplicates().reset_index(drop = True)
temp_df = list(temp_df.sample(100, random_state= 7)['cik'])

filings_df = filings_df[filings_df['cik'].isin(temp_df)].reset_index(drop = True)

In [5]:
cik_list = list(filings_df['cik'].drop_duplicates())

In [7]:
headers = {"User-agent":'cmo@onelinetec.com'}

del_elements = [f'.//table', f'//*[@id="DSPFPageNumber"]', "//div[@style='text-align:center']", f'.//head', "//div[@style='display:none']"]
stop_words = ['☐','☒']

# This function will be our all-in-one noise removal function
def remove_stopwords(tokens):
    cleaned_tokens = []
    for token in tokens:
        if token not in stop_words:
            cleaned_tokens.append(token.strip())
    cleaned_tokens = list(filter(None, cleaned_tokens))
    return cleaned_tokens

key_path = glob.glob('*.json')[0]
credentials = service_account.Credentials.from_service_account_file(key_path)
project_id = 'fine-scene-356009'
table_id = 'cik_filings.cik_contents'
client = bigquery.Client(credentials = credentials, project = credentials.project_id)                                                             

In [8]:
%%time

sql = f"""
DELETE FROM `{project_id}.{table_id}` WHERE true;
"""

client.query(sql)

CPU times: user 90.6 ms, sys: 18.5 ms, total: 109 ms
Wall time: 886 ms


QueryJob<project=fine-scene-356009, location=asia-northeast3, id=6c91ecd3-039b-4bd8-b74c-4fb2dc1bd1a7>

In [None]:
for index in tqdm(range(0, len(cik_list))):
    cik = cik_list[index]
    temp_df = filings_df[filings_df['cik']==cik].reset_index(drop = True)
    name = temp_df['name'][0]
    content_list = [] # init
    
    for k in range(0, len(temp_df)):
        
        accessionNumber = temp_df['accessionNumber'][k].replace("-","")
        primaryDocument = temp_df['primaryDocument'][k]    

        url = f'https://www.sec.gov/Archives/edgar/data/{cik}/{accessionNumber}/{primaryDocument}'
        response = requests.get(url, headers = headers)
        response = BeautifulSoup(response.content, "html.parser")
        try:
            response = etree.HTML(str(response))
        except ValueError: # Unicode strings with encoding declaration are not supported
            response = etree.HTML(bytes(str(response), encoding='utf-8'))

        for p in del_elements:
            if p == f'.//table':
                for i in response.xpath(p):
                    if len(i) != 1:  # 테이블로 형태의 텍스트를 제외하고는 삭제
                        i.getparent().remove(i)
            else:        
                for i in response.xpath(p): #pagination 일반화가능?
                    i.getparent().remove(i)  

        content = response.xpath(f'.//text()')
        content = remove_stopwords(content)
        content = " ".join(content)
        content = re.sub(r"\s+", " ", content)
        content_list.append(content)
        
    temp_df['content'] = content_list
    temp_df.to_gbq(table_id, project_id, if_exists='append', credentials = credentials)
    print(f"{cik}-{name} completed")

# pandas_gbq은 저장할 때 너무 느림
pandas_gbq.to_gbq(data, table_id, project_id=project_id, if_exists = 'replace', credentials = credentials)

In [6]:
cik = cik_list[0]
temp_df = filings_df[filings_df['cik']==cik].reset_index(drop = True)
name = temp_df['name'][0]

In [7]:
for k in range(0, 1):

    accessionNumber = temp_df['accessionNumber'][k].replace("-","")
    primaryDocument = temp_df['primaryDocument'][k]    
    url = f'https://www.sec.gov/Archives/edgar/data/{cik}/{accessionNumber}/{primaryDocument}'
    
print(url)

https://www.sec.gov/Archives/edgar/data/0001613103/000161310322000023/mdt-20220429.htm


In [8]:
headers = {'User-agent': 'cmo@onelinetec.com'}

In [9]:
response = requests.get(url, headers = headers)
print(response)

<Response [404]>


In [10]:
print(response.content)

b'\n\n<!DOCTYPE html>\n<html lang="en" dir="ltr" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">\n  <head>\n    <meta charset="utf-8" /><script type="text/javascript">(window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"32edb8f179",applicationID:"436723953"};window.NREUM||(NREUM={}),__nr_require=function(t,e,n){function r(n){if(!e[n]){var i=e[n]={exports:{}};t[n][0].call(i.exports,function(e){var i=t[n][1][e];return r(i||e)},i,i.exports)}return e[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var i=0;i<n.length;i++)r(n[i]);return r}({1:[function(t,e,n){function r(){}functi

In [None]:
response.text

In [None]:
response.content

In [None]:

response = BeautifulSoup(response.content, "html.parser")

In [None]:
response = etree.HTML(str(response))

In [None]:
for p in del_elements:
    if p == f'.//table':
        for i in response.xpath(p):
            if len(i) != 1:  # 테이블로 형태의 텍스트를 제외하고는 삭제
                i.getparent().remove(i)
    else:        
        for i in response.xpath(p): #pagination 일반화가능?
            i.getparent().remove(i)  

response.xpath(f'.//text()')

In [None]:

    content_list = [] # init
    
    for k in range(0, len(temp_df)):
        
        accessionNumber = temp_df['accessionNumber'][k].replace("-","")
        primaryDocument = temp_df['primaryDocument'][k]    

        url = f'https://www.sec.gov/Archives/edgar/data/{cik}/{accessionNumber}/{primaryDocument}'
        response = requests.get(url, headers = headers)
        response = BeautifulSoup(response.content, "html.parser")
        try:
            response = etree.HTML(str(response))
        except ValueError: # Unicode strings with encoding declaration are not supported
            response = etree.HTML(bytes(str(response), encoding='utf-8'))

        for p in del_elements:
            if p == f'.//table':
                for i in response.xpath(p):
                    if len(i) != 1:  # 테이블로 형태의 텍스트를 제외하고는 삭제
                        i.getparent().remove(i)
            else:        
                for i in response.xpath(p): #pagination 일반화가능?
                    i.getparent().remove(i)  

        content = response.xpath(f'.//text()')
        content = remove_stopwords(content)
        content = " ".join(content)
        content = re.sub(r"\s+", " ", content)
        content_list.append(content)
        
    temp_df['content'] = content_list
    temp_df.to_gbq(table_id, project_id, if_exists='append', credentials = credentials)
    print(f"{cik}-{name} completed")