In [None]:
from bs4 import BeautifulSoup
import requests
import sys
import itertools
import os
import time
import re
import unicodedata
from collections import namedtuple
from functools import wraps
from glob import glob

In [None]:
txt_1 = "https://www.sec.gov/include/ticker.txt"
txt_2 = "https://raw.githubusercontent.com/datasets/s-and-p-500-companies/master/data/constituents_symbols.txt"


tickers_list = requests.get(txt_1).text.split("\n")

tickers = {}

for x in tickers_list:
    tickers[x.split("\t")[0]] = x.split("\t")[1]

sp500 = requests.get(txt_2).text.split("\n")

In [None]:
companies = ['tsla', 'fb', 'aapl', 'amzn', 'nflx', 'goog', 'msft']

cik = [tickers[c] for c in companies]

In [None]:
cik

['1318605', '1326801', '320193', '1018724', '1065280', '1652044', '789019']

In [None]:
def get_form(cik, form, dateb):
    # Obtain HTML for search page
    base_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type={}&dateb={}"
    edgar_resp = requests.get(base_url.format(cik, form, dateb))
    edgar_str = edgar_resp.text

    # Find the document link
    doc_link = ''
    soup = BeautifulSoup(edgar_str, 'html.parser')
    table_tag = soup.find('table', class_='tableFile2')
    rows = table_tag.find_all('tr')
    
    dates = []
    links = []
    links_txt = []
    for row in rows:
        cells = row.find_all('td')
        if len(cells) > 3:
            date = cells[3].text
            doc_link = 'https://www.sec.gov' + cells[1].a['href']
            dates.append(date)
            links.append(doc_link)
            links_txt.append(doc_link.replace("-index.htm", ".txt").replace(".txtl", ".txt"))

    return dates, links, links_txt

In [None]:
def parse_html(content):
    """ Parses text from html with BeautifulSoup
    Args:
        input_file (str)
        output_file (str)
    """
    soup = BeautifulSoup(content, "html.parser")
    text = soup.get_text("\n")
    return text
 
 
def normalize_text(text):
    """Normalize Text
    """
    text = unicodedata.normalize("NFKD", text)  # Normalize
    text = '\n'.join(text.splitlines())  # Unicode break lines
 
    # Convert to upper
    text = text.upper()  # Convert to upper
 
    # Take care of breaklines & whitespaces combinations due to beautifulsoup parsing
    text = re.sub(r'[ ]+\n', '\n', text)
    text = re.sub(r'\n[ ]+', '\n', text)
    text = re.sub(r'\n+', '\n', text)
 
    # To find MDA section, reformat item headers
    text = text.replace('\n.\n', '.\n')  # Move Period to beginning
 
    text = text.replace('\nI\nTEM', '\nITEM')
    text = text.replace('\nITEM\n', '\nITEM ')
    text = text.replace('\nITEM  ', '\nITEM ')
 
    text = text.replace(':\n', '.\n')
 
    # Math symbols for clearer looks
    text = text.replace('$\n', '$')
    text = text.replace('\n%', '%')
 
    # Reformat
    text = text.replace('\n', '\n\n')  # Reformat by additional breakline
 
    return text
 
 
def find_mda_from_text(text, start=0):
    """Find MDA section from normalized text
    Args:
        text (str)s
    """
    debug = False
 
    mda = ""
    end = 0
 
    # Define start & end signal for parsing
    item7_begins = [
        '\nITEM 7.', '\nITEM 7 –', '\nITEM 7:', '\nITEM 7 ', '\nITEM 7\n'
    ]
    item7_ends = ['\nITEM 7A']
    if start != 0:
        item7_ends.append('\nITEM 7')  # Case: ITEM 7A does not exist
    item8_begins = ['\nITEM 8']
    """
    Parsing code section
    """
    text = text[start:]
 
    # Get begin
    for item7 in item7_begins:
        begin = text.find(item7)
        if debug:
            print(item7, begin)
        if begin != -1:
            break
 
    if begin != -1:  # Begin found
        for item7A in item7_ends:
            end = text.find(item7A, begin + 1)
            if debug:
                print(item7A, end)
            if end != -1:
                break
 
        if end == -1:  # ITEM 7A does not exist
            for item8 in item8_begins:
                end = text.find(item8, begin + 1)
                if debug:
                    print(item8, end)
                if end != -1:
                    break
 
        # Get MDA
        if end > begin:
            mda = text[begin:end].strip()
        else:
            end = 0
 
    return mda, end

In [None]:
def parse(url):
  page_ = requests.get(url).text

  page = parse_html(page_)
  text = normalize_text(page)

  return text

def parse_mda(text):
  # Parse MDA
  mda, end = find_mda_from_text(text)
  # Parse second time if first parse results in index
  if mda and len(mda.encode('utf-8')) < 1000:
      mda, _ = find_mda_from_text(text, start=end)

  mda = mda.replace("\n", "").lower()

  return mda

In [None]:
mda_dict = {}
from time import sleep

for c in cik:
  sleep(1)

  try:
    form = [get_form(c, '10-k', '20210601')]
    mda_dict[c] = form

  except AttributeError:
    continue

In [None]:
import pandas as pd

mda_parsed = {}

for cik, links in mda_dict.items():

  sleep(1)
  dates = []
  mda_list = []

  for i, x in enumerate(links[0][2]):
    sleep(1)

    if not x:
      pass
    else:
      text = parse(x)
      mda = parse_mda(text)

      dates.append(links[0][0][i])
      mda_list.append(mda)

  mda_parsed[cik] = [dates, mda_list]

In [None]:
mda_parsed

{'1018724': [['2021-02-03',
   '2020-01-31',
   '2019-02-01',
   '2018-02-02',
   '2017-02-10',
   '2016-01-29',
   '2015-01-30',
   '2014-01-31',
   '2013-01-30',
   '2012-02-01',
   '2011-02-28',
   '2011-01-28',
   '2010-01-29',
   '2009-01-30',
   '2008-02-11',
   '2007-02-16',
   '2006-02-17',
   '2005-03-11',
   '2004-02-25',
   '2003-02-19',
   '2002-01-24',
   '2001-03-23',
   '2000-09-08',
   '2000-03-29',
   '1999-03-05',
   '1998-03-30'],
  ['item 7.management’s discussion and analysis of financial condition and results of operationsforward-looking statementsthis annual report on form 10-k includes forward-looking statements within the meaning of the private securities litigation reform act of 1995. all statements other than statements of historical fact, including statements regarding guidance, industry prospects, or future results of operations or financial position, made in this annual report on form 10-k are forward-looking. we use words such as anticipates, believes, ex

In [None]:
type(mda_parsed)

dict

In [None]:
import pandas as pd

dlist = []

for cik, values in mda_parsed.items():
  for x in range(len(values[0])):
    dlist.append([cik, values[0][x], values[1][x]])

In [None]:
df = pd.DataFrame(dlist, columns=['cik', 'date', 'mda'])

In [None]:
df

Unnamed: 0,cik,date,mda
0,320193,2020-10-30,item 7. management’s discussion and analysi...
1,320193,2019-10-31,item 7.management’s discussion and analysis of...
2,320193,2018-11-05,item 7.management’s discussion and analysis of...
3,320193,2017-11-03,item 7.management’s discussion and analysis of...
4,320193,2016-10-26,item 7.management’s discussion and analysis of...
...,...,...,...
106,789019,1998-09-25,
107,789019,1997-09-29,item 7. management's discussion and analysis ...
108,789019,1996-09-27,item 7. management's discussion and analysis o...
109,789019,1995-09-25,item 7. management's discussion and analysis o...


In [None]:
import numpy as np

In [None]:
df['mda'] = df['mda'].replace('',np.nan)

In [None]:
df = df.dropna(subset=['mda'])

In [None]:
df

Unnamed: 0,cik,date,mda
0,320193,2020-10-30,item 7. management’s discussion and analysi...
1,320193,2019-10-31,item 7.management’s discussion and analysis of...
2,320193,2018-11-05,item 7.management’s discussion and analysis of...
3,320193,2017-11-03,item 7.management’s discussion and analysis of...
4,320193,2016-10-26,item 7.management’s discussion and analysis of...
...,...,...,...
104,789019,2000-09-28,item 7. management's discussion and analysis o...
105,789019,1999-09-28,item 7. management's discussion and analysis o...
107,789019,1997-09-29,item 7. management's discussion and analysis ...
108,789019,1996-09-27,item 7. management's discussion and analysis o...


In [None]:
df.to_csv("10K.csv")