In [1]:
import edgar
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup as bsoup
import numpy as np

## Get the annual filings for each company

In [2]:
comp_2017 = pd.DataFrame.from_csv('companies_2017.csv',encoding='latin-1')
comp_2017 = comp_2017[:505]
comp_2017.CIK = comp_2017['CIK'].astype(int).apply(lambda x: str(x).zfill(10))

  """Entry point for launching an IPython kernel.


In [3]:
comp_2017

Unnamed: 0,Unnamed: 1,Security,Symbol,SEC filings,GICS Sector,GICS Sub Industry,Headquarters Location,Date first added,CIK,Founded
,0.0,3M Company,MMM,reports,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",,0000066740,1902
,1.0,Abbott Laboratories,ABT,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",3/31/64,0000001800,1888
,2.0,AbbVie Inc.,ABBV,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",12/31/12,0001551152,2013 (1888)
ABMD,3.0,Wyndam Worldwide,WYN,reports,,,,,0001361658,
,4.0,Accenture plc,ACN,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",7/6/11,0001467373,1989
,5.0,Activision Blizzard,ATVI,reports,Communication Services,Interactive Home Entertainment,"Santa Monica, California",8/31/15,0000718877,2008
,6.0,Adobe Systems Inc,ADBE,reports,Information Technology,Application Software,"San Jose, California",5/5/97,0000796343,1982
,7.0,Advanced Micro Devices Inc,AMD,reports,Information Technology,Semiconductors,"Sunnyvale, California",3/20/17,0000002488,1969
,8.0,Advance Auto Parts,AAP,reports,Consumer Discretionary,Automotive Retail,"Roanoke, Virginia",7/9/15,0001158449,1932
,9.0,AES Corp,AES,reports,Utilities,Independent Power Producers & Energy Traders,"Arlington, Virginia",10/2/98,0000874761,1981


## 2017

In [9]:
def annual_filings(name,ID):
    '''This function finds the correct document'''
    # get filings from package
    company = edgar.Company(name,ID)
    tree = company.getAllFilings(filingType = "10-K")
    doc = edgar.getDocuments(tree, noOfDocuments=3)
    
    # if compnay only has one document
    if len(doc) > 3:
        return None
    
    # search for the right document year 
    d = 1
    while d < len(doc):
    
        filing = (re.sub('\\xa0|\\n',' ',doc[d]))
        
        # see if the document is amended
        if '10-K/A' in filing[1:15]:
            d += 1
    
        # if in the right fiscal year and remove the new lines and break
        elif re.search(r'FOR THE FISCAL YEAR ENDED\s*[0-9]*\s*[A-Z]*\s*2017|FOR THE\s*(FISCAL)? YEAR ENDED\s*(Commission File Number)?\W?\s*[A-Z]*\s*[0-9]*,\s*2017',filing,re.IGNORECASE):
            filing = filing.replace('\n', '').replace('\t', '').replace('\r','').replace('Contents',' ').upper().split('ITEM ')
            break
        
        elif re.search(r'FOR THE FISCAL YEAR ENDED\s*[0-9]*\s*[A-Z]*\s*2018|FOR THE\s*(FISCAL)? YEAR ENDED\s*(Commission File Number)?\W?\s*[A-Z]*\s*[0-9]*,\s*2018',filing,re.IGNORECASE):
            d += 1
        
        elif re.search(r'FOR THE FISCAL YEAR ENDED\s*[0-9]*\s*[A-Z]*\s*2016|FOR THE\s*(FISCAL)? YEAR ENDED\s*(Commission File Number)?\W?\s*[A-Z]*\s*[0-9]*,\s*2016',filing,re.IGNORECASE):
            d -= 1
                
        else:
            return None

    new_doc = []
    start = []
    stop = []
    # remove characters from filing
    for item in filing:
        new_doc.append(re.sub('\\xa0*|(?<=[7-8])\W?s?','',item))
    for i in range(len(new_doc)):
        if re.search(r'7\s*\W?\.?\s*(AND 7A.)?(COMBINED)?\s*MANAGEMENT\W?\s*S\s*DISCUSSION\s*AND\s*ANALYSIS\s*OF\s*(CONSOLIDATED)?\s*FINANCIAL\s*CONDITION\S?\s*AND\s*RESULTS\s*OF\s*OPERATION\S?',new_doc[i]):
            start.append(i)
        if re.search(r'7\s*\W?\.?\s*MANAGEMENT\W?\s*S\s*DISCUSSION\s*AND\s*ANALYSIS\s*OF\s*RESULTS\s*OF\s*OPERATIONS\s*AND\s*FINANCIAL\s*CONDITION',new_doc[i]):
            start.append(i)
        if re.search(r'8\s*\W?\w?\.?\s*(CONSOLIDATED)?(COMCAST CORPORATION)?\s*FINANCIAL\s*STATEMENTS\s*AND\s*SUPPLEMENTA\w*\s*DATA',new_doc[i]):
            stop.append(i)     
    
    for i in range(len(new_doc)):
        if len(stop) == 0 and len(start) > 0:
            if re.search(r'8\s*\W?\w?\.?\s*FINANCIAL\s*STATEMENTS',new_doc[i]):
                stop = [i]
            if re.search(r'9\s*\W?\w?\.?\s*CONTROLS\s*AND\s*PROCEDURES',new_doc[i]):
                stop = [i]
    
    return name,ID,start,stop,d

In [10]:
MDAFCRO_index = []
# go through each company add get the indices of which section to pull the MDA from
for i in range(54,57):
    MDAFCRO_index.append(annual_filings(comp_2017.iloc[i].Security,str(comp_2017.iloc[i].CIK)))

In [None]:
def MDA(name,ID,start_index,stop_index,doc_num):
    '''This function returns the MDA text of the 10-k filing using the indices provided and the correct document.'''
    # get filings from package
    company = edgar.Company(name,ID)
    tree = company.getAllFilings(filingType = "10-K")
    doc = edgar.getDocuments(tree, noOfDocuments=doc_num+1)
    
    # create the same format used from the function above
    filing = (re.sub('\\xa0|\\n',' ',doc[doc_num]))
    filing = filing.replace('\n', '').replace('\t', '').replace('\r','').replace('Contents',' ').upper().split('ITEM ')
    new_doc = []
    for item in filing:
        new_doc.append(re.sub('\\xa0*|(?<=[7-8])\W?s?','',item))
    
    # find the text using the indices
    MDA_text = []
    
    return name,ID,MDA_text

In [None]:
MDAFCRO = []
for i in MDAFCRO_index:
    if i != None:
        MDAFCRO.append(MDA(i[0],i[1],i[2],i[3],i[4]))
    else:
        pass
