In [40]:
import errno
import json
import os
import pandas as pd
import re
import requests

from datetime import datetime
from bs4 import BeautifulSoup

In [65]:
class secfilinglib():
    
    def __init__(self):
        self.default_path = "SEC_Filings"
        self.filepath = self.default_path
        self.current_date = datetime.today().strftime('%Y%m%d - %H:%M:%S')

    def set_filepath(self, path):
        self.filepath = path
        
    def reset_filepath(self):
        self.filepath = self.default_path        

    def make_directory(self, company_code, cik, filing_type):
        # Make the directory to save company filings
        path = os.path.join(self.filepath, company_code, cik, filing_type)

        if not os.path.exists(path):
            try:
                os.makedirs(path)
            except OSError as exception:
                if exception.errno != errno.EEXIST:
                    raise
    
    def get_filings(self, company_code, cik, filing_type, prior_to=datetime.today().strftime('%Y%m%d'), count=10):
        # Retrieves and saves text filings from SEC
        try:
            self.make_directory(company_code, cik, filing_type)
        except Exception as e:
            print(str(e))
            
        # Format CIK number with leading zeroes
        cik_f = f'{int(cik):010}'

        # Generate the url to crawl
        base_url = f'http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik_f}&type={filing_type}&dateb={prior_to}&owner=exclude&output=xml&count={count}'

        print(f'Retrieving {filing_type} filings for {company_code}')
        response = requests.get(base_url)
        data = response.text
        
        # Parse fetched XML data using BeautifulSoup
        xml_soup = BeautifulSoup(data, 'lxml')
        
        # Initialize lists related to filings
        filing_dates = []
        filing_urls = []
        filing_list = []
        
        #Scrape filing dates from XML data
        for txt in xml_soup.find_all('datefiled'):
            filing_date = txt.string
            filing_dates.append(filing_date)

        #Scrape filing links from XML data
        for txt in xml_soup.find_all('filinghref'):
            filing_url = txt.string
            filing_urls.append(filing_url)
            
        response = requests.get(filing_urls[0])
        data = response.text
        
        #Parse filing index file using BeautifulSoup
        html_soup = BeautifulSoup(data, 'html.parser')
        
        #Find "Document Format Files" table
        table_soup = html_soup.table
        
        for row in table_soup.contents:
            if row.find(string=filing_type) 
        
        print(table_soup)

                        
    def save_filings(self, company_code, cik, filing_type, filing_list):
        # Retrieve Save text filings into their respective folder
        for i in range(len(filing_list)):
            url = filing_list[i][0]
            response = requests.get(url)
            data = response.text
            path = os.path.join(self.filepath, company_code, cik, filing_type, filing_list[i][1])

            if not os.path.exists(path):
                with open(path, "ab") as f:
                    f.write(data.encode('ascii', 'ignore'))


In [66]:
y = secfilinglib()
y.get_filings('MMM','66740','10-K',prior_to='20170101')

Retrieving 10-K filings for MMM
<table class="tableFile" summary="Document Format Files">
<tr>
<th scope="col" style="width: 5%;"><acronym title="Sequence Number">Seq</acronym></th>
<th scope="col" style="width: 40%;">Description</th>
<th scope="col" style="width: 20%;">Document</th>
<th scope="col" style="width: 10%;">Type</th>
<th scope="col">Size</th>
</tr>
<tr>
<td scope="row">1</td>
<td scope="row">10-K</td>
<td scope="row"><a href="/Archives/edgar/data/66740/000155837016003162/mmm-20151231x10k.htm">mmm-20151231x10k.htm</a></td>
<td scope="row">10-K</td>
<td scope="row">8415875</td>
</tr>
<tr class="blueRow">
<td scope="row">2</td>
<td scope="row">EX-10.1</td>
<td scope="row"><a href="/Archives/edgar/data/66740/000155837016003162/mmm-20151231ex1011486b0.htm">mmm-20151231ex1011486b0.htm</a></td>
<td scope="row">EX-10.1</td>
<td scope="row">144352</td>
</tr>
<tr>
<td scope="row">3</td>
<td scope="row">EX-10.8</td>
<td scope="row"><a href="/Archives/edgar/data/66740/00015583701600316

Retrieving 10-K filings for MMM
<tr>
<th scope="col" style="width: 5%;"><acronym title="Sequence Number">Seq</acronym></th>
<th scope="col" style="width: 40%;">Description</th>
<th scope="col" style="width: 20%;">Document</th>
<th scope="col" style="width: 10%;">Type</th>
<th scope="col">Size</th>
</tr>


SEC_Filings
