## 1 - SEC Filings Links Collection
_Gabriel Perez Prieto_

### 1.0 - Import Libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random
import datetime
import unicodedata
import re
import time
import sys
import pandas_datareader.data as web
import datetime as dt

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [3]:
sys.setrecursionlimit(100000)

### 1.1 - Scrape S&P 500 List - Wikipedia
> [S&P 500 - List of Companies in the Index](https://en.wikipedia.org/wiki/List_of_S%26P_500_companies)

This is a step taken to extract the list of all the companies in the S&P 500 index in order to retrieve SEC filings for all of them in a certain period of time. This will also serve to include a few other features into the modeling sections such as industry and sub-industry.

In [4]:
# Set Wikipedia url to scrape the table from
url_sp = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

# Request from url
res = requests.get(url_sp)

# Status code of res
res.status_code

# Create soup object from the home page
soup = BeautifulSoup(res.content, 'lxml')

# Find all anchor tags with specific parameters
table = soup.find('table', {'class': 'wikitable sortable'})

# Create list to house table values
sp_table = []

# Loop through table rows  
for tr in table.find_all('tr')[1:]:
    
    # Create dictionary to house values
    d = {}
    
    # Create keys and values for dictionary based on table
    d['symbol'] = tr.find_all('td')[0].text.strip()
    d['security'] = tr.find_all('td')[1].text.strip()
    d['sec_filings'] = tr.find_all('td')[2].find('a').attrs['href'].strip()
    d['gics_sector'] = tr.find_all('td')[3].text.strip()
    d['gics_sub_industry'] = tr.find_all('td')[4].text.strip()
    d['hq_location'] = tr.find_all('td')[5].text.strip()
    d['date_first_added'] = tr.find_all('td')[6].text.strip()
    d['cik'] = tr.find_all('td')[7].text.strip()
    d['date_founded'] = tr.find_all('td')[8].text.strip()

    # Append dictionary to list
    sp_table.append(d)
    
# Create DataFrame with data    
sp_df = pd.DataFrame(sp_table)

In [5]:
sp_df.head(1)

Unnamed: 0,symbol,security,sec_filings,gics_sector,gics_sub_industry,hq_location,date_first_added,cik,date_founded
0,MMM,3M Company,https://www.sec.gov/cgi-bin/browse-edgar?CIK=M...,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",,66740,1902


In [6]:
sp_df.to_csv('../clean_data/sp500.csv', index_label=False)

### 1.2 - Scrape Filings - Get Links for Filings Page
> [SEC - EDGAR Database](https://www.sec.gov/edgar/searchedgar/companysearch.html)

Collect filings' links from the SEC Edgard Website

In [6]:
def get_links_company_filings(ticker_list, filing_type):
    
    # Create list to house data scraped
    filing_list = []

    # Loop through each ticker 
    for ticker in tqdm(ticker_list):

        # Print ticker being analyzed 
    #     print(f'Ticker: {ticker}')

        # Loop through pages on the website - Multiple of 100
        for i in list(range(0,1001,100)):

            # Set url on every loop - page number
            url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=' + ticker +'&type=' + \
            filing_type + '&dateb=&owner=exclude&start=' + str(i) + '&count=100'

            # In case of error when scraping - page that do not exist
            try:
                # Request from url
                res = requests.get(url)

                # Status code of res
                res.status_code

                # Create soup object from the home page
                soup = BeautifulSoup(res.content, 'lxml')

                # Find all anchor tags with specific parameters
                table = soup.find('table', {'class': 'tableFile2'})

                # Loop through filings table and get links
                for tr in table.find_all('tr')[1:]:
                    d = {}
                    d['company'] = ticker
                    d['filing_doc'] = tr.find('td').text.strip()
                    d['doc_link'] = 'http://sec.gov' + tr.find('a').attrs['href']
                    d['date'] = tr.find_all('td')[-2].text.strip()

                    # Append dictionary to list
                    filing_list.append(d)

                # Random Sleep in Between Requests
    #             time.sleep(random.randrange(0,3))

            # If page not found - pass!
            except AttributeError:
                continue

    # Create DataFrame with filing list
    return pd.DataFrame(filing_list)

#### Run Function and Create DataFrame

In [9]:
# Define list of tickers to be scraped
ticker_list = list(sp_df['symbol'].values)

In [10]:
# Define filing type to be scraped
filing_type = '8-K'

In [11]:
# Save as a DataFrame
df = get_links_company_filings(ticker_list, filing_type)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




In [12]:
df.head(1)

Unnamed: 0,company,filing_doc,doc_link,date
0,AAPL,8-K,http://sec.gov/Archives/edgar/data/320193/0000...,2019-10-30


#### Check Value Counts - Total Number of Filings

In [13]:
df['company'].value_counts()

AMZN     202
AAPL     177
NFLX     149
FB        67
GOOGL     34
Name: company, dtype: int64

### 1.3 - Scrape Filings - Get Links for Complete Filing .txt Files
> After getting the links it is time to get the links for the full .txt file for each filing

In [14]:
# Create list to house scraped data
filing_list = []

# Loop through all links
for link in tqdm(df['doc_link']):
    
    # Request from url
    res = requests.get(link)
    
    # Status code of res
    res.status_code
    
    # Create soup object from the home page
    soup = BeautifulSoup(res.content, 'lxml')
    
    # Find table to be scraped
    table = soup.find('table', {'class': 'tableFile'})
    
    # Loop through table and get link for complete filings
    for tr in table.find_all('tr')[1:]:
        
        # Create dictionary to house values
        d = {}

        # Loop through all rows
        for i, value in enumerate(tr.find_all('td')):
            
            # Find row with complete filings
            if tr.find_all('td')[i].text == 'Complete submission text file':
                
                # Insert description and link into dictionary
                d['file_description'] = tr.find_all('td')[i].text
                d['complete_file_link'] = 'https://sec.gov' + tr.find('a').attrs['href']

            # If row is not complete file - pass
            else:
                continue
            
            # Append dictionary to list
            filing_list.append(d)
      
    # Random Sleep in Between Requests
#     time.sleep(random.randrange(0,3))
    
# Create DataFrame with filing list
df = pd.concat([df, pd.DataFrame(filing_list)], axis=1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(IntProgress(value=0, max=629), HTML(value='')))




#### Create `accession_number` column

In [15]:
df['accecession_number'] = df['complete_file_link'].map(lambda x: x[-24:])

#### Include Company Name, GICS Sector and Sub Industry on the DataFrame

In [17]:
df = df.merge(sp_df[['symbol', 'security', 'gics_sector', 'gics_sub_industry']],
              left_on='company', right_on='symbol', how='left')

In [1]:
df.head(1)

NameError: name 'df' is not defined

#### Save DataFrame as a .csv file

In [18]:
df.to_csv('./data/filing_links.csv', index_label=False)