Tom Halloin <br> Springboard Data Science Career Track <br>

<h1 align="center">Capstone Project 2: Analysis of Berkshire Hathaway Shareholder Letters Using Natural Language Processing (NLP) Techniques</h1>

<h3 align="center">Part 1: Scraping Data</h3> <br>

The data comes from Berkshire Hathaway’s shareholder letters available at the link https://www.berkshirehathaway.com/letters/letters.html. The letters come in both HTML and PDF format, so part of the challenge will be scraping data from both formats into a feasible data set.

<b>NOTE: Berkshire eventually caught on that I was a bot and has denied me scraping access. I do have the raw letters in a file on my laptop from before the denial that I will upload with the milestone report.</b>

In [1]:
import certifi  # validates trustworthiness of site
import pickle  # Opening and closing intermediate files
import PyPDF2  # library to get data from PDFs
import os  # operating system
import re  # Regular expressions
import requests  # libary for web scraping
import shutil  # file operations
import urllib3  # Get stuff from the internet
import urllib3.contrib.pyopenssl  # provides secure connection to site
from bs4 import BeautifulSoup  # Parsing HTML
from tika import parser  # parse PDFs

In [2]:
# The code below establishes a secure internet connection throughout the scraping process.

urllib3.contrib.pyopenssl.inject_into_urllib3()
https = urllib3.PoolManager(
    cert_reqs='CERT_REQUIRED', ca_certs=certifi.where(), timeout=15.0)

In [3]:
# Gets url of years from 1977 to 1997 (first column of letters on website)

url_years = [
    f'https://www.berkshirehathaway.com/letters/{year}.html' for year in range(1977, 1998)]

In [4]:
# Saves letters into a list.

annual_letters = []
for i in range(len(url_years)):
    annual_letters.append(https.request('GET', url_years[i]).data)

In [5]:
# Writes letters to file.

base_dir = 'C:/Users/Tom/Documents/Berkshire/pdf_files/raw_letters'
count = 0
for year in range(1977, 1998):
    with open(base_dir + '/' + f'{year}_letter.txt', "wb") as f:
        f.write(annual_letters[count])
        count = count + 1

### The letters from 1998 to 2018 are in PDF format. The following code attempts to scrape text from PDF files.


In [6]:
# 1.) Create directory for each year of files. Save split PDF files in each directory.

# Make a list of years where each year is a year with an annual report:
import os
years = [str(year) for year in range(1998, 2019)]

# Note: Change output directory to something else on laptop!
output_dir = 'C:/Users/Tom/Documents/Berkshire/pdf_files'
for year in years:
    year_directory = os.path.join(output_dir, year)
    os.makedirs(year_directory, mode=0o777, exist_ok=True)

In [7]:
# 2.) Get PDF file from internet.

def download_pdf(url, filename):
    import certifi
    import shutil
    import urllib3
    import urllib3.contrib.pyopenssl
    
    urllib3.contrib.pyopenssl.inject_into_urllib3()
    c = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
    
    with c.request('GET', url, preload_content=False) as resp, open(filename, 'wb') as out_file:
        shutil.copyfileobj(resp, out_file)

    resp.release_conn()    

In [8]:
# 3.) Split PDF file. Save file in year directory.


def pdf_splitter(pdfFile):
    # creating a pdf file object
    pdfFileObj = open(pdfFile, 'rb')

    # creating a pdf reader object
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    for pg in range(pdfReader.numPages):
        if len(str(pg)) == 1:
            filename = pdfFile.split('.')[0] + '_0' + str(pg) + '.pdf'
        else:
            filename = pdfFile.split('.')[0] + '_' + str(pg) + '.pdf'
        pageObj = pdfReader.getPage(pg)
        pdfWriter = PyPDF2.PdfFileWriter()
        pdfWriter.addPage(pageObj)
        # new pdf file object
        newFile = open(filename, 'wb')
        pdfWriter.write(newFile)

    pdfFileObj.close()

In [9]:
# 4.) Loop through every page to get text. Write text file to new file.


def pdf_reader(pdffile):
    raw = parser.from_file(pdffile, xmlContent=True)['content']
    data = BeautifulSoup(raw, features='html')
    message = data.find(class_='page').encode('utf-8')  # for first page
    return(message)

In [None]:
# Put it all together.


def main():

    base_dir = 'C:/Users/Tom/Documents/Berkshire/pdf_files'
    all_pdf_dict = dict()
    urls = ['https://www.berkshirehathaway.com/letters/1998pdf.pdf',
            'https://www.berkshirehathaway.com/letters/final1999pdf.pdf',
            'https://www.berkshirehathaway.com/letters/2000pdf.pdf',
            'https://www.berkshirehathaway.com/letters/2001pdf.pdf',
            'https://www.berkshirehathaway.com/letters/2002pdf.pdf']
    urls2 = [
        f'https://www.berkshirehathaway.com/letters/{year}ltr.pdf' for year in range(2003, 2019)]
    urls_combined = urls + urls2

    for year in range(1998, 2019):
        count = 0
        one_pdf_dict = dict()
        year_pdf = f'{year}.pdf'
        year_dir = base_dir + '/' + str(year)
        complete_letter = base_dir + '/' + str(year) + '/' + year_pdf

        for root, dirs, files in os.walk(year_dir):
            for name in files:
                filename = root + '/' + name
                if filename != complete_letter:
                    count = count + 1
                    one_pdf_dict[str(count)] = pdf_reader(filename)

        all_pdf_dict[str(year)] = one_pdf_dict

        with open(base_dir + '/raw_letters/' + f'{year}_letter.txt', "w") as f:
            f.write(str(one_pdf_dict))

    return(all_pdf_dict)


main()