# Setup

In [1]:
# accessing APIs and URLs
import requests

# static web scraping
from urllib.request import urlopen
from lxml.html import parse, fromstring

# regular expressions
import re

# downloading files
import urllib.request

# operating system
import os

# looping through folder
import glob

# read pdfs
import fitz 

# data wrangling
import pandas as pd

# Data Collection

In [2]:
# extract all links from website
tree = parse(urlopen("https://www.americanrhetoric.com/barackobamaspeeches.htm"))
linkelements = tree.xpath("//a")
list_links = [e.attrib["href"] for e in linkelements]

In [3]:
print("Number of links: " + str(len(list_links)))

Number of links: 1417


In [4]:
# only retain pdf links
p = re.compile('.*pdf$')
pdf_links = [ s for s in list_links if p.match(s) ]

In [5]:
print("Number of pdf links: " + str(len(pdf_links)))

Number of pdf links: 436


In [6]:
# concetenate baseurl and path
baseurl="https://www.americanrhetoric.com/"
full_pdf_links = [baseurl + link for link in pdf_links]

In [7]:
# create new folder
folder_name = "obama_speeches"
os.makedirs(folder_name, exist_ok = True)

In [8]:
# download all pdf files
def download_files(links, folder):
    i = 1
    for link in links:
        filename = folder + "/file_" + str(i) + ".pdf"
        
        # check if file already exists and, if so, continue
        if os.path.isfile(filename):
            continue
            
        # otherwise download file and save in folder
        else: 
            urllib.request.urlretrieve(link, filename)
        
        i = i + 1

In [9]:
download_files(full_pdf_links, folder_name)

# Data Import

In [10]:
# import files
def import_pdfs(folder):
    
    # sort pdf files by name
    numbers = re.compile(r'(\d+)')
    def numericalSort(value):
        parts = numbers.split(value)
        parts[1::2] = map(int, parts[1::2])
        return parts
    filename_list = sorted(glob.glob(folder + "/*.pdf"), key = numericalSort)
    
    # create empty list
    speech = []
    
    # loop through all files
    for filename in filename_list:
        with fitz.open(filename) as doc:
            text = ""
            for page in doc:
                text += page.get_text()
        speech.append(text)
        
    return speech

In [11]:
list_speeches = import_pdfs(folder_name)

In [12]:
# get file names
name = []
for i in range(len(pdf_links) ):
               index = "file_" + str(i + 1)
               name.append(index)
            
# create pandas data frame with file names as index
d = {'speech': list_speeches}
df = pd.DataFrame(data = d, index = name)

# extract dates of speeches
date_reg = r'(\d{1,2}\s{1,2}[a-zA-Z]{3,9},?\s?\d{4}?)'
df['date'] = df['speech'].str.extract(date_reg, expand = False)

# manually fix dates that were not picked up by regex
df.at['file_271', 'date'] = '2014-07-18'
df.at['file_329', 'date'] = '2015-07-15'
df.at['file_377', 'date'] = '2016-02-26'
df['date'] = pd.to_datetime(df['date'])

In [13]:
df

Unnamed: 0,speech,date
file_1,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,2004-07-27
file_2,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,2005-01-06
file_3,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,2005-06-04
file_4,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,2005-10-25
file_5,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,2005-12-15
...,...,...
file_432,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,2017-01-10
file_433,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,2017-01-12
file_434,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,2017-01-17
file_435,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,2017-01-18


In [14]:
# write data frame to excel file
os.makedirs('output', exist_ok = True)
df.to_excel("output/obama_speeches.xlsx")