# Downloading and Importing Data

In [2]:
# accessing APIs and URLs
import requests

# static web scraping
from urllib.request import urlopen
from lxml.html import parse, fromstring

# regular expressions
import re

# downloading files
import urllib.request

# operating system
import os

# looping through folder
import glob

# read pdfs
import fitz 

# data wrangling
import pandas as pd

In [3]:
# extract all links from website
tree=parse(urlopen("https://www.americanrhetoric.com/barackobamaspeeches.htm"))
linkelements = tree.xpath("//a")
links = [e.attrib["href"] for e in linkelements]

In [4]:
len(links)

1417

In [5]:
# only retain pdf links
p = re.compile('.*pdf$')
pdf_links = [ s for s in links if p.match(s) ]

In [6]:
len(pdf_links)

436

In [7]:
# concetenate baseurl and path
baseurl="https://www.americanrhetoric.com/"
full_pdf_links = [baseurl + link for link in pdf_links]

In [8]:
# create new folder
os.makedirs("data")

In [9]:
# download all pdf files
i = 1
for link in full_pdf_links:
    x = "data/file_" + str(i) + ".pdf"
    urllib.request.urlretrieve(link, x)
    i = i+1

In [10]:
# import all files

# sort pdf files by name
numbers = re.compile(r'(\d+)')
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

filename_list = sorted(glob.glob("data/*.pdf"), key=numericalSort)

# create empty lists
speech = []
name = []

# loop through all files
for filename in filename_list:
    with fitz.open(filename) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    speech.append(text)
    file = re.search(r'file_\d{1,3}', filename)
    file = file.group()
    name.append(file)

# create pandas data frame
d = {'speech': speech, 'filename': name}
df = pd.DataFrame(data=d)

In [11]:
# extract date of speech
date_reg = r'(\d{1,2}\s{1,2}[a-zA-Z]{3,9},?\s\d{4})'
df['date'] = df['speech'].str.extract(date_reg, expand=False)
df['date'] = pd.to_datetime(df['date'])

The date extraction works fairly well. Only files 271, 329 and 377 still cause some issues and need fixing.

In [12]:
df

Unnamed: 0,speech,filename,date
0,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_1,2004-07-27
1,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_2,2005-01-06
2,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_3,2005-06-04
3,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_4,2005-10-25
4,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_5,2005-12-15
...,...,...,...
431,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_432,2017-01-10
432,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_433,2017-01-12
433,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_434,2017-01-17
434,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_435,2017-01-18
