# Setup

In [1]:
# This code chunk can be removed in the final notebook since the requirements.txt file lists all used packages!
!pip install requests openpyxl PyMuPDF pandas glob2

You should consider upgrading via the '/Users/emma/.pyenv/versions/3.9.5/envs/mda_assignment/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [246]:
# accessing APIs and URLs
import requests

# static web scraping
from urllib.request import urlopen
from lxml.html import parse, fromstring

# regular expressions
import re

# downloading files
import urllib.request

# operating system
import os

# looping through folder
import glob

# read pdfs
import fitz 

# data wrangling
import pandas as pd

# disable warnings
import warnings
warnings.filterwarnings('ignore')

# Data Collection and Import

In [3]:
# extract all links from website
tree=parse(urlopen("https://www.americanrhetoric.com/barackobamaspeeches.htm"))
linkelements = tree.xpath("//a")
links = [e.attrib["href"] for e in linkelements]

In [4]:
len(links)

1417

In [5]:
# only retain pdf links
p = re.compile('.*pdf$')
pdf_links = [ s for s in links if p.match(s) ]

In [6]:
len(pdf_links)

436

In [7]:
# concetenate baseurl and path
baseurl="https://www.americanrhetoric.com/"
full_pdf_links = [baseurl + link for link in pdf_links]

In [8]:
# create new folder
os.makedirs("data")

In [9]:
# download all pdf files
i = 1
for link in full_pdf_links:
    x = "data/file_" + str(i) + ".pdf"
    urllib.request.urlretrieve(link, x)
    i = i+1

In [261]:
# sort pdf files by name
numbers = re.compile(r'(\d+)')
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

filename_list = sorted(glob.glob("data/*.pdf"), key=numericalSort)

# create empty lists
speech = []
name = []

# loop through all files
for filename in filename_list:
    with fitz.open(filename) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    speech.append(text)
    file = re.search(r'file_\d{1,3}', filename)
    file = file.group()
    name.append(file)

# create pandas data frame
d = {'speech': speech, 'filename': name}
df = pd.DataFrame(data=d)

In [12]:
df

Unnamed: 0,speech,filename
0,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_1
1,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_2
2,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_3
3,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_4
4,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_5
...,...,...
431,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_432
432,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_433
433,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_434
434,\nAAm\nmeerriiccaannRRhheettoorriicc..ccoom\...,file_435


# Data Pre-Processing

In [336]:
df_clean = df.copy()

In [337]:
# separate line breaks from words
p1 = re.compile(r'(\n)([A-Za-z\\[])')
df_clean['speech'] = df_clean.speech.str.replace(p1, r"\1 \2")

p2 = re.compile(r'([A-Za-z])(\n)')
df_clean['speech'] = df_clean.speech.str.replace(p2, r"\1 \2")

In [338]:
# remove footer
df_clean['speech'] = df_clean.speech.str.replace('(AAm|AmericanRhetoric\.com)\s((.||\n)*?)\sPage\s\d', '')

In [339]:
# remove everything up until (and including) the statement about transcription
df_clean['speech'] = df_clean.speech.str.replace('^((.|\n)*)\s(AUTHENTICITY)\s.*\s\n', '')

In [340]:
# remove line breaks
df_clean['speech'] = df_clean.speech.str.replace('\n', '')

In [342]:
# remove punctuation and special characters
df_clean['speech'] = df_clean.speech.str.replace('[^a-zA-Z\d\s\']', '')

In [347]:
# REMOVE FOR FINAL PROJECT

# view cleaned speeches
pd.set_option('display.max_colwidth', None)
print(df_clean.loc[[0]])

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [333]:
# REMOVE FOR FINAL PROJECT

# reset settings
pd.reset_option('^display.', silent=True)