In [4]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup, SoupStrainer
import json
from pymongo import MongoClient
import pickle
import codecs

%matplotlib inline

## scraping functions

In [307]:
def to_soup(url):
    """
    Takes url as input and returns soup object.
    """
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    return soup

        
        
def get_urls(site, keyword_string):
    """
    Takes a page url and keyword string for href tag. Returns a list of complete
    urls on that page that have specified keywords in href tag. Urls include
    prefix.
    """
    soup = to_soup(site)
    hrefs = soup.find_all('a', href=re.compile(keyword_string))
    links = []
    for anchor in hrefs:
        links.append(anchor['href'])
    links = ['https://obamawhitehouse.archives.gov' + x for x in links]
    return links


def parse_page(url):
    """
    Takes press briefing url as input and calls functions to scrape attributes.
    """
    soup = to_soup(url)
    
    try:
        title = soup.find('h1').text
    except Exception as e:
        print('Title: ' + str(e))
        title = None
        
    try:
        date = soup.find("div", {"class": "press-article-date"}).text
    except Exception as e:
        print('Date: ' + str(e))
        date = None
        
    try:
        location = soup.find("p", {"class": "rtecenter"})
        location = location.text.replace("\n", " ").replace("\t", "").lstrip(' ')
    except Exception as e:
        print('Location: ' + str(e))
        location = None
#     # try to improve this later to catch more
#     except Exception:
#         try:
#             location = soup.findAll("div", {"class": "legacy-center"})[-1].text
#             location = location.text.replace("\n", " ").replace("\t", "").lstrip(' ')
#         except Exception:
#             try:
#                 location = soup.findAll("div", {"class": "legacy-para"})[2].text
#                 location = location.text.replace("\n", " ").replace("\t", "").lstrip(' ')
#             except Exception as e:
#                 print('Location: ' + str(e))
#                 location = None

    try:
        doc_text = soup.find("div", {"class": "field-items"}).text
    except Exception as e:
        print('Doc_text: ' + str(e))
        doc_text = None
        

    attributes = (title, date, location, doc_text)
    
    return attributes

In [35]:
# url = 'https://obamawhitehouse.archives.gov/briefing-room/press-briefings'
# response = requests.get(url)
# soup = to_soup(url)

In [36]:
all_index_pages = ['https://obamawhitehouse.archives.gov/briefing-room/press-briefings?term_node_tid_depth=36']
for i in range(1, 178):
    all_index_pages.append("""https://obamawhitehouse.archives.gov/briefing-room/press-briefings?term_node_
                            tid_depth=36&page=""" + str(i))

In [45]:
all_urls = []
for page in all_index_pages:
    page_urls = get_urls(page, 'the-press-office')
    all_urls.extend(page_urls)

In [317]:
len(all_urls)

1779

## set up and scrape into Mongo

In [5]:
client = MongoClient()
#fletcher is my database
db = client.fletcher
#briefings is my collection of documents (like a table in SQL)
briefs = db.briefings

In [None]:
for i, j in enumerate(all_urls):
    attributes = parse_page(j)
    data = {}
    data['title'] = attributes[0]
    data['date'] = attributes[1]
    data['location'] = attributes[2]
    data['doc_text'] = attributes[3]
    briefs.insert_one(data)
    if None in data.values():
        print('Index = ' + str(i) + '\nError at ' + str(j))
    print(i)
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Location: 'NoneType' object has no attribute 'text'
Index = 18
Error at https://obamawhitehouse.archives.gov/the-press-office/2016/12/09/press-briefing-principal-deputy-press-secretary-eric-schultz-12916
18
19
20
21
Location: 'NoneType' object has no attribute 'text'
Index = 22
Error at https://obamawhitehouse.archives.gov/the-press-office/2016/12/05/press-briefing-press-secretary-josh-earnest-12516
22
23
24
25
26
27
28
29
30
31
32
Location: 'NoneType' object has no attribute 'text'
Index = 33
Error at https://obamawhitehouse.archives.gov/the-press-office/2016/11/11/press-briefing-press-secretary-josh-earnest-110916
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
Location: 'NoneType' object has no attribute 'text'
Index = 102
Error at https://obamawhitehouse.archives.gov/the-press-off

In [None]:
i = 1671

In [334]:
# restart at 1671

for i, j in enumerate(all_urls[1671:]):
    attributes = parse_page(j)
    data = {}
    data['title'] = attributes[0]
    data['date'] = attributes[1]
    data['location'] = attributes[2]
    data['doc_text'] = attributes[3]
    briefs.insert_one(data)
    if None in data.values():
        print('Index = ' + str(i) + '\nError at ' + str(j))
    print(i)
    

Location: 'NoneType' object has no attribute 'text'
Index = 0
Error at https://obamawhitehouse.archives.gov/the-press-office/briefing-white-house-press-secretary-robert-gibbs-5182009
0
Location: 'NoneType' object has no attribute 'text'
Index = 1
Error at https://obamawhitehouse.archives.gov/the-press-office/press-briefing-a-senior-administration-official-vice-presidents-upcoming-trip-sout-0
1
Location: 'NoneType' object has no attribute 'text'
Index = 2
Error at https://obamawhitehouse.archives.gov/the-press-office/briefing-white-house-press-secretary-robert-gibbs-5-15-09
2
Location: 'NoneType' object has no attribute 'text'
Index = 3
Error at https://obamawhitehouse.archives.gov/the-press-office/press-briefing-a-senior-administration-official-vice-presidents-upcoming-trip-south
3
Location: 'NoneType' object has no attribute 'text'
Index = 4
Error at https://obamawhitehouse.archives.gov/the-press-office/press-gaggle-press-secretary-robert-gibbs-aboard-air-force-one
4
Location: 'NoneTy

**debugging errors**

In [335]:
# all_urls[1671]
# for i, j in enumerate(all_urls[1671:]):
#     print(i, j)

In [None]:
# lots of locations missing (looks like some are div class rtecenter and others are 
# p class rtecenter and others near end are legacy center)
# -- could recode above and fix many
# try with all_urls[-20:] once have improved location code above to see how it works

In [314]:
# # test for errors

# for i, j in enumerate(all_urls[17:20]):
#     attributes = parse_page(j)
#     data = {}
#     data['title'] = attributes[0]
#     data['date'] = attributes[1]
#     data['location'] = attributes[2]
#     data['doc_text'] = attributes[3]
#     briefs.insert_one(data)
#     if None in data.values():
#         print('Index = ' + str(i) + '\nError at ' + str(j))
#     print(i)

In [332]:
# for i in db.briefings.find({}, {'date': 1}):
#     print(i)

In [336]:
# for i in db.briefings.find({'date': 'May 18, 2009'}):
#     print(i)

In [313]:
# for i in db.briefings.find({'date': 'December 02, 2009'}):
#     print(i)

## put into dataframe and split on paragraphs

In [6]:
df = pd.DataFrame(list(briefs.find()))

In [7]:
df.head()

Unnamed: 0,_id,date,doc_text,location,title
0,592748481de2880f286161de,"January 17, 2017",\n\n\tJames S. Brady Press Briefing Room\n\n\t...,James S. Brady Press Briefing Room,Press Briefing by Press Secretary Josh Earnest...
1,5927484a1de2880f286161df,"January 13, 2017",\n\n\tJames S. Brady Press Briefing Room\n\n\t...,James S. Brady Press Briefing Room,Press Briefing by Press Secretary Josh Earnest...
2,5927484c1de2880f286161e0,"January 13, 2017",\n\n\tJames S. Brady Press Briefing Room\n\n\t...,James S. Brady Press Briefing Room,Press Briefing by Press Secretary Josh Earnest...
3,5927484f1de2880f286161e1,"January 12, 2017",\n\n\tON-THE-RECORD PRESS CALL\n\tBY DEPUTY NA...,ON-THE-RECORD PRESS CALL BY DEPUTY NATIONAL SE...,On-the-Record Press Call on Cuba Policy Announ...
4,592748511de2880f286161e2,"January 11, 2017",\n\n\tJames S. Brady Press Briefing Room\n\n\t...,James S. Brady Press Briefing Room,Press Briefing by Press Secretary Josh Earnest...


In [8]:
# print(codecs.getdecoder('unicode_escape')(df.doc_text[0])[0])

In [9]:
records = df.to_dict(orient = 'records')

In [10]:
def flat_splitter(df):
    """
    Splits documents (initially one per row) into one paragraph per row.
    """
    for document in df:
        for paragraph in document['doc_text'].split('\n\n'):
            yield {'date': document['date'], 'title': document['title'], 'location': document['location'], \
                   'paragraph': paragraph}

In [11]:
next(flat_splitter(records))

{'date': 'January 17, 2017',
 'location': 'James S. Brady Press Briefing Room',
 'paragraph': '',
 'title': 'Press Briefing by Press Secretary Josh Earnest, 1/17/17'}

In [12]:
for i, j in enumerate(flat_splitter(records)):
    if i > 5:
        break
    print(j)

{'date': 'January 17, 2017', 'title': 'Press Briefing by Press Secretary Josh Earnest, 1/17/17', 'location': 'James S. Brady Press Briefing Room', 'paragraph': ''}
{'date': 'January 17, 2017', 'title': 'Press Briefing by Press Secretary Josh Earnest, 1/17/17', 'location': 'James S. Brady Press Briefing Room', 'paragraph': '\tJames S. Brady Press Briefing Room'}
{'date': 'January 17, 2017', 'title': 'Press Briefing by Press Secretary Josh Earnest, 1/17/17', 'location': 'James S. Brady Press Briefing Room', 'paragraph': '\t12:15 P.M. EST'}
{'date': 'January 17, 2017', 'title': 'Press Briefing by Press Secretary Josh Earnest, 1/17/17', 'location': 'James S. Brady Press Briefing Room', 'paragraph': '\tMR. EARNEST:\xa0 Good afternoon, everybody.\xa0 I don’t actually have any announcements at the top, but --'}
{'date': 'January 17, 2017', 'title': 'Press Briefing by Press Secretary Josh Earnest, 1/17/17', 'location': 'James S. Brady Press Briefing Room', 'paragraph': '\tQ\xa0\xa0\xa0 Thank y

In [13]:
record_dict = flat_splitter(records)

In [14]:
df = pd.DataFrame(record_dict)

In [17]:
df.head()

Unnamed: 0,date,location,paragraph,title
0,"January 17, 2017",James S. Brady Press Briefing Room,,Press Briefing by Press Secretary Josh Earnest...
1,"January 17, 2017",James S. Brady Press Briefing Room,\tJames S. Brady Press Briefing Room,Press Briefing by Press Secretary Josh Earnest...
2,"January 17, 2017",James S. Brady Press Briefing Room,\t12:15 P.M. EST,Press Briefing by Press Secretary Josh Earnest...
3,"January 17, 2017",James S. Brady Press Briefing Room,"\tMR. EARNEST: Good afternoon, everybody. I ...",Press Briefing by Press Secretary Josh Earnest...
4,"January 17, 2017",James S. Brady Press Briefing Room,\tQ Thank you. (Laughter.),Press Briefing by Press Secretary Josh Earnest...


In [16]:
df.shape

(123467, 4)

In [18]:
df['paragraph'] = df.paragraph.str.replace('\t', '')

In [452]:
df.head()

Unnamed: 0,date,location,paragraph,title
0,"January 17, 2017",James S. Brady Press Briefing Room,,Press Briefing by Press Secretary Josh Earnest...
1,"January 17, 2017",James S. Brady Press Briefing Room,James S. Brady Press Briefing Room,Press Briefing by Press Secretary Josh Earnest...
2,"January 17, 2017",James S. Brady Press Briefing Room,12:15 P.M. EST,Press Briefing by Press Secretary Josh Earnest...
3,"January 17, 2017",James S. Brady Press Briefing Room,"MR. EARNEST: Good afternoon, everybody. I do...",Press Briefing by Press Secretary Josh Earnest...
4,"January 17, 2017",James S. Brady Press Briefing Room,Q Thank you. (Laughter.),Press Briefing by Press Secretary Josh Earnest...


In [None]:
# later look at clusters to see if this takes care of James Brady, times, laughter, good morning/afternoon, applause

In [20]:
# drop empty paragraphs
df2 = df[df.paragraph != '']

In [23]:
df2.head()

Unnamed: 0,date,location,paragraph,title
1,"January 17, 2017",James S. Brady Press Briefing Room,James S. Brady Press Briefing Room,Press Briefing by Press Secretary Josh Earnest...
2,"January 17, 2017",James S. Brady Press Briefing Room,12:15 P.M. EST,Press Briefing by Press Secretary Josh Earnest...
3,"January 17, 2017",James S. Brady Press Briefing Room,"MR. EARNEST: Good afternoon, everybody. I do...",Press Briefing by Press Secretary Josh Earnest...
4,"January 17, 2017",James S. Brady Press Briefing Room,Q Thank you. (Laughter.),Press Briefing by Press Secretary Josh Earnest...
5,"January 17, 2017",James S. Brady Press Briefing Room,MR. EARNEST: But because today marks my last ...,Press Briefing by Press Secretary Josh Earnest...


In [21]:
df2.shape

(121025, 4)

In [22]:
df2.to_pickle('paragraph_df.pkl')