In [187]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from http.client import HTTPSConnection
import pickle
from urllib.request import urlopen
import requests
import os

    

In [8]:
def create_speech_df(host, annual_htm_list):
    '''
    Builds a dataframe containing information and links to all of the
    Federal Reserve speeches. This dataframe is later called to scrape the
    actual speeches

    INPUTS:
        host                the host (for the Federal Reserve 'www.federalreserve.gov)
        annual_htm_list     which contains the path to the web sites containing
                             speeches for each year

    OUTPUT:
        df    a dateframe containing the following columns
            ['date]         date of speech
            ['speaker']     speaker
            ['title']       title of speech
            ['link']        link to website with speech text to be scraped
            ['text']        empty column to be populated later with text

    NOTES:
        1. There are two items from 2006 to present that are on the Federal Reserve
            website that are not speeches but reports. These items are removed in this
            function by idenfitying dataframe rows where the speaker is blank

    '''
    all_dates = []
    all_speakers = []
    all_titles = []
    all_links = []
    for item in annual_htm_list:
        date_lst, speaker_lst, title_lst, link_lst =find_speeches_by_year(host,
                                                    item, print_test=False)
        all_dates = all_dates + date_lst
        all_speakers = all_speakers + speaker_lst
        all_titles = all_titles + title_lst
        all_links = all_links + link_lst

    dict1 = {'date': all_dates, 'speaker':all_speakers,
            'title': all_titles, 'link':all_links}
    df = pd.DataFrame.from_dict(dict1)
    #Cleaning up some of the dateframe elemenst to remove brackets
    df['date']=df['date'].str[0]
    #df['date'] = pd.to_datetime(df['date'])
    df['speaker']=df['speaker'].str[0]
    df['title']=df['title'].str[0]
    # creating empty column for documents
    doc = np.zeros_like(df['date'])
    df['text'] = doc

    # removing items that are not speeches. These contain a link that starts with '/pubs/feds'
    delete_these = df[df['link'].str.match('/pubs/feds')].index
    df = df.drop(delete_these)

    # now we need to sort the dataframe so that the most recent period is first
    df.sort_values(by=['date'], ascending = False, inplace = True)
    df.reset_index(drop=True, inplace=True)


In [154]:
# clean this link to only include newsevents/pressreleases/monetarypolicy/
print(len(link_list))

259


In [71]:
def find_speeches_by_year(host, this_url, print_test=False):
    '''
    Takes the host and a url for a given year
    and returns infromation about the speeches and links to the web site
    containing the text of the speeches. This function is used to create
    the list of all web sites that contain the individual speeches that
    need to be scraped.

    INPUTS:
        host        the host (for the Federal Reserve 'www.federalreserve.gov)
        this_url         the path to the speeches for a given year
        print_test  an optional field that will print out summary statistics

    OUTPUT:
        date_lst    list of speech dates
        speaker_lst list of speaker names
        title_lst   list containing titles of speeches
        link_lst    list of htm links to the actual speeches

    NOTES:
        1. There are video links on some of the urls that we need to removed.
            These videos are represented by the 'watchLive' class.

    '''
    conn = HTTPSConnection(host = host)
    conn.request(method='GET', url = this_url)
    resp = conn.getresponse()
    body = resp.read()
    # check that we received the correct response code
    if resp.status != 200:
        print('Error from Web Site! Response code: ', resp.status)
    else:
        soup=BeautifulSoup(body, 'html.parser')
        event_list = soup.find('div', id='article')
        # creating the list of dates, titles, speakers and html articles from web page
        month_lst =[]
        date_lst = []
        link_lst = []

        for row in event_list.find_all('div', class_='row fomc-meeting'):
            tmp_month = [x.text for x in row.find_all('fomc-meeting__month')]
            month_lst.append(tmp_month)
            
            tmp_date= [x.text for x in row.find_all('fomc-meeting__date')]
            date_lst.append(tmp_date)

            #  tmp_speaker = [x.text for x in row.find_all('p', class_='news__speaker')]
            #  speaker_lst.append(tmp_speaker)

            tmp_link = [x.text for x in row.find_all('href')]
            link_lst.append(tmp_link)

            # some of the links include video with the transcript. We are deleteing these here
            #for link in event_list.find_all('a', href=True, class_ = lambda x: x != 'watchLive'):
            #    link_lst.append(link['href'])

        if print_test:
            print('length of months: ', len(month_lst))
            print('length of dates: ', len(date_lst))
            print('length of href: ', len(link_lst))

        return month_lst, date_lst, link_lst


In [151]:
link_list = []
#for row in event_list.find_all('div', class_='row fomc-meeting'):
    #print(row)

for link in event_list.findAll('a', href=True):
    link_list.append(link.get('href'))
    
    
    
    

In [167]:
keep_these = []
for i in range(len(link_list)):
    this_href = link_list[i]
    #print(this_href)
    #print(type(this_href))
    if 'newsevents/pressreleases/' in this_href: 
        keep_these.append(i)

In [171]:
final_links = []
for item in keep_these:
    final_links.append(link_list[item])
print(len(link_list))
print(len(keep_these))
print(len(final_links))

259
76
76


<div class="panel panel-default"><div class="panel-heading"><h4><a id="7537">2014 FOMC Meetings </a></h4></div>
<div class="row fomc-meeting">
<div class="fomc-meeting__month col-xs-5 col-sm-3 col-md-2"><strong>January</strong></div>
<div class="fomc-meeting__date col-xs-4 col-sm-9 col-md-10 col-lg-1">28-29</div>
<div class="col-xs-12 col-md-4 col-lg-2">
<a href="/newsevents/pressreleases/monetary20140129a.htm">Statement</a><br/>
</div>
<div class="col-xs-12 col-md-4 col-lg-3">
</div>
<div class="col-xs-12 col-md-4 col-lg-4 fomc-meeting__minutes">
<strong>Minutes:</strong><br/>
<a href="/monetarypolicy/files/fomcminutes20140129.pdf">PDF</a> | <a href="/monetarypolicy/fomcminutes20140129.htm">HTML</a>
<br/> (Released February 19, 2014)
                    	
                    	
                    	</div>
</div>
<div class="fomc-meeting--shaded row fomc-meeting" style="border-bottom: none;">
<div class="fomc-meeting__month col-xs-5 col-sm-3 col-md-2"><strong>March </strong></div>
<div 

<div class="col-xs-12 col-sm-8 col-md-9" id="article">
<h3>Meeting calendars, statements, and minutes (2014-2019)</h3>
<!-- Article Area -->
<div class="col-xs-12 col-sm-12 col-md-5 pull-right" id="floatRightRail">
<div class="panel panel-related">
<div class="panel-heading"><h5 class="panel-title text-capitalize">FOMC Search</h5></div>
<div class="panel-body" style="padding-bottom:5px;">
<ul class="panel__list list-unstyled">
<li class="panel__listItem">Search all FOMC materials</li>
<li class="panel__listItem" style="padding-bottom: 20px;">
<form action="//www.fedsearch.org/fomc-docs/search" class="form-inline ng-pristine ng-valid" method="GET">
<div class="input-group">
<input class="form-control" id="fomcsearchbox" maxlength="90" name="text" type="text"/>
<span class="input-group-btn">
<button class="btn" type="submit">
<span class="icon icon--centered icon__sm icon-next"></span>
</button>
</span>
</div>
</form>
</li>
<li class="panel__listItem"><a class="noIcon" href="https://www.

In [216]:

def find_all_press_releases(host, this_url, print_test=False):
    '''
    Takes the host and a url to the FOMC web site for monetary policy press releases
    and return links to the web site containing the text of the press releases.
    This function is used to create the list of all web sites that contain the individual speeches that
    need to be scraped.

    INPUTS:
        host        the host (for the Federal Reserve 'www.federalreserve.gov)
        this_url         the path to the speeches for a given year
        print_test  an optional field that will print out summary statistics

    OUTPUT:
        final_links    list of htm links to the actual speeches

    '''
    conn = HTTPSConnection(host = host)
    conn.request(method='GET', url = this_url)
    resp = conn.getresponse()
    body = resp.read()
    # check that we received the correct response code
    if resp.status != 200:
        print('Error from Web Site! Response code: ', resp.status)
    else:
        soup=BeautifulSoup(body, 'html.parser')
        event_list = soup.find('div', id='article')
        # creating the list of dates, titles, speakers and html articles from web page

        for link in event_list.findAll('a', href=True):
            link_list.append(link.get('href'))

        # now we need to clean the link_list to remove pdf versions of the statements
        # The statements are often listed as two links, one to a web site and one to a pdf.
        keep_these = []
        for i in range(len(link_list)):
            this_href = link_list[i]
            if 'newsevents/pressreleases/monetary' in this_href:
                keep_these.append(i)

        htm_links = []
        for item in keep_these:
            htm_links.append(link_list[item])
            
        # the Federeal Resereve web site contains multiple html links on each date. I am interested in the 
        # policy statement only, which is kept as a date reference then 'a.ht,' 
        # Filtering non-statements out
        keep_these = []
        for i in range(len(htm_links)):
            this_href = htm_links[i]
            if 'a.htm' in this_href:
                keep_these.append(i)
        
        final_links = []
        for item in keep_these:
            final_links.append(htm_links[item])
                
        return final_links

In [192]:
#https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm
host = 'www.federalreserve.gov'
prefix = '/monetarypolicy/fomccalendars.htm'
this_url = host + prefix
print(this_url)

www.federalreserve.gov/monetarypolicy/fomccalendars.htm


In [194]:
conn = HTTPSConnection(host = host)
conn.request(method='GET', url = prefix)
resp = conn.getresponse()
body = resp.read()


In [217]:
links_list= find_all_press_releases(host, prefix, print_test=False)
#links_list

In [184]:
prefix = 'monetarypolicy.fomccalendars.htm'

In [185]:
host

'www.federalreserve.gov'

# testing new FOMC statement scraping

In [72]:
l_month, l_date, l_link = find_speeches_by_year(host, prefix, print_test=False)


In [73]:
l_link

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]

In [191]:
soup=BeautifulSoup(body, 'html.parser')
event_list = soup.find('div', id='article')
#event_list


In [33]:
resp.status

200

In [214]:
soup=BeautifulSoup(body, 'html.parser')


In [27]:
resp.status

400

In [177]:
this_url

'www.federalreserve.gov/monetarypolicy/fomccalendars.htm'

In [221]:
this_link = links_list[0]
this_link

'/newsevents/pressreleases/monetary20190130a.htm'

In [223]:
temp_url = 'https://' + host + this_link
response = requests.get(temp_url)
sp = BeautifulSoup(response.text)

In [224]:
sp

<html><body><p>ï»¿<!DOCTYPE html>

</p>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0, minimum-scale=1.0 maximum-scale=1.6, user-scalable=1" name="viewport"/>
<meta content=" Information received since the Federal Open Market Committee met in December indicates that the labor market has continued to strengthen and that economic acti" name="description"/>
<meta content="Federal Reserve issues FOMC statement" property="og:title"/>
<meta content="Board of Governors of the Federal Reserve System" property="og:site_name"/>
<meta content="article" property="og:type"/>
<meta content=" Information received since the Federal Open Market Committee met in December indicates that the labor market has continued to strengthen and that economic acti" property="og:description"/>
<meta content="" property="og:image"/>
<meta content="summary" name="twitter:card"/>
<meta content="Federal Reserve issues FOMC statement" n

In [230]:
this_date= sp.find('p', class_='article__time')
this_date = this_date.text
this_date

'January 30, 2019'

In [235]:
def get_one_doc(host, this_url):
    '''
    This function takes a host and url containing the location of one Federal
    Reserve speech and returns a string containing the text from the speech.

    INPUTS:
        host    the host to the Federal Reserve
        url     the path to a particular speech

    OUTPUTS:
        string  containing the text of the speech

    '''
    #conn = HTTPSConnection(host = host)
    #conn.request(method='GET', url = this_url)
    #response = conn.getresponse()

    temp_url = 'https://' + host + this_url
    response = requests.get(temp_url)
    sp = BeautifulSoup(response.text)
    this_date = sp.find('p', class_='article__time')
    this_date = this_date.text

    article = sp.find('div', class_='col-xs-12 col-sm-8 col-md-8')
    doc = []
    for p in article.find_all('p'):
        doc.append(p.text)

    return_doc = ''.join(doc)

    return this_date, return_doc


In [236]:
this_date, this_doc = get_one_doc(host,this_link)

In [237]:
this_date

'January 30, 2019'

In [238]:
this_doc

"Information received since the Federal Open Market Committee met in December indicates that the labor market has continued to strengthen and that economic activity has been rising at a solid rate. Job gains have been strong, on average, in recent months, and the unemployment rate has remained low. Household spending has continued to grow strongly, while growth of business fixed investment has moderated from its rapid pace earlier last year. On a 12-month basis, both overall inflation and inflation for items other than food and energy remain near 2 percent. Although market-based measures of inflation compensation have moved lower in recent months, survey-based measures of longer-term inflation expectations are little changed.Consistent with its statutory mandate, the Committee seeks to foster maximum employment and price stability. In support of these goals, the Committee decided to maintain the target range for the federal funds rate at 2-1/4 to 2-1/2 percent. The Committee continues 

In [240]:
print(type(this_doc))

<class 'str'>


In [251]:

def retrieve_docs(host, link_list):
    '''
    This function takes a dataframe with the columns 'link' and 'text' and the host to
    the paths contained in the link column. The original dataframe is returned with
    the text of the scrapped speeches in the 'text' column as a string

    INPUTS:
        host          the host to the Federal Reserve
        link_list     a list 'link' which contains all of the speech paths to be scrapped.

    OUTPUTS:
        doc_list      the original dataframe is returned with the column 'text' populated
                with the text from the speeches
        doc_date
    '''
    doc_list = []
    date_list = []
    for i in range(len(link_list)):
        this_item = link_list[i]
        print('Scraping text for documents #: ', i)
        this_date, this_doc = get_one_doc(host, this_item)
        doc_list.append(this_doc)
        date_list.append(this_date)
    return doc_list, date_list

In [252]:
doc_list, date_list = retrieve_docs(host, links_list)

Scraping text for documents #:  0
Scraping text for documents #:  1
Scraping text for documents #:  2
Scraping text for documents #:  3
Scraping text for documents #:  4
Scraping text for documents #:  5
Scraping text for documents #:  6
Scraping text for documents #:  7
Scraping text for documents #:  8
Scraping text for documents #:  9
Scraping text for documents #:  10
Scraping text for documents #:  11
Scraping text for documents #:  12
Scraping text for documents #:  13
Scraping text for documents #:  14
Scraping text for documents #:  15
Scraping text for documents #:  16
Scraping text for documents #:  17
Scraping text for documents #:  18
Scraping text for documents #:  19
Scraping text for documents #:  20
Scraping text for documents #:  21
Scraping text for documents #:  22
Scraping text for documents #:  23
Scraping text for documents #:  24
Scraping text for documents #:  25
Scraping text for documents #:  26
Scraping text for documents #:  27
Scraping text for documents #:

Scraping text for documents #:  231
Scraping text for documents #:  232
Scraping text for documents #:  233
Scraping text for documents #:  234
Scraping text for documents #:  235
Scraping text for documents #:  236
Scraping text for documents #:  237
Scraping text for documents #:  238
Scraping text for documents #:  239
Scraping text for documents #:  240
Scraping text for documents #:  241
Scraping text for documents #:  242
Scraping text for documents #:  243
Scraping text for documents #:  244
Scraping text for documents #:  245
Scraping text for documents #:  246
Scraping text for documents #:  247
Scraping text for documents #:  248
Scraping text for documents #:  249
Scraping text for documents #:  250
Scraping text for documents #:  251
Scraping text for documents #:  252
Scraping text for documents #:  253
Scraping text for documents #:  254
Scraping text for documents #:  255
Scraping text for documents #:  256
Scraping text for documents #:  257
Scraping text for documents 

In [254]:
doc_list

["Information received since the Federal Open Market Committee met in December indicates that the labor market has continued to strengthen and that economic activity has been rising at a solid rate. Job gains have been strong, on average, in recent months, and the unemployment rate has remained low. Household spending has continued to grow strongly, while growth of business fixed investment has moderated from its rapid pace earlier last year. On a 12-month basis, both overall inflation and inflation for items other than food and energy remain near 2 percent. Although market-based measures of inflation compensation have moved lower in recent months, survey-based measures of longer-term inflation expectations are little changed.Consistent with its statutory mandate, the Committee seeks to foster maximum employment and price stability. In support of these goals, the Committee decided to maintain the target range for the federal funds rate at 2-1/4 to 2-1/2 percent. The Committee continues

In [255]:
len(doc_list)

378

In [256]:
len(date_list)

378

In [268]:
date_dt = pd.to_datetime(date_list)


In [269]:
df_dict = {'date':date_dt, 'text':doc_list}

In [270]:
df = pd.DataFrame(df_dict)

In [271]:
df.head()

Unnamed: 0,date,text
0,2019-01-30,Information received since the Federal Open Ma...
1,2019-03-20,Information received since the Federal Open Ma...
2,2018-01-31,Information received since the Federal Open Ma...
3,2018-03-21,Information received since the Federal Open Ma...
4,2018-05-02,Information received since the Federal Open Ma...
