In [16]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [17]:
df = pd.read_csv("congress.csv")
# Removing the URL column just to make things harder for us
# (BUT THE THINGS WE'LL LEARN!!!)
df = df.drop(columns='link')
df.head()

Unnamed: 0,name_raw,slug
0,"Senator Abdnor, James",james-abdnor/A000009
1,"Representative Abercrombie, Neil",neil-abercrombie/A000014
2,"Senator Abourezk, James",james-abourezk/A000017
3,"Representative Abraham, Ralph Lee",ralph-abraham/A000374
4,"Senator Abraham, Spencer",spencer-abraham/A000355


## Let's download the first page

In [18]:
url = "https://www.congress.gov/member/james-abdnor/A000009"
response = requests.get(url)
doc = BeautifulSoup(response.text)

## Let's scrape everything we need to from the page

In [19]:
# find and print out his name
# doc.find, not doc.find_all, because there is only ONE name
name = doc.find(class_='legDetail').next_element
birthdate = doc.find(class_='birthdate').text
in_congress = doc.find(class_='birthdate').find_next_sibling('span').text
bill_count = doc.select_one('#searchTune > div.basic-search-tune-number > span').text.strip()
print(name, birthdate, in_congress, bill_count)

Senator James Abdnor  (1923 - 2012) In Congress 1973 - 1987 1-100                
                of 1,949


In [20]:
# Find class='birthdate', then find the next span tag


In [21]:
# Right click > Copy > Copy selector
# You'll need to use doc.select_one
doc.select_one('#searchTune > div.basic-search-tune-number > span').text

'\n\r\n                    1-100                \r\n                of 1,949            '

## Let's turn this into a dictionary

We're going to call it `page` because it's a description of whats on that page

In [22]:
# Put *everything* in here
# Also add in the code where we downloaded the page
url = "https://www.congress.gov/member/james-abdnor/A000009"
print("We are scraping", url)

response = requests.get(url)
doc = BeautifulSoup(response.text)

# find and print out his name
# doc.find, not doc.find_all, because there is only ONE name
page = {}

page['name'] = doc.find(class_='legDetail').next_element
page['birthdate'] = doc.find(class_='birthdate').text
page['in_congress'] = doc.find(class_='birthdate').find_next_sibling('span').text
page['bill_count'] = doc.select_one('#searchTune > div.basic-search-tune-number > span').text.strip()
print(page)

We are scraping https://www.congress.gov/member/james-abdnor/A000009
{'name': 'Senator James Abdnor', 'birthdate': ' (1923 - 2012)', 'in_congress': 'In Congress 1973 - 1987', 'bill_count': '1-100                \r\n                of 1,949'}


# Let's build a function!!!

In [25]:
df = df.head(10)
df.shape

Unnamed: 0,name_raw,slug
0,"Senator Abdnor, James",james-abdnor/A000009
1,"Representative Abercrombie, Neil",neil-abercrombie/A000014
2,"Senator Abourezk, James",james-abourezk/A000017
3,"Representative Abraham, Ralph Lee",ralph-abraham/A000374
4,"Senator Abraham, Spencer",spencer-abraham/A000355
5,"Representative Abzug, Bella S.",bella-abzug/A000018
6,"Representative Acevedo-Vila, Anibal",anibal-acevedo-vila/A000359
7,"Representative Ackerman, Gary L.",gary-ackerman/A000022
8,"Representative Adams, Alma S.",alma-adams/A000370
9,"Senator Adams, Brock",brockman-adams/A000031


In [26]:
def scrape_page(row):
    # These both work the same
    # url = f"https://www.congress.gov/member/" + row['slug']
    url = f"https://www.congress.gov/member/{row['slug']}"
    # Print out the 'slug' column
    # print("Scraping", row['slug'])
    print("Scraping", url)

In [27]:
# YO!
# Take my dataframe - df.
# Go through every single row - axis=1
# And feed it to scrape_page - .apply(scrape_page
df.head().apply(scrape_page, axis=1)

Scraping https://www.congress.gov/member/james-abdnor/A000009
Scraping https://www.congress.gov/member/neil-abercrombie/A000014
Scraping https://www.congress.gov/member/james-abourezk/A000017
Scraping https://www.congress.gov/member/ralph-abraham/A000374
Scraping https://www.congress.gov/member/spencer-abraham/A000355


0    None
1    None
2    None
3    None
4    None
dtype: object

# Build our real scraping function

In [28]:
def scrape_page(row):
    # These both work the same
    url = f"https://www.congress.gov/member/" + row['slug']
    url = f"https://www.congress.gov/member/{row['slug']}"
    # Print out the 'slug' column
    print("Scraping", url)

    # Steal all of our scraping code from before
    response = requests.get(url)
    doc = BeautifulSoup(response.text)

    # find and print out his name
    # doc.find, not doc.find_all, because there is only ONE name
    page = {}

    page['name'] = doc.find(class_='legDetail').next_element
    page['birthdate'] = doc.find(class_='birthdate').text
    try:
        page['in_congress'] = doc.find(class_='birthdate').find_next_sibling('span').text
    except:
        pass
    page['bill_count'] = doc.select_one('#searchTune > div.basic-search-tune-number > span').text.strip()

    # In your function, always return pd.Series(page)
    # It converts what you've scraped into a dataframe
    return pd.Series(page)

In [29]:
scrape_page(df.loc[0])

Scraping https://www.congress.gov/member/james-abdnor/A000009


name                                        Senator James Abdnor
birthdate                                          (1923 - 2012)
in_congress                              In Congress 1973 - 1987
bill_count     1-100                \r\n                of 1,949
dtype: object

In [30]:
scrape_page(df.loc[2])

Scraping https://www.congress.gov/member/james-abourezk/A000017


name                                    Senator James Abourezk
birthdate                                            (1931 - )
in_congress                            In Congress 1971 - 1979
bill_count     1-100                \r\n                of 875
dtype: object

In [70]:
scraped_df = df.apply(scrape_page, axis=1)
scraped_df

Scraping https://www.congress.gov/member/james-abdnor/A000009
Scraping https://www.congress.gov/member/james-abdnor/A000009
Scraping https://www.congress.gov/member/neil-abercrombie/A000014
Scraping https://www.congress.gov/member/james-abourezk/A000017
Scraping https://www.congress.gov/member/ralph-abraham/A000374
Scraping https://www.congress.gov/member/spencer-abraham/A000355
Scraping https://www.congress.gov/member/bella-abzug/A000018
Scraping https://www.congress.gov/member/anibal-acevedo-vila/A000359
Scraping https://www.congress.gov/member/gary-ackerman/A000022
Scraping https://www.congress.gov/member/alma-adams/A000370
Scraping https://www.congress.gov/member/brockman-adams/A000031


Unnamed: 0,name,birthdate,in_congress,bill_count
0,Senator James Abdnor,(1923 - 2012),In Congress 1973 - 1987,"1-100 \r\n of 1,949"
1,Representative Neil Abercrombie,(1938 - ),"In Congress 1985 - 1987, 1991 - 2011","1-100 \r\n of 4,472"
2,Senator James Abourezk,(1931 - ),In Congress 1971 - 1979,1-100 \r\n of 875
3,Representative Ralph Lee Abraham,(1954 - ),In Congress 2015 - Present | \n\n $(do...,1-100 \r\n of 637
4,Senator Spencer Abraham,(1952 - ),In Congress 1995 - 2001,"1-100 \r\n of 1,227"
5,Representative Bella S. Abzug,(1920 - 1998),In Congress 1971 - 1977,"1-100 \r\n of 1,437"
6,Representative Anibal Acevedo-Vila,(1962 - ),In Congress 2001 - 2005,1-100 \r\n of 503
7,Representative Gary L. Ackerman,(1942 - ),In Congress 1983 - 2013,"1-100 \r\n of 7,315"
8,Representative Alma S. Adams,(1946 - ),In Congress 2014 - Present | \n\n $(do...,1-100 \r\n of 809
9,Senator Brock Adams,(1927 - 2004),"In Congress 1965 - 1979, 1987 - 1993","1-100 \r\n of 1,716"


In [71]:
df.head()

Unnamed: 0,name,slug
0,"Senator Abdnor, James",james-abdnor/A000009
1,"Representative Abercrombie, Neil",neil-abercrombie/A000014
2,"Senator Abourezk, James",james-abourezk/A000017
3,"Representative Abraham, Ralph Lee",ralph-abraham/A000374
4,"Senator Abraham, Spencer",spencer-abraham/A000355


# Add our new scraped columns to our old dataframe columns

In [73]:
df.merge(scraped_df, left_index=True, right_index=True)

Unnamed: 0,name_x,slug,name_y,birthdate,in_congress,bill_count
0,"Senator Abdnor, James",james-abdnor/A000009,Senator James Abdnor,(1923 - 2012),In Congress 1973 - 1987,"1-100 \r\n of 1,949"
1,"Representative Abercrombie, Neil",neil-abercrombie/A000014,Representative Neil Abercrombie,(1938 - ),"In Congress 1985 - 1987, 1991 - 2011","1-100 \r\n of 4,472"
2,"Senator Abourezk, James",james-abourezk/A000017,Senator James Abourezk,(1931 - ),In Congress 1971 - 1979,1-100 \r\n of 875
3,"Representative Abraham, Ralph Lee",ralph-abraham/A000374,Representative Ralph Lee Abraham,(1954 - ),In Congress 2015 - Present | \n\n $(do...,1-100 \r\n of 637
4,"Senator Abraham, Spencer",spencer-abraham/A000355,Senator Spencer Abraham,(1952 - ),In Congress 1995 - 2001,"1-100 \r\n of 1,227"
5,"Representative Abzug, Bella S.",bella-abzug/A000018,Representative Bella S. Abzug,(1920 - 1998),In Congress 1971 - 1977,"1-100 \r\n of 1,437"
6,"Representative Acevedo-Vila, Anibal",anibal-acevedo-vila/A000359,Representative Anibal Acevedo-Vila,(1962 - ),In Congress 2001 - 2005,1-100 \r\n of 503
7,"Representative Ackerman, Gary L.",gary-ackerman/A000022,Representative Gary L. Ackerman,(1942 - ),In Congress 1983 - 2013,"1-100 \r\n of 7,315"
8,"Representative Adams, Alma S.",alma-adams/A000370,Representative Alma S. Adams,(1946 - ),In Congress 2014 - Present | \n\n $(do...,1-100 \r\n of 809
9,"Senator Adams, Brock",brockman-adams/A000031,Senator Brock Adams,(1927 - 2004),"In Congress 1965 - 1979, 1987 - 1993","1-100 \r\n of 1,716"


In [75]:
df.join(scraped_df, rsuffix='_scraped')

Unnamed: 0,name,slug,name_scraped,birthdate,in_congress,bill_count
0,"Senator Abdnor, James",james-abdnor/A000009,Senator James Abdnor,(1923 - 2012),In Congress 1973 - 1987,"1-100 \r\n of 1,949"
1,"Representative Abercrombie, Neil",neil-abercrombie/A000014,Representative Neil Abercrombie,(1938 - ),"In Congress 1985 - 1987, 1991 - 2011","1-100 \r\n of 4,472"
2,"Senator Abourezk, James",james-abourezk/A000017,Senator James Abourezk,(1931 - ),In Congress 1971 - 1979,1-100 \r\n of 875
3,"Representative Abraham, Ralph Lee",ralph-abraham/A000374,Representative Ralph Lee Abraham,(1954 - ),In Congress 2015 - Present | \n\n $(do...,1-100 \r\n of 637
4,"Senator Abraham, Spencer",spencer-abraham/A000355,Senator Spencer Abraham,(1952 - ),In Congress 1995 - 2001,"1-100 \r\n of 1,227"
5,"Representative Abzug, Bella S.",bella-abzug/A000018,Representative Bella S. Abzug,(1920 - 1998),In Congress 1971 - 1977,"1-100 \r\n of 1,437"
6,"Representative Acevedo-Vila, Anibal",anibal-acevedo-vila/A000359,Representative Anibal Acevedo-Vila,(1962 - ),In Congress 2001 - 2005,1-100 \r\n of 503
7,"Representative Ackerman, Gary L.",gary-ackerman/A000022,Representative Gary L. Ackerman,(1942 - ),In Congress 1983 - 2013,"1-100 \r\n of 7,315"
8,"Representative Adams, Alma S.",alma-adams/A000370,Representative Alma S. Adams,(1946 - ),In Congress 2014 - Present | \n\n $(do...,1-100 \r\n of 809
9,"Senator Adams, Brock",brockman-adams/A000031,Senator Brock Adams,(1927 - 2004),"In Congress 1965 - 1979, 1987 - 1993","1-100 \r\n of 1,716"
