In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
response = requests.get('https://www.congress.gov/members')
doc = BeautifulSoup(response.text)

In [3]:
rows = []

# Select all of our chunks of person -
# each class='expanded' should have all of the data about that person in it
members = doc.find_all(class_='expanded')
for member in members:
    # print("------")
    # Create a new empty dictionary called 'row'
    # Fill it in with the Congressperson's name and URL
    # and then print out the dictionary
    row = {}
    
    # Print all of the text inside, but .strip() to clean it a little
    #print(member.text.strip())
    row['name'] = member.find('a').text
    row['url'] = member.find('a')['href']
    # print(row)
    rows.append(row)

df = pd.DataFrame(rows)
df.head()

Unnamed: 0,name,url
0,"Senator Abdnor, James",https://www.congress.gov/member/james-abdnor/A...
1,"Representative Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...
2,"Senator Abourezk, James",https://www.congress.gov/member/james-abourezk...
3,"Representative Abraham, Ralph Lee",https://www.congress.gov/member/ralph-abraham/...
4,"Senator Abraham, Spencer",https://www.congress.gov/member/spencer-abraha...


In [17]:
df.shape

(100, 2)

# Let's find out how to make the URLs for each page

In [26]:
# https://www.congress.gov/members?pageSize=250&page=10
for page_num in range(1, 11):
    print(page_num)

1
2
3
4
5
6
7
8
9
10


In [28]:
# https://www.congress.gov/members?pageSize=250&page=10
for page_num in range(1, 11):
    # You can also do this if you hate f-strings:
    # url = "https://www.congress.gov/members?pageSize=250&page=" + str(page_num)
    url = f"https://www.congress.gov/members?pageSize=250&page={page_num}"
    print(url)

https://www.congress.gov/members?pageSize=250&page=1
https://www.congress.gov/members?pageSize=250&page=2
https://www.congress.gov/members?pageSize=250&page=3
https://www.congress.gov/members?pageSize=250&page=4
https://www.congress.gov/members?pageSize=250&page=5
https://www.congress.gov/members?pageSize=250&page=6
https://www.congress.gov/members?pageSize=250&page=7
https://www.congress.gov/members?pageSize=250&page=8
https://www.congress.gov/members?pageSize=250&page=9
https://www.congress.gov/members?pageSize=250&page=10


# Now that we know how to make the URLs, let's scrape!

In [34]:
print("Clearing out our dataset")
rows = []

# https://www.congress.gov/members?pageSize=250&page=10
for page_num in range(1, 11):
    # You can also do this if you hate f-strings:
    # url = "https://www.congress.gov/members?pageSize=250&page=" + str(page_num)
    url = f"https://www.congress.gov/members?pageSize=250&page={page_num}"
    print("Now scraping", url)
    
    # Download the appropriate page
    response = requests.get(url)
    doc = BeautifulSoup(response.text)
    
    # Select all of our chunks of person -
    # each class='expanded' should have all of the data about that person in it
    members = doc.find_all(class_='expanded')
    for member in members:
        # print("------")
        # Create a new empty dictionary called 'row'
        # Fill it in with the Congressperson's name and URL
        # and then print out the dictionary
        row = {}

        # Print all of the text inside, but .strip() to clean it a little
        #print(member.text.strip())
        row['name'] = member.find('a').text
        row['url'] = member.find('a')['href']
        # print(row)
        rows.append(row)

print("Building our dataframe")
df = pd.DataFrame(rows)
df.head()

Clearing out our dataset
Now scraping https://www.congress.gov/members?pageSize=250&page=1
Now scraping https://www.congress.gov/members?pageSize=250&page=2
Now scraping https://www.congress.gov/members?pageSize=250&page=3
Now scraping https://www.congress.gov/members?pageSize=250&page=4
Now scraping https://www.congress.gov/members?pageSize=250&page=5
Now scraping https://www.congress.gov/members?pageSize=250&page=6
Now scraping https://www.congress.gov/members?pageSize=250&page=7
Now scraping https://www.congress.gov/members?pageSize=250&page=8
Now scraping https://www.congress.gov/members?pageSize=250&page=9
Now scraping https://www.congress.gov/members?pageSize=250&page=10
Building our dataframe


Unnamed: 0,name,url
0,"Senator Abdnor, James",https://www.congress.gov/member/james-abdnor/A...
1,"Representative Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...
2,"Senator Abourezk, James",https://www.congress.gov/member/james-abourezk...
3,"Representative Abraham, Ralph Lee",https://www.congress.gov/member/ralph-abraham/...
4,"Senator Abraham, Spencer",https://www.congress.gov/member/spencer-abraha...


In [35]:
df.head()

Unnamed: 0,name,url
0,"Senator Abdnor, James",https://www.congress.gov/member/james-abdnor/A...
1,"Representative Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...
2,"Senator Abourezk, James",https://www.congress.gov/member/james-abourezk...
3,"Representative Abraham, Ralph Lee",https://www.congress.gov/member/ralph-abraham/...
4,"Senator Abraham, Spencer",https://www.congress.gov/member/spencer-abraha...


In [36]:
df.shape

(2348, 2)

# Editing for fun

In [46]:
df['slug'] = df.url.str.replace("https://www.congress.gov/member/", "")
df.head()

Unnamed: 0,name,url,slug
0,"Senator Abdnor, James",https://www.congress.gov/member/james-abdnor/A...,james-abdnor/A000009
1,"Representative Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...,neil-abercrombie/A000014
2,"Senator Abourezk, James",https://www.congress.gov/member/james-abourezk...,james-abourezk/A000017
3,"Representative Abraham, Ralph Lee",https://www.congress.gov/member/ralph-abraham/...,ralph-abraham/A000374
4,"Senator Abraham, Spencer",https://www.congress.gov/member/spencer-abraha...,spencer-abraham/A000355


In [49]:
df.to_csv("congress.csv", index=False)

In [37]:
summary = "     \n\n\n Hello .    my name is SOma \n       how are you   \n\n    \n"
summary

'     \n\n\n Hello .    my name is SOma \n       how are you   \n\n    \n'

In [42]:
print(summary.strip())

Hello .    my name is SOma 
       how are you


In [43]:
# import re
# re.sub('\s+', ' ', summary).strip()