In [1]:
import requests # for making http (web) requests
import pandas as pd # for working with tabular (spreadsheet) data
import csv # also for working with tabular data, in csv format

# saving the search results from congress.gov 
bills = pd.read_csv('in/117_house_gender.csv')

# creating a "dataframe" in pandas (a python library for working
# with spreadsheet like data
df = pd.DataFrame(bills)

In [2]:
df = df.drop_duplicates(subset=['Legislation Number'], keep='first')

In [3]:
df

Unnamed: 0,Legislation Number,URL,Congress,Bill Text Type,Bill Text Type Code,Latest Title,HTML Url,PDF Url
0,H.R. 1927,https://www.congress.gov/bill/117th-congress/h...,117,Introduced in House,IH,End Taxpayer Funding of Gender Experimentation...,https://www.congress.gov/117/bills/hr1927/gene...,https://www.congress.gov/117/bills/hr1927/BILL...
1,H.R. 8731,https://www.congress.gov/bill/117th-congress/h...,117,Introduced in House,IH,Protect Children’s Innocence Act,https://www.congress.gov/117/bills/hr8731/gene...,https://www.congress.gov/117/bills/hr8731/BILL...
2,H.R. 5,https://www.congress.gov/bill/117th-congress/h...,117,Engrossed in House,EH,Equality Act,https://www.congress.gov/117/bills/hr5/generat...,https://www.congress.gov/117/bills/hr5/BILLS-1...
5,H.R. 7684,https://www.congress.gov/bill/117th-congress/h...,117,Introduced in House,IH,No Tax Breaks for Radical Corporate Activism Act,https://www.congress.gov/117/bills/hr7684/gene...,https://www.congress.gov/117/bills/hr7684/BILL...
6,H.Res. 1475,https://www.congress.gov/bill/117th-congress/h...,117,Reported in House,RH,Of inquiry requesting the President and direct...,https://www.congress.gov/117/bills/hres1475/ge...,https://www.congress.gov/117/bills/hres1475/BI...
...,...,...,...,...,...,...,...,...
839,H.R. 4037,https://www.congress.gov/bill/117th-congress/h...,117,Introduced in House,IH,Trade Preferences and American Manufacturing C...,https://www.congress.gov/117/bills/hr4037/gene...,https://www.congress.gov/117/bills/hr4037/BILL...
841,H.R. 8588,https://www.congress.gov/bill/117th-congress/h...,117,Introduced in House,IH,Fair Care Act of 2022,https://www.congress.gov/117/bills/hr8588/gene...,https://www.congress.gov/117/bills/hr8588/BILL...
843,H.R. 4346,https://www.congress.gov/bill/117th-congress/h...,117,Public Law,PL,Chips and Science Act,https://www.congress.gov/117/plaws/publ167/gen...,https://www.congress.gov/117/plaws/publ167/PLA...
846,H.R. 1512,https://www.congress.gov/bill/117th-congress/h...,117,Introduced in House,IH,CLEAN Future Act,https://www.congress.gov/117/bills/hr1512/gene...,https://www.congress.gov/117/bills/hr1512/BILL...


In [4]:
## go through each row in numbers column of our spreadsheet
## extract the number and put into a separate list
numbers = []
for row in df['Legislation Number']:
    splitted = row.split(' ')
    for item in splitted:
        if item.isnumeric():
            numbers.append(item)

In [5]:
numbers[:20]

['1927',
 '8731',
 '5',
 '7684',
 '1475',
 '8170',
 '2695',
 '9497',
 '4176',
 '5744',
 '86',
 '1209',
 '1926',
 '29',
 '1440',
 '4097',
 '4286',
 '3930',
 '196',
 '3794']

In [6]:
len(numbers)

688

In [9]:
# function that contains a loop to insert bill numbers
# into the URL, then to grab the content and add to a new list
def scrape_bill_text(numbers):
    bills_text = []
    for item in numbers:
        # make sure to put the right congress session
        url = (f'https://www.congress.gov/117/bills/hr{item}/BILLS-117hr{item}ih.htm')
        # url = (f'https://www.congress.gov/117/bills/s{item}/BILLS-117s{item}is.htm')
        response = requests.get(url)
        content = response.content
        bills_text.append(content)
    return bills_text

In [10]:
# if HTML tags and data persists in the results, run this function instead to scrape
# the bill text. It uses BS4 to get just the text. 

# from bs4 import BeautifulSoup

# def scrape_just_text(numbers):
#     bills_text = []
#     for item in numbers:
#         page = requests.get(f'https://www.congress.gov/118/bills/hr{item}/BILLS-118hr{item}ih.htm')
#         source = page.content
#         soup = BeautifulSoup(source, 'lxml')
#         stuff = soup.get_text(strip=True)
#         bills_text.append(stuff)
#     return bills_text

In [11]:
# call the relevant function, passing the list of numbers as parameter
# the first time, just scrape 10 to see what the results look like

# full_text = scrape_just_text(numbers)
full_text = scrape_bill_text(numbers[:10])

In [14]:
full_text[0]

b"<html><body><pre>\n[Congressional Bills 117th Congress]\n[From the U.S. Government Publishing Office]\n[H.R. 1927 Introduced in House (IH)]\n\n&lt;DOC&gt;\n\n\n\n\n\n\n117th CONGRESS\n  1st Session\n                                H. R. 1927\n\nTo prohibit taxpayer-funded gender reassignment medical interventions, \n                        and for other purposes.\n\n\n_______________________________________________________________________\n\n\n                    IN THE HOUSE OF REPRESENTATIVES\n\n                             March 16, 2021\n\n  Mr. LaMalfa (for himself, Mr. Norman, Mr. Allen, Mr. Grothman, Mr. \nDuncan, Mr. Lamborn, Mr. Hice of Georgia, Mrs. Miller of Illinois, Mr. \n Steube, Mr. Kelly of Mississippi, Mr. Jordan, Mr. Banks, Mr. Weber of \n Texas, Mr. Aderholt, Mr. Babin, and Mr. Good of Virginia) introduced \n the following bill; which was referred to the Committee on Energy and \nCommerce, and in addition to the Committees on the Judiciary, and Ways \n and Means, f

In [15]:
# double check we have the desired amount of texts

len(full_text)

10

In [16]:
# Next, turn the results, which is a bytes object, into a string, so that it
# can be manipulated. 

# then save a copy of the results. 

strings = []
for item in full_text:
    i = str(item)
    strings.append(i)

with open('in/bill_text.txt', 'w') as f:
    for item in full_text:
        f.write(str(item))

In [17]:
# The next step will be to clean the text of the html characters and other unwanted characters 
# and whitespace. 

# First, I load it back up. Then, I run a function to take out all of the = characters that 
# we don't want like `<html>`, `\n`, `__`, as well as any whitespaces. Finally, I save to a 
# new text file, called `clean`.

# loading up the texts that we just saved
load = open('in/bill_text.txt')
data = load.read()
load.close()

# remove all the characters in the "take out" list by writing a
# loop that replaces those characters with an empty character, ''
def clean_up(text):
    take_out = ['\n', '/n', '\\n', '_', '[', ']', '<html><body><pre>', '<html><body><pre>', '  ']
    for item in take_out:
        if item in text:
            text = text.replace(item, '')
    return text


cleaned = clean_up(data)

In [18]:
# check a slice of the text to see the results of cleaning

cleaned[1000:3000]

"cted by the Senate and House of Representatives of the United States of America in Congress assembled,SECTION 1. SHORT TITLE; TABLE OF CONTENTS.(a) Short Title.--This Act may be cited as the ``End Taxpayer Funding of Gender Experimentation Act of 2021''.(b) Table of Contents.--The table of contents of this Act is as follows:Sec. 1. Short title; table of contents. TITLE I--PROHIBITING FEDERALLY FUNDED GENDER REASSIGNMENT MEDICALINTERVENTIONSSec. 101. Prohibiting taxpayer-funded gender reassignment medical interventions.Sec. 102. Amendment to table of chapters.TITLE II--APPLICATION UNDER THE AFFORDABLE CARE ACTSec. 201. Clarifying application of prohibition to premium credits and cost-sharing reductions under ACA. TITLE I--PROHIBITING FEDERALLY FUNDED GENDER REASSIGNMENT MEDICALINTERVENTIONSSEC. 101. PROHIBITING TAXPAYER-FUNDED GENDER REASSIGNMENT MEDICAL INTERVENTIONS.Title 1, United States Code, is amended by adding at the end the following new chapter: ``CHAPTER 4--PROHIBITING TAXPAY

In [19]:
# save plain to a separate text file
# MAKE SURE TO CHANGE THE NAME OF THE FILE!

with open('test/gathered.txt', 'w') as f:
    f.write(cleaned)