In [1]:
import requests
import json
from pathlib import Path
from bs4 import BeautifulSoup
from unicodedata import normalize
from urllib.parse import urlparse

## Bill title and subtitle
- Co-Sponsorship Memo (done)
- Prime Sponsor (done)
- Co-Sponsors (done)
- Bill Status & History
- Printers Numbers
- Amendments
- Votes
- Committee Meetings
- Statute References



In [2]:
url = urlparse('https://www.palegis.us/legislation/bills/2021/hb1486')
f'{url.scheme}://{url.netloc}{url.path}'

'https://www.palegis.us/legislation/bills/2021/hb1486'

In [3]:
html = requests.get(f'{url.scheme}://{url.netloc}{url.path}')
print(len(html.text))
#print(html.text)
page = open('test_page.html', 'w')
page.write(html.text)
page.close()

soup = BeautifulSoup(html.text, 'lxml')

164223


In [4]:
main = soup.find(id="bodyContainer")

In [5]:
# Find the bill number
main.find('div', class_="h1 header-title").contents[0].strip()

'House Bill 1486'

In [6]:
# Find the bill session
main.find('div', class_="h5 header-pretitle").contents[0]

'2021-2022 Regular Session'

In [7]:
# Find the bill description
main.find('div', class_='d-print-none mt-5').find_all('div', class_='col-md-9')[0].div.contents[0]

'An Act amending Titles 74 (Transportation) and 75 (Vehicles) of the Pennsylvania Consolidated Statutes, in turnpike, further providing for definitions, for electronic toll collection and for annual hearing; in registration of vehicles, providing for contributions for the prevention of child abuse and neglect, further providing for display of registration plate, providing for pollinator conservation registration plate, for Afghanistan and Iraq veterans plate and for Blue Star Family plate and further providing for suspension of registration upon unpaid tolls; in fees, further providing for payments to special funds and establishing the Pollinator Habitat Program Fund; and, in powers of department and local authorities, further providing for provisions relating to fare evasion.'

In [8]:
# Find the bill memo page URL (title, date, URL); NOTE: may not exist
memo_url = main.find('div', class_='d-print-none mt-5').find_all('div', class_='col-md-9')[0].a['href']
#print(f'{memo_title}: {memo_url}')

In [9]:
# Load memo page
memo_page = requests.get(f'{url.scheme}://{url.netloc}{memo_url}')
memo = BeautifulSoup(memo_page.text, 'lxml')

In [10]:
# Load memo title
memo.find('div', class_="h1 header-title").text.strip()

'Blue Star Family License Plate'

In [11]:
# Load memo date
memo.find('div', class_="h5 header-pretitle").text.strip()

'May 6, 2021 02:01 PM to All House Members'

In [12]:
# Load memo text
memo.find_all(class_="mt-3")[2].text

'Military families give their lives for our country and this Commonwealth.\n\nWhen they join the military community, some families choose to display a Blue Stars service banner of flag in a window of their home to signify a loved one is an active duty service member. This special flag, displayed only by military families, is where the phrase “Blue Star Family” comes from and refers to the immediate family members of an active duty service member.\n\nIt is my intention to introduce legislation to honor those who actively serve their country and their families by making the Blue Star Family plate available to families of active duty service members. The plate will display a Blue Star which will signify a family loved one is an active duty service member. Applicants must provide proper documentation and a fee of $20 to the Department of Transportation (PennDOT).\n\nPlease join me in honoring these individuals for their service and sacrifices by co-sponsoring this important legislation.'

In [13]:
# Get bill prime sponsor
# main.find_all('div', class_="h3 mt-3") # This gets all key sections!
prime_sponsor = main.find_all('div', class_="h3 mt-3")[1].find_next('div')

In [14]:
# Get bill prime sponsor name
prime_sponsor.a.text

"Representative Timothy J. O'Neal"

In [15]:
# Get bill prime sponsor URL
prime_sponsor_url = urlparse(f'{url.scheme}://{url.netloc}{prime_sponsor.a['href']}')
prime_sponsor_url

ParseResult(scheme='https', netloc='www.palegis.us', path='/house/members/bio/1797/representative-timothy-j-o', params='', query='', fragment='')

In [16]:
# Get bill prime sponsor ID
prime_sponsor_url.path.split('/')[4]

'1797'

In [17]:
# Get bill co-sponsors
co_sponsors = main.find_all('div', class_="col-6 col-lg-3 col-md-4 col-sm-6 mb-5")
for member in co_sponsors:
    member_data = {}
    member_data['url'] = member.a['href']
    member_data['chamber'] = member.a['href'].split('/')[1]
    member_data['id'] = member.a['href'].split('/')[4]
    member_data['name'] = member.a.text
    member_data['photo_url'] = member.img['src']
    print(member_data)

{'url': '/house/members/bio/1750/rep-francis-ryan', 'chamber': 'house', 'id': '1750', 'name': 'Rep. Francis Ryan', 'photo_url': 'https://www.palegis.us/resources/images/members/300/1750.jpg?20240411'}
{'url': '/house/members/bio/41/rep-tim-hennessey', 'chamber': 'house', 'id': '41', 'name': 'Rep. Tim Hennessey', 'photo_url': 'https://www.palegis.us/resources/images/members/300/41.jpg?20240411'}
{'url': '/house/members/bio/97/rep-tina-pickett', 'chamber': 'house', 'id': '97', 'name': 'Rep. Tina Pickett', 'photo_url': 'https://www.palegis.us/resources/images/members/300/97.jpg?20240411'}
{'url': '/house/members/bio/1022/rep-rob-kauffman', 'chamber': 'house', 'id': '1022', 'name': 'Rep. Rob Kauffman', 'photo_url': 'https://www.palegis.us/resources/images/members/300/1022.jpg?20240411'}
{'url': '/house/members/bio/1846/rep-f-todd-polinchock', 'chamber': 'house', 'id': '1846', 'name': 'Rep. F. Todd Polinchock', 'photo_url': 'https://www.palegis.us/resources/images/members/300/1846.jpg?20240

In [18]:
# Get bill additional co-sponsors
add_co_sponsors = main.find_all('div', class_="col-6 col-md-3 col-sm-6 mb-5")
for member in add_co_sponsors:
    member_data = {}
    member_data['url'] = member.a['href']
    member_data['chamber'] = member.a['href'].split('/')[1]
    member_data['id'] = member.a['href'].split('/')[4]
    member_data['name'] = member.a.text
    member_data['photo_url'] = member.img['src']
    print(member_data)

{'url': '/house/members/bio/1843/rep-andrew-lewis', 'chamber': 'house', 'id': '1843', 'name': 'Rep. Andrew Lewis', 'photo_url': 'https://www.palegis.us/resources/images/members/300/1843.jpg?20240411'}
{'url': '/house/members/bio/1859/rep-wendi-thomas', 'chamber': 'house', 'id': '1859', 'name': 'Rep. Wendi Thomas', 'photo_url': 'https://www.palegis.us/resources/images/members/300/1859.jpg?20240411'}
{'url': '/house/members/bio/136/rep-robert-freeman', 'chamber': 'house', 'id': '136', 'name': 'Rep. Robert Freeman', 'photo_url': 'https://www.palegis.us/resources/images/members/300/136.jpg?20240411'}
{'url': '/house/members/bio/90/rep-chris-sainato', 'chamber': 'house', 'id': '90', 'name': 'Rep. Chris Sainato', 'photo_url': 'https://www.palegis.us/resources/images/members/300/90.jpg?20240411'}
{'url': '/house/members/bio/1707/rep-craig-staats', 'chamber': 'house', 'id': '1707', 'name': 'Rep. Craig Staats', 'photo_url': 'https://www.palegis.us/resources/images/members/300/1707.jpg?20240411'

In [19]:
# Get bill history; NOTE: Unicode decoding needed, as well as URL prefix for committee
# referrals and amendments
# TODO: Rearrange order and convert date string to object, extracted, 

actions = main.find('table', class_='table table-striped w-100 w-md-75 w-lg-50 ms-3').find_all('tr')
bill_history = []
for action in actions:
    items = action.find_all('td')
    revision = items[0].contents
    #if revision:
    #    link_url = revision.find('a')['href']
    
    description = items[1].contents
    #print(f'{revision}: {description}')
    # bill_history.append(action)
    
    #if event[0]:
    #    event[0] = f'{url.scheme}://{url.netloc}/legislation/bills/text/PDF/2021/0/HB1486/PN1602'
    #bill_history.append([url, event])
actions[0]

<tr>
<td><a aria-label="2021-2022 Regular Session HB 1486 PN 1602 Bill Text (PDF)" href="/legislation/bills/text/PDF/2021/0/HB1486/PN1602" target="_blank">1602</a></td>
<td style="text-align:left">Referred to <a class="committee" href="/house/committees/38/transportation">Transportation</a>,  May 25, 2021</td>
</tr>

In [60]:
for action in actions:
    items = action.find_all('td')
    
    if len(items[1]) == 3:
        action_type = items[1].contents[0]
        committee_id = items[1].a['href'].split('/')[3]
        committee_url = items[1].a['href']
        committee_name = items[1].contents[1].text
    else:
        action_type = items[1].contents[0]
        committee_id = ''
        committee_url = ''
        committee_name = ''
        
    if items[0].contents:
        pn_url = f'{url.scheme}://{url.netloc}{items[0].a['href']}'
        pn_id = items[0].a.text
        pn_description = items[0].a['aria-label']
        print(f'* {pn_id}, {pn_url}, {pn_description}, {action_type}, {committee_id}, {committee_url}, {committee_name}')
    else:
        print(f'* {action_type}, {committee_id}, {committee_url}, {committee_name}')

* 1602, https://www.palegis.us/legislation/bills/text/PDF/2021/0/HB1486/PN1602, 2021-2022 Regular Session HB 1486 PN 1602 Bill Text (PDF), Referred to , 38, /house/committees/38/transportation, Transportation
* 1713, https://www.palegis.us/legislation/bills/text/PDF/2021/0/HB1486/PN1713, 2021-2022 Regular Session HB 1486 PN 1713 Bill Text (PDF), Reported as amended, , , , 
* First consideration, June 8, 2021 , , , 
* Re-committed to , 44, /house/committees/44/rules, Rules
* Re-reported as committed, , , , 
* Laid on the table, Sept. 15, 2021 , , , 
* Removed from table, Nov. 10, 2021, , , 
* Laid on the table, Nov. 10, 2021 , , , 
* Removed from table, April 27, 2022, , , 
* 3130, https://www.palegis.us/legislation/bills/text/PDF/2021/0/HB1486/PN3130, 2021-2022 Regular Session HB 1486 PN 3130 Bill Text (PDF), Second consideration, with amendments, , amendment-list?billnum=1486&sessind=0&searchby=amendment&amendingbody=h&billtype=b&billbody=h&billpn=1713&sessyr=2021, /legislation/amendm

In [21]:
# Get bill committee(s)



In [22]:
# Get bill status



In [23]:
# Get bill version



In [24]:
# Get bill text



In [25]:
# Get bill files

