Notes from upgrading to create_bill_imp2

No longer unzipping zip files to disk and processing

Doing it all in memory

In [1]:
import io, os, shutil
import zipfile
from bs4 import BeautifulSoup
import lxml
import pandas as pd
from datetime import date, datetime, timedelta

In [49]:
z_name = '/Users/carl/python/bradleywoods/bills/billzips-2018-05-21/BILLSTATUS-115-2018-05-21-s.zip'

In [50]:
z = zipfile.ZipFile(z_name)

In [51]:
contents = z.namelist()
contents[:10]

['BILLSTATUS-115s105.xml',
 'BILLSTATUS-115s102.xml',
 'BILLSTATUS-115s97.xml',
 'BILLSTATUS-115s96.xml',
 'BILLSTATUS-115s100.xml',
 'BILLSTATUS-115s145.xml',
 'BILLSTATUS-115s106.xml',
 'BILLSTATUS-115s98.xml',
 'BILLSTATUS-115s99.xml',
 'BILLSTATUS-115s142.xml']

In [56]:
test = z.read(contents[0])
soup = BeautifulSoup(page, 'xml')
billid = '-'.join([soup.congress.text, soup.billType.text, soup.billNumber.text])
billid

'115-S-2825'

In [52]:
df_bills = pd.DataFrame()
# takes about 3 min to run for 3000 entries
records = []  # list to collect dicts of records
for filename in contents:
    page = z.read(filename).decode("utf-8")
    soup = BeautifulSoup(page, 'xml')

    # create billid
    billid = '-'.join([soup.congress.text, soup.billType.text, soup.billNumber.text])

    #find title
    title = soup.bill.title.text

    # find introduced date
    intro = soup.introducedDate.text

    # find sponsor
    try:
        sponsor = soup.sponsors.item.fullName.text
    except:
        sponsor = "None found"

    # find last action date
    try:
        action_date = soup.latestAction.actionDate.text

    except:
        action_date = soup.introducedDate.text

    # find last action
    try:
        action = soup.latestAction.find('text').text
    except:
        action = "None found"

    # find policyArea if one exists to create policy
    try:
        policy = soup.policyArea.contents[1].text
    except:
        policy = 'No listed policy'

    # create list of legislative subjects if they exist
    try:
        ls = soup.legislativeSubjects.find_all('item')
        subj_list = []
        subjects = 'No subjects defined'
        for entry in ls:
            entry = str(entry)  # need to convert from bs4 object to string
            clean = entry.split('<name>')[1].split('</name>')[0]
            subj_list.append(clean)
            subjects = '; '.join(subj_list)  # convert from list to semi-colon separated string
    except:
        subjects = 'No subjects defined'

    # create the record dict
    items = {'billid':billid, 'title':title, 'introDate':intro, 'sponsor':sponsor,
             'lastActionDate':action_date, 'lastAction':action, 'policy':policy, 'subjects': subjects}
    # append the record to the list
    records.append(items)

df_bills = df_bills.append(records, ignore_index = True)

In [53]:
len(df_bills)

2876

In [54]:
df_bills.sample(20)

Unnamed: 0,billid,introDate,lastAction,lastActionDate,policy,sponsor,subjects,title
2697,115-S-2722,2018-04-19,Read twice and referred to the Committee on He...,2018-04-19,Environmental Protection,"Sen. Harris, Kamala D. [D-CA]",Administrative law and regulatory procedures; ...,Environmental Justice Right to Know Act of 2018
153,115-S-311,2017-02-06,Read twice and referred to the Committee on He...,2017-02-06,Health,"Sen. Klobuchar, Amy [D-MN]",Aging; Health care coverage and access; Health...,Alzheimer's Caregiver Support Act
922,115-S-1497,2017-06-29,Read twice and referred to the Committee on En...,2017-06-29,Government Operations and Politics,"Sen. Daines, Steve [R-MT]","Child health; Government buildings, facilities...",Fairness For Breastfeeding Mothers Act of 2017
2009,115-S-1595,2017-07-20,Amendment SA 1110 agreed to in Senate by Unani...,2017-10-05,International Affairs,"Sen. Rubio, Marco [R-FL]","Bank accounts, deposits, capital; Congressiona...",Hizballah International Financing Prevention A...
2730,115-S-2746,2018-04-25,Read twice and referred to the Committee on Fi...,2018-04-25,Labor and Employment,"Sen. Booker, Cory A. [D-NJ]",No subjects defined,Federal Jobs Guarantee Development Act of 2018
2687,115-S-2696,2018-04-18,"Committee on Health, Education, Labor, and Pen...",2018-04-24,Social Welfare,"Sen. Casey, Robert P., Jr. [D-PA]",No subjects defined,Supporting Infant Plans of Safe Care Implement...
2152,115-S-2192,2017-12-05,Read the second time. Placed on Senate Legisla...,2018-02-28,Immigration,"Sen. Grassley, Chuck [R-IA]",Administrative law and regulatory procedures; ...,RAISE Act
731,115-S-825,2017-04-04,"Placed on the Union Calendar, Calendar No. 472.",2018-04-05,Public Lands and Natural Resources,"Sen. Murkowski, Lisa [R-AK]",Alaska; Alaska Natives and Hawaiians; Health f...,Southeast Alaska Regional Health Consortium La...
1333,115-S-2134,2017-11-15,Read twice and referred to the Committee on Ve...,2017-11-15,Armed Forces and National Security,"Sen. Baldwin, Tammy [D-WI]",Congressional oversight; Drug trafficking and ...,Andrew White Veterans Community Care Opioid Sa...
94,115-S-242,2017-01-30,Read twice and referred to the Committee on Ve...,2017-01-30,Armed Forces and National Security,"Sen. Cassidy, Bill [R-LA]",Administrative remedies; Congressional officer...,WINGMAN Act


In [79]:
out_dir = '/Users/carl/python/bradleywoods/bills/test/billzips-2018-05-21'
# zipdir = out_dir + '/billzips-2018-05-21'

In [80]:
os.makedirs(out_dir)

In [75]:
len(out_dir)

62

In [None]:
os.