In [None]:
import csv
import re
import requests
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.notebook import tqdm

## Query the API for bills with _election_ or _vote_ in the title

In [None]:
# API key obtained via https://legiscan.com/user/register under My Account
local_file = 'legiscan_auth.txt'
with open(local_file) as txtfile:
    my_key = txtfile.read().strip('\n')
host = 'https://api.legiscan.com/?key=' + my_key

In [None]:
# Search for bills with relevant content
# Year where 1=all, 2=current, 3=recent, 4=prior, >1900=exact [Default: 2]
method = "search"
params = "state=ALL&year=3&query=type%3AB+AND+(election)"
params = params + "+OR+(vote)"

In [None]:
url = host + "&op=" + method + "&" + params
response = requests.get(url)
print(response)
data = response.json().get("searchresult")
page_total = data["summary"]["page_total"]
print("Processing page 1 of " + str(page_total))
data.pop("summary", None)
bills = [data[bill] for bill in data]

In [None]:
# *bills is used to get each element and create a list with atomic elements
if page_total > 1:
    for i in tqdm(range(2, page_total + 1)):
        # print("Processing page " + str(i) + " of " + str(page_total))
        response = requests.get(url + "&page=" + str(i))
        data = response.json().get("searchresult")
        data.pop("summary", None)
        bills = [*bills, *[data[bill] for bill in data]]

print("Query results added: " + str(len(bills)))

In [None]:
bill_query = pd.DataFrame(bills)
bill_query.head(2)

## Filter out irrelevant results

In [None]:
# filter out low relevance (less than 80)
top_bills = bill_query[bill_query['relevance']>=80].copy()

In [None]:
# examine vocabulary for bill titles
titles = top_bills['title']

vectorizer = CountVectorizer(stop_words='english', min_df=50)
dtm = vectorizer.fit_transform(titles)

In [None]:
vocab = vectorizer.get_feature_names()

In [None]:
with open('bill_title_vocab.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(np.array(vocab)[:, np.newaxis])

In [None]:
# get a subset of terms to look for after manually reviewing bill_title_vocab.csv
terms = pd.read_csv('bill_title_vocab_selection.csv')
terms = list(terms[terms['keep']==1]['term'])
terms

In [None]:
def get_keywords(keywords, source):
    matches = [k for k in keywords if k in source.lower()]
    return matches

def count_keywords(keywords, source):
    matches = [k for k in keywords if k in source.lower()]
    return len(matches)

In [None]:
# test functions
mystring = 'here is sentence with primary voters'
print(get_keywords(terms, mystring))
print(count_keywords(terms, mystring))

In [None]:
top_bills['keywords'] = top_bills['title'].apply(lambda x: count_keywords(terms, x))
top_bills = top_bills[top_bills['keywords']>0].copy()
len(top_bills)

In [None]:
top_bills.to_csv('legiscan_query.csv', index=False)

## Query the API for additional details about each bill

In [None]:
def sorter(collection, billNumber):
    '''
    Returns the bill information that corresponds to the oldest action date.
    If appropriation or budget is found in the title, return None.
    list,
    '''
    temp = []
    for element in collection:
        if billNumber == element['bill_number']:
            temp.append(element)
    temp = sorted(temp, key=lambda x: x['last_action_date'])
    pat1 =  re.compile("[Aa]ppropriation|[Bb]udget")
    oldest = temp[0]
    title = oldest['title']
    matches = pat1.findall(str(title))
    if len(matches) > 0:
        return None
    return oldest

In [None]:
def get_extension(file_path):
    """
    Determines the file type of the document from path extension
    str
    """
    i = -1  # start at the end of the filename
    while i > -len(file_path):
        if file_path[i] == '.':
            break  # stop when you get to a period
        else:
            i -= 1  # i = i - 1
    ext = file_path[i+1:]
    if len(ext) > 4:
        return "other"
    else:
        return ext

In [None]:
sourcefile = 'legiscan_query.csv'
fieldnames = ['relevance', 'state', 'bill_number', 'bill_id', 'change_hash', 'url', 'text_url', 'research_url',
              'last_action_date', 'last_action', 'title', 'keywords']
addfields = ['body', 'session_name', 'year_start', 'sponsors', 'votes', 'text_url', 'state_link', 'type', 'format']
destfile = 'legiscan_query_detailed.csv'
destnames = [*fieldnames, *addfields] 

collec = []
billNum = set()

# Get list of unique bill ids
with open(sourcefile, 'r') as infile, open(destfile, 'w', newline='') as outfile:
    reader = csv.DictReader(infile, fieldnames)
    next(reader, None)  
    
    writer = csv.DictWriter(outfile, destnames)
    writer.writeheader()
    
    for row in reader:
        collec.append(row)
        billNum.add(row['bill_number'])   
    
    billNum = list(billNum)
    
    for num in tqdm(billNum):
        billInfo = sorter(collec, num)
        print(num)
        if billInfo == None:
            continue
        bill_id = billInfo["bill_id"]  
        url = host + "&op=getBill&id=" + str(bill_id)
        response = requests.get(url)
        bill = response.json().get("bill")
        
        if bill == None:
            continue
        
        billInfo["body"] = bill["body"]
        billInfo["session_name"] = bill["session"]["session_name"]
        billInfo["year_start"] = bill["session"]["year_start"]

        billInfo["sponsors"] = str([p.get("people_id") for p in bill[
                                "sponsors"]])
        billInfo["votes"] = str([(v.get("date"), v.get("passed"),
                           v.get("chamber")) for v in bill["votes"]])
        
        for i in range(0, len(bill["texts"])):
            billInfo["text_url"] = bill["texts"][i]["url"]
            billInfo["state_link"] = bill["texts"][i]["state_link"]
            billInfo["type"] = bill["texts"][i]["type"]
            billInfo["format"] = get_extension(billInfo["state_link"])
            
        writer.writerow(billInfo)