In [7]:
# importing libraries
%matplotlib inline
import pandas as pd
import json
import numpy as np
import os

In [38]:
# Initial testing and working around with chamber counts on different congress year sessions
# Documentation on the votes https://github.com/unitedstates/congress/wiki/votes

content_list = []
years = []
yearVotes = {}
congress_no = "data/108/"
congress_votes = congress_no + "votes/"

# loops through the congress directory and collects years present
for content in os.listdir(congress_no):
    if content != '.DS_Store':
        content_list.append(content)
        
# for each year extract the vote sessions completed in the House and the Senate 
for year in os.listdir(congress_votes):
    years.append(year)    
    house_count = 0
    senate_count = 0
    for chamber in os.listdir(congress_votes + year):
        if chamber[0] == "h":
            house_count +=1
        if chamber[0] == "s":
            senate_count += 1
    yearVotes[year] = {'senate_count':senate_count, 'house_count':house_count}

yearVotes

{'2003': {'house_count': 677, 'senate_count': 459},
 '2004': {'house_count': 544, 'senate_count': 216}}

In [44]:
# A look into "Foreign" bills and amendments from the House of Representatives.
# This test run returns an array of vote indices of all Vote Documents form the House
# related to the query param "Foreign".

sample_year= '2003/'
house_path = congress_votes+sample_year+"h"
res_docs = []
wordSearch = "Foreign"
for i in range(1, yearVotes['2003']['house_count']+1):
    # Specify path, open and read relevant json file
    path = house_path + str(i) + '/data.json'
    f = open(path, 'r')
    x = json.loads(f.read())
    if wordSearch in x["question"]:
        res_docs.append(i)
        
res_docs

[314, 369, 424, 429, 539]

In [17]:
# Relevant Vote Document parsed
# https://www.govtrack.us/data/congress/108/votes/2003/h424/data.json
# https://www.govtrack.us/congress/bills/108/hres372
class Processor():
    def __init__(self, congress_num, word_search):
        self.congress_num = congress_num
        self.vote_types = ["Nay", "Not Voting", "Present", "Yea"]
        self.bill_types = ['hconres', 'hjres', 'hres', 'sconres', 'sjres', 'sres', 'hr', 's']
        self.res_dict = {}
        self.res_dict['bill_title'] = []
        self.res_dict['isAmendment'] = []
        self.res_dict['result'] = []
        self.res_dict['date'] = []
        self.res_dict['question'] = []
        self.res_dict['chamber'] = []
        self.res_dict['year'] = []
        self.res_dict['bill_long_text'] = []
        self.res_dict["top_subject"] = []
        self.res_dict['amendment_type_num'] = []
        self.res_dict["bill_sponsor_name"] = []
        self.res_dict["list_of_voters_obj"] = []

        for vote_type in self.vote_types:
            self.res_dict[vote_type + " (Democrats)"] = []
            self.res_dict[vote_type + " (Republicans)"] = []
            self.res_dict[vote_type + " (Others)"] = []
        self.word_search = word_search
        self.chambers = {"/h":"house_count", "/s":"senate_count"}

    def find_chamber_count(self):
        """ 
            Lists all vote sessions completed in the House and the Senate (both chambers) 
            for a given congress and year.
            
            Returns dictionary of year and chamber count arrangements 
        """
        self.years = []
        self.count_arrangements = {}
        self.congress_votes = self.congress_num+"votes/"
        for year in os.listdir(self.congress_votes):
            self.years.append(year)    
            self.house_count = 0
            self.senate_count = 0
            for chamber in os.listdir(self.congress_votes + year):
                if chamber[0] == "h":
                    self.house_count +=1
                if chamber[0] == "s":
                    self.senate_count += 1
            self.count_arrangements[year] = {'senate_count':self.senate_count, 'house_count':self.house_count}
        return self.count_arrangements

    def read_chamber_data(self):
        """
            Finds all indices of votes related to the query search word and returns resulting dictionary
        """
        chambers_in_years = self.find_chamber_count()
        res_all = {}
        self.res_docs = []
        for chamber in self.chambers:
            for year in self.years:
                chamber_path = self.congress_votes+year+chamber
                for i in range(1, chambers_in_years[year][self.chambers[chamber]]+1):
                    path = chamber_path + str(i) + '/data.json'
                    f = open(path, 'r')
                    x = json.loads(f.read())
                    if self.word_search in x["question"]:
                        self.res_docs.append(i)
                        res_all = self.arrange_dict(i, path, self.chambers[chamber])
        return res_all
        
    def arrange_dict(self, vote_index, vote_path, chamber):
        """
            Alocates keys and values to the resulting dictionary and populates fields using helper API methods.
        """
        vote = json.loads(open(vote_path, 'r').read())
        votes_info = vote['votes']
        if "Aye" in votes_info:
            votes_info["Yea"] = votes_info["Aye"]
        if "No" in votes_info:
            votes_info["Nay"] = votes_info["No"]

        if vote["category"] != "amendment":
            self.res_dict['amendment_type_num'].append("no")
            self.res_dict['isAmendment'].append(False)
        else:
            self.res_dict['isAmendment'].append(True)
            self.res_dict['amendment_type_num'].append(str(vote['amendment']['type'][0])+"amdt"+str(vote['amendment']['number']))
        
        self.res_dict['bill_title'].append(str(vote['bill']['type'] + str(vote['bill']['number'])))
        self.res_dict['bill_long_text'].append(self.get_bill_info(vote_index,vote_path)[0])
        self.res_dict['top_subject'].append(self.get_bill_info(vote_index,vote_path)[1])
        self.res_dict['bill_sponsor_name'].append(self.get_bill_info(vote_index,vote_path)[2])
        self.res_dict['result'].append(str(vote['result_text']))
        self.res_dict['date'].append(str(vote['date']))
        self.res_dict['year'].append(vote['date'].split('-')[0])
        self.res_dict['question'].append((vote['question']).encode('utf-8'))
        self.res_dict['chamber'].append(str(chamber.split('_')[0]))
        
        list_of_people_for_vote = []
        for vote_type in self.vote_types:
            dem_counter = 0
            rep_counter = 0
            ind_counter = 0
            for voterObj in vote["votes"][vote_type]:
                meaningfulObj = {}
                name = voterObj["display_name"].encode('utf-8')
                party = voterObj["party"].encode('utf-8')
                state = voterObj["state"].encode('utf-8')
                meaningfulObj = {"display_name":name, "party":party, "state":state, "vote":vote_type}

                if party == 'D':
                    dem_counter +=1
                if party == 'R':
                    rep_counter +=1
                if party == 'I':
                    ind_counter +=1
                
                list_of_people_for_vote.append(meaningfulObj)
            self.res_dict[vote_type + " (Democrats)"].append(dem_counter)
            self.res_dict[vote_type + " (Republicans)"].append(rep_counter)
            self.res_dict[vote_type + " (Others)"].append(ind_counter)
        self.res_dict["list_of_voters_obj"].append(list_of_people_for_vote)

        return self.res_dict

    def get_bill_info(self, vote_index, vote_path):
        """
            Extracts information from bills relevant to the votes investigated.
            Return array of information for bill, including its text, top subject and sponsor's name.
        """
        vote = json.loads(open(vote_path, 'r').read())
        bill_type_num = str(vote['bill']['type']) + str(vote['bill']['number'])
        for bill_type in self.bill_types:
            if bill_type in bill_type_num:
                bill_data_path = self.congress_num + "bills/" + bill_type + "/" + str(bill_type_num) + "/data.json"
                bill = json.loads(open(bill_data_path, 'r').read())
                long_text_top_subject_array = [bill["summary"]["text"], bill["subjects_top_term"], bill["sponsor"]["name"]]
                return long_text_top_subject_array

In [18]:
pr = Processor("./data/108/","Immigration")

In [51]:
congress_years = ["104", "105", "106", "107", "108"]
# test_congress_years = ["104", "105"]
frames = []

for congress in congress_years:
    print congress
    pr = Processor("/Volumes/usb1/"+congress+"/","Immigration")
    df = pd.DataFrame(data=pr.read_chamber_data())
    frames.append(df)

allframes = pd.concat(frames)
filename_dest = "word_Immigration_congress_104_108.csv"
allframes.to_csv(filename_dest, index=False)

104
105
106
107
108


In [12]:
data_info = pd.read_csv("./word_Immigration_congress_104_108.csv")
data_info.head(3)

Unnamed: 0,Nay (Democrats),Nay (Others),Nay (Republicans),Not Voting (Democrats),Not Voting (Others),Not Voting (Republicans),Present (Democrats),Present (Others),Present (Republicans),Yea (Democrats),...,bill_sponsor_name,bill_title,chamber,date,isAmendment,list_of_voters_obj,question,result,top_subject,year
0,35,0,44,1,0,0,0,0,0,11,...,"Hatch, Orrin G.",s1664,senate,1996-04-24T15:44:00-04:00,True,"[{'vote': 'Nay', 'party': 'R', 'state': 'MI', ...",On the Amendment S.Amdt. 3730 to S.Amdt. 3725 ...,Amendment Rejected (20-79),Immigration,1996
1,4,0,2,1,0,1,0,0,0,42,...,"Hatch, Orrin G.",s1664,senate,1996-04-24T18:16:00-04:00,True,"[{'vote': 'Nay', 'party': 'D', 'state': 'NJ', ...",On the Amendment S.Amdt. 3672 to S.Amdt. 3667 ...,Amendment Agreed to (92-6),Immigration,1996
2,43,0,0,0,0,1,0,0,0,4,...,"Hatch, Orrin G.",s1664,senate,1996-04-24T18:42:00-04:00,False,"[{'vote': 'Nay', 'party': 'D', 'state': 'HI', ...",On the Motion to Table S.Amdt. 3667 to S. 1664...,Motion to Table Agreed to (56-43),Immigration,1996


In [13]:
year_sorted_immigration = data_info.groupby('year').size()
year_sorted_immigration

year
1996    32
1999     1
2001     1
2002     4
dtype: int64

In [14]:
data_info_foreign = pd.read_csv("./word_Foreign_congress_104_108.csv")
data_info_foreign.head(3)

Unnamed: 0,Nay (Democrats),Nay (Others),Nay (Republicans),Not Voting (Democrats),Not Voting (Others),Not Voting (Republicans),Present (Democrats),Present (Others),Present (Republicans),Yea (Democrats),...,bill_sponsor_name,bill_title,chamber,date,isAmendment,list_of_voters_obj,question,result,top_subject,year
0,1,0,1,2,0,2,0,0,0,43,...,"Helms, Jesse",s908,senate,1995-07-31T18:47:00-04:00,True,"[{'vote': 'Nay', 'party': 'R', 'state': 'OR', ...",On the Amendment S.Amdt. 2026 to S.Amdt. 2025 ...,Amendment Agreed to (94-2),International affairs,1995
1,44,0,4,1,0,2,0,0,0,1,...,"Helms, Jesse",s908,senate,1995-07-31T19:13:00-04:00,False,"[{'vote': 'Nay', 'party': 'D', 'state': 'HI', ...",On the Motion to Table S.Amdt. 1977 to S. 908 ...,Motion to Table Agreed to (49-48),International affairs,1995
2,0,0,0,3,0,1,0,0,0,44,...,"Callahan, Sonny",hr3540,senate,1996-07-25T10:01:00-04:00,True,"[{'vote': 'Not Voting', 'party': 'R', 'state':...",On the Amendment S.Amdt. 5017 to H.R. 3540 (Fo...,Amendment Agreed to (96-0),International affairs,1996


In [17]:
year_sorted_foreign = data_info_foreign.groupby('year').size()
year_sorted_foreign

year
1995    20
1996    11
1997    17
1998     7
1999    11
2000     8
2001     6
2003    13
2004     3
dtype: int64

topic analysis on bills -> set of themes within the immigration bills
late

tfif --> dismilarity of two things --> just like PCA
cluster of bills --> 

clustering on the vote outcome --> two outcomes --> rep and dem origin perhaps
binary vectors of yays and nays 

https://github.com/brandomr/document_cluster/blob/master/cluster_analysis.ipynb

https://github.com/jakevdp/sklearn_pycon2015/blob/master/notebooks/04.2-Clustering-KMeans.ipynb
