In [1]:
#Imports
import numpy as np
import pandas as pd
import json
import re
from sklearn.model_selection import StratifiedShuffleSplit
from nltk.corpus import stopwords

from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim import corpora



In [2]:
#Load bill status data to obtain the status

with open('..\\Data\\107th-112th Congress\\Bill status\\HR_bill_status_contemporary.json') as f:
    HR_data1= json.load(f)

with open('..\\Data\\107th-112th Congress\\Bill status\\Sen_bill_status_contemporary.json') as f:
    Sen_data1= json.load(f)
    
with open('..\\Data\\113th-114th Congress\\Bill status\\HR_bill_status_modern.json') as f:
    HR_data2= json.load(f)

with open('..\\Data\\113th-114th Congress\\Bill status\\Sen_bill_status_modern.json') as f:
    Sen_data2= json.load(f)    
    

In [3]:
def extract_id_status(bill_list):
    v = {}
    for b in bill_list:
        v.update({b['bill_id']:b['status']})
    return v

In [4]:
#Bill id and status

d1 = extract_id_status(HR_data1)
d2 = extract_id_status(Sen_data1)
d3 = extract_id_status(HR_data2)
d4 = extract_id_status(Sen_data2)

#Concatenate dictionaries into one
status_dict = {}
for d in (d1,d2,d3,d4):
    status_dict.update(d)

In [5]:
#Create dataframe from status_dict
status_df = pd.DataFrame.from_dict(status_dict,orient='index',columns=['Bill Status']).reset_index(level=0)

In [6]:
status_df.head()

Unnamed: 0,index,Bill Status
0,hr1-107,ENACTED:SIGNED
1,hr10-107,ENACTED:SIGNED
2,hr100-107,PASS_OVER:HOUSE
3,hr1000-107,ENACTED:SIGNED
4,hr1001-107,REFERRED


In [7]:
#Map status values to binary

#Once a Congress adjourn at the end of its two-year cycle, all bills that have been introduced in either
#the House or the Senate that have not made it through the entire legislative process and signed into law are dead.

#dictionary which specifies status value to binary
#0 = did not pass in originating chamber (firs pass)
#1 = did pass in originating chamber (first pass)
#survive committee in the originating chamber (doing this for more class balance)

status_binary_dict = {
    'INTRODUCED':0,
    'REFERRED':0,
    'REPORTED':1,
    'PROV_KILL:SUSPENSIONFAILED':1,
    'PROV_KILL:CLOTUREFAILED':1,
    'FAIL:ORIGINATING:HOUSE':1,
    'FAIL:ORIGINATING:SENATE':1,
    'PASSED:SIMPLERES':1,
    'PASSED:CONSTAMEND':1,
    'PASS_OVER:HOUSE':1,
    'PASS_OVER:SENATE':1,
    'PASSED:CONCURRENTRES':1,
    'FAIL:SECOND:HOUSE':1,
    'FAIL:SECOND:SENATE':1,
    'PASS_BACK:HOUSE':1,
    'PASS_BACK:SENATE':1,
    'PROV_KILL:PINGPONGFAIL':1,
    'PASSED:BILL':1,
    'CONFERENCE:PASSED:HOUSE':1,
    'CONFERENCE:PASSED:SENATE':1,
    'ENACTED:SIGNED':1,
    'PROV_KILL:VETO':1,
    'VETOED:POCKET':1,
    'VETOED:OVERRIDE_FAIL_ORIGINATING:HOUSE':1,
    'VETOED:OVERRIDE_FAIL_ORIGINATING:SENATE':1,
    'VETOED:OVERRIDE_PASS_OVER:HOUSE':1,
    'VETOED:OVERRIDE_PASS_OVER:SENATE':1,
    'VETOED:OVERRIDE_FAIL_SECOND:HOUSE':1,
    'VETOED:OVERRIDE_FAIL_SECOND:SENATE':1,
    'ENACTED:VETO_OVERRIDE':1,
    'ENACTED:TENDAYRULE':1,
    
}


In [8]:
#Use status_binary_dict to map values in dataframe
status_df["Bill Status"].replace(status_binary_dict, inplace=True)

In [9]:
status_df.head()

Unnamed: 0,index,Bill Status
0,hr1-107,1
1,hr10-107,1
2,hr100-107,1
3,hr1000-107,1
4,hr1001-107,0


In [10]:
#Load bill text data

with open('..\\Data\\107th-112th Congress\\Bill text\\HR_text_to_114.json') as f:
    HR_data= json.load(f)
    
with open('..\\Data\\107th-112th Congress\\Bill text\\Sen_text_to_114.json') as f:
    Sen_data= json.load(f)

In [11]:
#Merge dicts into one
HR_data.update(Sen_data)

In [12]:
len(HR_data)

77565

In [13]:
#Text Preprocessing
def clean_text(text):
    #Remove underscores
    text = re.sub('\_','',text)
    return text

#Remove extended ellipses


In [14]:
clean_bill_text= {k:clean_text(v) for k,v in HR_data.items()}

In [15]:
#Read clustered data to dataframe
text_df = pd.DataFrame.from_dict(clean_bill_text,orient='index', columns=['Text'])
text_df.reset_index(level=0,inplace=True)

In [16]:
text_df.head()

Unnamed: 0,index,Text
0,107hr1ih,a bill to close the achievement gap with acco...
1,107hr10ih,"to provide for pension reform, and for other ..."
2,107hr100ih,to establish and expand programs relating to ...
3,107hr1000ih,to adjust the boundary of the william howard ...
4,107hr1001ih,to amend title xix of the social security act...


In [17]:
def rename_id(i):
    j=i[:-2]
    k = j[3:]+'-'+j[:3]
    return k

In [18]:
#Rename bill id from dataframe so that it matches dictionary of id and status

text_df['index']=text_df['index'].apply(lambda x: rename_id(x))

In [19]:
text_df.tail()

Unnamed: 0,index,Text
77560,s995-114,to establish congressional trade negotiating ...
77561,s996-114,to facilitate nationwide availability of volu...
77562,s997-114,to extend the authorization for the major med...
77563,s998-114,to establish a process for the consideration ...
77564,s999-114,to amend the small business act to provide fo...


In [20]:
#merge datasets

merged_text = pd.merge(text_df,status_df, on='index',how='inner')

In [21]:
merged_text.head()

Unnamed: 0,index,Text,Bill Status
0,hr1-107,a bill to close the achievement gap with acco...,1
1,hr10-107,"to provide for pension reform, and for other ...",1
2,hr100-107,to establish and expand programs relating to ...,1
3,hr1000-107,to adjust the boundary of the william howard ...,1
4,hr1001-107,to amend title xix of the social security act...,0


In [22]:
#Save merged text to csv
merged_text.to_csv('../Data/Labeled bill documents.csv')

In [23]:
# Stratified Train/Test split
stratified_split = StratifiedShuffleSplit(n_splits=2, test_size=0.3)
for train_index, test_index in stratified_split.split(merged_text['Text'], merged_text['Bill Status']):
    x_train, x_test = merged_text['Text'][train_index], merged_text['Text'][test_index]
    y_train, y_test = merged_text['Bill Status'][train_index], merged_text['Bill Status'][test_index]

In [32]:
#Save training/test sets
x_train.to_csv('../Data/text x_train.csv')
x_test.to_csv('../Data/text x_test.csv')
y_train.to_csv('../Data/text y_train.csv')
y_test.to_csv('../Data/text y_test.csv')

In [26]:
#get the text column 
documents=x_train.tolist()
 

# Create a corpus
***
Tokenize documents and remove stop words. Then fit dictionary from token list and create corpus.

In [27]:
%%time
#tokenize
tokens =[simple_preprocess(doc) for doc in documents]

Wall time: 1min 33s


In [28]:
%%time
#Remove stop words
stops=set(stopwords.words('english'))
filt_docs = [[word for word in token if word not in stops]
            for token in tokens]

Wall time: 7.92 s


In [29]:
#Create dictionary
dct = Dictionary(tokens)  # fit dictionary
dct.save('text-train.dict')  # store the dictionary, for future reference

In [30]:
print(dct)

Dictionary(62347 unique tokens: ['academy', 'act', 'after', 'america', 'among']...)


In [31]:
#Create corpus from the dictionary
corpus = [dct.doc2bow(token) for token in tokens]
corpora.MmCorpus.serialize('text-train-corpus.mm', corpus)  # store to disk, for later use