In [1]:
import os 
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

Using TensorFlow backend.


In [4]:
#import the data 
os.chdir("./data")

def read_data(file_name):
    labels = []
    titles = []
    
    with open(file_name, 'r') as f:
        try:
            for line in f:
                if 'BillID,' in line:
                    pass
                elif '"' in line:
                    split_1 = line.split('"')
                    label = split_1[-1].lstrip(",").rstrip("\n")
                    labels.append(int(label))
                    title = split_1[-2].lstrip(",")
                    title = title.replace(",", "") #here I deleted "," from the titles.
                    titles.append(title)

                else:
                    split = line.split(",")
                    titles.append(split[-2])
                    labels.append(int(split[-1].strip("\n")))
        except:
            print(line)
                
    return ((titles, labels))

train_titles_raw, train_labels_raw = read_data("congress_train.csv")
val_titles_raw, val_labels_raw = read_data("congress_val.csv")
test_titles_raw, test_labels_raw = read_data("congress_test.csv")

114-HR-4322,4322,"To clarify the prohibition on affiliation under the Mentor-Protege Program of the Department of Defense, to amend the Small Business Act to improve cooperation between the mentor-protege programs of the Small Business Administration and the Department of Defense, and for other purposes.",15

114-HR-435,435,"To direct the Secretary of the Interior to sell certain Federal lands in Arizona, Colorado, Idaho, Montana, Nebraska, Nevada, New Mexico, Oregon, Utah, and Wyoming, previously identified as suitable for disposal, and for other purposes.",21



In [5]:
#setting up_data
def setting_data(text_lst,maxlen, max_words):
    maxlen = maxlen
    max_words = max_words

    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(text_lst)
    sequences = tokenizer.texts_to_sequences(text_lst)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=maxlen)

    print('Shape of data tensor:', data.shape)
    return (data)

#preprocessing train data 
x_train = setting_data(train_titles_raw, 
                       maxlen = 100,
                       max_words = 10000)

y_train = to_categorical(train_labels_raw)

#preprocessing validation data 
x_val = setting_data(val_titles_raw, 
                     maxlen = 100, 
                     max_words = 10000)

y_val = to_categorical(val_labels_raw)


#preprocessing test data 
x_test = setting_data(test_titles_raw, 
                      maxlen = 100,
                      max_words = 10000)

y_test = to_categorical(test_labels_raw)

Found 42197 unique tokens.
Shape of data tensor: (274482, 100)
Found 24985 unique tokens.
Shape of data tensor: (69649, 100)
Found 19565 unique tokens.
Shape of data tensor: (37733, 100)


In [6]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, GRU, LSTM

###  Estimate five additional neural network models with different configurations of hyperparameters (e.g. number of layers, number of hidden units, dropout, weight regularization, pre-trained word embeddings) 

In [48]:
#Mod_1

maxlen = 100

mod_1 = Sequential()
mod_1.add(Embedding(10000, 32, input_length = maxlen))
mod_1.add(LSTM(32,
          dropout=0.2,
          recurrent_dropout=0.2))
mod_1.add(Dense(24, activation='softmax'))

mod_1.compile(optimizer='rmsprop', 
                   loss='categorical_crossentropy', 
                   metrics=['acc'])
mod_1.summary()

mod_1_history = mod_1.fit(x_train, y_train, epochs= 10, batch_size= 2048,
                         validation_data = (x_val, y_val))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_21 (Embedding)     (None, 100, 32)           320000    
_________________________________________________________________
lstm_7 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_16 (Dense)             (None, 24)                792       
Total params: 329,112
Trainable params: 329,112
Non-trainable params: 0
_________________________________________________________________
Train on 274482 samples, validate on 69649 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
#Mod_2
maxlen = 100

mod_2 = Sequential()
mod_2.add(Embedding(10000, 32, input_length = maxlen))
mod_2.add(GRU(32,
          dropout=0.1,
          recurrent_dropout=0.5))
mod_2.add(Dense(24, activation='softmax'))

mod_2.compile(optimizer='rmsprop', 
                   loss='categorical_crossentropy', 
                   metrics=['acc'])
mod_2.summary()

mod_2_history = mod_2.fit(x_train, y_train, epochs= 10, batch_size= 2048,
                         validation_data = (x_val, y_val))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 32)           320000    
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                6240      
_________________________________________________________________
dense_2 (Dense)              (None, 24)                792       
Total params: 327,032
Trainable params: 327,032
Non-trainable params: 0
_________________________________________________________________
Train on 274482 samples, validate on 69649 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [73]:
from keras.layers import Reshape
#Mod_3

maxlen = 100

mod_3 = Sequential()
mod_3.add(Embedding(10000, 16, input_length = maxlen))
mod_3.add(GRU(16, dropout=0.2, 
              recurrent_dropout=0.3,
              return_sequences=True))
mod_3.add(GRU(16, dropout=0.2,
             recurrent_dropout=0.3))
mod_3.add(Dense(24, activation='softmax'))

mod_3.compile(optimizer='rmsprop', 
                   loss='categorical_crossentropy', 
                   metrics=['acc'])
mod_3.summary()

mod_3_history = mod_3.fit(x_train, y_train, epochs= 10, batch_size= 2048,
                         validation_data = (x_val, y_val))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_29 (Embedding)     (None, 100, 16)           160000    
_________________________________________________________________
gru_61 (GRU)                 (None, 100, 16)           1584      
_________________________________________________________________
gru_62 (GRU)                 (None, 16)                1584      
_________________________________________________________________
dense_23 (Dense)             (None, 24)                408       
Total params: 163,576
Trainable params: 163,576
Non-trainable params: 0
_________________________________________________________________
Train on 274482 samples, validate on 69649 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [76]:
#Mod_4
#Mod_4
maxlen = 100

mod_4 = Sequential()
mod_4.add(Embedding(10000, 24, input_length = maxlen))
mod_4.add(GRU(24,
          dropout=0.1,
          recurrent_dropout=0.3,
          return_sequences=True))
mod_4.add(GRU(24,
          dropout=0.1,
          recurrent_dropout=0.3,
          return_sequences=True))
mod_4.add(GRU(24,
          dropout=0.3,
          recurrent_dropout=0.7))
mod_4.add(Dense(24, activation='softmax'))

mod_4.compile(optimizer='rmsprop', 
                   loss='categorical_crossentropy', 
                   metrics=['acc'])
mod_4.summary()

mod_4_history = mod_4.fit(x_train, y_train, epochs= 10, batch_size= 2048,
                         validation_data = (x_val, y_val))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_31 (Embedding)     (None, 100, 24)           240000    
_________________________________________________________________
gru_64 (GRU)                 (None, 100, 24)           3528      
_________________________________________________________________
gru_65 (GRU)                 (None, 100, 24)           3528      
_________________________________________________________________
gru_66 (GRU)                 (None, 24)                3528      
_________________________________________________________________
dense_24 (Dense)             (None, 24)                600       
Total params: 251,184
Trainable params: 251,184
Non-trainable params: 0
_________________________________________________________________
Train on 274482 samples, validate on 69649 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/

In [None]:
#Mod_5
maxlen = 100

mod_5 = Sequential()
mod_5.add(Embedding(10000, 8, input_length = maxlen))
mod_5.add(LSTM(8,
          dropout=0.3,
          recurrent_dropout=0.1, 
          return_sequences=True))
mod_5.add(LSTM(8,
          dropout=0.3,
          recurrent_dropout=0.1, 
          return_sequences=True))
mod_5.add(LSTM(8,
          dropout=0.3,
          recurrent_dropout=0.1))
mod_5.add(Dense(24, activation='softmax'))

mod_5.compile(optimizer='rmsprop', 
                   loss='categorical_crossentropy', 
                   metrics=['acc'])
mod_5.summary()

mod_5_history = mod_5.fit(x_train, y_train, epochs= 10, batch_size= 1024,
                         validation_data = (x_val, y_val))

In [82]:
import pickle 

history_lst = [mod_1_history, mod_2_history, mod_3_history, mod_4_history]
pickle.dump(history_lst, open("history_lst_2.p", "wb"))

### Select the best performing model based on the validation set and evaluate its performance using the test set. Assume that with hand-coding we can achieve a 95% accuracy rate. Would your neural network perform better or worse than hand-coding?

In [None]:
lstm_val_acc = lstm_history.history['val_acc']