In [1]:
import os 
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

Using TensorFlow backend.


In [2]:
#import the data 
os.chdir("./data")

def read_data(file_name):
    labels = []
    titles = []
    with open(file_name, 'r') as f:
        lines = f.readlines()
        for line in lines:
            if 'BillID' not in line:
                split = line.split("\t")
                label = split[-1].rstrip("\n")
                if label == "23":
                    label = "11"
                labels.append(label)
                title = split[-2].replace('"', "").replace(",", "").replace(".", "").lower()
                titles.append(title)
                
    return (titles, np.asarray(labels))

train_titles_raw, train_labels_raw = read_data("congress_train.txt")
val_titles_raw, val_labels_raw = read_data("congress_val.txt")
test_titles_raw, test_labels_raw = read_data("congress_test.txt")

In [3]:
#setting up_data
maxlen = 100
max_words = 10000

#training data
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_titles_raw)
training_sequences = tokenizer.texts_to_sequences(train_titles_raw)
word_index = tokenizer.word_index
x_train = pad_sequences(training_sequences, maxlen=100)

#validation
val_sequences = tokenizer.texts_to_sequences(val_titles_raw)
x_val = pad_sequences(val_sequences, maxlen=100)

#test
test_sequences = tokenizer.texts_to_sequences(test_titles_raw)
x_test = pad_sequences(test_sequences, maxlen =100)


y_train = to_categorical(train_labels_raw)
y_val = to_categorical(val_labels_raw)
y_test = to_categorical(test_labels_raw)

In [4]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, GRU, LSTM

###  Estimate five additional neural network models with different configurations of hyperparameters (e.g. number of layers, number of hidden units, dropout, weight regularization, pre-trained word embeddings) 

In [5]:
#Mod_1

maxlen = 100

mod_1 = Sequential()
mod_1.add(Embedding(10000, 32, input_length = maxlen))
mod_1.add(LSTM(32,
          dropout=0.2,
          recurrent_dropout=0.2))
mod_1.add(Dense(22, activation='softmax'))

mod_1.compile(optimizer='rmsprop', 
                   loss='categorical_crossentropy', 
                   metrics=['acc'])
mod_1.summary()

mod_1_history = mod_1.fit(x_train, y_train, epochs= 20, batch_size= 1024,
                         validation_data = (x_val, y_val))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 32)           320000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_1 (Dense)              (None, 22)                726       
Total params: 329,046
Trainable params: 329,046
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Train on 278612 samples, validate on 69649 samples
Epoch 1/20
  2048/278612 [..............................] - ETA: 7:07 - loss: 3.0887 - acc: 0.0771 

ResourceExhaustedError: OOM when allocating tensor with shape[100,1024,32] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node training/RMSprop/gradients/lstm_1/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3_grad/TensorArrayGatherV3}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


In [None]:
#Mod_2
maxlen = 100

mod_2 = Sequential()
mod_2.add(Embedding(10000, 32, input_length = maxlen))
mod_2.add(GRU(32,
          dropout=0.1,
          recurrent_dropout=0.5))
mod_2.add(Dense(22, activation='softmax'))

mod_2.compile(optimizer='rmsprop', 
                   loss='categorical_crossentropy', 
                   metrics=['acc'])
mod_2.summary()

mod_2_history = mod_2.fit(x_train, y_train, epochs= 10, batch_size= 1024,
                         validation_data = (x_val, y_val))

In [None]:
from keras.layers import Reshape
#Mod_3

maxlen = 100

mod_3 = Sequential()
mod_3.add(Embedding(10000, 16, input_length = maxlen))
mod_3.add(GRU(16, dropout=0.2, 
              recurrent_dropout=0.3,
              return_sequences=True))
mod_3.add(GRU(16, dropout=0.2,
             recurrent_dropout=0.3))
mod_3.add(Dense(22, activation='softmax'))

mod_3.compile(optimizer='rmsprop', 
                   loss='categorical_crossentropy', 
                   metrics=['acc'])
mod_3.summary()

mod_3_history = mod_3.fit(x_train, y_train, epochs= 20, batch_size= 1024,
                         validation_data = (x_val, y_val))

In [None]:
#Mod_4
#Mod_4
maxlen = 100

mod_4 = Sequential()
mod_4.add(Embedding(10000, 24, input_length = maxlen))
mod_4.add(GRU(24,
          dropout=0.1,
          recurrent_dropout=0.3,
          return_sequences=True))
mod_4.add(GRU(24,
          dropout=0.1,
          recurrent_dropout=0.3,
          return_sequences=True))
mod_4.add(GRU(24,
          dropout=0.3,
          recurrent_dropout=0.7))
mod_4.add(Dense(22, activation='softmax'))

mod_4.compile(optimizer='rmsprop', 
                   loss='categorical_crossentropy', 
                   metrics=['acc'])
mod_4.summary()

mod_4_history = mod_4.fit(x_train, y_train, epochs= 20, batch_size= 1024,
                         validation_data = (x_val, y_val))

In [None]:
#Mod_5
maxlen = 100

mod_5 = Sequential()
mod_5.add(Embedding(10000, 8, input_length = maxlen))
mod_5.add(LSTM(8,
          dropout=0.3,
          recurrent_dropout=0.1, 
          return_sequences=True))
mod_5.add(LSTM(8,
          dropout=0.3,
          recurrent_dropout=0.1, 
          return_sequences=True))
mod_5.add(LSTM(8,
          dropout=0.3,
          recurrent_dropout=0.1))
mod_5.add(Dense(22, activation='softmax'))

mod_5.compile(optimizer='rmsprop', 
                   loss='categorical_crossentropy', 
                   metrics=['acc'])
mod_5.summary()

mod_5_history = mod_5.fit(x_train, y_train, epochs= 20, batch_size= 1024,
                         validation_data = (x_val, y_val))

### Select the best performing model based on the validation set and evaluate its performance using the test set. Assume that with hand-coding we can achieve a 95% accuracy rate. Would your neural network perform better or worse than hand-coding?