## Import the data and tokenize using Keras.

* Keep only the 10000 most frequent words
* Limit each bill's title to a maximum length of 100 words
* Pad each sequence to be of length 100

In [1]:
import os 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

Using TensorFlow backend.


In [9]:
#import the data 
os.chdir("./data")

def read_data(file_name):
    labels = []
    titles = []
    
    with open(file_name, 'r') as f:
        for line in f:
            if 'BillID,' in line:
                pass
            elif '"' in line:
                split_1 = line.split('"')
                label = split_1[-1].lstrip(",").rstrip("\n")
                labels.append(int(label))
                title = split_1[-2].lstrip(",")
                title = title.replace(",", "") #here I deleted "," from the titles.
                titles.append(title)

            else:
                split = line.split(",")
                titles.append(split[-2])
                labels.append(int(split[-1].strip("\n")))
                
    return ((titles, labels))

train_titles_raw, train_labels_raw = read_data("congress_train.csv")
val_titles_raw, val_labels_raw = read_data("congress_val.csv")
test_titles_raw, test_labels_raw = read_data("congress_test.csv")

In [10]:
#check the data length 

print (len(train_titles_raw), len(train_labels_raw))
print (len(val_titles_raw), len(val_labels_raw))
print (len(test_titles_raw), len(test_labels_raw))

278612 278612
69649 69649
38693 38693


In [4]:
#setting up_data
def setting_data(text_lst, label_lst, maxlen, max_words):
    maxlen = maxlen
    max_words = max_words

    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(text_lst)
    sequences = tokenizer.texts_to_sequences(text_lst)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=maxlen)
    labels = np.asarray(label_lst)

    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)
    return ((data, labels))

In [11]:
#preprocessing train data 
x_train, y_train = setting_data(train_titles_raw, 
                                train_labels_raw,
                                maxlen = 100,
                                max_words = 10000)

#preprocessing validation data 
x_val, y_val = setting_data(val_titles_raw, 
                            val_labels_raw, 
                            maxlen = 100, 
                            max_words = 10000)

#preprocessing test data 
x_test, y_test = setting_data(test_titles_raw, 
                              test_labels_raw,
                              maxlen = 100,
                              max_words = 10000)

Found 42449 unique tokens.
Shape of data tensor: (278612, 100)
Shape of label tensor: (278612,)
Found 24985 unique tokens.
Shape of data tensor: (69649, 100)
Shape of label tensor: (69649,)
Found 19813 unique tokens.
Shape of data tensor: (38693, 100)
Shape of label tensor: (38693,)


### Estimate a basic feed-forward network

### Estimate a recurrent neural network (RNN) with a layer_simple_rnn

### Estimate an RNN with an LSTM layer

### Estimate an RNN with a GRU layer

###  Estimate five additional neural network models with different configurations of hyperparameters (e.g. number of layers, number of hidden units, dropout, weight regularization, pre-trained word embeddings) 

### Select the best performing model based on the validation set and evaluate its performance using the test set. Assume that with hand-coding we can achieve a 95% accuracy rate. Would your neural network perform better or worse than hand-coding?