## Perspectives on Computational Research -- HW02
## Author: Sanittawan Tan

In [48]:
import random
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [49]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras import models
from keras import layers
from keras import losses
from keras import metrics

In [50]:
# check NAs (do this for 3 files)
types_dict = {'BillID': 'str', 'BillNum': 'str', 'Title': 'str', 'Major': 'str'}
df = pd.read_csv('./data/congress_test.csv', dtype=types_dict, encoding="ISO-8859-1")
for col in types_dict.keys():
    print(sum(df[col].isna()))

0
0
0
0


### Process and tokenize the data

In [51]:
def process_file(filepath):
    types_dict = {'BillID': 'str', 'BillNum': 'str', 'Title': 'str', 'Major': 'Int64'}
    df = pd.read_csv(filepath, dtype=types_dict, encoding="ISO-8859-1")
    text = df['Title'].tolist()
    label = df['Major'].tolist()
    return text, label

In [52]:
train_text, train_label = process_file('./data/congress_train.csv') 
val_text, val_label = process_file('./data/congress_val.csv')
test_text, test_label = process_file('./data/congress_test.csv')

In [53]:
max_len = 100
max_words = 10000

In [54]:
def tokenize_and_pad(text):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    data = pad_sequences(sequences, maxlen=max_len)
    return data, word_index

In [55]:
train_text, train_word_index = tokenize_and_pad(train_text)
val_text, val_word_index = tokenize_and_pad(val_text)
test_text, test_word_index = tokenize_and_pad(test_text)

In [56]:
# merge two dictionaries
word_index = {**train_word_index, **val_word_index}

In [57]:
# one hot encoding the labels
train_label = to_categorical(train_label)
val_label = to_categorical(val_label)
test_label = to_categorical(test_label)

### Prepare GloVe word-embeddings

In [58]:
# parsing the Glove word-embeddings (from the book)
glove_dir = '/home/ubuntu/hw02-1'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found {} word vectors.'.format(len(embeddings_index)))

Found 400000 word vectors.


In [59]:
# preparing Glove word-embeddings matrix (from the book)
max_words = 10000
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

### 1) Estimate a basic feed-forward network (without pre-trained word embeddings)

In [60]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Flatten, Dense

In [63]:
# how to select dimension ? 8?
ff_no_glove = Sequential()
ff_no_glove.add(Embedding(10000, 8, input_length=max_len))
ff_no_glove.add(Flatten())
ff_no_glove.add(Dense(24, activation='softmax'))
ff_no_glove.compile(optimizer='rmsprop',
                    loss='categorical_crossentropy',
                    metrics=['acc'])
ff_no_glove.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 8)            80000     
_________________________________________________________________
flatten_3 (Flatten)          (None, 800)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 24)                19224     
Total params: 99,224
Trainable params: 99,224
Non-trainable params: 0
_________________________________________________________________


In [None]:
ff_no_glove_history = ff_no_glove.fit(train_text, train_label,
                                      epochs=50,
                                      batch_size=512,
                                      validation_data=(val_text, val_label))

Instructions for updating:
Use tf.cast instead.
Train on 278612 samples, validate on 69649 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50

### 2) Estimate a basic feed-forward network (with GloVe word embeddings)