# L665 ML for NLPSpring 2018 

## Assignment 2 - Task 1 

Author: Carlos Sathler

In [1]:
import numpy as np
import nltk

import matplotlib.pyplot as plt
%matplotlib inline  

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from nltk.corpus import brown
nltk.download('brown')
nltk.download('universal_tagset')

Using TensorFlow backend.


[nltk_data] Downloading package brown to /Users/carlos2/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/carlos2/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [30]:
%%time

MAX_SEQ = 200

# create lists of tokens and tags from corpora
# add UNK to map unknown tag values, PAD to pad sequences for RNN
tokens, tags = zip(*brown.tagged_words(tagset='universal'))
tokens = list(tokens)
tokens.append('PAD')
tags = list(tags)
tags.append('PAD')

# create token embedding dictionary and inverse dictionary
token_dict = {token: no for no, token in enumerate(set(tokens))}
token_dict_inv = {no: token for token, no in token_dict.items()}

# create tag embedding dictionary and inverse dictionary
tag_dict = {tag: no for no, tag in enumerate(set(tags))}
tag_dict_inv = {no: tag for tag, no in tag_dict.items()}

# Create dictionary with sentence number as key, 
# and tuple with lists of token, token numbers, tags, tag numbers  

sents = {}
start = 0
for sent_no, token_list in enumerate(brown.sents()):
    end = start + len(token_list)
    sents[sent_no] = {}
    # get tags for this sentence, and then embedding
    tokens    = tokens[start: end]
    token_nos = [token_dict[token] for token in tokens]
    tags      = tags[start: end]
    tag_nos   = [tag_dict[tag] for tag in tags]
    sents[sent_no] = (tokens, token_nos, tags, tag_nos)
    start = end
    
# pad token and tag sequences to MAX_SEQ with PAD to get X and y for sequence to sequence model

token_seqs = [sents[1][1] for sents in sents.items()]
X = pad_sequences(maxlen=MAX_SEQ, sequences=token_seqs, padding="pre", value=token_dict['PAD'])

tag_seqs = [sents[1][3] for sents in sents.items()]
y = pad_sequences(maxlen=MAX_SEQ, sequences=tag_seqs, padding="pre", value=tag_dict['PAD'])

CPU times: user 13.1 s, sys: 610 ms, total: 13.7 s
Wall time: 15.5 s


In [31]:
# check if X and y were properly created
print(X[0])
print(y[0])

[21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500
 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500
 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500
 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500
 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500
 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500
 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500
 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500
 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500
 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500
 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500
 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500
 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500 21500
 21500 21500 21500 21500 21500 21500 21500 21500 21

In [4]:
# define LSTM bidirectional seq to seq model
# will vectorize word embeddings to vectors of size wordvec_dim

WORDVEC_DIM = 20

def get_model(max_seq, voc_size, wordvec_dim, tag_count):
    
    input = Input(shape=(max_seq,), dtype='float64')
    embed = Embedding(input_dim=voc_size, output_dim=wordvec_dim, input_length=MAX_SEQ, \
                      embeddings_initializer='uniform')(input)
    lstm = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(embed)
    out = TimeDistributed(Dense(tag_count, activation="softmax"))(lstm) 

    model = Model(input, out)
    model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

    model.summary()
    
    return model

In [33]:
vocabulary_size = len(token_dict)
number_of_classes = len(tag_dict)

# need to change y to categorical 
ycat = [to_categorical(i, num_classes=number_of_classes) for i in y]

# reserve 10% of the dataset for testing after training the model
X_train, X_test, y_train, y_test = train_test_split(X, ycat, test_size=0.1, shuffle=True)

In [6]:
model = get_model(MAX_SEQ, vocabulary_size, WORDVEC_DIM, number_of_classes)

history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=2, validation_split=0.3, verbose=1)

#plt.plot(history.history['loss'], label='train')
#plt.plot(history.history['val_loss'], label='test')   
#plt.legend()
#plt.show()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 20)           1121160   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 200)          96800     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 200, 13)           2613      
Total params: 1,220,573
Trainable params: 1,220,573
Non-trainable params: 0
_________________________________________________________________
Train on 40138 samples, validate on 17202 samples
Epoch 1/2
Epoch 2/2
 7360/40138 [====>.........................] - ETA: 3:44 - loss: 1.1921e-07 - acc: 1.0000

KeyboardInterrupt: 

In [None]:
y_hat = model.predict(X_test)
acc = accuracy_score(np.argmax(np.array(y_test), axis=1), np.argmax(y_hat, axis=1))
print('Accuracy on test partition: {}'.format(acc))

In [None]:
np.argmax(y_train, axis=-1)[150]

In [None]:
np.argmax(np.array(y_test), axis=-1)[0]

In [36]:
X_train[200]

array([21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500,

In [None]:
np.argmax(ycat[0][-25:], axis=-1)

In [None]:
y[0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, ycat, test_size=0)

In [19]:
X[0]

array([21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500, 21500,
       21500, 21500,

In [37]:
X.shape

(57340, 200)

In [38]:
X_train.shape

(51606, 200)