In [16]:
import collections
import os

#import matplotlib.pyplot as plt
import nltk
import numpy as np
from keras.layers.core import (
    Activation, Dense, Dropout, RepeatVector, SpatialDropout1D)
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.layers.wrappers import Bidirectional, TimeDistributed
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.utils import np_utils
from sklearn.model_selection import train_test_split

In [2]:
DATA_DIR = '../../data'

In [3]:
fedata = open(os.path.join(DATA_DIR, 'treebank_sents.txt'), 'w')
ffdata = open(os.path.join(DATA_DIR, 'treebank_poss.txt'), 'w')
sents = nltk.corpus.treebank.tagged_sents()
for sent in sents:
    words, poss = [], []
    for word, pos in sent:
        if pos == '-NONE-':
            continue
        words.append(word)
        poss.append(pos)
    fedata.write('{:s}\n'.format(' '.join(words)))
    ffdata.write('{:s}\n'.format(' '.join(poss)))
fedata.close()
ffdata.close()

In [4]:
!head -3 ../../data/treebank_poss.txt
!head -3 ../../data/treebank_sents.txt

NNP NNP , CD NNS JJ , MD VB DT NN IN DT JJ NN NNP CD .
NNP NNP VBZ NN IN NNP NNP , DT NNP VBG NN .
NNP NNP , CD NNS JJ CC JJ NN IN NNP NNP NNP NNP , VBD VBN DT JJ NN IN DT JJ JJ NN .
Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group .
Rudolph Agnew , 55 years old and former chairman of Consolidated Gold Fields PLC , was named a nonexecutive director of this British industrial conglomerate .


In [5]:
def parse_sentences(filename):
    word_freqs = collections.Counter()
    n_recs, maxlen = 0, 0
    fin = open(filename, 'r')
    for line in fin:
        words = line.strip().lower().split()
        for word in words:
            word_freqs[word] += 1
        if len(words) > maxlen:
            maxlen = len(words)
        n_recs += 1
    fin.close()
    return word_freqs, maxlen, n_recs

In [6]:
s_wordfreqs, s_maxlen, s_nrecs = parse_sentences(
    os.path.join(DATA_DIR, 'treebank_sents.txt'))
t_wordfreqs, t_maxlen, t_nrecs = parse_sentences(
    os.path.join(DATA_DIR, 'treebank_poss.txt'))

print(len(s_wordfreqs), s_maxlen, s_nrecs)
print(len(t_wordfreqs), t_maxlen, t_nrecs)

10947 249 3914
45 249 3914


In [7]:
MAX_SEQLEN = 250
S_MAX_FEATURES = 5000 # use only top 5k
T_MAX_FEATURES = 45

In [8]:
s_vocabsize = min(len(s_wordfreqs), S_MAX_FEATURES) + 2
s_word2index = {
    x[0]: i + 2 
    for i, x in enumerate(s_wordfreqs.most_common(S_MAX_FEATURES))}
s_word2index['PAD'] = 0
s_word2index['UNK'] = 1
s_index2word = {v: k for k, v in s_word2index.items()}

In [9]:
t_vocabsize = len(t_wordfreqs) + 1
t_word2index = {
    x[0]: i
    for i, x in enumerate(t_wordfreqs.most_common(T_MAX_FEATURES))}
t_word2index['PAD'] = 0
t_index2word = {v: k for k, v in t_word2index.items()}

In [10]:
def build_tensor(
        filename, n_recs, word2index, maxlen, make_categorical=False, 
        n_classes=0):
    data = np.empty((n_recs,), dtype=list)
    fin = open(filename, 'r')
    i = 0
    for line in fin:
        wids = []
        for word in line.strip().lower().split():
            if word in word2index:
                wids.append(word2index[word])
            else:
                wids.append(word2index['UNK'])
        if make_categorical:
            data[i] = np_utils.to_categorical(wids, num_classes=n_classes)
        else:
            data[i] = wids
        i += 1
    fin.close()
    pdata = sequence.pad_sequences(data, maxlen=maxlen)
    return pdata

In [11]:
X = build_tensor(os.path.join(DATA_DIR, 'treebank_sents.txt'), 
                 s_nrecs, 
                 s_word2index, 
                 MAX_SEQLEN)
Y = build_tensor(os.path.join(DATA_DIR, 'treebank_poss.txt'),
                 t_nrecs,
                 t_word2index,
                 MAX_SEQLEN,
                 True,
                 t_vocabsize)

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2) # random_state=123

In [13]:
EMBED_SIZE = 128
HIDDEN_SIZE = 64
BATCH = 32
EPOCHS = 1
DROPOUT = 0.2

In [14]:
mod = Sequential()
mod.add(Embedding(s_vocabsize, EMBED_SIZE, input_length=MAX_SEQLEN))
mod.add(SpatialDropout1D(DROPOUT))
mod.add(GRU(HIDDEN_SIZE, dropout=DROPOUT, recurrent_dropout=DROPOUT))
mod.add(RepeatVector(MAX_SEQLEN))
mod.add(GRU(HIDDEN_SIZE, return_sequences=True))
mod.add(TimeDistributed(Dense(t_vocabsize)))
mod.add(Activation('softmax'))
mod.compile(
    loss='categorical_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy'])

In [15]:
mod.fit(
    X_train, 
    Y_train, 
    batch_size=BATCH, 
    epochs=EPOCHS, 
    validation_data=[X_test, Y_test])
score, acc = mod.evaluate(X_test, Y_test, batch_size=BATCH)
print(f'Test Score: {score:.3f}\tAccuracy: {acc:.3f}')

Train on 3131 samples, validate on 783 samples
Epoch 1/1
Test Score: 0.280	Accuracy: 0.862


## Bidirectional Model

In [20]:
mod = Sequential()
mod.add(Embedding(s_vocabsize, EMBED_SIZE, input_length=MAX_SEQLEN))
mod.add(SpatialDropout1D(DROPOUT))
mod.add(Bidirectional(GRU(
    HIDDEN_SIZE, dropout=DROPOUT, recurrent_dropout=DROPOUT)))
mod.add(RepeatVector(MAX_SEQLEN))
mod.add(Bidirectional(GRU(HIDDEN_SIZE, return_sequences=True)))
mod.add(TimeDistributed(Dense(t_vocabsize)))
mod.add(Activation('softmax'))
mod.compile(
    loss='categorical_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy'])

In [21]:
mod.fit(
    X_train, 
    Y_train, 
    batch_size=BATCH, 
    epochs=EPOCHS, 
    validation_data=[X_test, Y_test])
score, acc = mod.evaluate(X_test, Y_test, batch_size=BATCH)
print(f'Test Score: {score:.3f}\tAccuracy: {acc:.3f}')

Train on 3131 samples, validate on 783 samples
Epoch 1/1
Test Score: 0.266	Accuracy: 0.524
