In [1]:
from keras.layers.core import Activation,Dense,Dropout,RepeatVector,SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.layers.wrappers import TimeDistributed
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.utils import np_utils
from keras.utils import plot_model
from sklearn.model_selection import train_test_split
import collections
import nltk
import numpy as np
import os

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
fedata=open(os.path.join("../data","treebank_sents.txt"),"w")
ffdata=open(os.path.join("../data","treebank_poss.txt"),"w")

sents=nltk.corpus.treebank.tagged_sents()
for sent in sents:
    words,poss=[],[]
    for word,pos in sent:
        if pos =="-NONE-":
            continue
        words.append(word)
        poss.append(pos)
    fedata.write("{:s}\n".format(" ".join(words)))
    ffdata.write("{:s}\n".format(" ".join(poss)))
fedata.close()
ffdata.close()

In [3]:
def parse_sentences(filename):
    word_freqs=collections.Counter()
    num_recs,maxlen=0,0
    fin=open(filename,"r")
    for line in fin:
        words=line.strip().lower().split()
        for word in words:
            word_freqs[word]+=1
        if(len(words)>maxlen):
            maxlen=len(words)
        num_recs+=1
    fin.close()
    return word_freqs,maxlen,num_recs

s_wordfreqs,s_maxlen,s_numrecs=parse_sentences(os.path.join("../data","treebank_sents.txt"))
t_wordfreqs,t_maxlen,t_numrecs=parse_sentences(os.path.join("../data","treebank_poss.txt"))
print(len(s_wordfreqs),s_maxlen,s_numrecs)
print(len(t_wordfreqs),t_maxlen,t_numrecs)

10947 249 3914
45 249 3914


In [4]:
MAX_SEQLEN=250
S_MAX_FEATURES=5000
T_MAX_FEATURES=45

s_vocabsize=min(len(s_wordfreqs),S_MAX_FEATURES)+2
s_word2index={x[0]:i+2 for i,x in enumerate(s_wordfreqs.most_common(S_MAX_FEATURES))}
s_word2index["PAD"]=0
s_word2index["UNK"]=1
s_index2word={v:k for k,v in s_word2index.items()}

t_vocabsize=len(t_wordfreqs)+1
t_word2index={x[0]:i for i,x in enumerate(t_wordfreqs.most_common(T_MAX_FEATURES))}

t_word2index["PAD"]=0
t_index2word={v:k for k,v in t_word2index.items()}

In [5]:
def build_tensor(filename,numrecs,word2index,maxlen,make_categorical=False,num_classes=0):
    data=np.empty((numrecs,),dtype=list)
    fin=open(filename,"r")
    i=0
    for line in fin:
        wids=[]
        for word in line.strip().lower().split():
            if word in word2index:
                wids.append(word2index[word])
            else:
                wids.append(word2index["UNK"])
        if make_categorical:
            data[i]=np_utils.to_categorical(wids,num_classes=num_classes)
        else:
            data[i]=wids
        i+=1
    fin.close()
    pdata=sequence.pad_sequences(data,maxlen=maxlen)
    return pdata

X=build_tensor(os.path.join("../data","treebank_sents.txt"),s_numrecs,s_word2index,MAX_SEQLEN)
Y=build_tensor(os.path.join("../data","treebank_poss.txt"),t_numrecs,t_word2index,MAX_SEQLEN,True,t_vocabsize)

In [9]:
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.2,random_state=42)

In [91]:
EMBED_SIZE=128
HIDDEN_SIZE=64
BATCH_SIZE=32
NUM_EPOCHS=1

model=Sequential()
model.add(Embedding(s_vocabsize,EMBED_SIZE,input_length=MAX_SEQLEN))
model.add(SpatialDropout1D(0.2))
model.add(GRU(HIDDEN_SIZE,dropout=0.2,recurrent_dropout=0.2))
model.add(RepeatVector(MAX_SEQLEN))
model.add(GRU(HIDDEN_SIZE,return_sequences=True))
model.add(TimeDistributed(Dense(t_vocabsize)))
model.add(Activation("softmax"))

model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

In [92]:
plot_model(model,to_file="GRU_POS.png",show_shapes=True)

In [93]:
model.fit(Xtrain,Ytrain,batch_size=BATCH_SIZE,epochs=NUM_EPOCHS,validation_data=[Xtest,Ytest])

Train on 3131 samples, validate on 783 samples
Epoch 1/1


<keras.callbacks.History at 0x7f40fdcfba90>

In [94]:
score,acc=model.evaluate(Xtest,Ytest,batch_size=BATCH_SIZE)
print(score,acc)

0.2928801686294843 0.9157496759261208


In [101]:
from keras.layers.wrappers import Bidirectional
from keras.layers.recurrent import LSTM

model=Sequential()
model.add(Embedding(s_vocabsize,EMBED_SIZE,input_length=MAX_SEQLEN))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(GRU(HIDDEN_SIZE,dropout=0.2,recurrent_dropout=0.2)))
model.add(RepeatVector(MAX_SEQLEN))
model.add(Bidirectional(GRU(HIDDEN_SIZE,return_sequences=True)))
model.add(TimeDistributed(Dense(t_vocabsize)))
model.add(Activation("softmax"))

model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

plot_model(model,to_file="Bidirectional_POS.png",show_shapes=True)

model.fit(Xtrain,Ytrain,batch_size=BATCH_SIZE,epochs=NUM_EPOCHS,validation_data=[Xtest,Ytest])

score,acc=model.evaluate(Xtest,Ytest,batch_size=BATCH_SIZE)
print(score,acc)

Train on 3131 samples, validate on 783 samples
Epoch 1/1
0.27818018513925535 0.8367100931979992


In [103]:
from keras.layers.recurrent import LSTM

model=Sequential()
model.add(Embedding(s_vocabsize,EMBED_SIZE,input_length=MAX_SEQLEN))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(HIDDEN_SIZE,dropout=0.2,recurrent_dropout=0.2))
model.add(RepeatVector(MAX_SEQLEN))
model.add(LSTM(HIDDEN_SIZE,return_sequences=True))
model.add(TimeDistributed(Dense(t_vocabsize)))
model.add(Activation("softmax"))

model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

plot_model(model,to_file="LSTM_POS.png",show_shapes=True)

model.fit(Xtrain,Ytrain,batch_size=BATCH_SIZE,epochs=NUM_EPOCHS,validation_data=[Xtest,Ytest])

score,acc=model.evaluate(Xtest,Ytest,batch_size=BATCH_SIZE)
print(score,acc)

Train on 3131 samples, validate on 783 samples
Epoch 1/1
0.2939024456463859 0.9078876208680495
