# POS tagger model

## Load data from JSON

In [1]:
import numpy as np
import json
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import pickle

Using TensorFlow backend.


In [2]:
files = [os.path.join('json/gc/webtext/json',x) for x in os.listdir('json/gc/webtext/json')]
corpus = []

for x in files:
    with open(x,'r',encoding='utf-16') as f:
#         print('opening {0}'.format(x))
        doc = json.load(f)
        for i in doc.keys():
#             print("key {0} of document {1}".format(i,x))
            try:
#                 print(doc[i][0][0])
                if (len(doc[i][0][0])<200  and len(i) >0): #150 190 gave same max len
                       corpus.append(doc[i][0][0])  #append each sentence to corpus
            except:
# #                 print(doc[i])
                continue

In [3]:


files = [os.path.join('json/gc/books',x) for x in os.listdir('json/gc/books')]
for x in files:
    with open(x,'r',) as f:
#         print('opening {0}'.format(x))
        doc = json.load(f)
#         for i in doc.keys():
        doc = doc ['ps']
        for i in doc:
            if len(i) <200 and len(i) >0:
                corpus.append(i)
#                 print(i)
            
%reset_selective -f doc
#     break

In [3]:
print(len(corpus))

1632


In [4]:
sentences = []
sentence_tags = []

for sentence in corpus:
    x=[]
    y=[]
    for word in sentence:
        x.append(word[1])
        y.append(word[0])
    if len(x) > 0:
        sentences.append(x)
        sentence_tags.append(y)
# sentences = sentences[:8000]
# sentence_tags = sentence_tags[:8000]
print("Sample sentence: ",sentences[10])
print("Sample sentence tags: ",sentence_tags[10])

Sample sentence:  ['तिब्बती', 'किसान', 'तान', 'चंग', 'को', 'सम्पन्नता', 'को', 'कथा', 'को', 'दोस्रो', 'भाग']
Sample sentence tags:  ['JX', 'NN', 'NN', 'NN', 'IKM', 'NN', 'IKM', 'NN', 'IKM', 'MOM', 'NN']


In [6]:
print("Tagged_sentences",len(sentence_tags))

Tagged_sentences 8000


In [7]:
print("Sample sentence: ",sentences[112])
print("Sample sentence tags: ",sentence_tags[112])

Sample sentence:  ['कस्मिक', 'को', 'तेस्रो', 'विमान', 'फोकर', '–', 100]
Sample sentence tags:  ['JX', 'IKM', 'MOM', 'NN', 'NN', 'YM', 'MM']


## Convert Labels to numbers

In [8]:
labels = set()
for sentence in sentence_tags:
    for tag in sentence:
        labels.add(tag)
        
tag2index = {t: i + 1 for i, t in enumerate(list(labels))}
tag2index
print("Total number of tags: ",len(labels))

Total number of tags:  96


In [5]:
##Load tag2index

with open('withMaskingtag2index.pickle','rb') as f:
    tag2index = pickle.load(f)

In [6]:
tag2index['-PAD-'] = 0


In [7]:
def tagsent2int(sent_tag):
    return [tag2index[tag] for tag in sent_tag]

sentence_tags = list(map(tagsent2int,sentence_tags))
print("Sample sentence: ",sentences[10])
print("Sample sentence tags: ",sentence_tags[10])

Sample sentence:  ['तिब्बती', 'किसान', 'तान', 'चंग', 'को', 'सम्पन्नता', 'को', 'कथा', 'को', 'दोस्रो', 'भाग']
Sample sentence tags:  [87, 18, 18, 18, 57, 18, 57, 18, 57, 95, 18]


## Load word2vec model

In [11]:
import gensim.models.word2vec as w2v
nepw2v=w2v.Word2Vec.load('w2vmodel')

In [8]:
#Load tokenizer

with open('withMaskingTokenizer.pickle','rb') as f:
    tokenizer = pickle.load(f)

In [15]:
# tokenizer = Tokenizer(lower=False, oov_token='-OOV-')
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(sentences)
# max_length = len(max(sentences,key=len))
max_length = 197
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
padded_tags = pad_sequences(sentence_tags,maxlen=max_length, padding='post')

In [10]:
tokenizer.word_index['-PAD-'] = 0

In [14]:

# with open('withMaskingtag2index.pickle','wb') as f:
#     pickle.dump(tag2index,f)
# with open('nepw2v')

In [15]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
#     print(word)
    try:
        embedding_vector = nepw2v.wv.get_vector(word)
    except:
        embedding_vector = None
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [16]:

# with open('nepw2vembeddingmatrix.pickle','wb') as f:
#     pickle.dump(embedding_matrix,f)
# with open('withMaskingTokenizer.pickle','wb') as f:
#     pickle.dump(tokenizer,f)

## Train Test split

In [17]:
from sklearn.model_selection import train_test_split
# skf = StratifiedKFold(n_splits=3,shuffle=True)
train_sentences,test_sentences,train_sentence_tags,test_sentence_tags = train_test_split(padded_docs,padded_tags,test_size=0.2)

## Train

In [18]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Dropout, Bidirectional, TimeDistributed, Embedding, Activation,Masking
from keras.optimizers import Adam
 
model = Sequential()
model.add(InputLayer(input_shape=(max_length,)))
model.add(Embedding(vocab_size,300,weights=[embedding_matrix],input_length=max_length,trainable=False))
model.add(Masking())
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])
 
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 197, 300)          5382300   
_________________________________________________________________
masking_1 (Masking)          (None, 197, 300)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 197, 256)          570368    
_________________________________________________________________
dropout_1 (Dropout)          (None, 197, 256)          0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 197, 97)           24929     
_________________________________________________________________
activation_1 (Activat

In [35]:
model.fit(train_sentences,to_categorical(train_sentence_tags),batch_size=256,epochs=10,validation_split=0.2)

Train on 5120 samples, validate on 1280 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f39dc6afa90>

## Save Model

In [37]:
from keras.models import save_model
save_model(model,'withMasking.h5')

## Load Model

In [11]:
from keras.models import load_model
model = load_model('withMasking.h5')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


##  Test model

In [65]:
test_sentence_tags.shape

(1600, 197)

In [16]:
# model.evaluate(test_sentences,to_categorical(test_sentence_tags))
# model.evalutate(sentences,to_categorical(sentence_tags))
model.predict(np.asarray([padded_docs[0]]))

array([[[1.0000576e-02, 9.7415270e-03, 1.0604734e-02, ...,
         1.0260506e-02, 1.0235737e-02, 9.9062566e-03],
        [1.2381864e-03, 2.7797890e-03, 9.5776487e-03, ...,
         5.6460691e-03, 2.1177186e-03, 1.8547591e-03],
        [1.2381864e-03, 2.7797890e-03, 9.5776487e-03, ...,
         5.6460691e-03, 2.1177186e-03, 1.8547591e-03],
        ...,
        [1.9855240e-06, 3.1439006e-06, 5.4778258e-05, ...,
         8.3695231e-06, 3.6361980e-06, 8.6164812e-07],
        [1.9855240e-06, 3.1439006e-06, 5.4778258e-05, ...,
         8.3695231e-06, 3.6361980e-06, 8.6164812e-07],
        [1.9855240e-06, 3.1439006e-06, 5.4778258e-05, ...,
         8.3695231e-06, 3.6361980e-06, 8.6164812e-07]]], dtype=float32)

In [17]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
#             print(np.argmax(categorical))

            token_sequence.append(index[np.argmax(categorical)])

#                 token_sequence.append(index[np.argmax(0)])
        token_sequences.append(token_sequence)
 
    return token_sequences[0]

In [18]:
def logits_to_sentence(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
#             try:
            token_sequence.append(index[categorical])

        token_sequences.append(token_sequence)
    return token_sequences[0]

In [51]:
# with open('withMaskingTokenizer.pickle','wb') as f:
#     pickle.dump(tokenizer,f)

In [40]:
test = np.asarray([padded_docs[127]])
test_tag = np.asarray([padded_tags[127]])


In [41]:
predicted =model.predict(test)

In [42]:
# with open('withMaskingTokenizer.pickle','rb') as f:
#     tokenizer = pickle.load(f)
    

# with open('withMaskingtag2index.pickle','rb') as f:
#     tag2index = pickle.load(f)

In [43]:
tokenizer.word_index['-00V-'] = 0
p_sent_tag = logits_to_tokens(predicted, {i: t for t, i in tag2index.items()})
a_sent_tag = logits_to_sentence(test_tag, {i: t for t, i in tag2index.items()})
p_sent = logits_to_sentence(test, {i: t for t, i in tokenizer.word_index.items()})

In [44]:
for word,tag in zip(p_sent,a_sent_tag):
    if word!='-00V-' :
        print((word,tag))

('मेरी', 'PMXKF')
('छोरी', 'NN')
('को', 'IKM')
('देश', 'NN')


In [45]:
predicted = []
for word,tag in zip(p_sent,p_sent_tag):
    if word!='-00V-' :
        predicted.append((word,tag))
print(predicted)

[('मेरी', 'RR'), ('छोरी', 'NN'), ('को', 'IKM'), ('देश', 'DKX')]


In [63]:
from keras.models import save_model

save_model(model,'withMasking.h5',include_optimizer=True)

In [83]:
from keras import backend as K
 
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'float32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'float32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy
 

In [84]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
model = Sequential()
model.add(InputLayer(input_shape=(max_length,)))
model.add(Embedding(vocab_size,300,weights=[embedding_matrix],input_length=max_length,trainable=False))
model.add(Masking())
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy', ignore_class_accuracy(0)])
 
model.summary()
 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 197, 300)          5382300   
_________________________________________________________________
masking_4 (Masking)          (None, 197, 300)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 197, 256)          570368    
_________________________________________________________________
dropout_4 (Dropout)          (None, 197, 256)          0         
_________________________________________________________________
time_distributed_4 (TimeDist (None, 197, 97)           24929     
_________________________________________________________________
activation_4 (Activation)    (None, 197, 97)           0         
Total params: 5,977,597
Trainable params: 595,297
Non-trainable params: 5,382,300
____________________________________________________________

In [85]:
model.fit(train_sentences, to_categorical(train_sentence_tags, len(tag2index)), batch_size=128, epochs=5, validation_split=0.2)

Train on 5120 samples, validate on 1280 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3a31ad06d8>

In [None]:
from keras.models import load_model
model = load_model('withMasking_ignore_accuracy.h5')

In [None]:
predictions = model.predict(test_samples_X)
print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))