In [1]:
import numpy as np
import pandas as pd

In [2]:
from pathlib import Path
DATASET_DIR = "output"
data_list = list(Path(DATASET_DIR).rglob("*.[cC][sS][vV]"))
print("Found {0} csv files in {1}".format(len(data_list), DATASET_DIR))

Found 2177 csv files in output


In [3]:
SEQ_LENGTH = 100
def get_total_data_length(d_list):
    tot = 0
    for path in d_list:
        df = pd.read_csv(path)
        df = df[["tags","words"]]
        for t, w in zip(df["tags"], df["words"]):
            sentences = w.split("#") 
            if(len(sentences) > SEQ_LENGTH):
                continue
            
            tot = tot + 1
    return tot

In [4]:
get_total_data_length(data_list[:50])

5627

In [4]:
# remember to make it a class in the library of nlp_preprocessor
TAGSET_URL = "tagset.txt"
t_l = ['M','F','O','X']
with open(TAGSET_URL, 'r') as t:
    tagsets = t.read().split('\n')
    tagsets = set(list(map(lambda x:x[:3],tagsets))) #reduce to 3chars
    tagsets = set(list(map(lambda x:x[:-1] if (len(x)>2 and x[-1]) in t_l else x,tagsets))) #reduce M,F,X,O
    
# zeros are special
ts_sc_num2tags = { i + 1  : tags for i, tags in enumerate(tagsets) }
ts_sc_tags2num = { tags: i + 1 for i, tags in enumerate(tagsets) }

def tags_encode(t_in, source, seq_length):
    t_in = set(list(map(lambda x:x[:3],t_in))) #reduce to 3chars
    t_in = set(list(map(lambda x:x[:-1] if (len(x)>2 and x[-1]) in t_l else x,t_in))) #reduce M,F,X,O
    op_data = np.zeros(seq_length)
    for i, tags in enumerate(t_in):
        if i < seq_length:
            try:
                op_data[i] = source[tags]
            except KeyError:
                continue
    return op_data

def tags_decode(t_in, source):
    dec_data = []
    for tags in t_in:
        if (tags != 0.0):
            dec_data.append(source[tags])
    return dec_data

In [5]:
tags_decode(tags_encode(['KML', 'NP'], ts_sc_tags2num, 100), ts_sc_num2tags)


['NP']

In [6]:
from gensim.models import Word2Vec as w2v
from nlp_processor import NepaliStemmer
SEQUENCE_LENGTH = 100
FEATURE_LENGTH = 100
nep2vec = w2v.load('nep2vec_snowball_stemmer.model')
nepali_stemmer = NepaliStemmer()

def words_encode(words):
    stemmed_words = [words]
    stemmed_words = nepali_stemmer.stem_corpus([words])
    X = np.zeros([SEQUENCE_LENGTH, FEATURE_LENGTH])
    for k, token in enumerate(stemmed_words[0]):
        if(token in nep2vec.wv.vocab):
            index = k
            if index >= SEQUENCE_LENGTH:
                continue
            X[index] = nep2vec.wv[token]
    X = X.reshape(1, SEQUENCE_LENGTH, FEATURE_LENGTH)
    return X

In [7]:
import random
from tensorflow.keras.utils import to_categorical


random.shuffle(data_list)

# x = int(len(data_list) * 0.8)

train_data_list = data_list[:10]
test_data_list = data_list[10:12]

train_data_length = get_total_data_length(train_data_list)
test_data_length = get_total_data_length(test_data_list)

print("Train Data Sentences #: {0}".format(train_data_length))
print("Test Data Sentences #: {0}".format(test_data_length))

SEQ_LENGTH = 100

def make_generator(d_list, print_data=False):
    for path in d_list:
        df = pd.read_csv(path)
        df = df[["tags","words"]]
        for t, w in zip(df["tags"], df["words"]):
            tags = t.split("#")
            sentences = w.split("#")
            
            if(len(sentences) > SEQ_LENGTH):
                continue
                
            if print_data is True:
                print(sentences, tags)
            e_sentences = words_encode(sentences).reshape(1, SEQ_LENGTH, FEATURE_LENGTH)
            e_tags = tags_encode(tags, ts_sc_tags2num, SEQ_LENGTH)
            e_tags = to_categorical(e_tags, num_classes=len(tagsets)+1).reshape(1, SEQ_LENGTH, len(tagsets)+1)
            
            
            yield e_sentences, e_tags

Train Data Sentences #: 2465
Test Data Sentences #: 1753


In [8]:
for s, t in make_generator(train_data_list):
    print(s.shape, t.shape)
    break

(1, 100, 100) (1, 100, 65)


In [34]:
from tensorflow.keras.layers import Dense, Bidirectional, LSTM, Input, Dropout, TimeDistributed, Activation, Masking
from tensorflow.keras.models import Model

LSTM_SHAPE = 100
# try 2 layer lstms 
input_data = Input(shape=(100,100), name="input_layer")
y = Masking()(input_data)
y = Bidirectional(LSTM(LSTM_SHAPE, return_sequences=True, activation='tanh',recurrent_activation='sigmoid', name="lstm_layer_1"), name="bi_lstm_1")(input_data)
y = Dropout(0.5)(y)
# y = Bidirectional(LSTM(LSTM_SHAPE, return_sequences=True, activation='tanh',recurrent_activation='sigmoid', name="lstm_layer_2"), name="bi_lstm_2")(y)
# y = Dropout(0.5)(y)
y = TimeDistributed(Dense(len(tagsets)+1, name="dense_layer"), name="td_dense")(y)
output_data = Activation('softmax', name="activation_layer")(y)

model = Model([input_data], [output_data])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     (None, 100, 100)          0         
_________________________________________________________________
bi_lstm_1 (Bidirectional)    (None, 100, 200)          160800    
_________________________________________________________________
dropout_8 (Dropout)          (None, 100, 200)          0         
_________________________________________________________________
td_dense (TimeDistributed)   (None, 100, 65)           13065     
_________________________________________________________________
activation_layer (Activation (None, 100, 65)           0         
Total params: 173,865
Trainable params: 173,865
Non-trainable params: 0
_________________________________________________________________


In [54]:
input_data = Input(shape=(1,100))
ya = Bidirectional(LSTM(LSTM_SHAPE, return_sequences=True, activation='tanh',recurrent_activation='sigmoid'))(input_data)
ya = Dense(3)(ya)
output_data = Activation('softmax', name="activation_layer")(ya)
model1 = Model([input_data], [output_data])

In [35]:
model.load_weights('model.h5')

In [15]:
from tensorflow.keras.callbacks import ModelCheckpoint

# checkpoints
filepath="weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint] 

In [16]:
train_data = make_generator(train_data_list)
validation_data_gen = make_generator(test_data_list)
model.fit_generator(train_data, 
                    epochs=1, 
                    steps_per_epoch = train_data_length, 
                    validation_data = validation_data_gen, 
                    validation_steps=test_data_length,
                    callbacks=callbacks_list)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
 407/2465 [===>..........................] - ETA: 23:52 - loss: 0.4714 - acc: 0.9035

KeyboardInterrupt: 

In [17]:
x , y = [], []
i = 0
for a,b in make_generator(test_data_list, print_data=True):
    x = a
    y = b
    if i == 15:
        break
    i = i + 1

['नेपाली', 'साहित्य', 'को', 'ऐतिहासिक', 'परिचय'] ['JX', 'NN', 'IKM', 'JX', 'NN']
['डा.', 'तारानाथ', 'शमार्'] ['FB', 'NP', 'NN']
['प्रकाशक', ':', 'साझा', 'प्रकाशन'] ['NN', 'YM', 'JX', 'NN']
['संस्करण', ':', 'पहिलो', '२०२९', ',', 'दोस्रो', '२०४०', '(', 'श्याम', 'पुस्तक', 'भण्डार', ')'] ['NN', 'YM', 'MOM', 'MM', 'YM', 'MOM', 'MM', 'YB', 'NP', 'NN', 'NN', 'YB']
['तेस्रो', '२०६३'] ['MM', 'MM']
['(', 'साझा', 'प्रकाशन', 'बाट', 'पहिलो', ',', '११००', 'प्रति', ')'] ['YB', 'JX', 'NN', 'II', 'MOM', 'YM', 'MM', 'II', 'YB']
['आवरणकला', ':', 'टेकवीर', 'मुखिया'] ['NN', 'YM', 'NN', 'NN']
['मूल्य', ':', 'रु.'] ['NN', 'YM', 'FB']
['मुद्रक', ':', 'साझा', 'प्रकाशन', 'को', 'छापाखाना', ',', 'पुलचोक', ',', 'ललितपुर'] ['NP', 'YM', 'JX', 'NN', 'IKM', 'NN', 'YM', 'NN', 'YM', 'NP']
['फोन', '५५२१०२३'] ['NN', 'MM']
['क्ष्क्द्यल्', 'स्', 'ढढढघघ', '(', 'द्द', '(', 'द्धटज्ञ', '(', 'द्द'] ['NN', 'NN', 'NN', 'YB', 'NN', 'YB', 'NN', 'YB', 'NN']
['यस', 'पुस्तक', 'का', 'विषय', 'मा'] ['DDX', 'NN', 'IKO', 'NN', 'II']
['यो', 

In [36]:
len(['फेद', 'देखि', 'पुछार', 'सम्म', 'नै', 'यस', 'लाई', 'नयाँ', 'प्रकार', 'ले', 'लेखिए', 'को', 'छ', '।'])

14

In [None]:
sent = words_encode(['फेद', 'देखि', 'पुछार', 'सम्म', 'नै', 'यस', 'लाई', 'नयाँ', 'प्रकार', 'ले', 'लेखिए', 'को', 'छ', '।'] )
# e_tags = tags_encode(['CC', 'NP', 'IKM', 'NN', 'II', 'NN', 'IKM', 'DKX', 'NN', 'TT', 'VVYN1', 'YF'], ts_sc_tags2num, SEQ_LENGTH)
prediction = model.predict(sent).argmax(axis=2)
prediction
print(tags_decode(prediction[0], ts_sc_num2tags))
# prediction.shape
# print(len(sent))

In [55]:
nep2vec.wv['अब', 'बाटो', 'खोज्न'].shape

(3, 100)

In [70]:
e_sentences[:,6,:]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]])