In [1]:
import numpy as np
import pandas as pd
import ast
import unicodedata

In [2]:
data_train = pd.read_excel("../data/ner-vietnamese/ner_dataset_train.xlsx")
data_train = data_train.dropna()
data_train = data_train.reset_index(drop = True)
data_train.shape

(10, 2)

In [3]:
data_train.head()

Unnamed: 0,Text,Entity
0,Hôm nay tôi mua 100 nghìn tiền gạo,"[[0, 3, ""B-date""], [4, 7, ""I-date""], [16, 19,..."
1,Hôm nay tôi mua 200 nghìn tiền thịt lợn,"[[0, 3, ""B-date""], [4, 7, ""I-date""], [16, 19,..."
2,Hôm nay tôi mua 10 nghìn tiền chanh,"[[0, 3, ""B-date""], [4, 7, ""I-date""], [16, 18,..."
3,Ngày 30/08/2023 tôi mua chai nước hết 10000,"[[5,15,""B-date""],[24, 28, ""B-type""], [29, 33, ..."
4,Ngày hôm nay tôi mua lon bia 15000,"[[5,8,""B-date""],[9,12,""I-date""],[21,24,""B-type..."


In [4]:
data_test = pd.read_excel("../data/ner-vietnamese/ner_dataset_test.xlsx")
data_test = data_test.dropna()
data_test = data_test.reset_index(drop = True)
data_test.shape

(7, 2)

In [5]:
def pre_process(questions, params):
    results = []
    entities = []
    for i, question in enumerate(list(questions)):
        param_list = ast.literal_eval(params[i])
        question_processed = question
        
        for  param in param_list:
            entity_text = question[param[0]:param[1]]
            entity_name = param[2]
            question_processed = question_processed.replace(entity_text, entity_name)

            if entity_name not in entities:
                entities.append(entity_name)
        results.append(unicodedata.normalize('NFKD', question_processed).strip())
        
    return results, entities


In [None]:
texts_train = data_train['Text']
entities_train = data_train["Entity"]
processed_texts_train, entities = pre_process(texts_train, entities_train)
data_train['Processed Text'] = np.array(processed_texts_train)
data_train['Processed Text'][7]

texts_test = data_test['Text']
entities_test = data_test["Entity"]
processed_texts_test, _ = pre_process(texts_test, entities_test)
data_test['Processed Text'] = np.array(processed_texts_test)

: 

In [7]:
entities

['B-date', 'I-date', 'amount', 'B-type', 'I-type']

In [8]:
def preprocess_label(processed_questions):
    Y = []
    questions = processed_questions
    for question in questions:
        question_splited = question.split()
        y = []
        for word in question_splited:
            if word in entities:
                y.append(word)
            else:
                y.append("O")
        Y.append(" ".join(y))
    return Y

In [9]:
Y_train = preprocess_label(processed_texts_train)
X_train = list(data_train['Text'])
print(X_train[0])
print(Y_train[0])

Y_test = preprocess_label(processed_texts_test)
X_test = list(data_test['Text'])

Hôm nay tôi mua 100 nghìn tiền gạo
B-date I-date O O amount O O B-type


In [64]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Input, Conv1D, LSTM, GRU, Dense, Embedding, Bidirectional, GaussianDropout, TimeDistributed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow import keras
import tensorflow as tf

In [65]:
tokenizer_X = Tokenizer(oov_token = "<OOV>", filters = '')
tokenizer_X.fit_on_texts(X_train)
tokenized_X_train = tokenizer_X.texts_to_sequences(X_train)
X_train = pad_sequences(tokenized_X_train, padding = "post")

tokenizer_Y = Tokenizer(oov_token = "<OOV>", filters = '',lower = False)
tokenizer_Y.fit_on_texts(Y_train)
tokenized_Y_train = tokenizer_Y.texts_to_sequences(Y_train)
Y_train = pad_sequences(tokenized_Y_train, padding = "post")

X_train_vocab_size = len(tokenizer_X.word_index)
Y_train_vocab_size = len(tokenizer_Y.word_index)+1

X_train = tf.convert_to_tensor(X_train, dtype = tf.int32)
Y_train = tf.stack([tf.keras.utils.to_categorical(i, num_classes = Y_train_vocab_size) for i in Y_train])

print("Training Data's Shape: ", X_train.shape)
print("Training Label's Shape: ", Y_train.shape)

Training Data's Shape:  (7, 9)
Training Label's Shape:  (7, 9, 8)


In [66]:
tokenized_X_test = tokenizer_X.texts_to_sequences(X_test)
X_test = pad_sequences(tokenized_X_test, padding = "post")

tokenized_Y_test = tokenizer_Y.texts_to_sequences(Y_test)
Y_test = pad_sequences(tokenized_Y_test, padding = "post")

X_test = tf.convert_to_tensor(X_test, dtype = tf.int32)
Y_test = tf.stack([tf.keras.utils.to_categorical(i, num_classes = Y_train_vocab_size) for i in Y_test])

print("Training Data's Shape: ", X_test.shape)
print("Training Label's Shape: ", Y_test.shape)

Training Data's Shape:  (7, 9)
Training Label's Shape:  (7, 9, 8)


## Embedding

In [67]:
embedding_map = {}
with open("../embedding/word2vec_vi_words_100dims.txt", "r") as f:
    for line in f:
        text, vec = line.split(maxsplit = 1)
        vec = np.fromstring(vec, "f", sep=" ")
        embedding_map[text] = vec
        
embedding_map


  vec = np.fromstring(vec, "f", sep=" ")


{'1587507': array([100.], dtype=float32),
 ',': array([-0.01344502,  0.08291196,  0.09511289,  0.05249285,  0.06944065,
        -0.01065069,  0.00432103, -0.08282392,  0.05148022, -0.21759155,
        -0.09577844, -0.05494263, -0.0103271 ,  0.07735942,  0.04745318,
        -0.06621594, -0.13393909,  0.01582818, -0.12119   , -0.02294538,
        -0.09091333, -0.01343485,  0.10260719, -0.12268285, -0.05281042,
         0.00319954, -0.09995399, -0.11973237,  0.17736456, -0.04641574,
        -0.00445013,  0.12292004,  0.00212898,  0.07875178, -0.05953704,
        -0.00335201,  0.04510177,  0.03826997,  0.1008976 , -0.03324228,
         0.01321117, -0.06556378, -0.09131269, -0.02638779,  0.10684214,
         0.01374784,  0.04601299, -0.04713417,  0.1408092 , -0.00664291,
         0.01302393,  0.1144252 ,  0.04165214, -0.00536263,  0.03502455,
         0.04084913, -0.01474634,  0.02489359, -0.05261   , -0.08628976,
         0.0391279 , -0.17040002, -0.02041635, -0.07390592, -0.02549585,
    

In [105]:
num_tokens = X_train_vocab_size + 1
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer_X.word_index.items():
    embedding_vector = embedding_map.get(word)
    
    if embedding_vector is not None and embedding_vector.shape[0] > 0:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        embedding_matrix[i] = np.zeros((embedding_dim))
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 24 words (5 misses)


## Model 

In [106]:
input_dim = X_train_vocab_size + 1
output_dim = 100
dropout_threshold = 0.5
n_tags = len(tokenizer_Y.word_index)+1

input_layer = Input(shape = X_train.shape[1])
embedding_layer = Embedding(input_dim = input_dim, output_dim = output_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix))(input_layer)
lstm_layer = Bidirectional(LSTM(units = output_dim, dropout = dropout_threshold, return_sequences = True), merge_mode = "concat")(embedding_layer)

cnn_layer = Conv1D(40, 3, activation = "relu", padding = "same")(embedding_layer)
cnn_layer = GaussianDropout(dropout_threshold)(cnn_layer)
cnn_layer = Conv1D(80, 5, activation = "relu", padding = "same")(cnn_layer)

concat_feature = tf.keras.layers.Concatenate()([lstm_layer, cnn_layer])

dense_layer = Sequential()
dense_layer.add(Dense(100, activation = "relu"))
dense_layer.add(Dense(50, activation = "relu"))
dense_layer.add(Dense(n_tags, activation = "softmax"))

classifier = TimeDistributed(dense_layer)(concat_feature)

model = Model(inputs = input_layer, outputs = classifier)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 9)]                  0         []                            
                                                                                                  
 embedding (Embedding)       (None, 9, 100)               3000      ['input_1[0][0]']             
                                                                                                  
 conv1d (Conv1D)             (None, 9, 40)                12040     ['embedding[0][0]']           
                                                                                                  
 gaussian_dropout (Gaussian  (None, 9, 40)                0         ['conv1d[0][0]']              
 Dropout)                                                                                     

In [107]:
model.compile(optimizer = Adam(lr = 0.001), loss = "categorical_crossentropy", metrics = ["accuracy"]) 



In [108]:
model.fit(X_train, Y_train, batch_size = 64, epochs = 1, validation_split = 0.1, shuffle = True)



<keras.src.callbacks.History at 0x135e933b2d0>