In [None]:
import os
import numpy as np
import tensorflow as tf
import keras
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from random import shuffle
from pyvi import ViTokenizer, ViPosTagger

In [None]:
special_char = [chr(c + ord('0')) for c in range(10)]
special_char.extend([' ', '~', '!', '@', '#', '$', '%', '^', '&', '-', '+', '=', 
                     '{', '}', '[', ']', '\\', '|', '/', '<', '>', '?', '“', '”', '"',
                    '‘', '’'])

def is_valid_word(word):
    return all(c not in word for c in special_char)

def word_tokenize(sentence):
    words, postags = ViPosTagger.postagging(ViTokenizer.tokenize(sentence.lower()))
    return [word for word in words if is_valid_word(word)]

In [None]:
topics = ['xahoi' , 'kinhdoanh', 'thethao', 'vanhoa']
topic_names = ['Xã hội', 'Kinh doanh', 'Thể thao', 'Văn hóa']

num_classes = len(topics)

In [None]:
word_doc_counts = {}
data = []

for i in range(len(topics)):
    fn = os.path.join('data/headlines', topics[i] + '.txt')
    f = open(fn, encoding='utf8')
    lines = f.readlines()[:5000]
    
    for line in lines:
        tokens = word_tokenize(line.strip())
        data.append((tokens, i))
        
        for token in set(tokens):
            word_doc_counts[token] = word_doc_counts.get(token, 0) + 1
        
    f.close()

In [None]:
word_items = list(word_doc_counts.items())
word_items = sorted(word_items, key=lambda x : x[1])

In [None]:
word_index = {item[0]:i+3 for i,item in enumerate(word_items)}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

In [None]:
def encode_tokens(tokens):    
    return [word_index.get(token, 0) for token in tokens]
    

In [None]:
shuffle(data)
Ntrain = int(0.7*len(data))
train_data = data[:Ntrain]
test_data = data[Ntrain:]

In [None]:
Xtrain = [encode_tokens(x[0]) for x in train_data]

In [None]:
Xtrain = [encode_tokens(x[0]) for x in train_data]
Xtrain = pad_sequences(Xtrain, value=0, padding='post', maxlen=128)
ytrain = np.array([to_categorical(x[1], num_classes) for x in train_data])            

In [None]:
Xtest = [encode_tokens(x[0]) for x in test_data]
Xtest = pad_sequences(Xtest, value=0, padding='post', maxlen=128)
ytest = np.array([to_categorical(x[1], num_classes) for x in test_data])

In [None]:
vocab_size = len(word_index)

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(num_classes, activation=tf.nn.softmax))

model.summary()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
model.fit(Xtrain, ytrain, epochs=60, batch_size=1024, verbose=1)

In [None]:
model.evaluate(Xtest, ytest)

In [None]:
text = """Mới đây, Viện khảo cổ học phối hợp với Trung tâm Bảo tồn Di sản văn hoá thế giới Thành nhà Hồ tổ chức công bố kết quả khai quật thám sát di chỉ khảo cổ học núi Xuân Đài (xã Vĩnh Ninh, huyện Vĩnh Lộc, tỉnh Thanh Hóa)."""
tokens = word_tokenize(text)
words_id = encode_tokens(tokens)
X = pad_sequences([words_id], value=0, padding='post', maxlen=128)
y = np.argmax(model.predict(X))
print(topic_names[y])