In [1]:
import os
import re
import numpy as np
import gensim
from gensim import models
from pyvi import ViTokenizer, ViPosTagger
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam
from keras.utils import to_categorical
from random import shuffle

Using TensorFlow backend.


In [2]:
special_char = [chr(c + ord('0')) for c in range(10)]
special_char.extend([' ', '~', '!', '@', '#', '$', '%', '^', '&', '-', '+', '=', 
                     '{', '}', '[', ']', '\\', '|', '/', '<', '>', '?', '“', '”', '"',
                    '‘', '’'])

def is_valid_word(word):
    return all(c not in word for c in special_char)

def word_tokenize(sentence):
    words, postags = ViPosTagger.postagging(ViTokenizer.tokenize(sentence.lower()))
    return [word for word in words if is_valid_word(word)]

In [3]:
topics = ['xahoi' , 'kinhdoanh', 'thethao', 'vanhoa']
topic_names = ['Xã hội', 'Kinh doanh', 'Thể thao', 'Văn hóa']

num_classes = len(topics)
documents = []
labels = []

In [4]:
for i in range(len(topics)):
    fn = os.path.join('data/headlines', topics[i] + '.txt')
    f = open(fn, encoding='utf8')
    documents.extend(f.readlines()[:5000])
    labels.extend([i]*5000)
    f.close()

In [5]:
processed_docs = list(map(word_tokenize, documents))
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_above=0.5, keep_n=10000)
dict_size = len(dictionary)

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
tfidf = models.TfidfModel(bow_corpus)

In [6]:
data = []

for i in range(len(processed_docs)):
    bow_vector = tfidf[bow_corpus[i]]
    wordvec = np.zeros(dict_size)    
    for index, value in bow_vector:
        wordvec[index] = value
    data.append((wordvec, labels[i]))
    
shuffle(data)
Ntrain = int(len(data) * 0.7)

X_train = np.array([item[0] for item in data[:Ntrain]])
Y_train = np.array([to_categorical(item[1], num_classes) for item in data[:Ntrain]])

X_test = np.array([item[0] for item in data[Ntrain:]])
Y_test = np.array([to_categorical(item[1], num_classes) for item in data[Ntrain:]])


In [7]:
model = Sequential()
model.add(Dense(5, input_dim=dict_size, activation='sigmoid'))
model.add(Dense(num_classes, activation='softmax'))
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-6)
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=adam)

In [None]:
model.fit(X_train, Y_train, epochs=40, shuffle=True)

Epoch 1/40
Epoch 2/40

In [None]:
_ , score = model.evaluate(X_test, Y_test)
print('score = ', score)

In [None]:
text = """Mới đây, Viện khảo cổ học phối hợp với Trung tâm Bảo tồn Di sản văn hoá thế giới Thành nhà Hồ tổ chức công bố kết quả khai quật thám sát di chỉ khảo cổ học núi Xuân Đài (xã Vĩnh Ninh, huyện Vĩnh Lộc, tỉnh Thanh Hóa)."""

processed_text = word_tokenize(text)
bow = dictionary.doc2bow(processed_text)
bow_vector = tfidf[bow]

wordvec = np.zeros(dict_size)
for index, value in bow_vector:
    wordvec[index] = value
    
predict = model.predict(np.array([wordvec]))
categ = np.argmax(predict[0])
print('Chủ đề : ', topic_names[categ])