In [35]:
import os
from pathlib import Path
import string
import numpy

import pandas
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
articles_dir = Path(os.getcwd(), 'data', 'articles')
titles_dir = Path(os.getcwd(), 'data', 'titles')

In [33]:
NUM_WORDS = 2 ** 16
MAX_TEXT_LENGTH = 42

# Data preprocessing (word embedding)

In [46]:
# Remove puncuation from text
def remove_punc_from_text(text):
    clean_word_list = [char for char in text if char not in string.punctuation]
    return ''.join(clean_word_list)


# TODO: remove stopwords

In [47]:
# Collect articles content and summary to dataframe
dataset = []
list_article_files = os.listdir(articles_dir)
for filename in list_article_files:
    with open(Path(articles_dir, filename), 'r') as article_file:
        text = article_file.read()
        with open(Path(titles_dir, filename), 'r') as title_file:
            summary = title_file.read()
            dataset.append({ 'summary': summary, 'text': text })

dataset_df = pandas.DataFrame.from_records(dataset)

# Normalize train data
dataset_df.summary = [f'_START_{remove_punc_from_text(summary.lower())}_END_' for summary in dataset_df['summary']]
dataset_df.text = [remove_punc_from_text(text.lower()) for text in dataset_df['text']]

# Split datasets to training and validation sets
x_train, x_val, y_train, y_val = train_test_split(
    numpy.array(dataset_df['text']),
    numpy.array(dataset_df['summary']),
    test_size=0.1,
    random_state=1,
    shuffle=True
)

# Tokenizing text of x set
x_tokenizer = Tokenizer(num_words=NUM_WORDS)
x_tokenizer.fit_on_texts(list(x_train))
x_train_sequence = x_tokenizer.texts_to_sequences(x_train)
x_train_padded = pad_sequences(x_train_sequence, maxlen=MAX_TEXT_LENGTH, padding='post')
x_vocab_size = len(x_tokenizer.word_index) + 1

# Tokenizing text of y set
y_tokenizer = Tokenizer(num_words=NUM_WORDS)
y_tokenizer.fit_on_texts(list(y_train))
y_train_sequence = x_tokenizer.texts_to_sequences(y_train)
y_train_padded = pad_sequences(y_train_sequence, maxlen=MAX_TEXT_LENGTH, padding='post')
y_vocab_size = len(y_tokenizer.word_index) + 1

print(f'[Vocab size] x: {x_vocab_size} <-> y: {y_vocab_size}')

[Vocab size] x: 25966 <-> y: 3803


# Modeling

In [48]:
LATENT_DIM = 240
EMBEDDING_DIM = 300
NUM_EPOCHS = 50

GLOVE_FILE_PATH = Path(os.getcwd(), 'glove', '')

In [49]:
def get_embedding_matrix (tokenizer, embedding_dim, vocab_size):
    word_index = tokenizer.word_index
    vocab = list(word_index.keys())

    with open(GLOVE_FILE_PATH) as file:
        for line in file:
            word, coefs = line.split(maxsplit=1)
            print(word, coefs)

    return 1


x_embedding_matrix = get_embedding_matrix(x_tokenizer, EMBEDDING_DIM, x_vocab_size)
y_embedding_matrix = get_embedding_matrix(y_tokenizer, EMBEDDING_DIM, y_vocab_size)

# print(x_embedding_matrix.shape)
# print(y_embedding_matrix.shape)

{'start': 1, 'end': 2, 'cổ': 3, 'chứng': 4, 'phiếu': 5, 'khoán': 6, 'thị': 7, 'trường': 8, 'phiên': 9, 'đầu': 10, 'tăng': 11, 'bán': 12, 'mua': 13, 'trong': 14, 'dịch': 15, 'của': 16, 'giao': 17, 'tiếp': 18, 'triệu': 19, 'doanh': 20, 'tỷ': 21, 'nhìn': 22, 'góc': 23, 'vietstock': 24, 'giảm': 25, 'đồng': 26, 'cp': 27, 'lại': 28, 'tích': 29, 'hơn': 30, 'tục': 31, 'trước': 32, 'ngày': 33, 'năm': 34, 'công': 35, 'gì': 36, 'tuần': 37, 'giá': 38, 'phân': 39, 'chiều': 40, 'tư': 41, 'sinh': 42, 'quyền': 43, 'và': 44, 'ròng': 45, 'phái': 46, 'bị': 47, 'nhịp': 48, 'trở': 49, 'điểm': 50, 'động': 51, 'mạnh': 52, 'kỹ': 53, 'thuật': 54, 'có': 55, 'ý': 56, 'top': 57, 'đập': 58, 'đáng': 59, 'chú': 60, 'vnindex': 61, 'hàng': 62, 'tự': 63, 'đọc': 64, 'chủ': 65, 'giờ': 66, 'sẽ': 67, 'việt': 68, 'gần': 69, '10': 70, 'hạn': 71, 'vốn': 72, '2022': 73, 'vào': 74, 'lớn': 75, 'daily': 76, 'thành': 77, 'tại': 78, 'tháng': 79, 'quan': 80, 'quỹ': 81, 'tịch': 82, 'sản': 83, 'nghiệp': 84, 'đông': 85, 'lý': 86, 'từ':