In [1]:
import pandas as pd
import numpy as np
import ast
import re
import string
import unicodedata
import nltk

In [2]:
root = pd.read_csv('./15_clean_link_inside_description.csv')

In [3]:
# Drop unnecessary columns
df = root.drop(columns = ['depth', 'cover_img', 'recommended_stories', 'badges', 'Thời gian'])

# Convert story_tags_list to array
df['story_tags_list'] = df['story_tags_list'].apply(ast.literal_eval)

# Deduplicate values in story_tags_list
df['story_tags_list'] = df['story_tags_list'].apply(lambda x: np.unique(x))

# Rearrange and rename indexes to code easier
col_names = {'ID': 'id',
             'title': 'title',
             'Description': 'description',
             'Lượt đọc': 'view',
             'Lượt bình chọn': 'vote',
             'Chương': 'chapter',
             'story_tags_list': 'tag'}
df = df.reindex(columns=['ID', 'title', 'Description', 'Lượt đọc', 'Lượt bình chọn', 'Chương', 'story_tags_list'])
df = df.rename(columns=col_names)

In [4]:
# Clean NaN
df['title'].fillna(value='The', inplace=True)

# Merge title and description into text
df['title'] = df['title'] + ' ' + df['description']
df = df.drop(columns=['description'])
df = df.rename(columns={'title': 'text'})

In [5]:
# Cleaning function
def clean(text):
    if isinstance(text, str):
        # unbold all character
        text = unicodedata.normalize('NFKC', text)

        # lowercase
        text = text.lower()

        # remove newline & punctuation
        text = text.replace('\n', ' ')
        text = re.sub('[' + string.punctuation + ']', ' ', text)

        # remove redundant space after above processes
        text = ' '.join(text.split())

        # aplly super regex (Vietnamese characters)
        super_regex = r'[^ A-Za-z0-9\u00C0\u00C1\u00C2\u00C3\u00C8\u00C9\u00CA\u00CC\u00CD\u00D2\u00D3\u00D4\u00D5\u00D9\u00DA\u00DD\u00E0\u00E1\u00E2\u00E3\u00E8\u00E9\u00EA\u00EC\u00ED\u00F2\u00F3\u00F4\u00F5\u00F9\u00FA\u00FD\u0102\u0103\u0110\u0111\u0128\u0129\u0168\u0169\u01A0\u01A1\u01AF\u01B0\u1EA0\u1EA1\u1EA2\u1EA3\u1EA4\u1EA5\u1EA6\u1EA7\u1EA8\u1EA9\u1EAA\u1EAB\u1EAC\u1EAD\u1EAE\u1EAF\u1EB0\u1EB1\u1EB2\u1EB3\u1EB4\u1EB5\u1EB6\u1EB7\u1EB8\u1EB9\u1EBA\u1EBB\u1EBC\u1EBD\u1EBE\u1EBF\u1EC0\u1EC1\u1EC2\u1EC3\u1EC4\u1EC5\u1EC6\u1EC7\u1EC8\u1EC9\u1ECA\u1ECB\u1ECC\u1ECD\u1ECE\u1ECF\u1ED0\u1ED1\u1ED2\u1ED3\u1ED4\u1ED5\u1ED6\u1ED7\u1ED8\u1ED9\u1EDA\u1EDB\u1EDC\u1EDD\u1EDE\u1EDF\u1EE0\u1EE1\u1EE2\u1EE3\u1EE4\u1EE5\u1EE6\u1EE7\u1EE8\u1EE9\u1EEA\u1EEB\u1EEC\u1EED\u1EEE\u1EEF\u1EF0\u1EF1\u1EF2\u1EF3\u1EF4\u1EF5\u1EF6\u1EF7\u1EF8\u1EF9]'
        text = re.sub(super_regex, '', text)

    return text

In [6]:
# Removing English stopwords
import spacy

def remove_eng_stopwords(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    filtered_words = [token.text for token in doc if not token.is_stop]
    text = ' '.join(filtered_words)
    return text

ModuleNotFoundError: No module named 'spacy'

In [None]:
# Removing Vietnamese stopwords
def remove_vie_stopwords(text):
    with open('vietnamese-stopwords.txt', 'r', encoding='utf-8') as file:
        stopwords = [line.strip() for line in file]

    # Create a regex pattern to match any word in stopwords
    pattern = r'\b(?:{})\b'.format('|'.join(map(re.escape, stopwords)))
    text = re.sub(pattern, '', text)

    return text

In [None]:
# Set up for Vietnamese Segmentation
from vws import RDRSegmenter, Tokenizer
rdrsegment = RDRSegmenter.RDRSegmenter()
tokenizer = Tokenizer.Tokenizer()

# Segmentation
def segment(text):
    text = rdrsegment.segmentRawSentences(tokenizer, text)
    return text

In [None]:
# English stemming
from nltk.tokenize import word_tokenize

nltk.download('punkt')
stemmer = nltk.PorterStemmer()

# Stemming
def stem(text):
    words = nltk.word_tokenize(text)
    words = [stemmer.stem(word) for word in words]
    text = ' '.join(words)
    return text

In [None]:
# Apply everything
df['text'] = df['text'].apply(clean)
df['text'] = df['text'].apply(remove_eng_stopwords)
df['text'] = df['text'].apply(remove_vie_stopwords)
df['text'] = df['text'].apply(segment)
df['text'] = df['text'].apply(stem)

In [None]:
data = df['text']

In [None]:
from collections import Counter

title_words = df['text'].str.split()
word_counts = Counter([word for words in title_words for word in words])

word_counts_dict = dict(word_counts)

In [None]:
sorted_word_count_dict = sorted(word_counts_dict.items(), key=lambda x: x[1], reverse=True)

In [None]:
# Specify the file path
file_path = 'raw_dictionary.txt'

# Open the file in write mode
with open(file_path, 'w', encoding='utf-8') as file:
    # Write each key-value pair with a newline after each pair
    for key, value in dict(sorted_word_count_dict).items():
        file.write(f"{key}: {value}\n")

In [None]:
df.to_csv('./16a_description_cleaned.csv', encoding='utf-8-sig', index=False)

In [None]:
new_data = pd.read_csv('./16a_description_cleaned.csv')

In [None]:
new_data