In [3]:
# !gdown --id 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R

In [4]:
import string # Thao tác với chuỗi
import nltk # (Natural Language Toolkit) Thư viện xử lý ngôn ngữ tự nhiên
nltk.download('punkt')
nltk.download('stopwords')

import pandas as pd # Cung cấp các cấu trúc dữ liệu hiệu quả cho xử lý dữ liệu và phân tích dữ liệu
import numpy as np # Thư viện hỗ trợ xử lý dữ liệu số
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import  GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\duchu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\duchu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
DATASET_PATH = './2cls_spam_text_cls.csv'
df = pd.read_csv(DATASET_PATH)

messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()


In [6]:
# Tiền xử lý dữ liệu

def lower_case(text):
    return text.lower()

def punctuation_removal(text):
    translator = str.maketrans('', '', string.punctuation)

    return text.translate(translator)

def tokenize(text):
    return nltk.word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = nltk.corpus.stopwords.words('english')

    return [token for token in tokens if token not in stop_words]

def steaming(tokens):
    steamer = nltk.PorterStemmer()

    return [steamer.stem(token) for token in tokens]

def preprocess_text(text):
    text = lower_case(text)
    text = punctuation_removal(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = steaming(tokens)

    return tokens

messages = [preprocess_text(message) for message in messages]
    

In [7]:
def create_dictionary(messages):
    dictionary = []

    for tokens in messages:
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)

    return dictionary

dictionary = create_dictionary(messages)

In [8]:
def create_features(tokens, dictionary):
    features = np.zeros(len(dictionary))

    for token in tokens:
        if token in dictionary:
            features[dictionary.index(token)] += 1

    return features

X = np.array([create_features(tokens, dictionary) for tokens in messages])

In [9]:
le = LabelEncoder()
y = le.fit_transform(labels)

print(f'Classes: {le.classes_}')
print(f'Encoded labels: {y}')


Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


In [10]:
VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 42

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SIZE, shuffle=True, random_state=SEED)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=TEST_SIZE, shuffle=True, random_state=SEED)


In [11]:
model = GaussianNB()
print('Start training...')
model.fit(X_train, y_train)
print('Training completed!')

Start training...
Training completed!


In [12]:
# Đánh giá mô hình
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Validation accuracy: {val_accuracy}')
print(f'Test accuracy: {test_accuracy}')


Validation accuracy: 0.8753363228699551
Test accuracy: 0.8924731182795699


In [13]:
# Thực hiện dự đoán

def predict(text, model, dictionary):
    precessed_text = preprocess_text(text)
    features = create_features(text, dictionary)
    features = np.array(features).reshape(1, -1)
    prediction = model.predict(features)
    prediction_cls = le.inverse_transform(prediction)[0]

    return prediction_cls

text = '"Free entry in 2 a wkly comp to win FA Cup final'
prediction_cls = predict(text, model, dictionary)
print(f'Prediction: {prediction_cls}')

Prediction: ham
