# Thực hành phân loại văn bản

# 1. Nhập dataset và kiểm tra

In [1]:
import warnings
import pandas as pd

warnings.filterwarnings('ignore')
df = pd.read_csv('./file01.csv')
df.dropna(axis=0, inplace=True)

In [2]:
df.sample(5, random_state=0)

Unnamed: 0,comments,sentiments
339528,It is fun and time saving to communicate one f...,1
229825,This game is amazing!!! I love playing color s...,1
312241,"No doubt the app is great, and very useful, es...",1
169438,Yep it's pretty good so far! Only been using i...,1
441475,"Impossible de relier un compte, message d'erre...",-1


In [3]:
df.groupby('sentiments').sentiments.count()

sentiments
-1     78088
 0     74711
 1    322518
Name: sentiments, dtype: int64

positive: 1 \
negative: -1 \
neutral: 0

In [4]:
# Kiểm tra kích thước dataset
df.shape

(475317, 2)

# 2. Làm sạch dữ liệu

1. Chuyển tất cả các từ sang chữ thường.
2. Loại bỏ tất cả các dấu chấm câu, số, từ ít hơn 3 ký tự.
3. Tách từ
4. Loại bỏ stopwords
5. Đưa về từ nguyên gốc

In [5]:
#Import các thư viện cần thiết
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer

STOPWORDS = set(stopwords.words('english'))

In [6]:
def preprocess(sentence):
    sentence = sentence.lower()
    stemmer = SnowballStemmer("english")
    tokenizer = RegexpTokenizer(r'[a-z]{3,}')
    tokens = tokenizer.tokenize(sentence)
    tokens = [stemmer.stem(w) for w in tokens if w not in STOPWORDS]

    return " ".join(tokens)

In [7]:
df['comments'] = df['comments'].apply(preprocess)
df['comments'].head()

0    outstand app lot show one app problem keep get...
1    use one main media app long time found month b...
2    ive watch netflix year would like see improv u...
3    love use app think best seri movi differ genr ...
4    complaint subtitl size switch phone like small...
Name: comments, dtype: object

# 3. Tạo mô hình Bag of Words sử dụng sklearn

Tách dataset thành tập train và test

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['comments'], df['sentiments'], test_size=.2, random_state=42)

print('Number of rows in the train set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the train set: 380253
Number of rows in the test set: 95064


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# Khởi tạo CountVectorizer method
count_vector = CountVectorizer()

# Fit và transform tập train rồi trả về ma trận
training_data = count_vector.fit_transform(X_train)

# Transform rồi trả về ma trận.
# Lưu ý không fit với CountVectorizer()
testing_data = count_vector.transform(X_test)

# 4. Huấn luyên mô hình sử dụng mô hình Naive Bayes

In [10]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(training_data, y_train)

In [11]:
test_predictions = model.predict(testing_data)

# 5. Đánh giá mô hình Naive Bayes

In [12]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score

print('\nTest')
print(confusion_matrix(y_test, test_predictions))
print(classification_report(y_test, test_predictions))
print(f1_score(y_test, test_predictions, average='micro'))


Test
[[ 9521   224  5820]
 [ 1330  3528 10014]
 [ 4349   453 59825]]
              precision    recall  f1-score   support

          -1       0.63      0.61      0.62     15565
           0       0.84      0.24      0.37     14872
           1       0.79      0.93      0.85     64627

    accuracy                           0.77     95064
   macro avg       0.75      0.59      0.61     95064
weighted avg       0.77      0.77      0.74     95064

0.7665783051417993


# 6. Huấn luyên mô hình sử dụng mô hình SVM

In [None]:
from sklearn.svm import SVC, LinearSVC
svm = SVC()
svm.fit(training_data, y_train)

In [None]:
val_predictions = svm.predict(val_data)
test_predictions = svm.predict(testing_data)

# 7. Đánh giá mô hình SVM

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Val')
print('Accuracy score: {:.2f}'.format(100*accuracy_score(y_val, val_predictions)))
print('Precision score: {:.2f}'.format(100*precision_score(y_val, val_predictions)))
print('Recall score: {:.2f}'.format(100*recall_score(y_val, val_predictions)))
print('F1 score: {:.2f}'.format(100*f1_score(y_val, val_predictions)))
print('\nTest')
print('Accuracy score: {:.2f}'.format(100*accuracy_score(y_test, test_predictions)))
print('Precision score: {:.2f}'.format(100*precision_score(y_test, test_predictions)))
print('Recall score: {:.2f}'.format(100*recall_score(y_test, test_predictions)))
print('F1 score: {:.2f}'.format(100*f1_score(y_test, test_predictions)))

# 8. Lưu mô hình

In [1]:
import pickle
import pandas as pd

In [17]:
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(model, picklefile)
    
with open('vectorizer', 'wb') as picklefile:
    pickle.dump(count_vector, picklefile)

In [2]:
with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)
    
with open('vectorizer', 'rb') as vectorizer:
    count_vector = pickle.load(vectorizer)

In [3]:
data = {'comments':  ['love app']}

dfa = pd.DataFrame(data, columns=['comments'])

print(dfa)

   comments
0  love app
1    so bad


In [4]:
tmp = model.predict(count_vector.transform(dfa['comments']))

In [5]:
tmp

array([ 1, -1])

In [6]:
tmp[0]

1