# Thực hành phân loại báo

# 1. Nhập dataset và kiểm tra

In [55]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
df = pd.read_csv('../dataset/dataset.csv')
df.head()

Unnamed: 0,text,category
0,Milburn defends poster campaign\n \n Labour's ...,politics
1,'Debate needed' on donations cap\n \n A cap on...,politics
2,UK plan to deport terror suspects\n \n Deals a...,politics
3,Howard rebuts asylum criticisms\n \n Tory lead...,politics
4,UKIP MEP attacked German 'empire'\n \n A UK In...,politics


Chuyển giá trị trong cột label về số sử dụng map. {'politics': 0, 'tech': 1}

In [56]:
df['category'] = df.category.map({'politics': 0, 'tech': 1})
df.head()

Unnamed: 0,text,category
0,Milburn defends poster campaign\n \n Labour's ...,0
1,'Debate needed' on donations cap\n \n A cap on...,0
2,UK plan to deport terror suspects\n \n Deals a...,0
3,Howard rebuts asylum criticisms\n \n Tory lead...,0
4,UKIP MEP attacked German 'empire'\n \n A UK In...,0


In [57]:
#Kiểm tra kích thước dataset
df.shape

(800, 2)

# 2. Làm sạch dữ liệu

1. Chuyển tất cả các từ sang chữ thường.
2. Loại bỏ tất cả các dấu chấm câu, số, từ ít hơn 3 ký tự.
3. Tách từ
4. Loại bỏ stopwords
5. Đưa về từ nguyên gốc

In [58]:
#Import các thư viện cần thiết
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer

STOPWORDS = set(stopwords.words('english'))

In [59]:
def preprocess(sentence):
    sentence = sentence.lower()
    stemmer = SnowballStemmer("english")
    tokenizer = RegexpTokenizer(r'[a-z]{3,}')
    tokens = tokenizer.tokenize(sentence)
    tokens = [stemmer.stem(w) for w in tokens if w not in STOPWORDS]

    return " ".join(tokens)

In [60]:
df['text'] = df['text'].apply(preprocess)
df['text'].head()

0    milburn defend poster campaign labour elect ch...
1    debat need donat cap cap donat polit parti int...
2    plan deport terror suspect deal sought allow d...
3    howard rebut asylum critic tori leader michael...
4    ukip mep attack german empir independ parti me...
Name: text, dtype: object

# 3. Tạo mô hình Bag of Words sử dụng sklearn

In [61]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(encoding='utf-8', max_features=3000, min_df=3)
count_vector.fit(df['text'])

CountVectorizer(max_features=3000, min_df=3)

In [62]:
doc_array = count_vector.transform(df['text']).toarray()

In [37]:
frequency_matrix = pd.DataFrame(doc_array,columns = count_vector.get_feature_names())

     abandon  abid  abil  abl  abolish  abroad  absolut  abus  academ  \
0          0     0     0    0        0       0        0     0       0   
1          0     0     0    1        0       0        0     0       0   
2          0     0     1    0        0       0        0     0       0   
3          0     0     0    0        0       0        2     0       0   
4          0     0     0    0        0       0        0     0       0   
..       ...   ...   ...  ...      ...     ...      ...   ...     ...   
795        0     0     0    0        0       0        0     0       0   
796        0     0     0    0        0       0        0     0       0   
797        0     0     0    0        0       0        0     0       0   
798        0     0     0    0        0       0        0     0       0   
799        0     0     0    0        0       0        0     0       0   

     acceler  ...  yahoo  year  yes  yet  york  young  younger  youth  zombi  \
0          0  ...      0     1    0    0   

Tách dataset thành tập train và test

In [63]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], 
                                                    df['category'],
                                                    test_size=.2,
                                                    random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                    y_train, 
                                                    test_size=.2,
                                                    random_state=42)
print('Number of rows in the train set: {}'.format(X_train.shape[0]))
print('Number of rows in the val set: {}'.format(X_val.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the train set: 512
Number of rows in the val set: 128
Number of rows in the test set: 160


In [71]:
# Khởi tạo CountVectorizer method
count_vector = CountVectorizer()

# Fit và transform tập train rồi trả về ma trận
training_data = count_vector.fit_transform(X_train)

# Transform rồi trả về ma trận.
# Lưu ý không fit với CountVectorizer()
val_data = count_vector.transform(X_val)
testing_data = count_vector.transform(X_test)

# 4. Huấn luyên mô hình sử dụng mô hình Naive Bayes

In [65]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB()

In [66]:
val_predictions = naive_bayes.predict(val_data)
test_predictions = naive_bayes.predict(testing_data)

# 5. Đánh giá mô hình Naive Bayes

In [67]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Val')
print('Accuracy score: {:.2f}'.format(100*accuracy_score(y_val, val_predictions)))
print('Precision score: {:.2f}'.format(100*precision_score(y_val, val_predictions)))
print('Recall score: {:.2f}'.format(100*recall_score(y_val, val_predictions)))
print('F1 score: {:.2f}'.format(100*f1_score(y_val, val_predictions)))
print('\nTest')
print('Accuracy score: {:.2f}'.format(100*accuracy_score(y_test, test_predictions)))
print('Precision score: {:.2f}'.format(100*precision_score(y_test, test_predictions)))
print('Recall score: {:.2f}'.format(100*recall_score(y_test, test_predictions)))
print('F1 score: {:.2f}'.format(100*f1_score(y_test, test_predictions)))

Val
Accuracy score: 99.22
Precision score: 100.00
Recall score: 98.59
F1 score: 99.29

Test
Accuracy score: 99.38
Precision score: 98.77
Recall score: 100.00
F1 score: 99.38


# 6. Huấn luyên mô hình sử dụng mô hình SVM

In [68]:
from sklearn.svm import SVC, LinearSVC
svm = SVC()
svm.fit(training_data, y_train)

SVC()

In [69]:
val_predictions = svm.predict(val_data)
test_predictions = svm.predict(testing_data)

# 7. Đánh giá mô hình SVM

In [70]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Val')
print('Accuracy score: {:.2f}'.format(100*accuracy_score(y_val, val_predictions)))
print('Precision score: {:.2f}'.format(100*precision_score(y_val, val_predictions)))
print('Recall score: {:.2f}'.format(100*recall_score(y_val, val_predictions)))
print('F1 score: {:.2f}'.format(100*f1_score(y_val, val_predictions)))
print('\nTest')
print('Accuracy score: {:.2f}'.format(100*accuracy_score(y_test, test_predictions)))
print('Precision score: {:.2f}'.format(100*precision_score(y_test, test_predictions)))
print('Recall score: {:.2f}'.format(100*recall_score(y_test, test_predictions)))
print('F1 score: {:.2f}'.format(100*f1_score(y_test, test_predictions)))

Val
Accuracy score: 99.22
Precision score: 98.61
Recall score: 100.00
F1 score: 99.30

Test
Accuracy score: 97.50
Precision score: 95.24
Recall score: 100.00
F1 score: 97.56
