# Thực hành phân loại báo sử dụng Naive Bayes

# 1. Nhập dataset và kiểm tra

In [11]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
df = pd.read_csv('../dataset/dataset.csv')
# Output printing out first 5 rows
df.head()

Unnamed: 0,text,category
0,Milburn defends poster campaign\n \n Labour's ...,politics
1,'Debate needed' on donations cap\n \n A cap on...,politics
2,UK plan to deport terror suspects\n \n Deals a...,politics
3,Howard rebuts asylum criticisms\n \n Tory lead...,politics
4,UKIP MEP attacked German 'empire'\n \n A UK In...,politics


Chuyển giá trị trong cột label về số sử dụng map. {'politics': 0, 'tech': 1}

In [12]:
df['category'] = df.category.map({'politics': 0, 'tech': 1})
df.head()

Unnamed: 0,text,category
0,Milburn defends poster campaign\n \n Labour's ...,0
1,'Debate needed' on donations cap\n \n A cap on...,0
2,UK plan to deport terror suspects\n \n Deals a...,0
3,Howard rebuts asylum criticisms\n \n Tory lead...,0
4,UKIP MEP attacked German 'empire'\n \n A UK In...,0


In [13]:
#Kiểm tra kích thước dataset
df.shape

(800, 2)

# 2. Làm sạch dữ liệu

1. Chuyển tất cả các từ sang chữ thường.
2. Loại bỏ tất cả các dấu chấm câu.
3. Tách từ

In [14]:
#Import các thư viện cần thiết
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer

STOPWORDS = set(stopwords.words('english'))

In [15]:
def preprocess(sentence):
    sentence = sentence.lower()
    stemmer = SnowballStemmer("english")
    tokenizer = RegexpTokenizer(r'[a-zA-Z]{3,}')
    tokens = tokenizer.tokenize(sentence)
    tokens = [stemmer.stem(w) for w in tokens if w not in STOPWORDS]

    return " ".join(tokens)

In [16]:
df['text'] = df['text'].apply(preprocess)
df['text'].head()

0    milburn defend poster campaign labour elect ch...
1    debat need donat cap cap donat polit parti int...
2    plan deport terror suspect deal sought allow d...
3    howard rebut asylum critic tori leader michael...
4    ukip mep attack german empir independ parti me...
Name: text, dtype: object

# 3. Tạo mô hình Bag of Words sử dụng sklearn

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=3000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)
print(count_vector)
count_vector.fit(df['text'])
count_vector.get_feature_names()

CountVectorizer(max_features=3000)


['abandon',
 'abba',
 'abid',
 'abil',
 'abl',
 'abolish',
 'abroad',
 'absolut',
 'abus',
 'academ',
 'acceler',
 'accept',
 'access',
 'accid',
 'accommod',
 'accompani',
 'accord',
 'account',
 'accur',
 'accus',
 'achiev',
 'acknowledg',
 'acquir',
 'acquisit',
 'across',
 'act',
 'action',
 'activ',
 'activist',
 'actual',
 'ad',
 'adam',
 'adapt',
 'add',
 'addit',
 'address',
 'adequ',
 'administr',
 'admit',
 'adopt',
 'adult',
 'advanc',
 'advantag',
 'advert',
 'advertis',
 'advic',
 'advis',
 'advoc',
 'affair',
 'affect',
 'afford',
 'afghanistan',
 'afraid',
 'africa',
 'african',
 'age',
 'agenc',
 'agenda',
 'agent',
 'aggress',
 'ago',
 'agre',
 'agreement',
 'agricultur',
 'ahead',
 'aid',
 'aim',
 'air',
 'airlin',
 'airport',
 'alan',
 'alarm',
 'alastair',
 'album',
 'alcohol',
 'alert',
 'alien',
 'alleg',
 'alli',
 'allianc',
 'allow',
 'almost',
 'alon',
 'along',
 'alongsid',
 'alreadi',
 'also',
 'alter',
 'altern',
 'although',
 'alway',
 'amaz',
 'amazon',
 '

In [19]:
doc_array = count_vector.transform(df['text']).toarray()
print("Doc array: \n", doc_array)

Doc array: 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 0 0 0]]


In [16]:
frequency_matrix = pd.DataFrame(doc_array,columns = count_vector.get_feature_names())
print(frequency_matrix)

      aah  aaniy  aaooooright  aathi  abbey  abdomen  abeg  abel  aberdeen  \
0       0      0            0      0      0        0     0     0         0   
1       0      0            0      0      0        0     0     0         0   
2       0      0            0      0      0        0     0     0         0   
3       0      0            0      0      0        0     0     0         0   
4       0      0            0      0      0        0     0     0         0   
...   ...    ...          ...    ...    ...      ...   ...   ...       ...   
5567    0      0            0      0      0        0     0     0         0   
5568    0      0            0      0      0        0     0     0         0   
5569    0      0            0      0      0        0     0     0         0   
5570    0      0            0      0      0        0     0     0         0   
5571    0      0            0      0      0        0     0     0         0   

      abi  ...  zebra  zed  zero  zhong  zindgi  zoe  zogtorius

Tách dataset thành tập train và test

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], 
                                                    df['category'], 
                                                    test_size=.3,
                                                    random_state=42)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 800
Number of rows in the training set: 560
Number of rows in the test set: 240


In [21]:
# Khởi tạo CountVectorizer method
count_vector = CountVectorizer()

# Fit và transform tập train rồi trả về ma trận
training_data = count_vector.fit_transform(X_train)

# Transform tập test rồi trả về ma trận.
# Lưu ý không fit với CountVectorizer()
testing_data = count_vector.transform(X_test)

# 4. Huấn luyên mô hình sử dụng mô hình Naive Bayes

In [22]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB()

In [23]:
predictions = naive_bayes.predict(testing_data)

# 5. Đánh giá mô hình

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print('Precision score: {}'.format(precision_score(y_test, predictions)))
print('Recall score: {}'.format(recall_score(y_test, predictions)))
print('F1 score: {}'.format(f1_score(y_test, predictions)))

Accuracy score: 0.9875
Precision score: 0.9912280701754386
Recall score: 0.9826086956521739
F1 score: 0.9868995633187774
