In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
DATA_PATH='data/bbc-text.csv'

## Data overview

In [None]:
df=pd.read_csv(DATA_PATH)
df.sample(10)

In [None]:
df.describe(include='all')

In [None]:
df['category'].value_counts()

In [None]:
df['text'][0]

In [None]:
X=df['text'].to_numpy()
Y=df['category'].to_numpy()

In [None]:
np.unique(Y)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=200, random_state=42)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

## Features extraction

### Dùng thư viện sklearn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from pprint import pprint
from time import time
from sklearn.preprocessing import FunctionTransformer

In [None]:
vectorizer=CountVectorizer(lowercase=True, analyzer='word', binary=True, stop_words='english', max_df=0.75, max_features=500, ngram_range=(1,2))
x_train_vec=vectorizer.fit_transform(x_train).todense()
x_train_vec

In [None]:
vectorizer.get_stop_words()

In [None]:
vectorizer.transform(x_train[:2]).todense()

In [None]:
x_test_vec=vectorizer.transform(x_test).todense()

In [None]:
def train_and_test(clf, x_train, y_train, x_test, y_test):
    clf.fit(x_train, y_train)
    print(clf)
    print(f'Train set accuracy: {clf.score(x_train, y_train)}')
    print(f'Test set accuracy: {clf.score(x_test, y_test)}')

In [None]:
clfs=[MultinomialNB(), BernoulliNB(), CategoricalNB()]

BernoulliNB and CategoricalNB should produces the same result since features in data are all in binary form

In [None]:
for clf in clfs:
    train_and_test(clf, x_train_vec, y_train, x_test_vec, y_test)
    print('---------------------')

## Build a pipeline to find out best hyper parameters

In [None]:
def grid_search(pipeline, parameters, x_train, y_train):
    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, verbose=1)
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(x_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_estimator=grid_search.best_estimator_
    best_parameters = best_estimator.get_params()
    print(best_parameters)
    return best_estimator

In [22]:
parameters = {
    'vectorizer__stop_words': (None, 'english'),
    'vectorizer__max_df': (0.5, 0.75, 1.0),
    'vectorizer__max_features': (None, 100, 300, 500, 1000, 2000),
    'vectorizer__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
}

In [None]:
bernoulli_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(lowercase=True, analyzer='word', binary=True)),
    ('todense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
    ('clf', BernoulliNB())
])

best_bernoulli_clf=grid_search(bernoulli_pipeline, parameters, x_train, y_train)
best_bernoulli_clf.score(x_test, y_test)

In [None]:
multinomial_pipeline=bernoulli_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(lowercase=True, analyzer='word', binary=True)),
    ('todense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
    ('clf', MultinomialNB())
])

best_bernoulli_clf=grid_search(bernoulli_pipeline, parameters, x_train, y_train)
best_bernoulli_clf.score(x_test, y_test)