### Different Naive Bayes Classifiers to Predict Categories (Machine Learning vs. Business Software)

In [1]:
pwd

'/home/jovyan/ipynb'

In [2]:
from os import chdir
chdir('/home/jovyan/')

In [3]:
import library.db_helper as db
import library.functions as fy

In [4]:
import pandas as pd
import numpy as np

#### Setup a train/test/split

In [5]:
from sklearn.model_selection import train_test_split

In [8]:
text_query = '''
SELECT text
FROM page
'''
X = db.query_to_dataframe(text_query)
X.shape

(2449, 1)

In [26]:
X.values.ravel().shape

(2449,)

In [12]:
catid_query = '''
SELECT category_cid
FROM category_page
'''
y = db.query_to_dataframe(catid_query)
y.shape

(2449, 1)

In [24]:
y.values.ravel().shape

(2449,)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X.values.ravel(), y.values.ravel())

#### Build a Pipeline for a Bernouli NB Classifier

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

In [27]:
bnnb_pipeline = Pipeline([
    ('cvt', CountVectorizer(min_df=2, ngram_range=(1,2))),
    ('svd', TruncatedSVD(n_components=400)),
    ('clf', BernoulliNB())
])

In [28]:
bnnb_pipeline.fit(X_train, y_train)

Pipeline(steps=[('cvt', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_...te=None, tol=0.0)), ('clf', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))])

In [29]:
bnnb_pipeline.predict(X_train)

array([0, 0, 0, ..., 0, 0, 0])

In [35]:
bnnb_pipeline.predict_proba(X_train)

array([[  9.16353495e-01,   8.36465053e-02],
       [  9.99755189e-01,   2.44811251e-04],
       [  9.89486672e-01,   1.05133281e-02],
       ..., 
       [  9.74827255e-01,   2.51727454e-02],
       [  9.91542148e-01,   8.45785194e-03],
       [  9.84962437e-01,   1.50375628e-02]])

#### Build a Pipeline for a Gaussian NB Classifier

In [41]:
gnb_pipeline = Pipeline([
    ('tfdif', TfidfVectorizer(min_df=3, ngram_range=(1,2))),
    ('svd', TruncatedSVD(n_components=300)),
    ('clf', GaussianNB())
])

In [None]:
gnb_pipeline.fit(X_train, y_train)