# 3.32 Modeling: Naive Bayes

In [56]:
# load the data
%store -r X_train
%store -r X_test
%store -r y_train
%store -r y_test

In [57]:
# import LogisticRegression related pacakges
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

### Create Lemmatize and Tokenize rules

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 

In [59]:
# use regularexpression to do lemmatize
# use countvectorizer to tokenize, lemmatize, and exclude stopwords
# cvec = CountVectorizer(tokenizer=LemmaTokenizer()) 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        tokenizer = RegexpTokenizer('(?u)\\b\\w\\w+\\b')
        return [self.wnl.lemmatize(t) for t in tokenizer.tokenize(doc)]

In [60]:
X_train.shape

(1474,)

### CountVectorizer

In [61]:
# Instantiate Pipeline
# using MultinomialNB here
pipeline_cv = Pipeline([('cvec', CountVectorizer(tokenizer=LemmaTokenizer())),
                        ('mnb', MultinomialNB()) 
                       ])
# Pipeline parameter CountVectorizer
pipe_params = {
    'cvec__max_features': [300, 500, 1000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [None, 'english']
}

In [62]:
# GridSearch
gs_cv = GridSearchCV(pipeline_cv, 
                     param_grid=pipe_params, 
                     cv=5,
                     verbose=1,
                     n_jobs=4)

mnb_cv = gs_cv.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    9.1s
[Parallel(n_jobs=4)]: Done 240 out of 240 | elapsed:   10.8s finished
  'stop_words.' % sorted(inconsistent))


In [63]:
# Best parameters on the training data:
mnb_cv.best_params_

{'cvec__max_df': 0.9,
 'cvec__max_features': 1000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english'}

In [64]:
# assign the best estimator to a variable:
best_mnb_cv = mnb_cv.best_estimator_
# check training score, cross_validation_score and testing score
print(f"training score is {best_mnb_cv.score(X_train, y_train)}")
print(f"cross validation score is {mnb_cv.best_score_}")
print(f"test score is {best_mnb_cv.score(X_test, y_test)}")

training score is 0.9613297150610584
cross validation score is 0.9355495251017639
test score is 0.9410569105691057


### Tf-Idf Vectorizer - MultinomialNB

In [82]:
# Instantiate Pipeline
# using Multinomial here
pipeline_tfidf = Pipeline([('tfidf', TfidfVectorizer(tokenizer=LemmaTokenizer())),
                           ('mnb', MultinomialNB())
                          ])
# Pipeline parameter CountVectorizer
pipe_params = {
    'tfidf__max_features': [300, 500, 1000],
    'tfidf__min_df': [2, 3],
    'tfidf__max_df': [.9, .95],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__stop_words': [None, 'english']
}

In [83]:
# GridSearch
gs_tfidf = GridSearchCV(pipeline_tfidf, 
                     param_grid=pipe_params, 
                     cv=5,
                     verbose=1,
                     n_jobs=4)

mnb_tfidf = gs_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    9.8s
[Parallel(n_jobs=4)]: Done 240 out of 240 | elapsed:   11.6s finished
  'stop_words.' % sorted(inconsistent))


In [84]:
# Best parameters on the training data:
mnb_tfidf.best_params_

{'tfidf__max_df': 0.9,
 'tfidf__max_features': 1000,
 'tfidf__min_df': 3,
 'tfidf__ngram_range': (1, 2),
 'tfidf__stop_words': 'english'}

In [85]:
# assign the best estimator to a variable:
best_mnb_tfidf = mnb_tfidf.best_estimator_
# check training score, cross_validation_score and testing score
print(f"training score is {best_mnb_tfidf.score(X_train, y_train)}")
print(f"cross validation score is {mnb_tfidf.best_score_}")
print(f"test score is {best_mnb_tfidf.score(X_test, y_test)}")

training score is 0.9640434192672999
cross validation score is 0.9308005427408412
test score is 0.9288617886178862


### Tf-Idf Vectorizer - GaussianNB

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin

In [71]:
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [72]:
# Instantiate Pipeline
# using Gaussian NB
pipeline_tfidf = Pipeline([('tfidf', TfidfVectorizer(tokenizer=LemmaTokenizer())),
                           ('toDense', DenseTransformer()),
                        ('gnb', GaussianNB()) 
                       ])
# Pipeline parameter CountVectorizer
pipe_params = {
    'tfidf__max_features': [300, 500, 1000],
    'tfidf__min_df': [2, 3],
    'tfidf__max_df': [.9, .95],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__stop_words': [None, 'english']
}

In [74]:
# GridSearch
gs_tfidf = GridSearchCV(pipeline_tfidf, 
                     param_grid=pipe_params, 
                     cv=5,
                     verbose=1,
                     n_jobs=4)

gnb_tfidf = gs_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    9.6s
[Parallel(n_jobs=4)]: Done 240 out of 240 | elapsed:   11.6s finished


In [75]:
# Best parameters on the training data:
gnb_tfidf.best_params_

{'tfidf__max_df': 0.9,
 'tfidf__max_features': 1000,
 'tfidf__min_df': 2,
 'tfidf__ngram_range': (1, 2),
 'tfidf__stop_words': None}

In [79]:
# assign the best estimator to a variable:
best_gnb_tfidf = gnb_tfidf.best_estimator_
# check training score, cross_validation_score and testing score
print(f"training score is {best_gnb_tfidf.score(X_train, y_train)}")
print(f"cross validation score is {gnb_tfidf.best_score_}")
print(f"test score is {best_gnb_tfidf.score(X_test, y_test)}")

training score is 0.9362279511533242
cross validation score is 0.8758480325644504
test score is 0.8455284552845529


### Check probability to see which feature weights more

Since model under CountVectorizer(cv) and tfidf perform similarly, I'll choose cv to do further analysis.

In [20]:
# using the best estimator in gridsearch
cvec = CountVectorizer(tokenizer=LemmaTokenizer(),
                       max_df=0.9,
                       max_features=1000,
                       min_df=2,
                       ngram_range=(1, 1),
                       stop_words='english'
                      )

In [21]:
# use CountVectorizer to vectorize the training data
X_train_cvec= cvec.fit_transform(X_train)

  'stop_words.' % sorted(inconsistent))


In [22]:
# instantiate a logisticRegression model with lasso
mnb = MultinomialNB()
mnb.fit(X_train_cvec, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [34]:
# convert X_train_cvec as a dataframe
import pandas as pd

cvec_words = pd.DataFrame(X_train_cvec.toarray(), columns = cvec.get_feature_names())

In [40]:
# add prob in the dataframe
import numpy as np

cv_table_nb = pd.DataFrame(np.exp(mnb.feature_log_prob_), columns = cvec.get_feature_names()).T

In [41]:
# check words with high prob for class 1
cv_table_nb.sort_values(by=1, ascending=False).head(10)

Unnamed: 0,0,1
spacex,0.002912,0.051804
launch,0.016618,0.029645
falcon,0.001199,0.022458
twitter,0.004797,0.014224
dragon,0.000171,0.013176
starship,0.000343,0.012726
heavy,0.001542,0.01108
elon,0.000343,0.01063
crew,0.001885,0.009582
starlink,0.000171,0.009283


In [42]:
# check words with high prob for class 0
cv_table_nb.sort_values(by=0, ascending=False).head(10)

Unnamed: 0,0,1
blue,0.056707,0.000599
origin,0.048655,0.000449
new,0.043687,0.006288
space,0.018331,0.007336
shepard,0.017303,0.00015
bezos,0.017303,0.000449
launch,0.016618,0.029645
jeff,0.015076,0.000449
rocket,0.01422,0.004941
glenn,0.013706,0.00015


In [43]:
# store data for visualization
%store cv_table_nb

Stored 'cv_table_nb' (DataFrame)
