In [1]:
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

import pandas as pd

In [2]:
twenty_train = fetch_20newsgroups()

In [3]:
def processText(raw_texts):
    STOP_WORDS = stopwords.words('english')
    tokenizer = RegexpTokenizer(r'\w+')    
    
    raw_data = []
    for raw in raw_texts:
        words = [w.lower().strip() for w in tokenizer.tokenize(raw)]
        words = [w for w in words if w not in STOP_WORDS and w != '']
        raw_data.append(words)    
    return raw_data

In [4]:
def splitData(raw_features, raw_targets, test_fraction=0.2):
    train_done = False
    test_done = False
    n_iter = 0
    n_classes = np.unique(raw_targets)
    while not train_done or not test_done:
        X_train_raw, X_test_raw, y_train, y_test = train_test_split(raw_features, raw_targets, test_size=test_fraction)
        
        train_done = len(np.unique(y_train) == len(n_classes))
        test_done = len(np.unique(y_test) == len(n_classes))
        
        if train_done and test_done:
            return X_train_raw, X_test_raw, y_train, y_test
        else:
            n_iter += 1
            
        if n_iter == 20:
            if train_done:
                print('test set has missing classes')
                return X_train_raw, X_test_raw, y_train, y_test
            else:
                print('both train and test sets have missing classes. use entier data instead')
                return raw_features, raw_features, raw_target, raw_target
        

In [5]:
def fitFeatures(X_train_raw):
    features = []
    for train_example in X_train_raw:
        if len(train_example) == 0:
            continue
        values = [e for e in train_example]
        features = list(set(features).union(set(values)))
    features = [feat for feat in features if feat is not None and feat is not np.nan and feat != '']
    features.sort()
    p = len(features)
    feature2idx = {}
    idx2feature = {}
    for i, feat in enumerate(features):
        feature2idx[feat] = i
        idx2feature[i] = feat
    return feature2idx, idx2feature

In [6]:
def rawTokens2Vector(raw_example, features2idx):
    p = len(feature2idx)
    vec = np.zeros(p)
    if len(raw_example) == 0:
        return vec
    for word in raw_example:
        if word in features2idx:
            vec[features2idx[word]] = 1.0
    return vec    

In [7]:
def encodeFeatures(X_train_raw, X_test_raw, feature2idx):
    X_train = []
    for raw_example in X_train_raw:
        vec = rawTokens2Vector(raw_example, feature2idx)
        X_train.append(vec)
    X_train = np.array(X_train)

    X_test = []
    for raw_example in X_test_raw:
        vec = rawTokens2Vector(raw_example, feature2idx)
        X_test.append(vec)
    X_test = np.array(X_test)
    
    return X_train, X_test

In [8]:
raw_features = processText(twenty_train.data)    
raw_targets = twenty_train.target
# twenty_train.target_names
X_train_raw, X_test_raw, y_train, y_test = splitData(raw_features, raw_targets)
feature2idx, idx2feature = fitFeatures(X_train_raw)
X_train, X_test = encodeFeatures(X_train_raw, X_test_raw, feature2idx)

### Test scikit-learn Naive Bayes

In [9]:
from sklearn.naive_bayes import MultinomialNB

In [10]:
sk_model = MultinomialNB()
sk_model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
accuracy_train = np.mean(sk_model.predict(X_train) == y_train)
accuracy_test = np.mean(sk_model.predict(X_test) == y_test)
print('training accuracy: {0:.3f}, testing accuracy: {1:.3f}'.format(accuracy_train, accuracy_test))

training accuracy: 0.965, testing accuracy: 0.866


In [12]:
sk_model.predict_proba(X_train)[0:5, 0:3]

array([[  1.74000968e-30,   1.12329426e-22,   5.11748499e-29],
       [  6.20388949e-16,   3.91096442e-10,   8.11245011e-17],
       [  6.42836867e-56,   2.45377937e-58,   8.96586584e-71],
       [  1.59216891e-41,   8.40260471e-18,   1.56529743e-22],
       [  1.80079182e-21,   2.78290537e-05,   1.08841080e-04]])

### Test personal package

In [13]:
from importlib import reload

In [14]:
import naive_bayes
reload(naive_bayes)
from naive_bayes import MultinomialNaiveBayes

In [15]:
my_model = MultinomialNaiveBayes()
my_model.fit(X_train, y_train)

In [16]:
accuracy_train = np.mean(my_model.predict(X_train) == y_train)
accuracy_test = np.mean(my_model.predict(X_test) == y_test)
print('training accuracy: {0:.3f}, testing accuracy: {1:.3f}'.format(accuracy_train, accuracy_test))

training accuracy: 0.965, testing accuracy: 0.866


In [17]:
my_model.predict_proba(X_train)[0:5, 0:3]

array([[  1.74000968e-30,   1.12329426e-22,   5.11748499e-29],
       [  6.20388949e-16,   3.91096442e-10,   8.11245011e-17],
       [  6.42836867e-56,   2.45377937e-58,   8.96586584e-71],
       [  1.59216891e-41,   8.40260471e-18,   1.56529743e-22],
       [  1.80079182e-21,   2.78290537e-05,   1.08841080e-04]])