In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [4]:
import numpy as np

class MultinomialNaiveBayes():
    def __init__(self):
        self.prob_vec = {}
        self.classes  = []
        
    def fit(self,X,y):
        # extract the probabilities for each element for different classes
        self.classes = np.unique(y)
        occurences = np.where(X != 0, 1.0, 0.0)
        for c in self.classes:
            idx = np.where(y==c)[0]
            N = len(idx)
            
            self.prob_vec[c] = np.log(np.sum(occurences[idx,:],axis=0)/N+1e-10)
        
    def predict_proba(self,X):
        #Output matrix N_documents X N_classes
        probs = np.zeros((X.shape[0],len(self.classes)))
        i=0
        for k in self.classes:

            probs[:,i] = np.dot(X,self.prob_vec[k].T).ravel()
            i+=1
        return probs
    
    def predict(self,X):
        return self.classes[np.argmax(self.predict_proba(X),axis =1)]

### Mock problem

In [5]:
X=np.array([[1,2,3,0,0,0],[0,1,2,0,0,1],[0,0,0,2,3,1],[1,0,1,2,3,0]],dtype=np.float)
y = [0,0,1,1]

In [7]:
clf = BernoulliNaiveBayes()
clf.fit(X,y)

In [8]:
clf.classes

array([0, 1])

In [9]:
clf.prob_vec

{0: array([ -6.93147180e-01,   1.00000008e-10,   1.00000008e-10,
         -2.30258509e+01,  -2.30258509e+01,  -6.93147180e-01]),
 1: array([ -6.93147180e-01,  -2.30258509e+01,  -6.93147180e-01,
          1.00000008e-10,   1.00000008e-10,  -6.93147180e-01])}

In [10]:
clf.predict_proba(X)

array([[  -0.69314718,  -48.82429058],
       [  -0.69314718,  -25.10529247],
       [-115.82240183,   -0.69314718],
       [-115.82240183,   -1.38629436]])

In [11]:
clf.predict(X)

array([0, 0, 1, 1])

### NY Times dataset

In [None]:
#load data
import pandas as pd
data=pd.read_csv('./files/Boydstun_NYT_FrontPage_Dataset_1996-2006_0.csv')
data.head()

In [None]:
import numpy as np
#Let us train the classifier with data up to 1/1/2004 and test its performnace in data from 2004-2006
split = pd.to_datetime(pd.Series(data['Date']))<pd.datetime(2004, 1, 1)
raw_data = data['Title']
raw_train = raw_data[split]
raw_test = raw_data[np.logical_not(split)]
y = data['Topic_2digit']
y_train = y[split]
y_test = y[np.logical_not(split)]
print ('Check the split sizes, train, test and total amount of data:')
print (raw_train.shape, raw_test.shape, raw_data.shape)
print ('Display the labels:')
print (np.unique(y))

In [None]:
# Let us tokenize the data
from sklearn.feature_extraction.text import CountVectorizer

# We use the count number of instances considering that a word has a minimum support of two documents
vectorizer = CountVectorizer(min_df=2, 
# stop words such as 'and', 'the', 'of' are removed                             
 stop_words='english', 
 strip_accents='unicode')

#example of the tokenization
test_string = raw_train[0]
print ("Example: " + test_string +"\n")
print ("Preprocessed: " + vectorizer.build_preprocessor()(test_string)+"\n")
print ("Tokenized:" + str(vectorizer.build_tokenizer()(test_string))+"\n")
print ("Analyzed data string:" + str(vectorizer.build_analyzer()(test_string))+"\n")


#Process and convert data
X_train = vectorizer.fit_transform(raw_train)
X_test = vectorizer.transform(raw_test)

print ("Number of tokens: " + str(len(vectorizer.get_feature_names())) +"\n")
print ("Extract of tokens:")
print (vectorizer.get_feature_names()[1000:1100])

In [None]:
X_train = X_train.todense()
X_train = X_train.astype(np.float)
X_test = X_test.todense()
X_test = X_test.astype(np.float)
y_train = np.array(y_train.tolist())

### Our Multinomial NaiveBayes

In [None]:
clf = MultinomialNaiveBayes()
clf.fit(X_train,y_train)
y_hat = clf.predict(X_test)

from sklearn import metrics
import matplotlib.pyplot as plt
def plot_confusion_matrix(y_pred, y):
    plt.imshow(metrics.confusion_matrix(y, y_pred), interpolation='nearest',cmap='gray')
    plt.colorbar()
    plt.ylabel('true value')
    plt.xlabel('predicted value')
    fig = plt.gcf()
    fig.set_size_inches(9,9)    
    
print ("classification accuracy:", metrics.accuracy_score(y_hat, y_test))
plot_confusion_matrix(y_hat, y_test)
print ("Classification Report:")
print (metrics.classification_report(y_hat,np.array(y_test)))

### Result of sklearn Multinomial

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train,y_train)
y_hat = nb.predict(X_test)


from sklearn import metrics
import matplotlib.pyplot as plt
def plot_confusion_matrix(y_pred, y):
    plt.imshow(metrics.confusion_matrix(y, y_pred), interpolation='nearest',cmap='gray')
    plt.colorbar()
    plt.ylabel('true value')
    plt.xlabel('predicted value')
    fig = plt.gcf()
    fig.set_size_inches(9,9)    
    
print ("classification accuracy:", metrics.accuracy_score(y_hat, y_test))
plot_confusion_matrix(y_hat, y_test)
print ("Classification Report:")
print (metrics.classification_report(y_hat,np.array(y_test)))