# Naive Bayesian - Multinomial

In [1]:
import numpy as np
import matplotlib.pyplot as plt

$$
P(y|w) = \frac{P(w|y)P(y)}{P(w)}
$$

$$ P(w_i \in train \mid y=k) = \frac{count(w_i, k)}{\sum_{i=1}^{n} count(w_i, k)} $$
    
Example:

| | docID  | words in doc    | China?   |    
|---:|:-------------|:-----------|:------|
| Training set | 1  | Chinese Beijing Chinese       | Yes   |
|  | 2  | Chinese Chinese Shanghai    | Yes  |
|  | 3  | Chinese Macao    | Yes   |
|  | 4  | Tokyo Japan Chinese   | No   |
| Test set | 5  | Chinese Chinese Chinese Tokyo Japan    | ?   |


In [2]:
train = np.array([
    'Chinese Beijing Chinese',
    'Chinese Chinese Shanghai',
    'Chinese Macao',
    'Tokyo Japan Chinese',
])

test = np.array([
    'Chinese Chinese Chinese Tokyo Japan'
])

train_target = np.array([1, 1, 1, 0])

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train    = vectorizer.fit_transform(train)

In [4]:
print("Feature names:", vectorizer.get_feature_names_out())

Feature names: ['beijing' 'chinese' 'japan' 'macao' 'shanghai' 'tokyo']


In [5]:
print("Type:", type(X_train))

Type: <class 'scipy.sparse._csr.csr_matrix'>


In [6]:
print(X_train.toarray())

[[1 2 0 0 0 0]
 [0 2 0 0 1 0]
 [0 1 0 1 0 0]
 [0 1 1 0 0 1]]


In [7]:
def likelihood(X_class):
    return ((X_class.sum(axis=0)) /  np.sum(X_class.sum(axis=0)))

In [8]:
X_train_class1 = X_train[train_target==1]
X_train_class1

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [9]:
X_train_class0 = X_train[train_target==0]
X_train_class0

<1x6 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [10]:
X_train_class1.toarray()

array([[1, 2, 0, 0, 0, 0],
       [0, 2, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 0]])

In [11]:
X_train_class1.sum(axis=0)

matrix([[1, 5, 0, 1, 1, 0]])

In [12]:
np.sum(X_train_class1.sum(axis=0))

8

In [13]:
likelihood1 = likelihood(X_train_class1)
likelihood1

matrix([[0.125, 0.625, 0.   , 0.125, 0.125, 0.   ]])

In [14]:
likelihood0 = likelihood(X_train_class0)
likelihood0

matrix([[0.        , 0.33333333, 0.33333333, 0.        , 0.        ,
         0.33333333]])

## Laplace smoothing

$$ P(w_i \in train \mid y=k) = \frac{count(w_i, k) + 1}{\sum_{i=1}^{n} count(w_i, k) + n} $$

In [15]:
def likelihood(X_class, laplace=1):
    return (((X_class.sum(axis=0)) + laplace) /  np.sum(X_class.sum(axis=0) + laplace))     

In [16]:
likelihood1 = likelihood(X_train_class1)
likelihood0 = likelihood(X_train_class0)

In [17]:
likelihood0, likelihood1

(matrix([[0.11111111, 0.22222222, 0.22222222, 0.11111111, 0.11111111,
          0.22222222]]),
 matrix([[0.14285714, 0.42857143, 0.07142857, 0.14285714, 0.14285714,
          0.07142857]]))

## Priors

$$P(y = k) = \frac{\Sigma_{i=1}^{m}1(y=k)}{m} $$

In [18]:
prior1 = len(train_target[train_target==1])/len(train_target)
prior0 = len(train_target[train_target==0])/len(train_target)

print("Target: ", train_target)
print("Prior 1 (P(y=1)): ", prior1)
print("Prior 0 (P(y=0)): ", prior0)

Target:  [1 1 1 0]
Prior 1 (P(y=1)):  0.75
Prior 0 (P(y=0)):  0.25


## Total likelihood

$$ P(w \in test \mid y=k) = \prod_{i=1}^{n} P(w_i \mid y=k)^{\text{freq of }w_i}$$

In [19]:
X_test = vectorizer.transform(test)
X_test.toarray()

array([[0, 3, 1, 0, 0, 1]])

In [20]:
likelihood1

matrix([[0.14285714, 0.42857143, 0.07142857, 0.14285714, 0.14285714,
         0.07142857]])

In [21]:
pxtest_y1 = np.prod(np.power(likelihood1, X_test.toarray()))
pxtest_y1

0.0004016183732968405

In [22]:
pxtest_y0 = np.prod(np.power(likelihood0, X_test.toarray()))
pxtest_y0

0.0005419228098697689

In [23]:
test

array(['Chinese Chinese Chinese Tokyo Japan'], dtype='<U35')

## Probability

$$P (y = k \mid w \in test) = P(y=k)\prod_{i=1}^{n} P(w_i \mid y=k)^{\text{freq of }w_i}$$

In [24]:
py1_x = prior1 * pxtest_y1
py0_x = prior0 * pxtest_y0

In [25]:
py1_x, py0_x

(0.00030121377997263036, 0.00013548070246744223)

In [26]:
test

array(['Chinese Chinese Chinese Tokyo Japan'], dtype='<U35')

In [27]:
#conclusion, our test sample Chinese Chinese Chinese Tokyo Japan --> China

In [28]:
np.log(py1_x), np.log(py0_x)

(-8.107690312843909, -8.906681345001262)

## Log probability
   
$$P (y = k \mid w \in test) = \log \ P(y=k) + \sum_{i=1}^{n} (\text{freq of }w_i) * \log \ p(w_i \mid y=k)$$

In [29]:
pxtest_y1 = X_test.toarray() @ np.log(likelihood1.T)
pxtest_y0 = X_test.toarray() @ np.log(likelihood0.T)

print("P(X_test | y = 1): ", pxtest_y1)
print("P(X_test | y = 0): ", pxtest_y0)

P(X_test | y = 1):  [[-7.82000824]]
P(X_test | y = 0):  [[-7.52038698]]


In [30]:
py1_x = np.log(prior1) + pxtest_y1
py0_x = np.log(prior0) + pxtest_y0

In [31]:
print("P(y=1|X_test) = p(y=1) + p(X_test|y=1): ", py1_x)
print("P(y=0|X_test) = p(y=0) + p(X_test|y=0): ", py0_x)

P(y=1|X_test) = p(y=1) + p(X_test|y=1):  [[-8.10769031]]
P(y=0|X_test) = p(y=0) + p(X_test|y=0):  [[-8.90668135]]


## Let's implement

### 1. Prepare some data

In [32]:
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups()
data.target_names

categories = ['talk.religion.misc', 'soc.religion.christian',
              'sci.space', 'comp.graphics']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

print(train.data[0]) #first 300 words
print("Target: ", train.target[0])  #start with 1, soc.religion.christian

#transform our X to frequency data
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train.data)
X_test = vectorizer.transform(test.data)
X_test = X_test.toarray()  #vectorizer gives us a sparse matrix; convert back to dense matrix

y_train = train.target
y_test = test.target

print("X_train: ", X_train[0])
print("y_train: ", y_train[0])

From: jono@mac-ak-24.rtsg.mot.com (Jon Ogden)
Subject: Re: Losing your temper is not a Christian trait
Organization: Motorola LPA Development
Lines: 26

In article <Apr.23.02.55.47.1993.3138@geneva.rutgers.edu>, jcj@tellabs.com
(jcj) wrote:

> I'd like to remind people of the withering of the fig tree and Jesus
> driving the money changers et. al. out of the temple.  I think those
> were two instances of Christ showing anger (as part of His human side).
> 
Yes, and what about Paul saying:

26 Be ye angry, and sin not: let not the sun go down upon your wrath:
(Ephesians 4:26).

Obviously then, we can be angry w/o sinning.

Jon

------------------------------------------------
Jon Ogden         - jono@mac-ak-24.rtsg.mot.com
Motorola Cellular - Advanced Products Division
Voice: 708-632-2521      Data: 708-632-6086
------------------------------------------------

They drew a circle and shut him out.
Heretic, Rebel, a thing to flout.
But Love and I had the wit to win;
We drew a circle and 

### 2. Calculating likelihood anrd prior

In [33]:
def likelihood(X_class, laplace=1):
    return ((X_class.sum(axis=0)) + laplace) / (np.sum(X_class.sum(axis=0) + laplace))

In [34]:
def prior(X_class, m):
    return X_class.shape[0] / m

In [35]:
def fit(X_train, y_train):
    m, n = X_train.shape
    classes = np.unique(y_train)  #list of class
    k = len(classes) #number of class
    
    priors = np.zeros(k) #prior for each classes
    likelihoods = np.zeros((k, n)) #likehood for each class of each feature
    
    for idx, label in enumerate(classes):
        X_train_c = X_train[y_train==label]
        priors[idx] = prior(X_train_c, m)
        likelihoods[idx, :] = likelihood(X_train_c)
    return priors, likelihoods

### 3. Predict

In [36]:
def predict(X_test, priors, likelihoods, classes):
    return np.log(priors) + X_test @ np.log(likelihoods.T) 

### 4. Let's use them

In [37]:
priors, likelihoods = fit(X_train, y_train)

In [38]:
classes = np.unique(y_test)
yhat = predict(X_test, priors, likelihoods, classes)
yhat = np.argmax(yhat, axis=1)

In [39]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import average_precision_score, classification_report

n_classes = len(np.unique(y_test))

print("Accuracy: ", np.sum(yhat == y_test)/len(y_test))

print("=========Average precision score=======")
y_test_binarized = label_binarize(y_test, classes=[0, 1, 2, 3])
yhat_binarized = label_binarize(yhat, classes=[0, 1, 2, 3])

for i in range(n_classes):
    class_score = average_precision_score(y_test_binarized[:, i], yhat_binarized[:, i])
    print(f"Class {i} score: ", class_score)
    
print("=========Classification report=======")
print("Report: ", classification_report(y_test, yhat))

Accuracy:  0.9168994413407822
Class 0 score:  0.9152047938418233
Class 1 score:  0.9069918620723723
Class 2 score:  0.8429395016564877
Class 3 score:  0.7277310085946386
Report:                precision    recall  f1-score   support

           0       0.95      0.95      0.95       389
           1       0.94      0.96      0.95       394
           2       0.87      0.95      0.91       398
           3       0.92      0.74      0.82       251

    accuracy                           0.92      1432
   macro avg       0.92      0.90      0.91      1432
weighted avg       0.92      0.92      0.92      1432

