# Naive Bayes model  (TP n°1)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from typing import Callable
from termcolor import colored

from datasets import load_dataset

from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer
import re
import spacy

''' set a defined random generator, better for reproducible results. '''
random = np.random.default_rng(42)

## Take a look on IMDB dataset:

In [2]:
imdb = load_dataset('imdb')
print(imdb)

Reusing dataset imdb (/home/cloud441/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


Let's see one example of entry in the IMDB database:

In [3]:
imdb['train'][0]

{'text': 'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!',
 'label': 1}

And we have the following number of entries:

In [4]:
print(f"train entries: {len(imdb['train'])}\ntest entries: {len(imdb['test'])}")

train entries: 25000
test entries: 25000


## Tokenize into a word/label dataset:

In [5]:
''' Train split dataset '''
train_dataset = []

for entry in imdb['train']:
    for word in tokenize.word_tokenize(entry['text']):
        train_dataset += [[word, entry['label']]]

train_dataset = pd.DataFrame(train_dataset, columns=['word', 'label'])

''' Let's see the final form: '''
train_dataset.head()

Unnamed: 0,word,label
0,Bromwell,1
1,High,1
2,is,1
3,a,1
4,cartoon,1


In [6]:
''' Test split dataset '''
test_dataset = []

for entry in imdb['test']:
    for word in tokenize.word_tokenize(entry['text']):
        test_dataset += [[word, entry['label']]]

test_dataset = pd.DataFrame(test_dataset, columns=['word', 'label'])
type(test_dataset)

pandas.core.frame.DataFrame

### Build Vocabulary:

In [7]:
vocabulary = train_dataset['word'].factorize()[1] # pandas does the factorization for us
type(vocabulary)

pandas.core.indexes.base.Index

## Build the Naive Bayes model:

Type aliases:

In [8]:
DataFrame = pd.core.frame.DataFrame
index = pd.core.indexes.base.Index

The Bayes model:

In [9]:
classes = [0, 1] # 0 is the class for negatives review and 1 for positives

def train_naive_bayes(dataset: DataFrame, classes: list[int], vocabulary: index) -> (list[float], dict[(int, str), float]):
    ''' Train a Naive Bayes model using the given dataset and according vocabulary '''
    
    classes_counter = dataset.groupby('label').count().word
    print(f"classes_counter is:\n{classes_counter}")
    
    loglikehood = {}
    logprior = []
    
    ''' computing each class loglikehood '''
    for c in classes:
        c_counter = classes_counter[c]
        
        logprior.append(math.log(c_counter / len(dataset)))
       
        bag_of_word = dataset[dataset.label == c].word
        word_counts = bag_of_word.value_counts()
                                                                                     
        
        ''' compute each word loglikehood by class '''
        for word in vocabulary:
            count_word_c = word_counts[word] if (word in word_counts) else 0
            loglikehood[(c,word)] = math.log((count_word_c + 1)/(len(bag_of_word) + 1))
    
    return (logprior, loglikehood)

So, the training of naive Bayes model on train IMDB dataset gives:

In [10]:
%%time
(logprior, loglikehood) = train_naive_bayes(train_dataset, classes, vocabulary)

classes_counter is:
label
0    3505555
1    3559778
Name: word, dtype: int64
CPU times: user 2.35 s, sys: 62.8 ms, total: 2.42 s
Wall time: 2.43 s


In [11]:
logprior

[-0.700851295611307, -0.6855019654147522]

In [12]:
loglikehood

{(0, 'Bromwell'): -15.069859696398153,
 (0, 'High'): -10.474739846263564,
 (0, 'is'): -4.2263258236334735,
 (0, 'a'): -3.8289792031003067,
 (0, 'cartoon'): -9.486363387616453,
 (0, 'comedy'): -7.8664541753150585,
 (0, '.'): -3.3838552560009028,
 (0, 'It'): -5.9960265690766335,
 (0, 'ran'): -10.369379330605737,
 (0, 'at'): -5.733944501513904,
 (0, 'the'): -3.2222349942130877,
 (0, 'same'): -7.458511978994532,
 (0, 'time'): -6.418310452482837,
 (0, 'as'): -5.215142308231455,
 (0, 'some'): -6.127791003353928,
 (0, 'other'): -6.758952939229703,
 (0, 'programs'): -11.73765518622295,
 (0, 'about'): -5.9783028604119375,
 (0, 'school'): -8.455134096194392,
 (0, 'life'): -7.413995678782096,
 (0, ','): -3.280735339042627,
 (0, 'such'): -7.2623496541819605,
 (0, '``'): -5.318183894451408,
 (0, 'Teachers'): -13.460421783964053,
 (0, "''"): -5.322499776573505,
 (0, 'My'): -8.223979821134103,
 (0, '35'): -11.573352134931673,
 (0, 'years'): -7.6540827209827595,
 (0, 'in'): -4.459864611600243,
 (0, 't

## Test the model:

### Test on training IMDB dataset:

In [13]:
''' Get the best result of each word according to the loglikehood '''
train_y_pred = np.array([0 if (loglikehood[(0, word)] > loglikehood[(1, word)]) else 1 for word in train_dataset['word']])

''' Compare to ground truth '''
train_y_gt = train_dataset['label'].to_numpy()

not_preprocessing_train_accuracy = (train_y_pred == train_y_gt).sum() / len(train_dataset)
print(f"the train accuracy is about: {not_preprocessing_train_accuracy}")

the train accuracy is about: 0.5671796361190619


### Test on test IMDB dataset:

In [14]:
'''
Get the best result of each word according to the loglikehood
we choose the label 0 in case of unknown word (not in vocabulary)
'''
test_y_pred = np.array([0 if (word not in vocabulary or loglikehood[(0, word)] > loglikehood[(1, word)]) else 1 for word in test_dataset['word']])

''' Compare to ground truth '''
test_y_gt = test_dataset['label'].to_numpy()

not_preprocessing_test_accuracy = (test_y_pred == test_y_gt).sum() / len(test_dataset)
print(f"the test accuracy is about: {not_preprocessing_test_accuracy}")

the test accuracy is about: 0.5461215708366166


## Add some pre-processing on IMDB dataset:

### Stop word filtering:

In [15]:
list_of_stop_word = ["the", "and", "a", "of", "to", "is", "it", "in", "this", "that", "s", "was", "as", "for", "with", "but", "then", "an", "at", "who", "when", "than", "where", "which", "with", "on", "t", "are", "by", "so", "from", "have", "be", "or", "just", "about", ""]

In [16]:
cleaned_train_dataset = train_dataset[~train_dataset.word.isin(list_of_stop_word)]
cleaned_test_dataset = test_dataset[~test_dataset.word.isin(list_of_stop_word)]


cleaned_vocabulary = cleaned_train_dataset['word'].factorize()[1]
print(f"Size of base train dataset is: {len(train_dataset)}")
print(f"Size of cleaned train dataset is: {len(cleaned_train_dataset)}")

Size of base train dataset is: 7065333
Size of cleaned train dataset is: 5233280


### Stemming the data:

In [17]:
re_word = re.compile(r"^\w+$")
stemmer = SnowballStemmer("english")

stem_word: Callable[[str], str] = lambda w : stemmer.stem(w.lower()) if re_word.match(w) else w

In [18]:
stemmed_train_dataset = train_dataset.copy()
stemmed_train_dataset['word'] = stemmed_train_dataset['word'].map(stem_word)

''' Let's see what stemming does: '''
stemmed_train_dataset.head()

Unnamed: 0,word,label
0,bromwel,1
1,high,1
2,is,1
3,a,1
4,cartoon,1


In [19]:
stemmed_test_dataset = test_dataset.copy()
stemmed_test_dataset['word'] = stemmed_test_dataset['word'].map(stem_word)

In [20]:
stemmed_vocabulary = stemmed_train_dataset['word'].factorize()[1]

### Lemmatizing the data:

Firstly, we need to download the english model of Spacy lemmatization...

In [21]:
!python -m spacy download en_core_web_sm > output_dl.txt

In [22]:
''' loading the small English model '''
nlp = spacy.load("en_core_web_sm")

In [23]:
%%time

lemmed_train_dataset = []

for entry in imdb['train']:
    for token in nlp(entry['text']):
        lemmed_train_dataset += [[token.lemma_, entry['label']]]

lemmed_train_dataset = pd.DataFrame(lemmed_train_dataset, columns=['word', 'label'])


''' Let's see what stemming does: '''
lemmed_train_dataset.head()

CPU times: user 10min 53s, sys: 746 ms, total: 10min 54s
Wall time: 10min 55s


Unnamed: 0,word,label
0,Bromwell,1
1,High,1
2,be,1
3,a,1
4,cartoon,1


In [24]:
lemmed_test_dataset = []

for entry in imdb['test']:
    for token in nlp(entry['text']):
        lemmed_test_dataset += [[token.lemma_, entry['label']]]

lemmed_test_dataset = pd.DataFrame(lemmed_test_dataset, columns=['word', 'label'])

In [25]:
lemmed_vocabulary = lemmed_train_dataset['word'].factorize()[1]

### Test the model with pre-processing:

Let's begin with stop word filtered dataset:

In [26]:
(cleaned_logprior, cleaned_loglikehood) = train_naive_bayes(cleaned_train_dataset, classes, cleaned_vocabulary)

classes_counter is:
label
0    2605861
1    2627419
Name: word, dtype: int64


In [27]:
''' Get the best result of each word according to the loglikehood '''
cleaned_train_y_pred = np.array([0 if (loglikehood[(0, word)] > loglikehood[(1, word)]) else 1 for word in cleaned_train_dataset['word']])

''' Compare to ground truth '''
cleaned_train_y_gt = cleaned_train_dataset['label'].to_numpy()

cleaned_train_accuracy = (cleaned_train_y_pred == cleaned_train_y_gt).sum() / len(cleaned_train_dataset)
print(f"the cleaned train accuracy is about: {cleaned_train_accuracy}")

the cleaned train accuracy is about: 0.5814871361746362


In [28]:
''' Get the best result of each word according to the loglikehood '''
cleaned_test_y_pred = np.array([0 if (word not in cleaned_vocabulary or loglikehood[(0, word)] > loglikehood[(1, word)]) else 1 for word in cleaned_test_dataset['word']])

''' Compare to ground truth '''
cleaned_test_y_gt = cleaned_test_dataset['label'].to_numpy()

cleaned_test_accuracy = (cleaned_test_y_pred == cleaned_test_y_gt).sum() / len(cleaned_test_dataset)
print(f"the cleaned test accuracy is about: {cleaned_test_accuracy}")

the cleaned test accuracy is about: 0.5541434260378817


then with stemming results:

In [29]:
(stemmed_logprior, stemmed_loglikehood) = train_naive_bayes(stemmed_train_dataset, classes, stemmed_vocabulary)

classes_counter is:
label
0    3505555
1    3559778
Name: word, dtype: int64


In [30]:
''' Get the best result of each word according to the loglikehood '''
stemmed_train_y_pred = np.array([0 if (stemmed_loglikehood[(0, word)] > stemmed_loglikehood[(1, word)]) else 1 for word in stemmed_train_dataset['word']])

''' Compare to ground truth '''
stemmed_train_y_gt = stemmed_train_dataset['label'].to_numpy()

stemmed_train_accuracy = (stemmed_train_y_pred == stemmed_train_y_gt).sum() / len(stemmed_train_dataset)
print(f"the stemmed train accuracy is about: {stemmed_train_accuracy}")

the stemmed train accuracy is about: 0.5607646518571736


In [31]:
'''
Get the best result of each word according to the loglikehood
we choose the label 0 in case of unknown word (not in vocabulary)
'''
stemmed_test_y_pred = np.array([0 if (word not in stemmed_vocabulary or stemmed_loglikehood[(0, word)] > stemmed_loglikehood[(1, word)]) else 1 for word in stemmed_test_dataset['word']])

''' Compare to ground truth '''
stemmed_test_y_gt = stemmed_test_dataset['label'].to_numpy()

stemmed_test_accuracy = (stemmed_test_y_pred == stemmed_test_y_gt).sum() / len(stemmed_test_dataset)
print(f"the stemmed test accuracy is about: {stemmed_test_accuracy}")

the stemmed test accuracy is about: 0.5442021425341607


And now the lemmatization results:

In [32]:
(lemmed_logprior, lemmed_loglikehood) = train_naive_bayes(lemmed_train_dataset, classes, lemmed_vocabulary)

classes_counter is:
label
0    3373917
1    3437630
Name: word, dtype: int64


In [33]:
''' Get the best result of each word according to the loglikehood '''
lemmed_train_y_pred = np.array([0 if (lemmed_loglikehood[(0, word)] > lemmed_loglikehood[(1, word)]) else 1 for word in lemmed_train_dataset['word']])

''' Compare to ground truth '''
lemmed_train_y_gt = lemmed_train_dataset['label'].to_numpy()

lemmed_train_accuracy = (lemmed_train_y_pred == lemmed_train_y_gt).sum() / len(lemmed_train_dataset)
print(f"the lemmed train accuracy is about: {lemmed_train_accuracy}")

the lemmed train accuracy is about: 0.5639915572776639


In [34]:
'''
Get the best result of each word according to the loglikehood
we choose the label 0 in case of unknown word (not in vocabulary)
'''
lemmed_test_y_pred = np.array([0 if (word not in lemmed_vocabulary or lemmed_loglikehood[(0, word)] > lemmed_loglikehood[(1, word)]) else 1 for word in lemmed_test_dataset['word']])

''' Compare to ground truth '''
lemmed_test_y_gt = lemmed_test_dataset['label'].to_numpy()

lemmed_test_accuracy = (lemmed_test_y_pred == lemmed_test_y_gt).sum() / len(lemmed_test_dataset)
print(f"the lemmed test accuracy is about: {lemmed_test_accuracy}")

the lemmed test accuracy is about: 0.5454024420520663


## conclusion about pre-processing:

We have applied two pre-processing strategies: stemming and lemmatization.
Let's make a comparison on accuracy:

In [35]:
print(colored(f"results without pre-processing: train [{round(not_preprocessing_train_accuracy, 3)}] -- test [{round(not_preprocessing_test_accuracy, 3)}]", "grey"))
print(colored(f"results with stop word filtering: train [{round(cleaned_train_accuracy, 3)}] -- test [{round(cleaned_test_accuracy, 3)}]", "green"))
print(colored(f"results with stemming: \t\ttrain [{round(stemmed_train_accuracy, 3)}] -- test [{round(stemmed_test_accuracy, 3)}]", "red"))
print(f"results with lemmatization: \ttrain [{round(lemmed_train_accuracy, 3)}] -- test [{round(lemmed_test_accuracy, 3)}]")

[30mresults without pre-processing: train [0.567] -- test [0.546][0m
[32mresults with stop word filtering: train [0.581] -- test [0.554][0m
[31mresults with stemming: 		train [0.561] -- test [0.544][0m
results with lemmatization: 	train [0.564] -- test [0.545]


_Analysis_:

Pre-processing doesn't look to be efficient on IMDB dataset __with a Naive Bayes model__.
Firstly, It doesn't mean that pre-processing is useless in IMDB dataset. One reason could be that naive bayes results are close to 50%, so the model is slightly better than a randomized predictor. With or without pre-processing will not have a great impact on bad results.

However, the act of filtering stop word is pretty efficient because these words are so much represented that they don't give a lot of informations.
But we can make such a comparison on our two other strategies. Stemming have the worst result on our benchmark, so the strategy looks to be useless in this case. But lemmatization make better prediction than stemming so maybe it will fit better with IMDB dataset.

## Build a binary Naive Bayes model:

In [36]:
def train_binary_naive_bayes(dataset: DataFrame, classes: list[int], vocabulary: index) -> (list[float], dict[(int, str), float]):
    
    classes_counter = dataset.groupby('label').count().word
    print(f"classes_counter is:\n{classes_counter}")
    
    loglikehood = {}
    logprior = []
    

    for c in classes:
        c_counter = classes_counter[c]
        
        logprior.append(math.log(c_counter / len(dataset)))
       
        bag_of_word = dataset[dataset.label == c].word
        word_counts = bag_of_word.value_counts()
                                                                                     
        
        
        for word in vocabulary:
            ''' Here, we just check if the word is present or not '''
            count_word_c = word_counts[word] if (word in word_counts) else 0
            loglikehood[(c,word)] = 1 if (count_word_c > 0) else 0 # math.log((count_word_c + 1)/(len(bag_of_word) + 1))
    
    return (logprior, loglikehood)

In [37]:
(bin_logprior, bin_loglikehood) = train_binary_naive_bayes(train_dataset, classes, vocabulary)

classes_counter is:
label
0    3505555
1    3559778
Name: word, dtype: int64


### Results of binary Naive Bayes model:

In [38]:
''' Get the best result of each word according to the loglikehood '''
bin_train_y_pred = np.array([0 if (bin_loglikehood[(0, word)] == 1) else 1 for word in train_dataset['word']])

''' Compare to ground truth '''
bin_train_y_gt = train_dataset['label'].to_numpy()

bin_train_accuracy = (bin_train_y_pred == bin_train_y_gt).sum() / len(train_dataset)
print(f"the binary model train accuracy is about: {bin_train_accuracy}")

the binary model train accuracy is about: 0.5059353890326188


In [39]:
''' Get the best result of each word according to the loglikehood '''
bin_test_y_pred = np.array([0 if (word not in vocabulary or bin_loglikehood[(0, word)] == 1) else 1 for word in test_dataset['word']])

''' Compare to ground truth '''
bin_test_y_gt = test_dataset['label'].to_numpy()

bin_test_accuracy = (bin_test_y_pred == bin_test_y_gt).sum() / len(test_dataset)
print(f"the binary model test accuracy is about: {bin_test_accuracy}")

the binary model test accuracy is about: 0.5017193943748491


## Conclusion:

Finally, we have a comparison of two Naive Bayes model with two possible pre-processing strategies. The results show that ...

To conclude, let's focus on a wrongly classified set of words:

In [40]:
wrong_set = train_dataset['word'][train_y_pred != train_y_gt][:20]
wrong_set = wrong_set.apply(lambda w : (w, 0 if (loglikehood[(0, w)] > loglikehood[(1, w)]) else 1))
wrong_set

4      (cartoon, 0)
6            (., 0)
9           (at, 0)
11        (same, 0)
14        (some, 0)
17       (about, 0)
18      (school, 0)
21        (such, 0)
23          (``, 0)
24    (Teachers, 0)
25          ('', 0)
26           (., 0)
32    (teaching, 0)
34        (lead, 0)
35          (me, 0)
36          (to, 0)
37     (believe, 0)
38        (that, 0)
44        (much, 0)
46          (to, 0)
Name: word, dtype: object

Why does "cartoon" is classified as __negative__ ?

In [41]:
print(f"the loglikehood of 'cartoon' is:\n\t0 --> {loglikehood[(0, 'cartoon')]}\n\t1 --> {loglikehood[(1, 'cartoon')]}")

the loglikehood of 'cartoon' is:
	0 --> -9.486363387616453
	1 --> -9.68253164037729


The classification is verified by computing loglikehood, but why "cartoon" doesn't seem to be __negative__ ?

"cartoon" is badly represented in our two labels so counting them will not offer a good interpretation of the feeling behind this word. Moreover, all words that are poorly represented in dataset, or equally divided in our labels will give bad predictions with Naive Bayes models. The reason is because we do not make an interpretation on sentences or context of our words.