# Naive Bayes model  (TP n°1)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from typing import Callable
from termcolor import colored

from datasets import load_dataset

from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer
import re
import spacy

''' set a defined random generator, better for reproducible results. '''
random = np.random.default_rng(42)

## Take a look on IMDB dataset:

In [2]:
imdb = load_dataset('imdb')
print(imdb)

Reusing dataset imdb (/home/cloud441/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


Let's see one example of entry in the IMDB database:

In [3]:
imdb['train'][0]

{'text': 'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!',
 'label': 1}

And we have the following number of entries:

In [4]:
print(f"train entries: {len(imdb['train'])}\ntest entries: {len(imdb['test'])}")

train entries: 25000
test entries: 25000


## Tokenize into a word/label dataset:

In [5]:
%%time

''' Train split dataset '''
train_dataset = []
train_sentences = []

for idx, entry in enumerate(imdb['train']):
    for word in tokenize.word_tokenize(entry['text']):
        train_dataset += [[word, idx, entry['label']]]

train_dataset = pd.DataFrame(train_dataset, columns=['word', 'sentence', 'label'])

''' Let's see the final form: '''
train_dataset.head()

CPU times: user 37.8 s, sys: 453 ms, total: 38.2 s
Wall time: 38.2 s


Unnamed: 0,word,sentence,label
0,Bromwell,0,1
1,High,0,1
2,is,0,1
3,a,0,1
4,cartoon,0,1


In [6]:
''' Test split dataset '''
test_dataset = []
test_sentences = []

for idx, entry in enumerate(imdb['test']):    
    for word in tokenize.word_tokenize(entry['text']):
        test_dataset += [[word, idx, entry['label']]]

test_dataset = pd.DataFrame(test_dataset, columns=['word', 'sentence', 'label'])
type(test_dataset)

pandas.core.frame.DataFrame

### Build Vocabulary:

In [7]:
vocabulary = train_dataset['word'].factorize()[1] # pandas does the factorization for us
type(vocabulary)

pandas.core.indexes.base.Index

## Build the Naive Bayes model:

Type aliases:

In [8]:
DataFrame = pd.core.frame.DataFrame
index = pd.core.indexes.base.Index

The Bayes model:

In [9]:
classes = [0, 1] # 0 is the class for negatives review and 1 for positives

def train_naive_bayes(dataset: DataFrame, classes: list[int], vocabulary: index) -> (list[float], dict[(int, str), float]):
    ''' Train a Naive Bayes model using the given dataset and according vocabulary '''
    
    classes_counter = dataset.groupby('label').count().word
    print(f"classes_counter is:\n{classes_counter}")
    
    loglikehood = {}
    logprior = []
    
    ''' computing each class loglikehood '''
    for c in classes:
        c_counter = classes_counter[c]
        
        logprior.append(math.log(c_counter / len(dataset)))
       
        bag_of_word = dataset[dataset.label == c].word
        word_counts = bag_of_word.value_counts()
                                                                                     
        
        ''' compute each word loglikehood by class '''
        for word in vocabulary:
            count_word_c = word_counts[word] if (word in word_counts) else 0
            loglikehood[(c,word)] = math.log((count_word_c + 1)/(len(bag_of_word) + 1))
    
    return (logprior, loglikehood)

So, the training of naive Bayes model on train IMDB dataset gives:

In [10]:
%%time
(logprior, loglikehood) = train_naive_bayes(train_dataset, classes, vocabulary)

classes_counter is:
label
0    3505555
1    3559778
Name: word, dtype: int64
CPU times: user 1.98 s, sys: 66.3 ms, total: 2.05 s
Wall time: 2.06 s


In [11]:
logprior

[-0.700851295611307, -0.6855019654147522]

In [12]:
loglikehood

{(0, 'Bromwell'): -15.069859696398153,
 (0, 'High'): -10.474739846263564,
 (0, 'is'): -4.2263258236334735,
 (0, 'a'): -3.8289792031003067,
 (0, 'cartoon'): -9.486363387616453,
 (0, 'comedy'): -7.8664541753150585,
 (0, '.'): -3.3838552560009028,
 (0, 'It'): -5.9960265690766335,
 (0, 'ran'): -10.369379330605737,
 (0, 'at'): -5.733944501513904,
 (0, 'the'): -3.2222349942130877,
 (0, 'same'): -7.458511978994532,
 (0, 'time'): -6.418310452482837,
 (0, 'as'): -5.215142308231455,
 (0, 'some'): -6.127791003353928,
 (0, 'other'): -6.758952939229703,
 (0, 'programs'): -11.73765518622295,
 (0, 'about'): -5.9783028604119375,
 (0, 'school'): -8.455134096194392,
 (0, 'life'): -7.413995678782096,
 (0, ','): -3.280735339042627,
 (0, 'such'): -7.2623496541819605,
 (0, '``'): -5.318183894451408,
 (0, 'Teachers'): -13.460421783964053,
 (0, "''"): -5.322499776573505,
 (0, 'My'): -8.223979821134103,
 (0, '35'): -11.573352134931673,
 (0, 'years'): -7.6540827209827595,
 (0, 'in'): -4.459864611600243,
 (0, 't

## Test the model:

### Test on training IMDB dataset:

Let's define a function to compute sentences labelisation accuracy:

In [13]:
ArrayNumpy = np.ndarray

def compute_sentences_accuracy(dataset: DataFrame, prediction: ArrayNumpy) -> float:
    dataset_cp = dataset.copy()
    dataset_cp['prediction'] = prediction

    ''' We group by sentence to compute number of positive prediction '''
    grouped_dataset = dataset_cp.groupby(dataset_cp.sentence)

    ''' This mask corresponds to if each sentence is well labelled '''
    mask = (grouped_dataset.sum().prediction > grouped_dataset.count().word / 2) == (grouped_dataset.sum().label != 0)
    
    return mask.sum() / len(grouped_dataset)

In [14]:
''' Get the best result of each word according to the loglikehood '''
train_y_pred = np.array([0 if (loglikehood[(0, word)] > loglikehood[(1, word)]) else 1 for word in train_dataset['word']])

''' Compare to ground truth '''
train_y_gt = train_dataset['label'].to_numpy()

not_preprocessing_train_accuracy = (train_y_pred == train_y_gt).sum() / len(train_dataset)
print(f"the train accuracy on words is about: {not_preprocessing_train_accuracy}")

''' lets compute the prediction of sentences '''

not_preprocessing_train_sentences_accuracy = compute_sentences_accuracy(train_dataset, train_y_pred)
print(f"the train accuracy is about: {not_preprocessing_train_sentences_accuracy}")

the train accuracy on words is about: 0.5671796361190619
the train accuracy is about: 0.77988


### Test on test IMDB dataset:

In [15]:
'''
Get the best result of each word according to the loglikehood
we choose the label 0 in case of unknown word (not in vocabulary)
'''
test_y_pred = np.array([0 if (word not in vocabulary or loglikehood[(0, word)] > loglikehood[(1, word)]) else 1 for word in test_dataset['word']])

''' Compare to ground truth '''
test_y_gt = test_dataset['label'].to_numpy()

not_preprocessing_test_accuracy = (test_y_pred == test_y_gt).sum() / len(test_dataset)
print(f"the test accuracy on words is about: {not_preprocessing_test_accuracy}")

''' lets compute the prediction of sentences '''

not_preprocessing_test_sentences_accuracy = compute_sentences_accuracy(test_dataset, test_y_pred)
print(f"the test accuracy is about: {not_preprocessing_test_sentences_accuracy}")

the test accuracy on words is about: 0.5461215708366166
the test accuracy is about: 0.7116


## Add some pre-processing on IMDB dataset:

### Stop word filtering:

In [16]:
list_of_stop_word = ["the", "and", "a", "of", "to", "is", "it", "in", "this", "that", "s", "was", "as", "for", "with", "but", "then", "an", "at", "who", "when", "than", "where", "which", "with", "on", "t", "are", "by", "so", "from", "have", "be", "or", "just", "about", ""]

In [17]:
cleaned_train_dataset = train_dataset[~train_dataset.word.isin(list_of_stop_word)]
cleaned_test_dataset = test_dataset[~test_dataset.word.isin(list_of_stop_word)]


cleaned_vocabulary = cleaned_train_dataset['word'].factorize()[1]
print(f"Size of base train dataset is: {len(train_dataset)}")
print(f"Size of cleaned train dataset is: {len(cleaned_train_dataset)}")

Size of base train dataset is: 7065333
Size of cleaned train dataset is: 5233280


### Stemming the data:

In [18]:
re_word = re.compile(r"^\w+$")
stemmer = SnowballStemmer("english")

stem_word: Callable[[str], str] = lambda w : stemmer.stem(w.lower()) if re_word.match(w) else w

In [19]:
stemmed_train_dataset = train_dataset.copy()
stemmed_train_dataset['word'] = stemmed_train_dataset['word'].map(stem_word)

''' Let's see what stemming does: '''
stemmed_train_dataset.head()

Unnamed: 0,word,sentence,label
0,bromwel,0,1
1,high,0,1
2,is,0,1
3,a,0,1
4,cartoon,0,1


In [20]:
stemmed_test_dataset = test_dataset.copy()
stemmed_test_dataset['word'] = stemmed_test_dataset['word'].map(stem_word)

In [21]:
stemmed_vocabulary = stemmed_train_dataset['word'].factorize()[1]

### Lemmatizing the data:

Firstly, we need to download the english model of Spacy lemmatization...

In [22]:
!python -m spacy download en_core_web_sm > output_dl.txt

In [23]:
''' loading the small English model '''
nlp = spacy.load("en_core_web_sm")

In [24]:
%%time

lemmed_train_dataset = []

for idx, entry in enumerate(imdb['train']):
    for token in nlp(entry['text']):
        lemmed_train_dataset += [[token.lemma_, idx, entry['label']]]

lemmed_train_dataset = pd.DataFrame(lemmed_train_dataset, columns=['word', 'sentence', 'label'])


''' Let's see what stemming does: '''
lemmed_train_dataset.head()

CPU times: user 9min 56s, sys: 756 ms, total: 9min 57s
Wall time: 9min 58s


Unnamed: 0,word,sentence,label
0,Bromwell,0,1
1,High,0,1
2,be,0,1
3,a,0,1
4,cartoon,0,1


In [25]:
lemmed_test_dataset = []

for idx, entry in enumerate(imdb['test']):
    for token in nlp(entry['text']):
        lemmed_test_dataset += [[token.lemma_, idx, entry['label']]]

lemmed_test_dataset = pd.DataFrame(lemmed_test_dataset, columns=['word', 'sentence', 'label'])

In [26]:
lemmed_vocabulary = lemmed_train_dataset['word'].factorize()[1]

### Test the model with pre-processing:

Let's begin with stop word filtered dataset:

In [27]:
(cleaned_logprior, cleaned_loglikehood) = train_naive_bayes(cleaned_train_dataset, classes, cleaned_vocabulary)

classes_counter is:
label
0    2605861
1    2627419
Name: word, dtype: int64


In [28]:
''' Get the best result of each word according to the loglikehood '''
cleaned_train_y_pred = np.array([0 if (loglikehood[(0, word)] > loglikehood[(1, word)]) else 1 for word in cleaned_train_dataset['word']])

''' Compare to ground truth '''
cleaned_train_y_gt = cleaned_train_dataset['label'].to_numpy()

cleaned_train_accuracy = (cleaned_train_y_pred == cleaned_train_y_gt).sum() / len(cleaned_train_dataset)
print(f"the cleaned train words accuracy is about: {cleaned_train_accuracy}")

''' lets compute the prediction of sentences '''

cleaned_train_sentences_accuracy = compute_sentences_accuracy(cleaned_train_dataset, cleaned_train_y_pred)
print(f"the cleaned train accuracy is about: {cleaned_train_sentences_accuracy}")

the cleaned train words accuracy is about: 0.5814871361746362
the cleaned train accuracy is about: 0.75772


In [29]:
''' Get the best result of each word according to the loglikehood '''
cleaned_test_y_pred = np.array([0 if (word not in cleaned_vocabulary or loglikehood[(0, word)] > loglikehood[(1, word)]) else 1 for word in cleaned_test_dataset['word']])

''' Compare to ground truth '''
cleaned_test_y_gt = cleaned_test_dataset['label'].to_numpy()

cleaned_test_accuracy = (cleaned_test_y_pred == cleaned_test_y_gt).sum() / len(cleaned_test_dataset)
print(f"the cleaned test accuracy on words is about: {cleaned_test_accuracy}")

''' lets compute the prediction of sentences '''

cleaned_test_sentences_accuracy = compute_sentences_accuracy(cleaned_test_dataset, cleaned_test_y_pred)
print(f"the cleaned test accuracy is about: {cleaned_test_sentences_accuracy}")

the cleaned test accuracy on words is about: 0.5541434260378817
the cleaned test accuracy is about: 0.66492


then with stemming results:

In [30]:
(stemmed_logprior, stemmed_loglikehood) = train_naive_bayes(stemmed_train_dataset, classes, stemmed_vocabulary)

classes_counter is:
label
0    3505555
1    3559778
Name: word, dtype: int64


In [31]:
''' Get the best result of each word according to the loglikehood '''
stemmed_train_y_pred = np.array([0 if (stemmed_loglikehood[(0, word)] > stemmed_loglikehood[(1, word)]) else 1 for word in stemmed_train_dataset['word']])

''' Compare to ground truth '''
stemmed_train_y_gt = stemmed_train_dataset['label'].to_numpy()

stemmed_train_accuracy = (stemmed_train_y_pred == stemmed_train_y_gt).sum() / len(stemmed_train_dataset)
print(f"the stemmed train accuracy on words is about: {stemmed_train_accuracy}")

''' lets compute the prediction of sentences '''

stemmed_train_sentences_accuracy = compute_sentences_accuracy(stemmed_train_dataset, stemmed_train_y_pred)
print(f"the stemmed train accuracy is about: {stemmed_train_sentences_accuracy}")

the stemmed train accuracy on words is about: 0.5607646518571736
the stemmed train accuracy is about: 0.7584


In [32]:
'''
Get the best result of each word according to the loglikehood
we choose the label 0 in case of unknown word (not in vocabulary)
'''
stemmed_test_y_pred = np.array([0 if (word not in stemmed_vocabulary or stemmed_loglikehood[(0, word)] > stemmed_loglikehood[(1, word)]) else 1 for word in stemmed_test_dataset['word']])

''' Compare to ground truth '''
stemmed_test_y_gt = stemmed_test_dataset['label'].to_numpy()

stemmed_test_accuracy = (stemmed_test_y_pred == stemmed_test_y_gt).sum() / len(stemmed_test_dataset)
print(f"the stemmed test accuracy on words is about: {stemmed_test_accuracy}")

''' lets compute the prediction of sentences '''

stemmed_test_sentences_accuracy = compute_sentences_accuracy(stemmed_test_dataset, stemmed_test_y_pred)
print(f"the stemmed train accuracy is about: {stemmed_test_sentences_accuracy}")

the stemmed test accuracy on words is about: 0.5442021425341607
the stemmed train accuracy is about: 0.70724


And now the lemmatization results:

In [33]:
(lemmed_logprior, lemmed_loglikehood) = train_naive_bayes(lemmed_train_dataset, classes, lemmed_vocabulary)

classes_counter is:
label
0    3373917
1    3437630
Name: word, dtype: int64


In [34]:
''' Get the best result of each word according to the loglikehood '''
lemmed_train_y_pred = np.array([0 if (lemmed_loglikehood[(0, word)] > lemmed_loglikehood[(1, word)]) else 1 for word in lemmed_train_dataset['word']])

''' Compare to ground truth '''
lemmed_train_y_gt = lemmed_train_dataset['label'].to_numpy()

lemmed_train_accuracy = (lemmed_train_y_pred == lemmed_train_y_gt).sum() / len(lemmed_train_dataset)
print(f"the lemmed train accuracy on words is about: {lemmed_train_accuracy}")

''' lets compute the prediction of sentences '''

lemmed_train_sentences_accuracy = compute_sentences_accuracy(lemmed_train_dataset, lemmed_train_y_pred)
print(f"the lemmed train accuracy is about: {lemmed_train_sentences_accuracy}")

the lemmed train accuracy on words is about: 0.5639915572776639
the lemmed train accuracy is about: 0.77704


In [35]:
'''
Get the best result of each word according to the loglikehood
we choose the label 0 in case of unknown word (not in vocabulary)
'''
lemmed_test_y_pred = np.array([0 if (word not in lemmed_vocabulary or lemmed_loglikehood[(0, word)] > lemmed_loglikehood[(1, word)]) else 1 for word in lemmed_test_dataset['word']])

''' Compare to ground truth '''
lemmed_test_y_gt = lemmed_test_dataset['label'].to_numpy()

lemmed_test_accuracy = (lemmed_test_y_pred == lemmed_test_y_gt).sum() / len(lemmed_test_dataset)
print(f"the lemmed test accuracy on words is about: {lemmed_test_accuracy}")

''' lets compute the prediction of sentences '''

lemmed_test_sentences_accuracy = compute_sentences_accuracy(lemmed_test_dataset, lemmed_test_y_pred)
print(f"the lemmed test accuracy is about: {lemmed_test_sentences_accuracy}")

the lemmed test accuracy on words is about: 0.5454024420520663
the lemmed test accuracy is about: 0.71348


## conclusion about pre-processing:

We have applied two pre-processing strategies: stemming and lemmatization.
Let's make a comparison on accuracy:

In [36]:
print(colored(f"results without pre-processing: train [{round(not_preprocessing_train_sentences_accuracy, 3)}] -- test [{round(not_preprocessing_test_sentences_accuracy, 3)}]", "grey"))
print(colored(f"results with stop word filtering: train [{round(cleaned_train_sentences_accuracy, 3)}] -- test [{round(cleaned_test_sentences_accuracy, 3)}]", "red"))
print(colored(f"results with stemming: \t\ttrain [{round(stemmed_train_sentences_accuracy, 3)}] -- test [{round(stemmed_test_sentences_accuracy, 3)}]", "grey"))
print(colored(f"results with lemmatization: \ttrain [{round(lemmed_train_sentences_accuracy, 3)}] -- test [{round(lemmed_test_sentences_accuracy, 3)}]", "green"))

[30mresults without pre-processing: train [0.78] -- test [0.712][0m
[31mresults with stop word filtering: train [0.758] -- test [0.665][0m
[30mresults with stemming: 		train [0.758] -- test [0.707][0m
[32mresults with lemmatization: 	train [0.777] -- test [0.713][0m


_Analysis_:

During all this notebook, we keep informations about sentences accuracy but also on words accuracy to compare if there is a logic relation between these two metrics. As you can see above, There is no logic between them. Have a good accuracy on word classification is not a clue that this strategy will give a better accuracy on IMDB classification.

Let's compare our per-processing strategies. Stop word filtering looks to be a way to lose needed informations for Bayes Naive model to converge on sentences. So we will consider that this strategy is bad.

Stemming doesn't look to be efficient because we lose a little bit of accuracy on test dataset. Lemmatization seems to be a better fit with our model. The result are not so much better but It doesn't mean that the strategy is useless with IMDB dataset. This conclusion could be the same for stemming strategy. But why do we think that these two strategies couls be a interesting solution ?

The reason is that our Naive Bayes model is not so smart. To classify a sentence by analysing each word separately is a bad approach in NLP problem. Context analysis is needed and have the possibility to reduce complicated word to lemma or reduce stemming fit well this resolution approach.

## Build a binary Naive Bayes model:

In [37]:
def train_binary_naive_bayes(dataset: DataFrame, classes: list[int], vocabulary: index) -> (list[float], dict[(int, str), float]):
    
    classes_counter = dataset.groupby('label').count().word
    print(f"classes_counter is:\n{classes_counter}")
    
    loglikehood = {}
    logprior = []
    

    for c in classes:
        c_counter = classes_counter[c]
        
        logprior.append(math.log(c_counter / len(dataset)))
       
        bag_of_word = dataset[dataset.label == c].word
        word_counts = bag_of_word.value_counts()
                                                                                     
        
        
        for word in vocabulary:
            ''' Here, we just check if the word is present or not '''
            count_word_c = word_counts[word] if (word in word_counts) else 0
            loglikehood[(c,word)] = 1 if (count_word_c > 0) else 0 # math.log((count_word_c + 1)/(len(bag_of_word) + 1))
    
    return (logprior, loglikehood)

In [38]:
(bin_logprior, bin_loglikehood) = train_binary_naive_bayes(train_dataset, classes, vocabulary)

classes_counter is:
label
0    3505555
1    3559778
Name: word, dtype: int64


### Results of binary Naive Bayes model:

In [39]:
''' Get the best result of each word according to the loglikehood '''
bin_train_y_pred = np.array([0 if (bin_loglikehood[(0, word)] == 1) else 1 for word in train_dataset['word']])

''' Compare to ground truth '''
bin_train_y_gt = train_dataset['label'].to_numpy()

bin_train_accuracy = (bin_train_y_pred == bin_train_y_gt).sum() / len(train_dataset)
print(f"the binary model train accuracy is about: {bin_train_accuracy}")

''' lets compute the prediction of sentences '''

bin_train_sentences_accuracy = compute_sentences_accuracy(train_dataset, bin_train_y_pred)
print(f"the binary train accuracy on words is about: {bin_train_sentences_accuracy}")

the binary model train accuracy is about: 0.5059353890326188
the binary train accuracy on words is about: 0.5


In [40]:
''' Get the best result of each word according to the loglikehood '''
bin_test_y_pred = np.array([0 if (word not in vocabulary or bin_loglikehood[(0, word)] == 1) else 1 for word in test_dataset['word']])

''' Compare to ground truth '''
bin_test_y_gt = test_dataset['label'].to_numpy()

bin_test_accuracy = (bin_test_y_pred == bin_test_y_gt).sum() / len(test_dataset)
print(f"the binary model test accuracy on words is about: {bin_test_accuracy}")

''' lets compute the prediction of sentences '''

bin_test_sentences_accuracy = compute_sentences_accuracy(test_dataset, bin_test_y_pred)
print(f"the binary test accuracy on words is about: {bin_test_sentences_accuracy}")

the binary model test accuracy on words is about: 0.5017193943748491
the binary test accuracy on words is about: 0.5


The binary Naive Bayes model doesn't look to be efficient and we are close to random classifier.

## Sample of badly classified words and sentences:

To conclude, let's focus on a wrongly classified set of words:

In [41]:
wrong_set = train_dataset['word'][train_y_pred != train_y_gt][:20]
wrong_set = wrong_set.apply(lambda w : (w, 0 if (loglikehood[(0, w)] > loglikehood[(1, w)]) else 1))
wrong_set

4      (cartoon, 0)
6            (., 0)
9           (at, 0)
11        (same, 0)
14        (some, 0)
17       (about, 0)
18      (school, 0)
21        (such, 0)
23          (``, 0)
24    (Teachers, 0)
25          ('', 0)
26           (., 0)
32    (teaching, 0)
34        (lead, 0)
35          (me, 0)
36          (to, 0)
37     (believe, 0)
38        (that, 0)
44        (much, 0)
46          (to, 0)
Name: word, dtype: object

Why does "cartoon" is classified as __negative__ ?

In [42]:
print(f"the loglikehood of 'cartoon' is:\n\t0 --> {loglikehood[(0, 'cartoon')]}\n\t1 --> {loglikehood[(1, 'cartoon')]}")

the loglikehood of 'cartoon' is:
	0 --> -9.486363387616453
	1 --> -9.68253164037729


The classification is verified by computing loglikehood, but why "cartoon" doesn't seem to be __negative__ ?

"cartoon" is badly represented in our two labels so counting them will not offer a good interpretation of the feeling behind this word. Moreover, all words that are poorly represented in dataset, or equally divided in our labels will give bad predictions with Naive Bayes models. The reason is because we do not make an interpretation on sentences or context of our words.

And for a sentence ?

In [43]:
dataset_cp = train_dataset.copy()
dataset_cp['prediction'] = train_y_pred

''' We group by sentence to compute number of positive prediction '''
grouped_dataset = dataset_cp.groupby(dataset_cp.sentence)

''' This mask corresponds to if each sentence is badly labelled '''
mask = np.array((grouped_dataset.sum().prediction > grouped_dataset.count().word / 2) != (grouped_dataset.sum().label != 0))
idx_error = np.where(mask == True)[0]
imdb['train'][idx_error]['text']

["Yes its an art... to successfully make a slow paced thriller.<br /><br />The story unfolds in nice volumes while you don't even notice it happening.<br /><br />Fine performance by Robin Williams. The sexuality angles in the film can seem unnecessary and can probably affect how much you enjoy the film. However, the core plot is very engaging. The movie doesn't rush onto you and still grips you enough to keep you wondering. The direction is good. Use of lights to achieve desired affects of suspense and unexpectedness is good.<br /><br />Very nice 1 time watch if you are looking to lay back and hear a thrilling short story!",
 "I enjoyed The Night Listener very much. It's one of the better movies of the summer.<br /><br />Robin Williams gives one of his best performances. In fact, the entire cast was very good. All played just the right notes for their characters - not too much and not too little. Sandra Oh adds a wonderful comic touch. Toni Collette is great as the Mom, and never goes 

This badly classified sentence is a good example of a sentence that will cause some problems with other better models. This IMDB comment gives pros and cons about the film so we have negative and positive description on the same comment.