In [2]:
import pandas as pd
import math
from string import punctuation
from datasets import load_dataset_builder
from datasets import load_dataset

from functools import reduce



  from .autonotebook import tqdm as notebook_tqdm


## The dataset

The following cells will execute the code given in the hugging face tutorial to explore the dataset
and answer the different questions around it

In [3]:
ds_builder = load_dataset_builder("rotten_tomatoes")

Downloading builder script: 100%|██████████| 5.03k/5.03k [00:00<00:00, 1.74MB/s]
Downloading metadata: 100%|██████████| 2.02k/2.02k [00:00<00:00, 1.28MB/s]
Downloading readme: 100%|██████████| 7.25k/7.25k [00:00<00:00, 4.31MB/s]


In [4]:
print(ds_builder.info.description)
ds_builder.info.features

Movie Review Dataset.
This is a dataset of containing 5,331 positive and 5,331 negative processed
sentences from Rotten Tomatoes movie reviews. This data was first used in Bo
Pang and Lillian Lee, ``Seeing stars: Exploiting class relationships for
sentiment categorization with respect to rating scales.'', Proceedings of the
ACL, 2005.



{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

In [13]:
dataset = load_dataset("rotten_tomatoes")

Downloading and preparing dataset rotten_tomatoes/default to /home/aeschylli/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46...


Downloading data: 100%|██████████| 488k/488k [00:00<00:00, 10.9MB/s]
                                                                                     

Dataset rotten_tomatoes downloaded and prepared to /home/aeschylli/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 349.90it/s]


In [14]:
pd.Series(dataset["validation"]["label"]).value_counts()

1    533
0    533
dtype: int64

## Naive Bayes classifier 
The next cells will focus on the Naive Bayes classifier.

First we will preprocces the dataset, lowering every character and removing unwanted ponctuation.


In [34]:
text_data = pd.Series(dataset["train"]["text"])

def clean_data(text_data):
    text_data = text_data.str.lower()
    #display(text_data[0])
    text_data = text_data.str.replace("[" + punctuation + "]( |$)", "", regex=True)
    #display(text_data[0])
    return text_data

text_data = clean_data(text_data=text_data)
data = pd.concat([text_data, pd.Series(dataset["train"]["label"])], axis=1)
data.columns=["text", "label"]
data

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,the gorgeously elaborate continuation of the l...,1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,emerges as something rare an issue movie that'...,1
...,...,...
8525,any enjoyment will be hinge from a personal th...,0
8526,if legendary shlockmeister ed wood had ever ma...,0
8527,hardly a nuanced portrait of a young woman's b...,0
8528,interminably bleak to say nothing of boring,0


In [12]:
def find_vocabulary(text_data):
    return reduce(lambda a,b: a | b, text_data.apply(lambda t: set(t.split(" "))))

find_vocabulary(data["text"])

{'',
 "lead's",
 'distancing',
 'cup',
 'studiously',
 'top-notch',
 "'truthabout",
 'merits',
 'frequent',
 'sorts',
 'non-disney',
 'off-putting',
 'double',
 'razzle-dazzle',
 'tripe',
 'sense',
 'distanced',
 'disclosure',
 'roisterous',
 'grotesque',
 'inexorably',
 'apallingly',
 'victor',
 'ambiguous',
 'well-worn',
 'delightfully',
 'yet',
 'welcomes',
 "nesbitt's",
 'gowns',
 'enjoyed',
 'thing',
 'fictionalize',
 'insipid',
 'popcorn',
 'no',
 'aberration',
 '-dull',
 'off',
 'ver',
 'burlap',
 'rodriguez',
 'situation',
 'teenagers',
 'produce',
 'inquisitiveness',
 'pollute',
 'unapologetically',
 'wattage',
 'van',
 'self-hating',
 'forrest',
 'sleeper',
 'specious',
 'predecesora',
 '7',
 'capitalize',
 'nudity',
 'punches',
 'lighting',
 'governs',
 'surgical',
 'divertida',
 'upends',
 'dulls',
 "'we're",
 'quasi-shakespearean',
 'analyze',
 'commercial',
 'overproduced',
 'stars',
 'one-of-a-kind',
 'dramedy',
 'insufferable',
 'combined',
 'disorienting',
 'impart',
 

In [7]:
def train(data, classes):
    V = find_vocabulary(data["text"])
    
    logprior = {}
    n_data = len(data)
    for c in classes:
        n_c = data["label"].value_counts()[c]
        logprior[c] = math.log(n_c/n_data)
        
        
        
        
train(data, [0, 1])

Next cells build the Naive Bayes Classifier using sklearn

In [47]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

estimator = [('Cv', CountVectorizer()), ('Bayes', MultinomialNB())]
pipe = Pipeline(estimator)
pipe.fit(data.text, data.label)

In [48]:
print(f"test : {pipe.score(clean_data(pd.Series(dataset['test']['text'])),dataset['test']['label'])}")
print(f"train : {pipe.score(clean_data(pd.Series(dataset['train']['text'])),dataset['train']['label'])}")


test : 0.797373358348968
train : 0.9327080890973036


**Most likely, the scikit-learn implementation will give better results. Looking at the documentation, explain why it could be the case.**

Sklearn seems better because it uses a Laplace smoothing parameter.

To test this hypotheis I run the same model but with no smoothing parameter

In [49]:
estimator = [('Cv', CountVectorizer()), ('Bayes', MultinomialNB(alpha=0, force_alpha=True))]
pipe = Pipeline(estimator)
pipe.fit(data.text, data.label)
print(f"test : {pipe.score(clean_data(pd.Series(dataset['test']['text'])),dataset['test']['label'])}")
print(f"train : {pipe.score(clean_data(pd.Series(dataset['train']['text'])),dataset['train']['label'])}")

  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(


test : 0.6951219512195121
train : 0.9626025791324736


The accuracy droped on test and was higher on train, the model seems to overfit.

The smoothing parameter is definitely important to get a better accuracy

**Why is accuracy a sufficient measure of evaluation here?**

It is a sufficient measure because the classes or equally split.

In [57]:
predicator = pipe.fit(clean_data(pd.Series(dataset['test']['text'])),dataset['test']['label'])
prediction = predicator.predict(clean_data(pd.Series(dataset['train']['text'])))

mask = prediction != dataset['train']['label']
# convert the mask to a pandas series
series = pd.Series(mask)

series.value_counts()

  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(


False    5198
True     3332
dtype: int64

In [61]:
pd.Series(prediction).value_counts()

0    6189
1    2341
dtype: int64

In [60]:
dataset['train']['label']
pd.Series(dataset['train']['label']).value_counts()


1    4265
0    4265
dtype: int64