#### Let's import the nessesary libraries

In [None]:
import pandas as pd
import math

from multiset import *
from string import punctuation
from datasets import load_dataset_builder
from datasets import load_dataset

from functools import reduce



## The dataset

The following cells will execute the code given in the hugging face tutorial to explore the dataset
and answer the different questions around it

In [None]:
ds_builder = load_dataset_builder("rotten_tomatoes")

In [None]:
print(ds_builder.info.description)
ds_builder.info.features

In [None]:
dataset = load_dataset("rotten_tomatoes")

In [None]:
pd.Series(dataset["validation"]["label"]).value_counts()

## The dataset

**1. How many splits does the dataset has? (1 point)**

They are 3 splits in the dataset.

**2. How big are these splits? (1 point)**

The nuber of in the splits are the following:
- train : 8530
- validation : 1066
- test : 1066

Each row contain a sentence and a label.

**3. What is the proportion of each class on the supervised splits? (1 point)**

The proportion of each class (neg and pos reviews) is 50 % each in every split.

## Naive Bayes classifier


## Naive Bayes classifier 
The next cells will focus on the Naive Bayes classifier.

### 1. Take a look at the data and create an adapted preprocessing function with at least
First we will preprocces the dataset, lowering every character and removing unwanted ponctuation.


In [None]:
# Let's pre-treat the input by removing the puntuation and lowering it

# We also transfer it into pandas Dataframe format to ease handling 
all_data = {}

def clean_data(text_data):
    text_data = text_data.str.lower()
    text_data = text_data.str.replace("[" + punctuation + "]( |$)", "", regex=True)
    return text_data

for type in dataset.keys():
        
    text_data = pd.Series(dataset[type]["text"])

    text_data = clean_data(text_data=text_data)
    all_data[type] = pd.concat([text_data, pd.Series(dataset[type]["label"])], axis=1)
    all_data[type].columns=["text", "label"]
    
    
data = all_data["train"]
data

### 2. Implement your own naive Bayes classifier from scratch. The pseudo code can be found in the slides or the book reference.
We tokenize words with a multiset data structure (bag of word method)

In [None]:
def find_vocabulary(text_data: pd.DataFrame):
    td = text_data.copy()
    td["text"] = td["text"].apply(lambda t: Multiset(t.split(" ")))
    
    return td.groupby(by="label")["text"].sum()

V_c = find_vocabulary(data)
V = V_c.sum()

C = [0, 1]

We implement the Train Naive Bayes function dicribed in the lecture

In [None]:
def train(data: pd.DataFrame, classes: list):
    V_c = find_vocabulary(data) # called bigdoc in pseudo code
    V = V_c.sum()
    
    # creates empty df where we will store our word occurence conditional probabilities 
    logLikelihood = pd.DataFrame(index=pd.Index(set(V)), columns=classes) 

    logprior = {}
    n_data = len(data)
    for c in classes:
        n_c = data["label"].value_counts()[c]
        logprior[c] = math.log(n_c / n_data)
        
        cardinal_V = (len(V) + len(V.items()))
        
        for w in V:
            count = V_c[c][w]
            logLikelihood.loc[w, c] = math.log((count + 1) / cardinal_V)
            
    return logprior, logLikelihood
            


logprior, logLikelihood = train(data, C)

In [None]:
predict_args = (logprior, logLikelihood, C, V)

#Implements Test Naive Bayes function
def predict(doc, logprior, logLikelihood, C, V):
    sumed = {}
    for c in C:
        sumed[c] = logprior[c]
        for w in doc.split(" "):
            if w in V:
                sumed[c] += logLikelihood.loc[w, c]

    return max(sumed, key=sumed.get)

# Applies prediction to a whole df
def predict_dataset(data, logprior, logLikelihood, C, V):
    data["results"] = (
        data["text"].apply(lambda t: predict(t, logprior, logLikelihood, C, V))
        == data["label"]
    )

# Print the percent of right answer
def evaluate(data, name, logprior, logLikelihood, C, V):
    print(f"results on {name} dataset : ", data["results"].value_counts()[True] / len(data))

# Predicts and evaluate on all our dataset
def accuracyScratch():
  for type in dataset.keys():
    predict_dataset(all_data[type], *predict_args)   
    evaluate(all_data[type], type, *predict_args)

### 3. Implement a naive Bayes classifier using scikit-learn.
Next cells build the Naive Bayes Classifier using sklearn

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

estimator = [('Cv', CountVectorizer()), ('Bayes', MultinomialNB())]
pipe = Pipeline(estimator)
pipe.fit(data.text, data.label)

### 4. Report the accuracy on both training and test set, for both your implementation and the scikit-learn one.

In [None]:
print('Accuracy of our own model : \n')
accuracyScratch()
print('\nAccuracy of scikit model : \n')

for type in dataset.keys():  
    print(f"{type} : {pipe.score(all_data[type]['text'],all_data[type]['label'])}")


### 5. Most likely, the scikit-learn implementation will give better results. Looking at the documentation, explain why it could be the case.

Sklearn seems better because it uses an auto Laplace smoothing parameter, to better fit the datas.

To test this hypotheis I run the same model but with no smoothing parameter

In [None]:
estimator = [('Cv', CountVectorizer()), ('Bayes', MultinomialNB(alpha=0, force_alpha=True))]
pipeNoalpha = Pipeline(estimator)
pipeNoalpha.fit(data.text, data.label)

for type in dataset.keys():  
    print(f"{type} : {pipeNoalpha.score(all_data[type]['text'],all_data[type]['label'])}")


The accuracy droped on test and was higher on train, the model seems to overfit.

The smoothing parameter is definitely important to get a better accuracy, but we should have fixed its value and not remomved it completely.

### 6. Why is accuracy a sufficient measure of evaluation here?

It is a sufficient measure because the classes or equally split.

### 7. Using one of the implementation, take at least 2 wrongly classified example from the test set and try explaining why the model failed.

In [None]:
prediction = pipe.predict(all_data["train"]['text'])

mask = prediction != all_data['train']['label']
# convert the mask to a pandas series
series = pd.Series(mask)

series.value_counts()

### 7. Using one of the implementation, take at least 2 wrongly classified example from the test set and try explaining why the model failed.

In [None]:
wrong = all_data['train'].loc[series]

def printWrong(row : pd.Series):
    print(f"We were wrong on  : {row['text']}\nReal label : {'positive' if row['label'] else 'negative'}")

wrong.head().apply(printWrong, axis=1)
None


The model seems to miss classify data where negative words are used but not to describe the movie itself but other element around it, like the movies of the past few years.

### 8. [Bonus] What are the top 10 most important words (features) for each class?

In [None]:
for c in [0, 1]:
    print(f"Most impactfull words in class {c} :")
    display(logLikelihood.sort_values(by=c, ascending=False)[c].head(10))
    print("\n")

In [None]:
# Imports the stopword list
import ssl
import nltk

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
from nltk.corpus import stopwords

stops = set(stopwords.words('english'))
stops.add('')
print(stops)


In [None]:

for c in [0, 1]:
    print(f"Most impactfull words in class {c} (exepting stopwords):")
    most_impact = logLikelihood.sort_values(by=c, ascending=False)[c]
    display(most_impact.loc[~most_impact.index.isin(stops)].head(10))
    print("\n")


Now let's look at scikit's log_probs

In [None]:

names = pd.Series(pipe["Cv"].get_feature_names_out(), name="name")
scikit_log_prob = pd.DataFrame(pipe["Bayes"].feature_log_prob_).transpose()
scikit_log_prob.columns = pipe["Bayes"].classes_
scikit_log_prob.index = names

for c in pipe["Bayes"].classes_:
    print(f"Most impactfull words in class {c} in scikit learn :")
    most_impact = scikit_log_prob.sort_values(by=c, ascending=False)[c]
    display(most_impact.head(10))

    print("And without stop words :")

    display(most_impact.loc[~most_impact.index.isin(stops)].head(10))

### 9. [Bonus] Play with scikit-learn's version parameters. For example, see if you can consider unigram and bigram instead of only unigrams.

In [None]:
estimator = [('Cv', CountVectorizer(ngram_range=(1,2))), ('Bayes', MultinomialNB())]
pipeBonus = Pipeline(estimator)
pipeBonus.fit(data.text, data.label)

In [None]:
for type in dataset.keys():  
    print(f"{type} : {pipeBonus.score(all_data[type]['text'],all_data[type]['label'])}")


### We tried multiple values for the ngram-range parameter, like only bigrams, unigram to trigrams and this one seems to be the best. But the accuracy does not raise really much

### We also tried addind the stop words, but it doesn't increase the accuracy

#**Stemming**

In [None]:
import nltk
import ssl
import re

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize 

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# We need to download a package for word tokenization
nltk.download('punkt')


re_word = re.compile(r"^\w+$")
stemmer = SnowballStemmer("english")

Running stemmer to preprocess the data

In [None]:
all_data_stemmed = {}
for type in dataset.keys():
        
    text_data = pd.Series(dataset[type]["text"])

    text_data = text_data.apply(lambda t: " ".join([stemmer.stem(word) for word in word_tokenize(t.lower()) if re_word.match(word)]))
    all_data_stemmed[type] = pd.concat([text_data, pd.Series(dataset[type]["label"])], axis=1)
    all_data_stemmed[type].columns=["text", "label"]
    
    
data_stemmed = all_data_stemmed["train"]
data_stemmed

Retrain model on stemmed data

In [None]:
pipe.fit(data.text, data.label)

Evaluate new model :

In [None]:
print('\nAccuracy of scikit model with stemming: \n')

for type in dataset.keys():  
    print(f"{type} : {pipe.score(all_data[type]['text'],all_data[type]['label'])}")

The Accuracy seems to go down when using stemming, It may be caused by the quality of the stemmer or the nature of reviews that don't really benefit from stemming