# Text Classification using  Naive Bayes
### Based on IMDB dataset

Source Annexe : https://web.stanford.edu/~jurafsky/slp3/slides/7_NB.pdf

In [26]:
import numpy as np
import pandas as pd
import string
from collections import Counter
from sklearn.model_selection import train_test_split
import glob #list .txt file

In [27]:
pos_list=glob.glob("./data/movie-reviews-en/train/pos/*.txt")
neg_list=glob.glob("./data/movie-reviews-en/train/neg/*.txt")

In [28]:
n_word = 10000    #Nb of words to keep in the vocabulary

pos_text = ''
#Creating Positive Vocabulary
for file in pos_list:
    f = open(file, "r")
    pos_text += f.read()
    f.close()


neg_text = ''
#Creating Negative Vocabulary
for file in neg_list:
    f = open(file, "r")
    neg_text += f.read() 
    f.close()

#Pre Processing and saving as dict to keep only n_word most common


### Small Preprocessing

Remove punctuation and line escape char '\n'

In [29]:
pos_text = pos_text.translate(str.maketrans('','', string.punctuation)).replace('\n','')
pos_count = dict(Counter(pos_text.split()).most_common(n_word))

neg_text = neg_text.translate(str.maketrans('','', string.punctuation)).replace('\n','')
neg_count = dict(Counter(neg_text.split()).most_common(n_word))

In [30]:
#Return the probabilty to have the input word knowing the class
#P(wi|Cj)
#P('nice'|Positive)

def proba_word(word,counter):    
    try: 
        # If the word is in our Vocabulary
        r = (counter[word]+1)/(sum(counter.values())+len(counter))
    except KeyError:
        # Else counter[word] = 0 
        r = (1)/(sum(counter.values())+len(counter))
    return(np.float64(r))

In [31]:
#Compute the sum of the log probabilities of each word in the input text
def proba_text(text,counter):
    probs=[]
    for word in text.split():
        probs.append(np.log(proba_word(word,counter)))
    return(np.sum(probs))

### Testing on the positive test set

In [32]:
pos_test_list=glob.glob("./data/movie-reviews-en/test/pos/*.txt")
pos_results=[]

for file in pos_test_list:
    f = open(file,"r")
    text = f.read()
    f.close()
    text = text.translate(str.maketrans('','', string.punctuation)).replace('\n','')
    probs = [proba_text(text,neg_count),proba_text(text,pos_count)]
    pos_results.append(np.argmax(probs))

In [33]:
f"Score : {np.sum(pos_results)/(len(pos_results))}"

'Score : 0.73'

### Testing on the negative test set

In [34]:
neg_test_list=glob.glob("./data/movie-reviews-en/test/neg/*.txt")
neg_results=[]

for file in neg_test_list:
    f = open(file,"r")
    text = f.read()
    f.close()
    text = text.translate(str.maketrans('','', string.punctuation)).replace('\n','')
    probs = [proba_text(text,neg_count),proba_text(text,pos_count)]
    neg_results.append(np.argmax(probs))

In [35]:
f"Score : {(len(neg_results)-np.sum(neg_results))/(len(neg_results))} "

'Score : 0.9 '