# Text Classification using  Naive Bayes
### Based on IMDB dataset

Source Annexe : https://web.stanford.edu/~jurafsky/slp3/slides/7_NB.pdf

In [1]:
import numpy as np
import pandas as pd
import string
from collections import Counter
from sklearn.model_selection import train_test_split
import glob

In [2]:
pos_list=glob.glob("./data/movie-reviews-en/train/pos/*.txt")
neg_list=glob.glob("./data/movie-reviews-en/train/neg/*.txt")

In [3]:
n_word = 10000    #Nb of words to keep in the vocabulary

In [4]:
# Function to get all text files as a list of string
def get_text_list(file_list):
    text_list = []
    
    for file in file_list:
        
        with open(file,'r') as f:
            text_list.append(f.read())
            
    return(text_list)
        
pos_text = ' '.join(get_text_list(pos_list))
neg_text = ' '.join(get_text_list(neg_list))

### Small Preprocessing

Remove punctuation and line escape char '\n'

Then we only keep the n_word most occuring word across all text

In [5]:
pos_text = pos_text.translate(str.maketrans('','', string.punctuation)).replace('\n','')
pos_count = dict(Counter(pos_text.split()).most_common(n_word))

neg_text = neg_text.translate(str.maketrans('','', string.punctuation)).replace('\n','')
neg_count = dict(Counter(neg_text.split()).most_common(n_word))

### Probabilty functions

In [6]:
#Return the probabilty to have the input word knowing the class
#P(wi|Cj)
#P('nice'|Positive)

def proba_word(word,counter):    
    
    try: 
        # If the word is in our Vocabulary
        r = (counter[word]+1)/(sum(counter.values())+len(counter))
        
    except KeyError:
        # Else counter[word] = 0 
        r = (1)/(sum(counter.values())+len(counter))
        
    return(np.float64(r))

In [7]:
#Compute the sum of the log probabilities of each word in the input text

def proba_text(text,counter):   
    
    probs=[]
    
    for word in text.split():
        probs.append(np.log(proba_word(word,counter)))
        
    return(np.sum(probs))

### Building our model

In [8]:
def NBmodel(text_array):
    
    predictions = []
    
    for text in text_array:
        
        text = text.translate(str.maketrans('','', string.punctuation)).replace('\n','')
        probs = [proba_text(text,neg_count),proba_text(text,pos_count)]
        predictions.append(np.argmax(probs))
        
    return(predictions)

### Testing on the test set

In [9]:
pos_test_list = glob.glob("./data/movie-reviews-en/test/pos/*.txt")
neg_test_list = glob.glob("./data/movie-reviews-en/test/neg/*.txt")

In [10]:
pos_test_text = get_text_list(pos_test_list)      
neg_test_text = get_text_list(neg_test_list)

In [11]:
pos_results = NBmodel(pos_test_text)
neg_results = NBmodel(neg_test_text)

In [12]:
f" Negative Accuracy : {(len(neg_results)-np.sum(neg_results))/(len(neg_results))} "

' Negative Accuracy : 0.9 '

In [13]:
f"Positive Accuracy : {np.sum(pos_results)/(len(pos_results))}"

'Positive Accuracy : 0.73'