# Text Classification using  Naive Bayes
### Based on IMDB dataset

Source Annexe : https://web.stanford.edu/~jurafsky/slp3/slides/7_NB.pdf

In [1]:
import numpy as np
import pandas as pd
import string
from collections import Counter
from sklearn.model_selection import train_test_split
import glob

In [2]:
pos_list=glob.glob(pathname="./data/movie-reviews-en/train/pos/*.txt")
neg_list=glob.glob(pathname="./data/movie-reviews-en/train/neg/*.txt")

In [3]:
n_word = 10000    #Nb of words to keep in the vocabulary

In [4]:
# Function to get all text files as a list of string
def get_text_list(file_list):
    text_list = []
    
    for file in file_list:
        
        with open(file,'r') as f:
            text_list.append(f.read())
            
    return(text_list)
        
pos_text = ' '.join(get_text_list(pos_list))
neg_text = ' '.join(get_text_list(neg_list))

### Small Preprocessing

Remove punctuation and line escape char '\n'

Then we only keep the n_word most occuring word across all text

In [5]:
pos_text = pos_text.translate(str.maketrans('','', string.punctuation))
pos_text = pos_text.replace('\n','')
pos_count = dict(Counter(pos_text.split()).most_common(n_word))

neg_text = neg_text.translate(str.maketrans('','', string.punctuation))
neg_text = neg_text.replace('\n','')
neg_count = dict(Counter(neg_text.split()).most_common(n_word))

### Probabilty functions

In [6]:
#Return the probabilty to have the input word knowing the class
#P(wi|Cj)
#P('nice'|Positive)
# With Laplace Smoothing

def proba_word(word,counter):    
    
    try: 
        
        # If the word is in our Vocabulary
        r = (counter[word]+1)/(sum(counter.values())+len(counter))
        
    except KeyError:
        # Else counter[word] = 0 
        r = (1)/(sum(counter.values())+len(counter))
        
    return(np.float64(r))

In [7]:
#Compute the sum of the log probabilities of each word in the input text

def proba_text(text,counter):   
    
    probs=[]
    
    for word in text.split():
        probs.append(np.log(proba_word(word,counter)))
        
    return(np.sum(probs))

### Building our model

In [8]:
def NBmodel(text_array,counter):
    
    predictions = []
    neg_count,pos_count = counter
    
    for text in text_array:
        
        probs = [proba_text(text,neg_count),proba_text(text,pos_count)]
        predictions.append(np.argmax(probs))
        
    return(predictions)

### Testing on the test set

In [9]:
pos_test_list = glob.glob(pathname="./data/movie-reviews-en/test/pos/*.txt")
neg_test_list = glob.glob(pathname="./data/movie-reviews-en/test/neg/*.txt")

In [10]:
pos_test_text = get_text_list(pos_test_list)      
neg_test_text = get_text_list(neg_test_list)

In [11]:
for i in range(0,len(pos_test_text)):
    pos_test_text[i] = pos_test_text[i].translate(str.maketrans('','', string.punctuation))
    pos_test_text[i] = pos_test_text[i].replace('\n','')

for i in range(0,len(neg_test_text)):
    neg_test_text[i] = neg_test_text[i].translate(str.maketrans('','', string.punctuation))
    neg_test_text[i] = neg_test_text[i].replace('\n','')

In [12]:
pos_results = NBmodel(pos_test_text,[neg_count,pos_count])
neg_results = NBmodel(neg_test_text,[neg_count,pos_count])

In [13]:
f" Negative Accuracy : {(len(neg_results)-np.sum(neg_results))/(len(neg_results))} "

' Negative Accuracy : 0.9 '

In [14]:
f"Positive Accuracy : {np.sum(pos_results)/(len(pos_results))}"

'Positive Accuracy : 0.73'

### Now with a bit of Preprocessing

Removing stop words using ntlk package

In [15]:
import nltk
from nltk.corpus import stopwords
import re 
stopwords = set(stopwords.words('english'))

for word in stopwords:
    pos_text = re.sub(' '+ word+' ',' ',pos_text)
    neg_text = re.sub(' '+ word+' ',' ',neg_text)


In [16]:
len(pos_test_text),len(neg_test_text)

(100, 100)

In [17]:
pos_count = dict(Counter(pos_text.split()).most_common(n_word))
neg_count = dict(Counter(neg_text.split()).most_common(n_word))

In [18]:
for word in stopwords:
    for i in range(0,len(pos_test_text)):
        pos_test_text[i] = re.sub(' '+ word+' ',' ',pos_test_text[i])
        neg_test_text[i] = re.sub(' '+ word+' ',' ',neg_test_text[i])

In [19]:
pos_results = NBmodel(pos_test_text,[neg_count,pos_count])

In [22]:
f"Positive Accuracy : {np.sum(pos_results)/(len(pos_results))}"

'Positive Accuracy : 0.73'

In [20]:
neg_results = NBmodel(neg_test_text,[neg_count,pos_count])

In [21]:
f" Negative Accuracy : {(len(neg_results)-np.sum(neg_results))/(len(neg_results))} "

' Negative Accuracy : 0.89 '

In [23]:
from nltk.stem.snowball import SnowballStemmer

In [29]:
 print(SnowballStemmer("english").stem("generously"))

generous


In [31]:
pos_test_text[0]

'in 1912  ship set sail maiden voyage across atlantic america  ship built largest ship world   also build one luxurious   finally  built unsinkable unfortunately  get ticket voyage either  spent lifes savings get america start life anew  part upper class money spare  finally lucky enough full house poker match docks like jack dawson  jack dawson makes trip  happens right place right time  rose dewitt bukater  first class passenger  climbs railings aft ship thoughts jumping  thus started tale romance intrigue  tale death tragedy    movie tragic event took place great many years ago  even taken lightly bit historical trivia  movie titanic shows happened  maybe 100 degree accuracy  still shows realisticaly  titanic story backdrop story  serves admirably  brining forth interesting story although simple simple premise captivating  movie emotional simply  alone enough  story brought certain style makes much emotional much effective  movies forgotten quickly unfortunately something produced h

In [36]:
for w in pos_test_text[0].split():
    print(w)
    print(SnowballStemmer("english").stem(w))

in
in
1912
1912
ship
ship
set
set
sail
sail
maiden
maiden
voyage
voyag
across
across
atlantic
atlant
america
america
ship
ship
built
built
largest
largest
ship
ship
world
world
also
also
build
build
one
one
luxurious
luxuri
finally
final
built
built
unsinkable
unsink
unfortunately
unfortun
get
get
ticket
ticket
voyage
voyag
either
either
spent
spent
lifes
life
savings
save
get
get
america
america
start
start
life
life
anew
anew
part
part
upper
upper
class
class
money
money
spare
spare
finally
final
lucky
lucki
enough
enough
full
full
house
hous
poker
poker
match
match
docks
dock
like
like
jack
jack
dawson
dawson
jack
jack
dawson
dawson
makes
make
trip
trip
happens
happen
right
right
place
place
right
right
time
time
rose
rose
dewitt
dewitt
bukater
bukat
first
first
class
class
passenger
passeng
climbs
climb
railings
rail
aft
aft
ship
ship
thoughts
thought
jumping
jump
thus
thus
started
start
tale
tale
romance
romanc
intrigue
intrigu
tale
tale
death
death
tragedy
tragedi
movie
movi
trag