# CREATING CLEAN DATA

In [2]:
import nltk
import numpy as np

In [3]:
#Tokenization using regular expression
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r"\w+")

#Stopword Removal
from nltk.corpus import stopwords
st= set(stopwords.words('english'))

#Stemming
from nltk.stem.snowball import SnowballStemmer
ss = SnowballStemmer('english')



In [4]:
def get_stem_data(review):
    review=review.lower()
    review=review.replace("<br /><br />"," ")
    
    #Tokenize
    tokens = tokenizer.tokenize(review)
    n_token = [w for w in tokens if w not in st]
    stemm_token= [ss.stem(token) for token in n_token]
    
    cleaned_review=' '.join(stemm_token)
    return cleaned_review

In [None]:
out = open('Cleaned_file.txt','w',encoding="utf8")

with open("imdb_trainX.txt",encoding="ISO-8859-1") as f:
    reviews = f.readlines()

for review in reviews:
    cleaned_review = get_stem_data(review)
    print((cleaned_review),file=out)

out.close()

# From scratch

 # Buiding Multinomial Naive Bayes Classifier

In [5]:
#Creating A dictionary to store word count
from nltk.tokenize import word_tokenize
def dic_rating(x):
    a={}
    for i in range(len(x)):
        for j in word_tokenize(x[i]):
            try:
                a[j]+=1
            except:
                a[j]=1
    return a
        

In [6]:
def prior_prob(y_train,rating):
    total_example=len(y_train)
    class_example = np.sum(y_train==rating)
    
    return float((class_example)/(total_example))

In [7]:
def cond_prob(x_train,y_train,word,rating):
    x_filter= x_train[y_train==rating]
    
    a=dic_rating(x_filter)
    try:
        numerator = a[word] + 1
    except:
        numerator = 1 
    s=0
    for i in a:
        s+=a[i]
    denominator = s+ len(a)
    return  float(numerator/(denominator))

In [8]:
def predict(x_train,y_train,x_test):
    
    post_prob=[]
    
    
    for rating in np.unique(y_train):
        likehood=1
        
        for word in word_tokenize(x_test):
            cond= cond_prob(x_train,y_train,word,rating)
            likehood*=cond
       
        prior= prior_prob(y_train,rating)
        prob= float(likehood*prior)
        post_prob.append(prob)
     
    pred = np.argmax(post_prob)
    return pred
        

# Creating training and test dataset in numpy array

In [10]:
with open("Cleaned_file.text",encoding="ISO-8859-1") as f:
    reviews = f.readlines()
x_train=[]
for review in reviews:
    x_train.append(review)

In [11]:
with open("imdb_trainY.txt",encoding="utf8") as m:
    ratings= m.readlines()
y_train=[]
for rating in ratings:
    y_train.append(int(rating))

In [12]:
y_train=np.array(y_train)
x_train=np.array(x_train)

In [13]:
x_test=[]
with open("imdb_testX.txt",encoding="ISO-8859-1") as b:
    reviews = b.readlines()

for review in reviews:
    x_test.append(get_stem_data(review))

y_test=[]
with open("imdb_testY.txt",encoding="ISO-8859-1") as h:
    reviews = h.readlines()

for rating in ratings:
    y_test.append(int(rating))
y_test=np.array(y_test)

In [None]:
for i in x_test:
    predicted_array=[]
    predicted_array.append(predict(x_train,y_train,str(i)))

# Using Sklearn 

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
cv = CountVectorizer()

In [16]:
x_vec = cv.fit_transform(x_train[:5000]).toarray()

In [21]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB, GaussianNB

In [18]:
mb = MultinomialNB()
mb.fit(x_vec,y_train[:5000])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
xt_vec = cv.transform(x_test[:5000]).toarray()

In [20]:
mb.score(xt_vec,y_test[:5000])

0.33000000000000002

# Multivariate Bernoulli Event Model Naive Bayes

In [22]:

bnb = BernoulliNB(binarize=0.0)

In [31]:

bnb.fit(x_vec,y_train[:5000])
y_predict=bnb.predict(xt_vec)
print(y_predict)

[10 10  7 ..., 10 10 10]


In [25]:
bnb.score(xt_vec,y_test[:5000])

0.35299999999999998

In [26]:
bnb.predict(xt_vec)

array([10, 10,  7, ..., 10, 10, 10])

# Confusion Matrix

In [30]:
from sklearn.metrics import confusion_matrix

In [32]:
cnf_matrix = confusion_matrix(y_train[:5000],y_predict)

In [33]:
print(cnf_matrix)

[[  89  158   13  734]
 [  96  136   13  932]
 [  63  112    6  674]
 [ 160  261   19 1534]]
