In [30]:
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from random import shuffle 
import string
import re
from nltk.stem import WordNetLemmatizer 
from nltk import FreqDist
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
len(movie_reviews.fileids())

2000

In [3]:
movie_reviews.categories()

['neg', 'pos']

In [4]:
def preprocess(allWords):
    # preprocess
    #remove stop words
    stopword = stopwords.words('english')
    all_words_without_stopwords = [word.lower() for word in allWords if word not in stopword]
    
    # preprocess
    #remove punctuation
    punctuations = '''!()-[]--'{}،;:'"\,<>./?@#$%^&*_~'''
    all_words_without_punctuation1 = [word for word in all_words_without_stopwords if word not in string.punctuation]
    all_words_without_punctuation = [word for word in all_words_without_punctuation1 if word not in punctuations]
    
    # preprocess
    #remove numbers
    all_words_without_numbers = []
    for word in all_words_without_punctuation:
        x = re.sub(r"\d+", "", word)
        if(x != ''):
            all_words_without_numbers.append(x)
            
    # preprocess
    #lemmatization
    lemmatizer = WordNetLemmatizer() 
    all_words_lemmatized = [lemmatizer.lemmatize(word) for word in all_words_without_numbers]
    
    return all_words_lemmatized

In [19]:
def classification(all_train_words):
    words = preprocess(all_train_words)
    words_frequency = FreqDist(words)
#     len(all_words_frequency)
#     print(all_words_frequency.most_common(len(all_words_frequency))[len(all_words_frequency)-10000])
#     print(all_words_frequency.most_common(len(all_words_frequency))[len(all_words_frequency)-20000])
#     print(all_words_frequency.most_common(len(all_words_frequency))[len(all_words_frequency)-25000])
#     print(all_words_frequency.most_common(len(all_words_frequency))[len(all_words_frequency)-27000])
#     print(all_words_frequency.most_common(len(all_words_frequency))[len(all_words_frequency)-30000])
#     print(all_words_frequency.most_common(len(all_words_frequency))[len(all_words_frequency)-31000])
    features = words_frequency.most_common(1500)
    feature_set = []
    labels = []
    for review,category in reviews:
        row = []
        for feature,freq in features:
            if feature in review:
                row.append(1)
            else:
                row.append(0)

        feature_set.append(row)
        labels.append(category)
    gnb = GaussianNB()
    gnb.fit(feature_set,labels)
    return gnb,feature_set,labels

In [20]:
reviews = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        reviews.append((preprocess(movie_reviews.words(fileid)), category))
shuffle(reviews)
all_train_words = []


In [32]:
for review,count in reviews[0:1600]:
    for word in review:
        all_train_words.append(word)
gnb,feature_set,labels = classification(all_train_words)
y_pred = gnb.predict(feature_set[1601:])
print(accuracy_score(labels[1601:], y_pred))
print(confusion_matrix(labels[1601:], y_pred))

0.8395989974937343
[[182  24]
 [ 40 153]]


In [33]:
rev = reviews[:1200]+reviews[1601:]
for review,count in rev:
    for word in review:
        all_train_words.append(word)
gnb,feature_set,labels = classification(all_train_words)
y_pred = gnb.predict(feature_set[1201:1600])
print(accuracy_score(labels[1201:1600], y_pred))
print(confusion_matrix(labels[1201:1600], y_pred))

0.8421052631578947
[[181  23]
 [ 40 155]]


In [31]:
rev = reviews[:800]+reviews[1201:]
for review,count in rev:
    for word in review:
        all_train_words.append(word)
gnb,feature_set,labels = classification(all_train_words)
y_pred = gnb.predict(feature_set[801:1200])
print(accuracy_score(labels[801:1200], y_pred))
print(confusion_matrix(labels[801:1200], y_pred))

0.8571428571428571
[[164  22]
 [ 35 178]]


In [34]:
rev = reviews[:400]+reviews[801:]
for review,count in rev:
    for word in review:
        all_train_words.append(word)
gnb,feature_set,labels = classification(all_train_words)
y_pred = gnb.predict(feature_set[401:800])
print(accuracy_score(labels[401:800], y_pred))
print(confusion_matrix(labels[401:800], y_pred))

0.8646616541353384
[[188  14]
 [ 40 157]]


In [35]:
for review,count in reviews[401:]:
    for word in review:
        all_train_words.append(word)
gnb,feature_set,labels = classification(all_train_words)
y_pred = gnb.predict(feature_set[0:400])
print(accuracy_score(labels[0:400], y_pred))
print(confusion_matrix(labels[0:400], y_pred))

0.8225
[[177  24]
 [ 47 152]]


In [36]:
print('accuracy = ',(0.8395989974937343+0.8421052631578947+0.8571428571428571+0.8646616541353384+0.8225)/5)

accuracy =  0.8452017543859649


In [37]:
print('F1 = ',(0.83+0.87+0.84+0.84+0.84)/5)

F1 =  0.844
