In [6]:
import nltk
from nltk.corpus import movie_reviews

In [7]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

#fileids= Return a list of file identifiers for the fileids that make up this corpus.
#categories= a list specifying the categories whose words have to be returned.

In [8]:
len(documents)

2000

In [10]:

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

In [11]:
#converting all words to features
word_features = list(all_words.keys()) #[:3000]

len(word_features)

39768

In [12]:
#https://pythonprogramming.net/words-as-features-nltk-tutorial/
#This function is returning a boolean of the word from document (corpus) is within the first 3000 words
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [13]:
#This line is going to build a dictionary with the words from each document in the corpus, whether it is 
#in the list of Features  (TRUE or FALSE) and the category of the review

#[({'plot': True,
#    ':': True,
#    'two': True,
# ...},
#   'neg'),
#  ...]
   
# print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [14]:
training_set = featuresets[:1900]

# set that we'll test against.
testing_set = featuresets[1900:]

classifier = nltk.NaiveBayesClassifier.train(training_set)

In [15]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 74.0


In [16]:
classifier.show_most_informative_features(15)

Most Informative Features
               ludicrous = True              neg : pos    =     13.9 : 1.0
                  avoids = True              pos : neg    =     13.7 : 1.0
              astounding = True              pos : neg    =     13.7 : 1.0
             fascination = True              pos : neg    =     12.2 : 1.0
                 idiotic = True              neg : pos    =     12.1 : 1.0
             outstanding = True              pos : neg    =     11.0 : 1.0
                  annual = True              pos : neg    =     10.7 : 1.0
               atrocious = True              neg : pos    =     10.5 : 1.0
                  hatred = True              pos : neg    =     10.0 : 1.0
                seamless = True              pos : neg    =     10.0 : 1.0
                   dread = True              pos : neg    =     10.0 : 1.0
               addresses = True              pos : neg    =     10.0 : 1.0
                   mulan = True              pos : neg    =     10.0 : 1.0

In [17]:
import pickle

save_classifier = open("pickled/SentimentAnalysisTrainClassifierNLTK.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [27]:
import pickle
classifier_f = open("pickled/SentimentAnalysisTrainClassifierNLTK.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

# classifier.show_most_informative_features(15)

In [24]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode

In [19]:
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)

Original Naive Bayes Algo accuracy percent: 74.0


In [22]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf


In [26]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

# LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
# LogisticRegression_classifier.train(training_set)
# print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

# SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
# SGDClassifier_classifier.train(training_set)
# print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(training_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

# LinearSVC_classifier = SklearnClassifier(LinearSVC())
# LinearSVC_classifier.train(training_set)
# print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

# NuSVC_classifier = SklearnClassifier(NuSVC())
# NuSVC_classifier.train(training_set)
# print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)


MNB_classifier accuracy percent: 77.0
BernoulliNB_classifier accuracy percent: 67.0


In [29]:
voted_classifier = VoteClassifier(classifier,
#                                   NuSVC_classifier,
#                                   LinearSVC_classifier,
#                                   SGDClassifier_classifier,
#                                   LogisticRegression_classifier,
                                  BernoulliNB_classifier,
                                  MNB_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)


voted_classifier accuracy percent: 74.0


In [30]:
print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)

Classification: pos Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 100.0
Classification: pos Confidence %: 100.0
Classification: pos Confidence %: 66.66666666666666
Classification: pos Confidence %: 100.0


In [None]:
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

def sentiment(text):
    feats = find_features(text)
    return voted_classifier.classify(feats),voted_classifier.confidence(feats)

In [None]:
print(sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10"))