#### Author : Dipjyoti Das (https://www.linkedin.com/in/dipjyotidas)

### This script is used to Train classifiers on the dataset and all the classifers are saved as pickle files.

### The pickled classfiers are used in the sentiment_analysis.py file.

###### Import all the libraries

In [3]:
import nltk
import random
#from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode

from nltk.tokenize import word_tokenize


class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf
        
        
        
short_pos = open("data/positive.txt","r").read()
short_neg = open("data/negative.txt","r").read()


# using POS -parts of speech tag - allow only specific words
#pos - tuple- word, parts of speech

all_words = []
documents = []

#  j is adject, r is adverb, and v is verb
#allowed_word_types = ["J","R","V"]
allowed_word_types = ["J"] # allowing only Adjectives

for p in short_pos.split('\n'):
    documents.append((p, "pos") )
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types: # w - tuple, not getting Nouns, commas
            all_words.append(w[0].lower())

            
for p in short_neg.split('\n'):
    documents.append((p, "neg"))
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())


# pickle and store documents
# pickled algos - folder created to store all the pickled objects :


save_documents = open("pickled_algos/documents.pickle", "wb")
pickle.dump(documents, save_documents)
save_documents.close()


all_words = nltk.FreqDist(all_words)


word_features = list(all_words.keys())[:5000]

# pickle and store word features
save_word_features = open("pickled_algos/word_features5k.pickle","wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()


def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

# Pickle and store - featuresets :

save_featuresets = open("pickled_algos/featuresets.pickle", "wb")
pickle.dump(featuresets, save_featuresets)
save_featuresets.close()

random.shuffle(featuresets)
print(len(featuresets))


# Train and Test set:


testing_set = featuresets[10000:]
training_set = featuresets[:10000]


## List of Classifiers :

## Naive Bayes classifier:

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

## pickle and store - Naive Bayes classifier
save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()


## MNB classifier :


MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)


# pickle and store MNB classifier:

save_classifier = open("pickled_algos/MNB_classifier5k.pickle","wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()

## BernoulliNB classifier:

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

#pickle and store BernoulliNB classifier:

save_classifier = open("pickled_algos/BernoulliNB_classifier5k.pickle","wb")
pickle.dump(BernoulliNB_classifier, save_classifier)
save_classifier.close()


## Logistic Regression classifier:


LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

#pickle and store Logistic Regression classifier:

save_classifier = open("pickled_algos/LogisticRegression_classifier5k.pickle","wb")
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()


## LinearSVC classifier


LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)


# pickle and store LinearSVC classifier:

save_classifier = open("pickled_algos/LinearSVC_classifier5k.pickle","wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()

## SGDC classifier:

SGDC_classifier = SklearnClassifier(SGDClassifier())
SGDC_classifier.train(training_set)
print("SGDClassifier accuracy percent:",nltk.classify.accuracy(SGDC_classifier, testing_set)*100)

# pickle and store SGDC classifier:

save_classifier = open("pickled_algos/SGDC_classifier5k.pickle","wb")
pickle.dump(SGDC_classifier, save_classifier)
save_classifier.close()


## Can't pickle the Voted Classifier - class of its own


10664
Original Naive Bayes Algo accuracy percent: 73.64457831325302
Most Informative Features
               wonderful = True              pos : neg    =     21.8 : 1.0
              engrossing = True              pos : neg    =     19.7 : 1.0
                 generic = True              neg : pos    =     16.9 : 1.0
                mediocre = True              neg : pos    =     16.9 : 1.0
               inventive = True              pos : neg    =     15.7 : 1.0
                 routine = True              neg : pos    =     14.9 : 1.0
                    flat = True              neg : pos    =     14.9 : 1.0
              refreshing = True              pos : neg    =     14.4 : 1.0
                  boring = True              neg : pos    =     13.8 : 1.0
                    warm = True              pos : neg    =     13.1 : 1.0
                intimate = True              pos : neg    =     11.7 : 1.0
               realistic = True              pos : neg    =     11.7 : 1.0
      



SGDClassifier accuracy percent: 71.83734939759037


We can run upto this cell one time. The sentiment analysis module uses the saved pickle objects and it also has the voting classfier and the sentiment function. The module saved is sentiment_analysis.py

#### After importing the sentiment analysis module :

### We can use this to check if any sentiment is positive or negative with the confidence level.

#### Examples :

In [1]:
import sentiment_analysis as s

# referencing the sentiment function of the sentiment_analysis.py script

# Example -  Pass through our own positive review
print(s.sentiment("This movie was awesome! The story was great and performances were amazing, I really liked it!"))

# Example - Pass through a negative review
print(s.sentiment("This movie was junk. No story at all and acting sucked. Horrible movie, 1/10"))



('pos', 1.0)
('neg', 1.0)


In [None]:
## Both are at 100% confidence level

#### This module can be used to perform live sentiment analysis from Twitter!