# Sentiment Analysis with NTLK

## Get the Data Source, and Prepare Training and Test set

In [1]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag, FreqDist
import random
import pickle

In [2]:
# read the labelled positive and negative text
def DataSources():
    positiveData = open("resources/positive.txt", "r", encoding='utf-8', errors='replace').read()
    negativeData = open("resources/negative.txt", "r", encoding='utf-8', errors='replace').read()
    return positiveData, negativeData


In [3]:
# prepare the data
def PrepareData():
    train_pos, train_neg = DataSources()
    documents = []
    all_words = []
    
#    j is adjective, r is adverb, and v is verb
#    allowed_word_types = ["J","R","V"]
    allowed_word_types = ["J"]

    for p in train_pos.split('\n'):
        documents.append((p, "pos"))
        words = word_tokenize(p)
        pos = pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())

    for p in train_neg.split('\n'):
        documents.append((p, "neg"))
        words = word_tokenize(p)
        pos = pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())
    
    save_documents = open("saved/documents.p", "wb")
    pickle.dump(documents, save_documents)
    save_documents.close()

    all_words = FreqDist(all_words)
    word_features = list(all_words.keys())[:5000]

    save_word_features = open("saved/word_features5k.p", "wb")
    pickle.dump(word_features, save_word_features)
    save_word_features.close()

    features = [(find_features(rev, word_features), category) for (rev, category) in documents]
    return features


def find_features(document, word_features):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features


def TestTrainData():
    featuresets = PrepareData()
    random.shuffle(featuresets)
#    print(len(featuresets))
    testing_set = featuresets[10000:]
    training_set = featuresets[:10000]
    return training_set, testing_set

## Train Classifiers and store them 

In [4]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from nltk import NaiveBayesClassifier, classify
# from gsu.data import TestTrainData
import pickle


def TrainClassifiers():
    training_set, testing_set = TestTrainData()

    classifiers = list()
    classifier_name = list()

    NaiveBayesClassifier_classifier = NaiveBayesClassifier.train(training_set)
    classifiers.append(NaiveBayesClassifier_classifier)
    classifier_name.append("NaiveBayesClassifier")

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    classifiers.append(MNB_classifier)
    classifier_name.append("MultinomialNBClassifier")

    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    classifiers.append(BernoulliNB_classifier)
    classifier_name.append("BernoulliNBClassifier")

    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    classifiers.append(LogisticRegression_classifier)
    classifier_name.append("LogisticRegressionClassifier")

    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    classifiers.append(LogisticRegression_classifier)
    classifier_name.append("LinearSVCClassifier")

    SGDC_classifier = SklearnClassifier(SGDClassifier())
    SGDC_classifier.train(training_set)
    classifiers.append(SGDC_classifier)
    classifier_name.append("SGDClassifier")

    SaveClassifiers(classifiers, classifier_name)

    return classifiers


def SaveClassifiers(classifiers, classifier_name):

    for i in range(0, len(classifiers)):
        save_classifier_path = open("saved/" + classifier_name[i] + ".p", "wb")
        pickle.dump(classifiers[i], save_classifier_path)
        save_classifier_path.close()

    save_classifier_path = open("saved/classifier_name.p", "wb")
    pickle.dump(classifier_name, save_classifier_path)

## Load the trained classifiers

In [5]:
import random
import pickle
from nltk.tokenize import word_tokenize
from nltk import classify



def find_features(document, word_features):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features


def LoadData(shuffle=False):

    documents_f = open("saved/documents.p", "rb")
    documents = pickle.load(documents_f)
    documents_f.close()

    documents_f = open("saved/word_features5k.p", "rb")
    word_features = pickle.load(documents_f)
    documents_f.close()

    features = [(find_features(rev, word_features), category) for (rev, category) in documents]

    if shuffle:
        random.shuffle(features)

    testing_set = features[10000:]
    training_set = features[:10000]

    return training_set, testing_set


def LoadClassifiers():
    document = open("saved/classifier_name.p", "rb")
    classifier_name = pickle.load(document)
    document.close()
    # print(classifier_name)
    classifiers = list()

#     training_set, testing_set = LoadData()

    for name in classifier_name:
        document = open("saved/" + name + ".p", "rb")
        classifier = pickle.load(document)
        classifiers.append(classifier)
        document.close()

    return classifiers


def LoadFeatures():
    documents_f = open("saved/word_features5k.p", "rb")
    word_features = pickle.load(documents_f)
    documents_f.close()
    return word_features

## Classifier Confidence

In [6]:
from nltk.classify import ClassifierI
from statistics import mode, StatisticsError


class VoteClassifier(ClassifierI):
    def __init__(self, classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        ret = "neg"
        try:
            ret = mode(votes)
        except StatisticsError:
            # print("Caught1")
            pass
        return ret

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        try:
            choice_votes = votes.count(mode(votes))
            conf = choice_votes / len(votes)
            return conf
        except StatisticsError:
            # print("Caught2")
            return 0.5

## Sentiment Classifier : Positive or Negative

In [10]:
# # from gsu.train import TrainClassifiers
# from gsu.load import LoadClassifiers
# from gsu.load import LoadFeatures
# from gsu.data import find_features
# from gsu.VoteClassifier import VoteClassifier


class Sentiment:

    def __init__(self):
        classifiers = TrainClassifiers()
#         classifiers = LoadClassifiers()
        self.votedClassifier = VoteClassifier(classifiers)
        self.new_features = LoadFeatures()

    def Analyse(self, text):
        new_features = find_features(text, self.new_features)
        return self.votedClassifier.classify(new_features), self.votedClassifier.confidence(new_features)

## Prediction for positive or negative sentiments

In [8]:
# turn off pickle warning
import warnings
warnings.filterwarnings('ignore')

In [9]:
# from gsu.Sentiment import Sentiment

s = Sentiment()

print(s.Analyse("This movie was awesome! The acting was great, plot was wonderful!"))

print(s.Analyse("I am happy and awesome"))

print(s.Analyse("This movie was boring"))

print(s.Analyse("Evrything is bad. Movie sucks!"))

print(s.Analyse("Amazing movie, my kids love it"))

FileNotFoundError: [Errno 2] No such file or directory: 'saved/classifier_name.p'

In [31]:
print(s.Analyse("lousy acting"))

('neg', 1.0)


In [32]:
print(s.Analyse("go to hell"))

('neg', 1.0)
