In [1]:
import nltk
import random
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize

In [2]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [3]:
all_words = []
documents = []

In [4]:
#  j is adject, r is adverb, and v is verb
#allowed_word_types = ["J","R","V"]
allowed_word_types = ["J"]

In [5]:
import pandas as pd
data = pd.read_csv('EqualTrainTest.csv')

In [6]:
for i, w in enumerate(data['Content']):
    
    if(data.at[i, 'Sentiment'] == 0):
        documents.append((w, "neg"))
        words = word_tokenize(w)
        pos = nltk.pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())
    else:
        documents.append((w, "pos"))
        words = word_tokenize(w)
        pos = nltk.pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())

In [7]:
save_documents = open("documents.pickle","wb")
pickle.dump(documents, save_documents)

In [8]:
all_words = nltk.FreqDist(all_words)

In [9]:
word_features = list(all_words.keys())[:5500]

In [10]:
save_word_features = open("word_features5.5k.pickle","wb")
pickle.dump(word_features, save_word_features)

In [11]:
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [12]:
#documents

In [13]:
featuresets = [(find_features(article), sentiment) for (article, sentiment) in documents]

random.shuffle(featuresets)
print(len(featuresets))

training_set = featuresets[:2500]
testing_set = featuresets[2500:]

3158


In [14]:
pos = 0
neg = 0

for i in training_set:
    temp = i[1]
    
    if(temp == 'pos'):
        pos += 1
    else:
        neg += 1

In [15]:
print(neg)

1254


In [16]:
print(neg)

1254


In [17]:
save_features = open("featuresets.pickle","wb")
pickle.dump(documents, save_features)

In [18]:
#featuresets

In [19]:
# Naive Bayes Classifier

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

save_classifier = open("originalnaivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)

Original Naive Bayes Algo accuracy percent: 81.0030395136778
Most Informative Features
                arrested = True              neg : pos    =     49.5 : 1.0
                  prison = True              neg : pos    =     32.0 : 1.0
              attractive = True              pos : neg    =     31.2 : 1.0
                    fake = True              neg : pos    =     30.1 : 1.0
                  arrest = True              neg : pos    =     27.4 : 1.0
                  issuer = True              pos : neg    =     26.5 : 1.0
              optimistic = True              pos : neg    =     24.7 : 1.0
              enthusiasm = True              pos : neg    =     21.1 : 1.0
                 digital = True              pos : neg    =     19.8 : 1.0
               2020-2021 = True              neg : pos    =     18.2 : 1.0
                detained = True              neg : pos    =     18.0 : 1.0
                 garbage = True              neg : pos    =     17.6 : 1.0
             

In [21]:
#Test
from pyroc import ROC

roc_data = ROC((label, naive_bayes.prob_classify(featuresets).prob(1)) for featuresets, label in documents)
roc_data.plot()

TypeError: __init__() missing 1 required positional argument: 'preds'

In [36]:
# Multinomial Naive Bayes

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

save_classifier = open("MNB_classifier.pickle","wb")
pickle.dump(MNB_classifier, save_classifier)

MNB_classifier accuracy percent: 84.65045592705167


In [37]:
# Bernoulli Naive Bayes

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

save_classifier = open("BernoulliNB_classifier.pickle","wb")
pickle.dump(BernoulliNB_classifier, save_classifier)

BernoulliNB_classifier accuracy percent: 84.34650455927051


In [38]:
# Logistic Regression

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

save_classifier = open("LogisticRegression_classifier.pickle","wb")
pickle.dump(LogisticRegression_classifier, save_classifier)

LogisticRegression_classifier accuracy percent: 89.8176291793313


In [39]:
# Linear Support Vector Classification

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

save_classifier = open("LinearSVC_classifier.pickle","wb")
pickle.dump(LinearSVC_classifier, save_classifier)

LinearSVC_classifier accuracy percent: 89.209726443769


In [40]:
# Stochastic Gradient Descent CLassifier

SGDC_classifier = SklearnClassifier(SGDClassifier())
SGDC_classifier.train(training_set)
print("SGDClassifier accuracy percent:",nltk.classify.accuracy(SGDC_classifier, testing_set)*100)

save_classifier = open("SGDC_classifier.pickle","wb")
pickle.dump(SGDC_classifier, save_classifier)

SGDClassifier accuracy percent: 89.51367781155015
