# 11. Text Classification

In [2]:
import nltk
import random
from nltk.corpus import movie_reviews

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

In [3]:
all_words.most_common(15)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('"', 17612),
 ('it', 16107),
 ('that', 15924),
 ('-', 15595)]

In [4]:
all_words["stupid"]

253

In [5]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories() # ['neg', 'pos']
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

In [6]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]

In [7]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [8]:
find_features(movie_reviews.words('neg/cv000_29416.txt'))

{'plot': True,
 ':': True,
 'two': True,
 'teen': True,
 'couples': True,
 'go': True,
 'to': True,
 'a': True,
 'church': True,
 'party': True,
 ',': True,
 'drink': True,
 'and': True,
 'then': True,
 'drive': True,
 '.': True,
 'they': True,
 'get': True,
 'into': True,
 'an': True,
 'accident': True,
 'one': True,
 'of': True,
 'the': True,
 'guys': True,
 'dies': True,
 'but': True,
 'his': True,
 'girlfriend': True,
 'continues': True,
 'see': True,
 'him': True,
 'in': True,
 'her': True,
 'life': True,
 'has': True,
 'nightmares': True,
 'what': True,
 "'": True,
 's': True,
 'deal': True,
 '?': True,
 'watch': True,
 'movie': True,
 '"': True,
 'sorta': True,
 'find': True,
 'out': True,
 'critique': True,
 'mind': True,
 '-': True,
 'fuck': True,
 'for': True,
 'generation': True,
 'that': True,
 'touches': True,
 'on': True,
 'very': True,
 'cool': True,
 'idea': True,
 'presents': True,
 'it': True,
 'bad': True,
 'package': True,
 'which': True,
 'is': True,
 'makes': True

do it for all document

In [9]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

### Run Naive Bayes Classifier

In [10]:
training_set = featuresets[:1900]
testing_set = featuresets[1900:]
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 82.0


In [11]:
classifier.show_most_informative_features(15)

Most Informative Features
               atrocious = True              neg : pos    =     10.5 : 1.0
                   sucks = True              neg : pos    =     10.0 : 1.0
                 frances = True              pos : neg    =      8.8 : 1.0
             silverstone = True              neg : pos    =      7.8 : 1.0
           unimaginative = True              neg : pos    =      7.8 : 1.0
              schumacher = True              neg : pos    =      7.5 : 1.0
                  shoddy = True              neg : pos    =      7.1 : 1.0
              uninspired = True              neg : pos    =      6.9 : 1.0
                  regard = True              pos : neg    =      6.5 : 1.0
                 idiotic = True              neg : pos    =      6.5 : 1.0
                    mena = True              neg : pos    =      6.5 : 1.0
                 kidding = True              neg : pos    =      6.5 : 1.0
                  suvari = True              neg : pos    =      6.5 : 1.0

### Save python object with pickle

In [12]:
import pickle
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [13]:
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 82.0


### Using nltk with Sckitlearn

In [14]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

def run_scikitlearn_algo(scikitlearnAlgo, name):
    sklearnClassifier = SklearnClassifier(scikitlearnAlgo())
    sklearnClassifier.train(training_set)
    print(f'{name} accuracy percent: ',nltk.classify.accuracy(sklearnClassifier, testing_set))
    return sklearnClassifier

print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = run_scikitlearn_algo(MultinomialNB, 'MultinomialNB')
BernoulliNB_classifier = run_scikitlearn_algo(BernoulliNB, 'BernoulliNB')
LogisticRegression_classifier = run_scikitlearn_algo(LogisticRegression, 'LogisticRegression')
SGDClassifier_classifier = run_scikitlearn_algo(SGDClassifier, 'SGDClassifier')
SVC_classifier = run_scikitlearn_algo(SVC, 'SVC_classifier')
LinearSVC_classifier = run_scikitlearn_algo(LinearSVC, 'LinearSVC')
NuSVC_classifier = run_scikitlearn_algo(NuSVC, 'NuSVC')

Original Naive Bayes Algo accuracy percent: 82.0
Most Informative Features
               atrocious = True              neg : pos    =     10.5 : 1.0
                   sucks = True              neg : pos    =     10.0 : 1.0
                 frances = True              pos : neg    =      8.8 : 1.0
             silverstone = True              neg : pos    =      7.8 : 1.0
           unimaginative = True              neg : pos    =      7.8 : 1.0
              schumacher = True              neg : pos    =      7.5 : 1.0
                  shoddy = True              neg : pos    =      7.1 : 1.0
              uninspired = True              neg : pos    =      6.9 : 1.0
                  regard = True              pos : neg    =      6.5 : 1.0
                 idiotic = True              neg : pos    =      6.5 : 1.0
                    mena = True              neg : pos    =      6.5 : 1.0
                 kidding = True              neg : pos    =      6.5 : 1.0
                  suvari 



SGDClassifier accuracy percent:  0.85
SVC_classifier accuracy percent:  0.78
LinearSVC accuracy percent:  0.8
NuSVC accuracy percent:  0.85


### Vote for all the classifier

In [15]:
from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [16]:
voted_classifier = VoteClassifier(classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

voted_classifier accuracy percent: 87.0


In [17]:
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)

voted_classifier accuracy percent: 87.0
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 85.71428571428571
Classification: pos Confidence %: 71.42857142857143
Classification: neg Confidence %: 57.14285714285714
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 100.0
