> Notes: [natural language processing](https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/)

In [1]:
# Corpus - Body of text. Corpora - plural.
# Lexicon - Words and their meanings.
# Token - Each "entity" that is a part of whatever was split up based on rules. 

In [2]:
# sentence and word tokens

from nltk.tokenize import sent_tokenize, word_tokenize

def get_tokens():
    example_text = "Hello Mr. Smith, how are you doing today? The weather is great,\
    and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."
    print('sentence tokens: {}'.format(sent_tokenize(example_text)))
    print('\nword tokens: {}'.format(word_tokenize(example_text)))

get_tokens()

sentence tokens: ['Hello Mr. Smith, how are you doing today?', 'The weather is great,    and Python is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard."]

word tokens: ['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard', '.']


In [3]:
# word tokens without stopwords

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def get_tokens_sans_stopwords():
    example_sent = "This is a sample sentence, showing off the stop words filtration."
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(example_sent)
    filtered_word_tokens = [w for w in word_tokens if not w in stop_words]
    print('word tokens: {}'.format(word_tokens))
    print('\nfiltered word tokens: {}'.format(filtered_word_tokens))

get_tokens_sans_stopwords()

word tokens: ['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']

filtered word tokens: ['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [4]:
# word stemming

# Many variations of words carry the same meaning, other than when tense is involved.
# Stemming helps to shorten the lookup, and normalize sentences.
# Consider:
# I was taking a ride in the car.
# I was riding in the car.
# This sentence means the same thing. in the car is the same. I was is the same. 
# the ing denotes a clear past-tense in both cases, so is it truly necessary to differentiate 
# between ride and riding, in the case of just trying to figure out the meaning of what this 
# past-tense activity was?
# No, not really.
# Imagine every word in the English language, every possible tense and affix you can put on 
# a word. Having individual dictionary entries per version would be highly redundant and 
# inefficient, especially since, once we convert to numbers, the "value" is going to be 
# identical.

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

def get_stem_words():
    ps = PorterStemmer()
    example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
    print('example words: {}'.format(example_words))
    print('\nstemmed example words:')
    for w in example_words:
        print(ps.stem(w))
        
    new_text = "It is important to by very pythonly while you are pythoning with python. \
                All pythoners have pythoned poorly at least once."
    words = word_tokenize(new_text)
    print('\nnew_text word tokens: {}'.format(words))
    print('\nstemmed new_text word tokens:')
    for w in words:
        print(ps.stem(w))

get_stem_words()

example words: ['python', 'pythoner', 'pythoning', 'pythoned', 'pythonly']

stemmed example words:
python
python
python
python
pythonli

new_text word tokens: ['It', 'is', 'important', 'to', 'by', 'very', 'pythonly', 'while', 'you', 'are', 'pythoning', 'with', 'python', '.', 'All', 'pythoners', 'have', 'pythoned', 'poorly', 'at', 'least', 'once', '.']

stemmed new_text word tokens:
It
is
import
to
by
veri
pythonli
while
you
are
python
with
python
.
All
python
have
python
poorli
at
least
onc
.


In [5]:
# part of speech (pos) tagging

"""
POS tag list:
CC	coordinating conjunction
CD	cardinal digit
DT	determiner
EX	existential there (like: "there is" ... think of it like "there exists")
FW	foreign word
IN	preposition/subordinating conjunction
JJ	adjective	'big'
JJR	adjective, comparative	'bigger'
JJS	adjective, superlative	'biggest'
LS	list marker	1)
MD	modal	could, will
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
PDT	predeterminer	'all the kids'
POS	possessive ending	parent's
PRP	personal pronoun	I, he, she
PRP$	possessive pronoun	my, his, hers
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
RP	particle	give up
TO	to	go 'to' the store.
UH	interjection	errrrrrrrm
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when
"""

# PunktSentenceTokenizer is capable of unsupervised machine learning, so you can actually 
# train it on any body of text that you use. 

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize


def get_pos_tags():
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = state_union.raw("2006-GWBush.txt")
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text) # train
    tokenized = custom_sent_tokenizer.tokenize(sample_text) # test
    try:
        for i in tokenized[:5]:
            words = word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))

get_pos_tags()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

In [6]:
# chunking - group words into chunks based on some reg-ex
# for reg-ex: https://pythonprogramming.net/regular-expressions-regex-tutorial-python-3/
# noun-phrases: group nouns and related words (verbs, adverbs, adjectives)

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

def get_chunks():
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = state_union.raw("2006-GWBush.txt")
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)
    try:
        for idx, i in enumerate(tokenized[:5]):
            print(idx)
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            # 0 or more adverbs, verbs, nouns
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)           
            print('chunked: {}'.format(chunked))
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print('subtree: {}'.format(subtree))
            # doesn't work in jupyter
            #chunked.draw()
    except Exception as e:
        print(str(e))

get_chunks()

0
chunked: (S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
subtree: (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
subtree: (Chunk ADDRESS/NNP)
subtree: (Chunk A/NNP JOINT/NNP SESSION/NNP)
subtree: (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
subtree: (Chunk THE/NNP UNION/NNP January/NNP)
subtree: (Chunk THE/NNP PRESIDENT/NNP)
subtree: (Chunk Thank/NNP)
1
chunked: (S
  (Chunk Mr./NNP Speaker/NNP)
  ,/,
  (Chunk Vice/NNP President/NNP Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (Chunk Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/

In [7]:
# chinking - removing a chunk from a chunk
# chink - chunk removed from a chunk

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

def chink_chunk():
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = state_union.raw("2006-GWBush.txt")
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)
    try:
        for idx, i in enumerate(tokenized[:5]):
            print(idx)
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            # expression b/w } { will be chinked (removed) from the chunk
            # verbs, prepositions, determiners, or the word 'to' 
            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            print('chunked: {}'.format(chunked))
            #chunked.draw()
    except Exception as e:
        print(str(e))

chink_chunk()

0
chunked: (S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP 'S/POS ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk
    THE/NNP
    UNION/NNP
    January/NNP
    31/CD
    ,/,
    2006/CD
    THE/NNP
    PRESIDENT/NNP
    :/:
    Thank/NNP
    you/PRP)
  all/DT
  (Chunk ./.))
1
chunked: (S
  (Chunk
    Mr./NNP
    Speaker/NNP
    ,/,
    Vice/NNP
    President/NNP
    Cheney/NNP
    ,/,
    members/NNS)
  of/IN
  (Chunk Congress/NNP ,/, members/NNS)
  of/IN
  the/DT
  (Chunk
    Supreme/NNP
    Court/NNP
    and/CC
    diplomatic/JJ
    corps/NN
    ,/,
    distinguished/JJ
    guests/NNS
    ,/,
    and/CC
    fellow/JJ
    citizens/NNS
    :/:)
  Today/VB
  (Chunk our/PRP$ nation/NN)
  lost/VBD
  a/DT
  beloved/VBN
  (Chunk ,/, graceful/JJ ,/, courageous/JJ woman/NN who/WP)
  called/VBD
  (Chunk America/NNP)
  to/TO
  (Chunk its/PRP$ founding/NN ideals/NNS and/CC)
  carried/VBD
  on/IN


In [8]:
# Named Entity Recognition: recognize "entities" like people, places, things, locations, 
# monetary figures, and more.
# Two options with NLTK's named entity recognition: 
# (1) Recognize all named entities
# (2) Recognize named entities as their respective type, like people, places, locations, etc.

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

def get_ne():
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = state_union.raw("2006-GWBush.txt")
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)
    try:
        for idx, i in enumerate(tokenized[:2]):
            print('idx: {}'.format(idx))
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEntT = nltk.ne_chunk(tagged, binary=True)
            print('namedEntT, bin=True: {}'.format(namedEntT))
            namedEntF = nltk.ne_chunk(tagged, binary=False)
            print('namedEntF, bin=True: {}'.format(namedEntF))
            #namedEnt.draw()
    except Exception as e:
        print(str(e))
get_ne()

# NE Type and Examples - when binary=False
# ORGANIZATION - Georgia-Pacific Corp., WHO
# PERSON - Eddy Bonte, President Obama
# LOCATION - Murray River, Mount Everest
# DATE - June, 2008-06-29
# TIME - two fifty a m, 1:30 p.m.
# MONEY - 175 million Canadian Dollars, GBP 10.40
# PERCENT - twenty pct, 18.75 %
# FACILITY - Washington Monument, Stonehenge
# GPE - South East Asia, Midlothian

idx: 0
namedEntT, bin=True: (S
  PRESIDENT/NNP
  (NE GEORGE/NNP)
  W./NNP
  BUSH/NNP
  'S/POS
  (NE ADDRESS/NNP)
  BEFORE/IN
  A/NNP
  JOINT/NNP
  SESSION/NNP
  OF/IN
  (NE THE/NNP)
  (NE CONGRESS/NNP)
  ON/NNP
  THE/NNP
  STATE/NNP
  OF/IN
  (NE THE/NNP UNION/NNP)
  January/NNP
  31/CD
  ,/,
  2006/CD
  THE/NNP
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP
  all/DT
  ./.)
namedEntF, bin=True: (S
  PRESIDENT/NNP
  (PERSON GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (ORGANIZATION ADDRESS/NNP)
  BEFORE/IN
  A/NNP
  (ORGANIZATION JOINT/NNP)
  SESSION/NNP
  OF/IN
  (ORGANIZATION THE/NNP)
  (ORGANIZATION CONGRESS/NNP)
  ON/NNP
  THE/NNP
  (ORGANIZATION STATE/NNP OF/IN)
  (ORGANIZATION THE/NNP)
  (ORGANIZATION UNION/NNP)
  January/NNP
  31/CD
  ,/,
  2006/CD
  (ORGANIZATION THE/NNP)
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP
  all/DT
  ./.)
idx: 1
namedEntT, bin=True: (S
  (NE Mr./NNP Speaker/NNP)
  ,/,
  Vice/NNP
  President/NNP
  (NE Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (NE Congress/NNP)
  

In [9]:
# lemmatizing - similar to stemming.
# stemmed root word might not be in a dictionary (not always a real word), but lemmatized 
# words can always be found in a dictionary (real words).

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
# takes part of speech (pos) as argument
# default = noun
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

cat
cactus
goose
rock
python
good
best
run
run


In [10]:
# nltk corpora

import nltk
print('nltk location: {}'.format(nltk.__file__)) # shows location of nltk modules __init__.py

# look into data.py for locations of nltk data

from nltk.tokenize import sent_tokenize, PunktSentenceTokenizer
from nltk.corpus import gutenberg

def get_corpora():
    sample = gutenberg.raw("bible-kjv.txt")
    tok = sent_tokenize(sample)
    for x in range(5):
        print('\ntoken #: {}\n{}'.format(x,tok[x]))
get_corpora()

nltk location: /Users/faameem/anaconda/lib/python3.5/site-packages/nltk/__init__.py

token #: 0
[The King James Bible]

The Old Testament of the King James Bible

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.

token #: 1
1:2 And the earth was without form, and void; and darkness was upon
the face of the deep.

token #: 2
And the Spirit of God moved upon the face of the
waters.

token #: 3
1:3 And God said, Let there be light: and there was light.

token #: 4
1:4 And God saw the light, that it was good: and God divided the light
from the darkness.


In [11]:
# WordNet (https://wordnet.princeton.edu/) is a lexical database for the English language, 
# which was created by Princeton, and is part of the NLTK corpus.
# You can use WordNet alongside the NLTK module to find the meanings of words, synonyms, 
# antonyms, and more.

from nltk.corpus import wordnet

# synonym set
syns = wordnet.synsets("program")

print('synonym:\n{}'.format(syns[0].name()))
print('synonym word:\n{}'.format(syns[0].lemmas()[0].name()))
print('synonym definition:\n{}'.format(syns[0].definition()))
print('synonym examples:\n{}'.format(syns[0].examples()))

#The lemmas will be synonyms, and then you can use .antonyms to find the antonyms to the lemmas. 
synonyms = []
antonyms = []
for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            for la in l.antonyms():
                antonyms.append(la.name())
                #antonyms.append(l.antonyms()[0].name())
print('\ngood: synonyms:\n{}'.format(set(synonyms)))
print('good: antonyms:\n{}'.format(set(antonyms)))  


# using WordNet to compare the similarity of two words and their tenses, 
# by employing the Wu and Palmer method for semantic related-ness.
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print('\nship / boat similarity: {}'.format(w1.wup_similarity(w2)))

w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('car.n.01')
print('ship / car similarity: {}'.format(w1.wup_similarity(w2)))

w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('cat.n.01')
print('ship / cat similarity: {}'.format(w1.wup_similarity(w2)))

synonym:
plan.n.01
synonym word:
plan
synonym definition:
a series of steps to be carried out or goals to be accomplished
synonym examples:
['they drew up a six-step plan', 'they discussed plans for a new bond issue']

good: synonyms:
{'expert', 'salutary', 'soundly', 'adept', 'in_effect', 'just', 'goodness', 'serious', 'unspoilt', 'right', 'unspoiled', 'trade_good', 'skillful', 'honest', 'secure', 'honorable', 'well', 'proficient', 'sound', 'dependable', 'dear', 'commodity', 'safe', 'ripe', 'undecomposed', 'practiced', 'near', 'respectable', 'skilful', 'in_force', 'upright', 'thoroughly', 'estimable', 'full', 'effective', 'beneficial', 'good'}
good: antonyms:
{'badness', 'bad', 'ill', 'evil', 'evilness'}

ship / boat similarity: 0.9090909090909091
ship / car similarity: 0.6956521739130435
ship / cat similarity: 0.32


In [23]:
import nltk
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
print(len(documents))
random.shuffle(documents)

#print(documents[1])

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
#print(all_words.most_common(15))
#print(all_words["stupid"])

word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

#print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets = [(find_features(rev), category) for (rev, category) in documents]
#print(type(featuresets))
#print(featuresets[0])
print(len(featuresets))
training_set = featuresets[:1900]
testing_set = featuresets[1900:]
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

import pickle
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

2000
2000
Classifier accuracy percent: 70.0
Most Informative Features
                    3000 = True              neg : pos    =     10.4 : 1.0
              accessible = True              pos : neg    =      9.6 : 1.0
                  doubts = True              pos : neg    =      8.9 : 1.0
                 frances = True              pos : neg    =      8.9 : 1.0
           unimaginative = True              neg : pos    =      8.4 : 1.0
                 tribute = True              pos : neg    =      6.9 : 1.0
                    jude = True              pos : neg    =      6.3 : 1.0
                bothered = True              neg : pos    =      5.8 : 1.0
                   plods = True              neg : pos    =      5.7 : 1.0
                   vapid = True              neg : pos    =      5.7 : 1.0
                    lame = True              neg : pos    =      5.7 : 1.0
                 crowded = True              pos : neg    =      5.6 : 1.0
                pfeiffer = Tru

In [27]:
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

In [31]:
# employing scikit-learn with nltk to get more algorithms

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

print("Original Naive Bayes Algo accuracy percent:", \
      (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", \
      (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", \
      (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", \
      (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", \
      (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", \
      (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", \
      (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", \
      (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

Original Naive Bayes Algo accuracy percent: 70.0
Most Informative Features
                    3000 = True              neg : pos    =     10.4 : 1.0
              accessible = True              pos : neg    =      9.6 : 1.0
                  doubts = True              pos : neg    =      8.9 : 1.0
                 frances = True              pos : neg    =      8.9 : 1.0
           unimaginative = True              neg : pos    =      8.4 : 1.0
                 tribute = True              pos : neg    =      6.9 : 1.0
                    jude = True              pos : neg    =      6.3 : 1.0
                bothered = True              neg : pos    =      5.8 : 1.0
                   plods = True              neg : pos    =      5.7 : 1.0
                   vapid = True              neg : pos    =      5.7 : 1.0
                    lame = True              neg : pos    =      5.7 : 1.0
                 crowded = True              pos : neg    =      5.6 : 1.0
                pfeiffer 

In [42]:
# combining algorithms with nltk
# combining classifier algorithms is done by creating a voting system, where each algorithm 
# gets one vote, and the classification that has the most votes is the chosen one.

from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            #print('classifier: {}'.format(c))
            v = c.classify(features)
            #print('vote: {}'.format(v))
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        choice_votes = votes.count(mode(votes))
        #print('choice_votes {}'.format(choice_votes))
        conf = choice_votes / len(votes)
        return conf
    
voted_classifier = VoteClassifier(classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

#print(len(testing_set))
print("voted_classifier accuracy percent:", \
      (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification:", voted_classifier.classify(testing_set[0][0]), \
      "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), \
      "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), \
      "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), \
      "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), \
      "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), \
      "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)
print("Classification:", voted_classifier.classify(testing_set[6][0]), \
      "Confidence %:",voted_classifier.confidence(testing_set[6][0])*100)
print("Classification:", voted_classifier.classify(testing_set[7][0]), \
      "Confidence %:",voted_classifier.confidence(testing_set[7][0])*100)
print("Classification:", voted_classifier.classify(testing_set[8][0]), \
      "Confidence %:",voted_classifier.confidence(testing_set[8][0])*100)
print("Classification:", voted_classifier.classify(testing_set[9][0]), \
      "Confidence %:",voted_classifier.confidence(testing_set[9][0])*100)
print("Classification:", voted_classifier.classify(testing_set[10][0]), \
      "Confidence %:",voted_classifier.confidence(testing_set[10][0])*100)

voted_classifier accuracy percent: 67.0
Classification: pos Confidence %: 57.14285714285714
Classification: neg Confidence %: 85.71428571428571
Classification: pos Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 71.42857142857143
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 57.14285714285714
