In [1]:
import nltk
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import word_tokenize
from sklearn.svm import SVC
import random
from nltk.corpus import stopwords
from string import punctuation

In [2]:
pos = open('short_reviews/positive.txt', 'r', encoding='utf8', errors='ignore').read()
neg = open('short_reviews/negative.txt', 'r', encoding='utf8', errors='ignore').read()

all_words = []
documents = []

pos = pos.split('\n')
neg = neg.split('\n')
for p, n in zip(pos, neg):
    documents.append((p, 'pos'))
    documents.append((n, 'neg'))
    
## preprocess words
table = str.maketrans('', '', punctuation)
## tokenize pos and neg documents and remove all stopwords and punctuation
pos = [[w.translate(table) for w in sent.split() if w not in set(stopwords.words('english'))] for sent in pos]
neg = [[w.translate(table) for w in sent.split() if w not in set(stopwords.words('english'))] for sent in neg]
display(len(pos), len(neg))

5332

5332

In [3]:

display('Pos: ', pos[:3], 'Neg: ', neg[:3])

'Pos: '

[['rock',
  'destined',
  '21st',
  'centurys',
  'new',
  '',
  'conan',
  '',
  'hes',
  'going',
  'make',
  'splash',
  'even',
  'greater',
  'arnold',
  'schwarzenegger',
  '',
  'jeanclaud',
  'van',
  'damme',
  'steven',
  'segal',
  ''],
 ['gorgeously',
  'elaborate',
  'continuation',
  '',
  'lord',
  'rings',
  '',
  'trilogy',
  'huge',
  'column',
  'words',
  'cannot',
  'adequately',
  'describe',
  'cowriterdirector',
  'peter',
  'jacksons',
  'expanded',
  'vision',
  'j',
  '',
  'r',
  '',
  'r',
  '',
  'tolkiens',
  'middleearth',
  ''],
 ['effective', 'tootepid', 'biopic']]

'Neg: '

[['simplistic', '', 'silly', 'tedious', ''],
 ['laddish',
  'juvenile',
  '',
  'teenage',
  'boys',
  'could',
  'possibly',
  'find',
  'funny',
  ''],
 ['exploitative',
  'largely',
  'devoid',
  'depth',
  'sophistication',
  'would',
  'make',
  'watching',
  'graphic',
  'treatment',
  'crimes',
  'bearable',
  '']]

In [4]:
corpus = pos + neg
corpus = [[w for w in sent if len(w) > 1] for sent in corpus]

## collect only J (adjective) word type
word_type = ['J']
corpus = [[w for w in nltk.pos_tag(sent) if w[1][0] in word_type] for sent in corpus]
display(corpus[:10])

[[('centurys', 'JJ'),
  ('new', 'JJ'),
  ('splash', 'JJ'),
  ('greater', 'JJR'),
  ('arnold', 'JJ'),
  ('steven', 'JJ')],
 [('elaborate', 'JJ'), ('huge', 'JJ'), ('describe', 'JJ')],
 [('effective', 'JJ')],
 [('good', 'JJ')],
 [('rare', 'JJ'), ('honest', 'JJS'), ('observed', 'JJ')],
 [('great', 'JJ'),
  ('insight', 'JJ'),
  ('neurotic', 'JJ'),
  ('absolute', 'JJ'),
  ('top', 'JJ')],
 [('rare', 'JJ')],
 [('showed', 'JJ'), ('good', 'JJ')],
 [('snappy', 'JJ')],
 [('different', 'JJ'), ('asian', 'JJ')]]

In [8]:
all_words = []
# take out words only
for s in corpus:
    for w in s:
        all_words.append(w[0])

## saves the corpus into disk
file = open('pickled_data/corpus.pickle', 'wb')
pickle.dump(all_words, file)
file.close()

## words frequency and word_features is the keywords
all_words = nltk.FreqDist(all_words)
word_features = list(k for k, v in all_words.items() if v > 2)
display(word_features[:10])

['new',
 'greater',
 'arnold',
 'steven',
 'elaborate',
 'huge',
 'describe',
 'effective',
 'good',
 'rare']

In [10]:
del corpus
display(all_words)

FreqDist({'good': 369, 'much': 288, 'little': 281, 'bad': 235, 'funny': 229, 'new': 206, 'many': 183, 'best': 182, 'great': 161, 'big': 156, ...})

In [13]:
def find_features(doc):
    words = word_tokenize(doc)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [15]:
features_sets = [(find_features(doc), category) for doc, category in documents]

In [16]:
display(features_sets[0])

({'new': True,
  'greater': True,
  'arnold': True,
  'steven': True,
  'elaborate': False,
  'huge': False,
  'describe': False,
  'effective': False,
  'good': False,
  'rare': False,
  'honest': False,
  'observed': False,
  'great': False,
  'insight': False,
  'neurotic': False,
  'absolute': False,
  'top': False,
  'snappy': False,
  'different': False,
  'asian': False,
  'lowkey': False,
  'greatest': False,
  'famous': False,
  'talky': False,
  'willing': False,
  'fresh': False,
  'true': False,
  'thoughtful': False,
  'provocative': False,
  'independent': False,
  'incisive': False,
  'evocative': False,
  'hypnotic': False,
  'connect': False,
  'nice': False,
  'melodramatic': False,
  'idealistic': False,
  'romantic': False,
  'treasure': False,
  'brisk': False,
  'familiar': False,
  'real': False,
  'masterful': False,
  'unique': False,
  'light': False,
  'forgettable': False,
  'funeral': False,
  'smart': False,
  'static': False,
  'spooky': False,
  'best': 

In [17]:
from sklearn.model_selection import train_test_split


random.shuffle(features_sets)
display(len(features_sets))

10664

In [24]:
X, y = train_test_split(features_sets, test_size=0.06)
display(len(X), len(y))

10024

640

In [25]:
## Naive Bayesian classifier
classifier = nltk.NaiveBayesClassifier.train(X)
acc = nltk.classify.accuracy(classifier, y)
print(f'Naive Bayes acc= {acc  * 100}')
classifier.show_most_informative_features(15)

Naive Bayes acc= 73.125
Most Informative Features
                  boring = True              neg : pos    =     19.4 : 1.0
                powerful = True              pos : neg    =     16.6 : 1.0
                 generic = True              neg : pos    =     16.3 : 1.0
                 routine = True              neg : pos    =     15.7 : 1.0
               inventive = True              pos : neg    =     15.0 : 1.0
                mediocre = True              neg : pos    =     14.3 : 1.0
                  unique = True              pos : neg    =     14.3 : 1.0
                    flat = True              neg : pos    =     13.0 : 1.0
               wonderful = True              pos : neg    =     13.0 : 1.0
                mindless = True              neg : pos    =     11.7 : 1.0
                   stale = True              neg : pos    =     11.7 : 1.0
                delicate = True              pos : neg    =     11.7 : 1.0
               realistic = True              pos :

In [26]:
## Support Vector Machine, SVC (C-Support)
svc_classifier = SklearnClassifier(SVC())
svc_classifier.train(X)
acc = nltk.classify.accuracy(svc_classifier, y)
print(f'SVC acc= {acc * 100}')

SVC acc= 68.59375


In [27]:
mnb_classifier = SklearnClassifier(MultinomialNB())
mnb_classifier.train(X)
acc = nltk.classify.accuracy(mnb_classifier, y)
print(f'MultinomialNB acc= {acc * 100}')

MultinomialNB acc= 71.5625


In [28]:
bnb_classifier = SklearnClassifier(BernoulliNB())
bnb_classifier.train(X)
acc = nltk.classify.accuracy(bnb_classifier, y)
print(f'BernoulliNB acc= {acc * 100}')

BernoulliNB acc= 72.34375


In [30]:
test_sent = "This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!"
test_features = find_features(test_sent.lower())
test_cat = classifier.classify(test_features)
display('Naive Bayes classification', test_cat)

'Naive Bayes classification'

'pos'

In [31]:
print(f'Multinomial NB classification: {mnb_classifier.classify(test_features)}')

Multinomial NB classification: pos


In [32]:
print(f'SVC : {svc_classifier.classify(test_features)}')

SVC : pos


In [33]:
test_sent = "This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10"
test_features = find_features(test_sent.lower())
print(f'Naive Bayes: {classifier.classify(test_features)}')
print(f'SVC : {svc_classifier.classify(test_features)}')
print(f'Multinomial NB: {mnb_classifier.classify(test_features)}')
print(f'Bernoulli NB: {bnb_classifier.classify(test_features)}')

Naive Bayes: neg
SVC : neg
Multinomial NB: neg
Bernoulli NB: neg


In [34]:
test_sent = """Although Reminiscence doesn’t try to hide any inherent metaphors — what are most movies
these days, really, but nostalgia machines, designed for those stuck in the past? 
it doesn’t do much with the material besides fashion something like a a dull-edged Blade Runner. 
All that’s left is irony: For a would-be brainteasing thriller so obsessed with memories, 
Reminiscence is almost painfully, instantly forgettable."""
test_features = find_features(test_sent.lower())
print(classifier.classify(test_features))

neg
