In [1]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range

import nltk
import numpy as np
from sklearn.utils import shuffle

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [2]:
wordnet_lemmatizer = WordNetLemmatizer()

# Stop words
# From http://www.lextek.com/manuals/onix/stopwords1.html
stopwords = set(w.rstrip() for w in open('data/stopwords.txt'))

# Load the reviews
# Data of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
positive_reviews = BeautifulSoup(open('data/positive.review').read(), features="html.parser")
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('data/negative.review').read(), features="html.parser")
negative_reviews = negative_reviews.findAll('review_text')

In [3]:
def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens

# Create a word-to-index map to use for word-frequency vectors later
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

print("len(word_index_map):", len(word_index_map))

# Input matrices
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1 matrix - keeping them together for now so we can shuffle more easily later
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

# Shuffle the data and create train/test splits
orig_reviews, data = shuffle(orig_reviews, data)

X = data[:,:-1]
Y = data[:,-1]

# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

# Weights for each word
threshold = 0.5
print("\n----- Words Weights -----")
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)
print("-------------------------\n")

# Misclassified examples
preds = model.predict(X)
P = model.predict_proba(X)[:,1] # p(y = 1 | x)

# Print the "most" wrong samples
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None
for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)

len(word_index_map): 10950
Train accuracy: 0.7657894736842106
Test accuracy: 0.76

----- Words Weights -----
unit -0.718425820841696
bad -0.7739053338263261
've 0.785663126062034
month -0.7862026387524567
sound 1.1025351585966663
lot 0.7409307889899085
you 0.9291162215397254
n't -1.8767251688524043
easy 1.663903937244084
quality 1.3603029675844118
company -0.6227284927072501
item -0.9534661604728312
wa -1.6215623316142094
perfect 1.0467941002663599
fast 0.9969736340373115
ha 0.7256967105956321
price 2.6937071485478175
money -1.1371149097225066
memory 0.9206525404019182
buy -0.7982394894666592
bit 0.5986849050112657
happy 0.5912266904353017
pretty 0.7023172249278895
doe -1.1922683753612142
pleased 0.5006318171486007
highly 0.9420639209158304
recommend 0.618744968526597
fit 0.5043716945260018
customer -0.6737686722098588
support -0.9109061501617451
little 1.0166396244456146
returned -0.743233578926042
excellent 1.3614115255292343
love 1.2036407588037232
feature 0.5365939491468267
home 0.