In [95]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load data

In [96]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

stopwords = set(w.rstrip() for w in open('/content/gdrive/MyDrive/Colab Notebooks/lazyprogrammer/data/electronics/stopwords.txt'))
# note: an alternative source of stopwords
# from nltk.corpus import stopwords
# stopwords.words('english')

positive_reviews = BeautifulSoup(open('/content/gdrive/MyDrive/Colab Notebooks/lazyprogrammer/data/electronics/positive.review').read(), features="html5lib")
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('/content/gdrive/MyDrive/Colab Notebooks/lazyprogrammer/data/electronics/negative.review').read(), features="html5lib")
negative_reviews = negative_reviews.findAll('review_text')

orig_reviews = []
for r in positive_reviews:
  orig_reviews.append(r.text)
for r in negative_reviews:
  orig_reviews.append(r.text)

len(orig_reviews)

Mounted at /content/gdrive/


2000

# Utils

In [97]:
def tokenize_review(txt, lemmatizer):
  tokens = nltk.tokenize.word_tokenize(txt.lower())
  tokens = [t for t in tokens if len(t) > 2]
  tokens = [lemmatizer.lemmatize(t) for t in tokens]
  tokens = [t for t in tokens if t not in stopwords]
  return tokens

def build_token2idx(rewiews_collections, lemmatizer):
  token_id = 0
  token2idx = {}
  for collection in rewiews_collections:
    for review in collection:
      for token in tokenize_review(review.text, lemmatizer):
        if token not in token2idx:
          token2idx[token] = token_id
          token_id += 1
  return token2idx

def preprocess_reviews(rewiews, org_txt: list, lemmatizer):
  processed_txt = []
  for review in rewiews:
    org_txt.append(review.text)
    processed_txt.append(tokenize_review(review.text, lemmatizer))
  return processed_txt, org_txt

def build_count_vectorizer(tokens, token2idx):
  x = np.zeros(V)
  for t in tokens:
    token_id = token2idx[t]
    x[token_id] += 1
  return x / x.sum()

# Process data

In [98]:
lemmatizer = WordNetLemmatizer()
token2idx = build_token2idx([positive_reviews, negative_reviews], lemmatizer)
V = len(token2idx)

print('vocab size:', V, '\n')
print('Example of oryginal reviews:')
[r.text for r in positive_reviews[:3]]

vocab size: 11092 

Example of oryginal reviews:


['\nI purchased this unit due to frequent blackouts in my area and 2 power supplies going bad.  It will run my cable modem, router, PC, and LCD monitor for 5 minutes.  This is more than enough time to save work and shut down.   Equally important, I know that my electronics are receiving clean power.\n\nI feel that this investment is minor compared to the loss of valuable data or the failure of equipment due to a power spike or an irregular power supply.\n\nAs always, Amazon had it to me in <2 business days\n',
 "\nI ordered 3 APC Back-UPS ES 500s on the recommendation of an employee of mine who used to work at APC. I've had them for about a month now without any problems. They've functioned properly through a few unexpected power interruptions. I'll gladly order more if the need arises.\n\nPros:\n - Large plug spacing, good for power adapters\n - Simple design\n - Long cord\n\nCons:\n - No line conditioning (usually an expensive option\n",

In [99]:
positive_tokenized = [tokenize_review(r.text, lemmatizer) for r in positive_reviews]
negative_tokenized = [tokenize_review(r.text, lemmatizer) for r in negative_reviews]

print('Tokenized reviews:')
[' '.join(t) for t in positive_tokenized[:3]]

Tokenized reviews:


['purchased this unit due frequent blackout power supply bad run cable modem router lcd monitor minute this time save shut equally electronics receiving clean power feel this investment minor compared loss valuable data failure equipment due power spike irregular power supply amazon business day',
 "apc back-ups 500 recommendation employee mine apc 've month 've functioned properly unexpected power interruption 'll gladly arises pro plug spacing power adapter simple design cord con line conditioning usually expensive option",

In [100]:
positive_vectorized = [build_count_vectorizer(r, token2idx) for r in positive_tokenized]
negative_vectorized = [build_count_vectorizer(r, token2idx) for r in negative_tokenized]

print('Vectorised reviews:')
print(positive_vectorized[0])
print(positive_vectorized[1])
print(positive_vectorized[2])

Vectorised reviews:
[0.02272727 0.06818182 0.02272727 ... 0.         0.         0.        ]
[0. 0. 0. ... 0. 0. 0.]
[0.         0.         0.08333333 ... 0.         0.         0.        ]


# Modeling part

In [101]:
X = np.array(positive_vectorized + negative_vectorized)
y = np.array([1] * len(positive_vectorized) + [0] * len(negative_vectorized))

X, Y, orig_reviews = shuffle(X, Y, orig_reviews)

Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

In [102]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

Train accuracy: 0.6484210526315789
Test accuracy: 0.47


In [107]:
# let's look at the weights for each word
# try it with different threshold values!
threshold = 0.5
for word, index in token2idx.items():
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

this 0.8187479814453458
amazon 0.5248338995303248
you 0.7807399525169034
price 1.2421120658787344
recommend -0.6895846436414396
excellent -0.5040373005209356


In [108]:
# check misclassified examples
preds = model.predict(X)
P = model.predict_proba(X)[:,1] # p(y = 1 | x)

In [109]:
# since there are many, just print the "most" wrong samples
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None
for i in range(len(orig_reviews)):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

print(f'Most wrong positive review (prob = {minP_whenYis1}, pred = {wrong_positive_prediction}, true label = 1')
print(wrong_positive_review)
print(f'Most wrong negative review (prob = {maxP_whenYis0}, pred = {wrong_negative_prediction}, true label = 0')
print(wrong_negative_review)

Most wrong positive review (prob = 0.4327359830655592, pred = 0, true label = 1

I am happy to have several Sandisk products, and all of them are excellent.

Most wrong negative review (prob = 0.6260722682056727, pred = 1, true label = 0

I love these speakers and the price was great

