In [1]:
# Reviews about electronic products sentiment analysis 

import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import download
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup

# Download stopwords
download('stopwords')
download('punkt')
download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/atissera/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/atissera/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/atissera/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
word_index_map = {}
current_index = 0

# Custom tokenizer
def my_tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

# Construct vectors
def tokens_to_vextor(tokens, label):
    x = np.zeros(len(word_index_map) + 1)
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum()
    x[-1] = label
    return x

In [3]:
# Lematizer is like a word normalizer, converts words to its base form. 
# Words like 'dogs' -> 'dog'
wordnet_lemmatizer = WordNetLemmatizer()

stopwords = set(stopwords.words('english'))

positive_reviews = BeautifulSoup(open('dataset/electronics/positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('dataset/electronics/negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')

In [4]:
positive_tokenized = []
negative_tokenized = []

for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index +=1
        
for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index +=1



In [5]:
N = len(positive_tokenized) + len(negative_tokenized)

data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vextor(tokens, 1)
    data[i, :] = xy
    i += 1
    
for tokens in negative_tokenized:
    xy = tokens_to_vextor(tokens, 0)
    data[i, :] = xy
    i += 1

X = data[:, :-1]
y = data[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=0, 
                                                    shuffle=True)

In [6]:
clf = LogisticRegression(random_state=0, solver='lbfgs')
clf.fit(X_train, y_train)
print('Classification rate using Logistic Regression: ', clf.score(X_test, y_test))

Classification rate using Logistic Regression:  0.7525


In [7]:
# Check the sentiment value for words above the threshold
threshold = 0.5
for word, index in word_index_map.items():
    weight = clf.coef_[0][index]
    if abs(weight) > threshold:
        print(word, weight)

time -0.5144916927408415
used 0.660849933673563
've 0.5680812605907062
need 0.5337997860528654
good 1.6530157927786526
sound 0.769016913090303
like 0.5557346109636143
n't -1.5479801167539364
easy 1.1513953442451175
get -0.7130665021709531
use 1.4436948166965653
quality 1.1443826954207381
best 0.8567242424719295
item -0.9572426473949455
well 0.9595405830784492
wa -0.936713512291071
perfect 0.9018938027631505
fast 0.6145687617755199
price 1.7983290325683774
great 2.984832193586256
money -0.7277022524366462
memory 0.6254305223878249
would -0.7493284066592155
buy -0.8798767529822638
worked -0.6723029280919447
doe -0.6841910628536952
highly 0.7192355008114237
recommend 0.5970071914703003
first -0.5915455849188291
support -0.7195952355178907
little 0.6463614654577288
returned -0.5908530320915524
excellent 1.0242130710782529
love 0.8550367450904176
mouse 0.5742760634612201
thing -0.6913874624088223
even -0.5602511574326824
poor -0.5239571208700422
back -1.3161404494454627
speaker 0.5878486398