In [1]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier


with open('data/datatrain.json', 'r') as fp:
    cl = NaiveBayesClassifier(fp, format="json")

In [2]:
# import libraries
import pandas as pd
import numpy as np

# get the data from csv file to dataframe
data_test_filter = pd.read_csv('data/data_prepared_before_sentiment.csv', low_memory=False)

In [3]:
data_test_filter.columns

Index(['Unnamed: 0', 'itemId', 'userId', 'rating', 'comment'], dtype='object')

In [4]:
# check unique username and goods name
user_count = data_test_filter['userId'].unique().shape[0]
item_count = data_test_filter['itemId'].unique().shape[0]

print("user count: " + str(user_count) + ", item count: " + str(item_count))

user count: 234, item count: 23


In [5]:
data_test_filter.comment.apply(str)

0      I thought it would be as big as small paper bu...
1      This kindle is light and easy to use especiall...
2      Didnt know how much i'd use a kindle so went f...
3      I am 100 happy with my purchase. I caught it o...
4      Solid entry level Kindle. Great for kids. Gift...
5      This make an excellent ebook reader. Don't exp...
6      I ordered this for my daughter, as I have the ...
7      I bought my Kindle about 2 months ago and the ...
8      amazon kindle is always the best ebook, upgrad...
9      It's beyond my expectation, and it can even sh...
10     If you really want to have a good read on some...
11     Great Gift for anyone. Very easy to setup. Coe...
12     Super excited to give this as a gift. It's sup...
13     We bought this for mother in law, buying anoth...
14     Well designed, good sound, has everything Alex...
15     I have one Alexa and three Echo dots and havin...
16     Omg I love my amazon show it is sooo cool and ...
17     Great looking design. Am

In [9]:
data_test_filter.shape

(250, 5)

In [10]:
# remove unwanted characters, numbers and symbols

data_test_filter['comment'] = data_test_filter['comment'].str.replace("[^a-zA-Z#]", " ")

# remove stopwords and short words (<2 letters)
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# function to remove stopwords
def remove_stopwords(rev):
    rev_new = " ".join([i for i in rev if i not in stop_words])
    return rev_new

# remove short words(length<3)
data_test_filter['comment'] = data_test_filter['comment'].apply(lambda x: ' '.join([w for w in x.split() if len (w)>2]))

# remove stopwords from the text
reviews = [remove_stopwords(r.split()) for r in data_test_filter['comment']]

# make entire text lowercase
reviews = [r.lower() for r in reviews]

data_test_filter['comment']=reviews

In [12]:
data_test_filter.head()

Unnamed: 0.1,Unnamed: 0,itemId,userId,rating,comment
0,0,1,1,3,thought would big small paper turn like palm t...
1,1,1,2,5,this kindle light easy use especially beach
2,2,1,3,4,didnt know much use kindle went lower end happ...
3,3,1,4,5,happy purchase caught sale really good price n...
4,4,1,5,5,solid entry level kindle great kids gifted kid...


In [13]:
sentiment = []
for x in data_test_filter['comment']:
    blob = TextBlob(x, classifier=cl)
    r = blob.classify()
    if blob.sentiment[0] == 0:
        r = 'neutral'
    sentiment.append(r)

In [14]:
sentiment_label = []
for x in sentiment:
    if x == 'pos':
        v = 3
    elif x == 'neg':
        v = 1
    else:
        v = 2
    sentiment_label.append(v)
data_test_filter['label']=sentiment_label    

In [15]:
data_test_filter.head()

Unnamed: 0.1,Unnamed: 0,itemId,userId,rating,comment,label
0,0,1,1,3,thought would big small paper turn like palm t...,3
1,1,1,2,5,this kindle light easy use especially beach,3
2,2,1,3,4,didnt know much use kindle went lower end happ...,3
3,3,1,4,5,happy purchase caught sale really good price n...,3
4,4,1,5,5,solid entry level kindle great kids gifted kid...,3
