In [1]:
# import libraries
import pandas as pd
import numpy as np
import string

# get the data from csv file to dataframe

DataTrain = pd.read_csv(
    'data/reviews/reviews.csv', 
    low_memory=False)

DataTest = pd.read_csv(
    'data/reviews/all_reviews_test.csv', 
    low_memory=False)


In [2]:
############################ PREPARATION #########################

In [3]:
# read stopwords, positive, negative, negation wordlist
s_col_names = ['stp']
s_words = pd.read_csv(
    'data/dictionary/stopwords.csv',
    header=None,
    names=s_col_names,
    low_memory=False)

p_col_names = ['pos']
p_words = pd.read_csv(
    'data/dictionary/positive_words.csv',
    header=None,
    names=p_col_names,
    low_memory=False)

n_col_names = ['negve']
n_words = pd.read_csv(
    'data/dictionary/negative_words.csv',
    header=None,
    names=n_col_names,
    low_memory=False)

nt_col_names = ['negtion']
nt_words = pd.read_csv(
    'data/dictionary/negation_words.csv',
    header=None,
    names=nt_col_names,
    low_memory=False)

# make list from dataframes
stopwords = [str(y) for y in s_words.stp]
positive_words = [str(x) for x in p_words.pos]
negative_words = [str(y) for y in n_words.negve]
negation_words = [str(y) for y in nt_words.negtion]


In [4]:
# replace the apostrophe from the lists
def ReplaceApostrophe(thelist):
    newlist = []
    for word in thelist:
        word = word.replace("'", '')
        newlist.append(word)
    return newlist

stopwords = list(set(ReplaceApostrophe(stopwords)))
negative_words = list(set(ReplaceApostrophe(negative_words)))
negation_words = list(set(ReplaceApostrophe(negation_words)))
positive_words = list(set(ReplaceApostrophe(positive_words)))

In [5]:
############################ THE SENTIMENT CLASS #########################

In [6]:
# The Sentiment Analysis Class
import re
# import enchant
# weng = enchant.Dict("en_US")

def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

from nltk.tokenize import word_tokenize 
def SentimentAnalysis(testimony):
    testimony_value = []
    for review in testimony:    
        # tokenize the review
        word_tokens = word_tokenize(review)

        # remove stopwords
        filtered_sentence = [w for w in word_tokens if not w in stopwords]

        # value gived by words type
        score_pos = 1
        score_neg = -1

        negation  = pos = neg = passing = False
        sum_score = 0

        for word in filtered_sentence:
            
            word = reduce_lengthening(word)
            if word in positive_words:
                if negation is True:
                    sum_score += score_neg
                    negation = False
                elif pos is True:
                    pos = False 
                else:
                    sum_score += score_pos
                pos = True

            elif word in negative_words:
                if neg is True:
                    sum_score += score_pos
                elif negation is True:
                    sum_score += score_pos
                    negation = False
                elif pos is True:
                    pos = False
                else:
                    sum_score += score_neg
                neg = True

            elif word in negation_words:
                negation = True

            elif negation is True:
                if passing is False:
                    passing = True
                else:
                    negation = passing = False
            else:
                pos = neg = False

        testimony_value.append(float(sum_score))                
#         if sum_score > 0:
#             # positive
#             testimony_value.append(float(3))
#         elif sum_score < 0:
#             # negative
#             testimony_value.append(float(1))
#         else:
#             # neutral
#             testimony_value.append(float(2))
    return testimony_value

In [7]:
def SentimentPolarity(listscore):
    listpolarity = []
    for score in listscore:
        if score > 0:
            # positive
            listpolarity.append('pos')
        elif score < 0:
            # negative
            listpolarity.append('neg')
        else:
            # neutral
            listpolarity.append('neu')
    return listpolarity

In [22]:
def ScoreNormalization(listscore):
    normalscore = []
    for score in listscore:
        if score < -4:
            normalscore.append(float(1))
        elif score == -3:
            normalscore.append(float(1.5))
        elif score == -2:
            normalscore.append(float(2))
        elif score == -1:
            normalscore.append(float(2.5))
        elif score == 0:
            normalscore.append(float(3))
        elif score == 1:
            normalscore.append(float(3.5))
        elif score == 2:
            normalscore.append(float(4))
        elif score == 3:
            normalscore.append(float(4.5))
        else:
            normalscore.append(float(5))
    return normalscore

In [8]:
############################ TRAINING #########################

In [9]:
# save testimony in a list
TrainTestimony = [str(x) for x in DataTrain.testimony]
# remove all punctuation
TrainTestimony = [testimony.translate(str.maketrans('','',string.punctuation)) for testimony in TrainTestimony ]
# make entire text lowercase
TrainTestimony = [t.lower() for t in TrainTestimony]

In [10]:
TrainTestimony

['i thought it would be as big as small paper but turn out to be just like my palm i think it is too small to read on it not very comfortable as regular kindle would definitely recommend a paperwhite instead',
 'this kindle is light and easy to use especially at the beach',
 'didnt know how much id use a kindle so went for the lower end im happy with it even if its a little dark',
 'i am 100 happy with my purchase i caught it on sale at a really good price i am normally a real book person but i have a 1 year old who loves ripping up pages the kindle prevents that its extremely portable it fits better in my purse than a giant book and i have it loaded with lots of books i finish one and start another without having to go store it serves all my needs i picked this one over the paperwhite because the price was unbeatable and the only difference that i could see was this one wasnt backlit a simple book light from the dollar tree solves that issue this is my second kindle the first being th

In [11]:
train_sentiment_score = SentimentAnalysis(TrainTestimony)
DataTrain['sentiment_score'] = train_sentiment_score
DataTrain.head()

Unnamed: 0,reviewId,rating,testimony,itemId,userId,sentiment_score
0,0,3,I thought it would be as big as small paper bu...,19,2976,1.0
1,1,5,This kindle is light and easy to use especiall...,19,3339,1.0
2,2,4,Didnt know how much i'd use a kindle so went f...,19,1472,0.0
3,3,5,I am 100 happy with my purchase. I caught it o...,19,603,8.0
4,4,5,Solid entry level Kindle. Great for kids. Gift...,19,332,5.0


In [23]:
train_sentiment_score_norm = ScoreNormalization(train_sentiment_score)
DataTrain['sentiment_score_norm'] = train_sentiment_score_norm
DataTrain.head()

Unnamed: 0,reviewId,rating,testimony,itemId,userId,sentiment_score,sentiment_score_norm
0,0,3,I thought it would be as big as small paper bu...,19,2976,1.0,3.5
1,1,5,This kindle is light and easy to use especiall...,19,3339,1.0,3.5
2,2,4,Didnt know how much i'd use a kindle so went f...,19,1472,0.0,3.0
3,3,5,I am 100 happy with my purchase. I caught it o...,19,603,8.0,5.0
4,4,5,Solid entry level Kindle. Great for kids. Gift...,19,332,5.0,5.0


In [25]:
# save training set file
header = ["reviewId", "itemId", "userId", "rating", "sentiment_score_norm" ]
DataTrain.to_csv('data/upload/DataTrain.csv', columns = header)

In [15]:
############################ TESTING #########################

In [12]:
# save testimony in a list
TestTestimony = [str(x) for x in DataTest.testimony]
# remove all punctuation
TestTestimony = [testimony.translate(str.maketrans('','',string.punctuation)) for testimony in TestTestimony ]
# make entire text lowercase
TestTestimony = [t.lower() for t in TestTestimony]

In [13]:
TestTestimony

['a lazy mans drean when it is combined with alexa if you get the harmony hub you can really impress with home automation',
 'i really enjoy my fire stick its really easy to use',
 'really cool device instantly noticed the difference in quality when i switched from the regular fire stick to the fire tv with 4k',
 'love it works great one in each of the main rooms',
 'this is a great addition to the other alexa products by amazon it works great in the kitchen better voice recognition and sound the onscreen display and information are very helpful the video capability is watching instructional videos or even playing music videos while workingcooking it can also scroll personal photos as a picture displayall in all echo show is a wonderful addition well worth it especially if you get it on sale',
 'it‚äôs good i don‚äôt know what i am doing can u say you tube',
 'my wife and i enjoy using alexa for weather information and also for the great music that we are able to access we also have th

In [14]:
sentiment_score = SentimentAnalysis(TestTestimony)
DataTest['sentiment_score'] = sentiment_score
DataTest.head()

Unnamed: 0,reviewId,rating,hum_sentiment,testimony,itemId,userId,sentiment_score
0,941,5,pos,A lazy mans drean when it is combined with Ale...,2,3266,1.0
1,942,5,pos,I really enjoy my Fire stick. It's really easy...,2,3665,2.0
2,943,5,pos,Really cool device! Instantly noticed the diff...,2,2944,2.0
3,944,5,pos,Love it! Works great. One in each of the main ...,2,2232,1.0
4,765,5,pos,This is a great addition to the other Alexa pr...,23,3157,6.0


In [24]:
test_sentiment_score_norm = ScoreNormalization(sentiment_score)
DataTest['sentiment_score_norm'] = test_sentiment_score_norm
DataTest.head()

Unnamed: 0,reviewId,rating,hum_sentiment,testimony,itemId,userId,sentiment_score,sentiment_polarity,check_mechine,sentiment_score_norm
0,941,5,pos,A lazy mans drean when it is combined with Ale...,2,3266,1.0,pos,True,3.5
1,942,5,pos,I really enjoy my Fire stick. It's really easy...,2,3665,2.0,pos,True,4.0
2,943,5,pos,Really cool device! Instantly noticed the diff...,2,2944,2.0,pos,True,4.0
3,944,5,pos,Love it! Works great. One in each of the main ...,2,2232,1.0,pos,True,3.5
4,765,5,pos,This is a great addition to the other Alexa pr...,23,3157,6.0,pos,True,5.0


In [16]:
# Sentiment Polarity
sentiment_pol = SentimentPolarity(sentiment_score)
DataTest['sentiment_polarity'] = sentiment_pol
DataTest.head()

Unnamed: 0,reviewId,rating,hum_sentiment,testimony,itemId,userId,sentiment_score,sentiment_polarity
0,941,5,pos,A lazy mans drean when it is combined with Ale...,2,3266,1.0,pos
1,942,5,pos,I really enjoy my Fire stick. It's really easy...,2,3665,2.0,pos
2,943,5,pos,Really cool device! Instantly noticed the diff...,2,2944,2.0,pos
3,944,5,pos,Love it! Works great. One in each of the main ...,2,2232,1.0,pos
4,765,5,pos,This is a great addition to the other Alexa pr...,23,3157,6.0,pos


In [17]:
DataTest['check_mechine'] = np.where(DataTest['sentiment_polarity'] != DataTest['hum_sentiment'], False , True)
(DataTest['check_mechine']==False).sum()

34

In [18]:
DataTest[DataTest['check_mechine']==False]

Unnamed: 0,reviewId,rating,hum_sentiment,testimony,itemId,userId,sentiment_score,sentiment_polarity,check_mechine
15,3026,3,neu,"This product, while the speaker is great, fail...",11,1589,1.0,pos,False
139,830,4,pos,"a bit difficult to set up, but it is amazing. ...",23,1374,0.0,neu,False
194,1496,4,pos,The Echo Plus is great. Alexa is the best feat...,20,1943,0.0,neu,False
207,1079,5,neu,Got this as a present for my fiance for Christ...,20,589,1.0,pos,False
211,3117,5,pos,Most speaker like this will be over USD150.00 ...,11,2531,0.0,neu,False
218,508,5,pos,Love this thing. Little apprehensive at first....,23,1788,0.0,neu,False
286,170,5,pos,"I find myself saying to my phone, Alexa, play....",23,3121,0.0,neu,False
349,47,5,pos,Amazon kindle products have always been reliab...,19,3726,0.0,neu,False
354,4434,5,pos,This is a grate product and the price very far...,1,3452,0.0,neu,False
369,4609,5,neg,"Nice camera. Learning how to use it, but it is...",1,2107,2.0,pos,False


In [26]:
# save testing set file
header = ["reviewId", "itemId", "userId", "rating", "sentiment_score_norm" ]
DataTest.to_csv('data/upload/DataTest.csv', columns = header)

In [19]:
############# evaluation ###########

In [20]:
tp = tn = fp = fn = net = 0


for index, row in DataTest.iterrows():
    if row[7]=='pos':
        if row[7]==row[2]:
            tp +=1
        else:
            fp +=1
    elif row[7]=='neg':
        if row[7]==row[2]:
            tn +=1
        else:
            fn +=1
    elif row[7]=='neu':
        net += 1
total = tp + fp + tn + fn + net
print("net = " + str(net))
print("tp = " + str(tp))
print("fp = " + str(fp))
print("tn = " + str(tn))
print("fn = " + str(fn))
print("total = "  + str(total))

accuracy = (tp + tn)/ total
precision = tp /(tp + fp)
recall = tp /(tp + fn)
print("accuracy = "  + str(accuracy))
print("precision = "  + str(precision))
print("recall = "  + str(recall))

net = 104
tp = 928
fp = 14
tn = 65
fn = 5
total = 1116
accuracy = 0.8897849462365591
precision = 0.9851380042462845
recall = 0.9946409431939979
