In [4]:
# import libraries
import pandas as pd
import numpy as np
import string

# get the data from csv file to dataframe
DataTrain = pd.read_csv(
    'data/data_training.csv', 
    low_memory=False)
DataTest = pd.read_csv(
    'data/data_testing_phuman_update.csv', 
    low_memory=False)

In [43]:
############################ PREPARATION #########################

In [6]:
# read stopwords, positive, negative, negation wordlist
s_col_names = ['stp']
s_words = pd.read_csv(
    'data/stopwords.csv',
    header=None,
    names=s_col_names,
    low_memory=False)

p_col_names = ['pos']
p_words = pd.read_csv(
    'data/positive_words.csv',
    header=None,
    names=p_col_names,
    low_memory=False)

n_col_names = ['negve']
n_words = pd.read_csv(
    'data/negative_words.csv',
    header=None,
    names=n_col_names,
    low_memory=False)

nt_col_names = ['negtion']
nt_words = pd.read_csv(
    'data/negation_words.csv',
    header=None,
    names=nt_col_names,
    low_memory=False)

# make list from dataframes
stopwords = [str(y) for y in s_words.stp]
positive_words = [str(x) for x in p_words.pos]
negative_words = [str(y) for y in n_words.negve]
negation_words = [str(y) for y in nt_words.negtion]


In [7]:
# replace the apostrophe from the lists
def ReplaceApostrophe(thelist):
    newlist = []
    for word in thelist:
        word = word.replace("'", '')
        newlist.append(word)
    return newlist

stopwords = list(set(ReplaceApostrophe(stopwords)))
negative_words = list(set(ReplaceApostrophe(negative_words)))
negation_words = list(set(ReplaceApostrophe(negation_words)))
positive_words = list(set(ReplaceApostrophe(positive_words)))

In [None]:
############################ THE SENTIMENT CLASS #########################

In [23]:
# The Sentiment Analysis Class
from nltk.tokenize import word_tokenize 
def SentimentAnalysis(testimony):
    testimony_value = []
    for review in testimony:    
        # tokenize the review
        word_tokens = word_tokenize(review)

        # remove stopwords
        filtered_sentence = [w for w in word_tokens if not w in stopwords]

        # value gived by words type
        score_pos = 1
        score_neg = -1

        negation  = pos = neg = passing = False
        sum_score = 0

        for word in filtered_sentence:
            if word in positive_words:
                if negation is True:
                    sum_score += score_neg
                    negation = False
                elif pos is True:
                    pos = False 
                else:
                    sum_score += score_pos
                pos = True

            elif word in negative_words:
                if neg is True:
                    sum_score += score_pos
                elif negation is True:
                    sum_score += score_pos
                    negation = False
                elif pos is True:
                    pos = False
                else:
                    sum_score += score_neg
                neg = True

            elif word in negation_words:
                negation = True

            elif negation is True:
                if passing is False:
                    passing = True
                else:
                    negation = passing = False
            else:
                pos = neg = False
        if sum_score > 0:
            # positive
            testimony_value.append(float(3))
        elif sum_score < 0:
            # negative
            testimony_value.append(float(1))
        else:
            # neutral
            testimony_value.append(float(2))
    return testimony_value

In [9]:
############################ TRAINING #########################

In [19]:
# save testimony in a list
TrainTestimony = [str(x) for x in DataTrain.testimony]
# remove all punctuation
TrainTestimony = [testimony.translate(str.maketrans('','',string.punctuation)) for testimony in TrainTestimony ]
# make entire text lowercase
TrainTestimony = [t.lower() for t in TrainTestimony]

In [20]:
TrainTestimony

['i have really enjoyed it so far while not perfect by any means it is certainly worth the price',
 'i purchased this tablet for my 3 year old and he loves it very easy to use and setup',
 'i love my tablet it is super easy and loads fast',
 'battery dies after an hourshould last longer for the money will not bold charge',
 'it is great if you are a prime member  case is durable',
 'since i purchased mine the wife thought she would also like one so we picked up her one and she is using it a lot',
 'my son is constantly distracted and loves technology him being only two years old this proved to be the perfect thing to keep him occupied and out of trouble',
 'this was a gift for my kids they love them easy to use and lots of great apps',
 'i was initially going to just get the amazon kindle but i thought this gives me a tablet and a easy way to read books i buy ecollege texts it has helped me so much im so happy i invested my money in this',
 'good product great price big improvement ove

In [21]:
len(TrainTestimony)

1920

In [24]:
testimony_value = SentimentAnalysis(TrainTestimony)
len(testimony_value)

1920

In [25]:
testimony_value = SentimentAnalysis(TrainTestimony)
DataTrain['testimony_value'] = testimony_value
DataTrain.head()

Unnamed: 0.1,Unnamed: 0,rating,testimony,itemId,userId,testimony_value
0,3979,1.0,I have really enjoyed it so far. While not per...,20,3585,3.0
1,2963,1.0,I purchased this tablet for my 3 year old and ...,7,1256,3.0
2,3899,2.0,I love my tablet. It is super easy and loads f...,20,1506,3.0
3,2326,1.5,Battery dies after an hour...should last longe...,7,188,1.0
4,2385,2.0,"It is great if you are a prime member , case i...",2,3023,3.0


In [35]:
# rename the columns
DataTrain.rename(
    columns={
        'Unnamed: 0':'reviewId'}, 
    inplace=True)

DataTrain.head()

Unnamed: 0,reviewId,rating,testimony,itemId,userId,testimony_value
0,3979,1.0,I have really enjoyed it so far. While not per...,20,3585,3.0
1,2963,1.0,I purchased this tablet for my 3 year old and ...,7,1256,3.0
2,3899,2.0,I love my tablet. It is super easy and loads f...,20,1506,3.0
3,2326,1.5,Battery dies after an hour...should last longe...,7,188,1.0
4,2385,2.0,"It is great if you are a prime member , case i...",2,3023,3.0


In [36]:
DataTrain.to_csv("data/SentimentDataTrain.csv")

In [3]:
############################ TESTING #########################

In [26]:
# save testimony in a list
TestTestimony = [str(x) for x in DataTest.testimony]
# remove all punctuation
TestTestimony = [testimony.translate(str.maketrans('','',string.punctuation)) for testimony in TestTestimony ]
# make entire text lowercase
TestTestimony = [t.lower() for t in TestTestimony]

In [27]:
TestTestimony

['excellent reader and product for the price very easy to use',
 'i bought this tablet for my 13 yr old godchild and she loves it this was her christmas present',
 'i have an lg tablet but picked up this one for my dad and after messing with it for a while its very nice i think i like it more the my lg g pad',
 'i bought 2 of these 1 for each of my 2 youngest grandaughters wanted something that would be made well easy to use and as they get older the range of use could expand not only is it a good activity but a great learning tool i will download fun and game activities but learning also even though many of the fun and game are learning tools their ages are 3 12 and 5 and they love them easy to download items save to home screen or remove items their are several parental controls and options so you can feel worry free can add more than one user and profile',
 'i got this tablet so i wouldnt have to pack up my laptop when i went to my boyfriends when i want to do my online class this i

In [29]:
test_testimony_value = SentimentAnalysis(TestTestimony)
DataTest['testimony_value'] = test_testimony_value
DataTest.head()

Unnamed: 0.1,Unnamed: 0,rating,human,testimony,itemId,userId,testimony_value
0,2800,2.0,3,Excellent Reader and Product for the price. Ve...,8,3599,3.0
1,4019,2.0,3,I bought this tablet for my 13 yr. old God-chi...,20,2757,3.0
2,3778,2.0,3,I have an LG tablet but picked up this one for...,20,977,3.0
3,2368,2.0,3,"I bought 2 of these, 1 for each of my 2 younge...",7,1166,3.0
4,4912,1.0,3,I got this tablet so I wouldn't have to pack u...,19,302,3.0


In [30]:
# convert human type
human_list = [float(x) for x in DataTest['human']]
DataTest['human'] =  human_list
DataTest.head()

Unnamed: 0.1,Unnamed: 0,rating,human,testimony,itemId,userId,testimony_value
0,2800,2.0,3.0,Excellent Reader and Product for the price. Ve...,8,3599,3.0
1,4019,2.0,3.0,I bought this tablet for my 13 yr. old God-chi...,20,2757,3.0
2,3778,2.0,3.0,I have an LG tablet but picked up this one for...,20,977,3.0
3,2368,2.0,3.0,"I bought 2 of these, 1 for each of my 2 younge...",7,1166,3.0
4,4912,1.0,3.0,I got this tablet so I wouldn't have to pack u...,19,302,3.0


In [31]:
DataTest['check_label'] = np.where(DataTest['human'] != DataTest['testimony_value'], False , True) 
(DataTest['check_label']==False).sum()

61

In [32]:
DataTest[DataTest['check_label']==False]

Unnamed: 0.1,Unnamed: 0,rating,human,testimony,itemId,userId,testimony_value,check_label
7,4999,2.0,3.0,"this is a steal, have 8 gb model as well.This ...",19,764,1.0,False
14,4719,2.0,2.0,I bought this as an affordable alternative to ...,1,3582,3.0,False
36,4820,2.0,2.0,"Not to big, not to small. Just right for for t...",17,3445,3.0,False
38,2965,1.0,3.0,Great tablet other than the photo image qualit...,5,2040,1.0,False
46,2731,2.0,2.0,It was all that my grandson wanted and didn't ...,8,1043,3.0,False
56,1927,1.5,2.0,I was completely under the impression that thi...,2,2935,3.0,False
57,1692,1.5,2.0,You get what you pay for Was not to happy but ...,7,3352,1.0,False
58,4088,1.0,3.0,I purchased this when my last tablet died. It ...,20,715,2.0,False
63,3984,1.0,1.0,"I got this for reading books, I still like ipa...",20,3481,3.0,False
67,4204,2.0,3.0,"No complaints. I just wish it played .avi, but...",20,3614,2.0,False


In [37]:
# rename the columns
DataTest.rename(
    columns={
        'Unnamed: 0':'reviewId'}, 
    inplace=True)

DataTest.head()

Unnamed: 0,reviewId,rating,human,testimony,itemId,userId,testimony_value,check_label
0,2800,2.0,3.0,Excellent Reader and Product for the price. Ve...,8,3599,3.0,True
1,4019,2.0,3.0,I bought this tablet for my 13 yr. old God-chi...,20,2757,3.0,True
2,3778,2.0,3.0,I have an LG tablet but picked up this one for...,20,977,3.0,True
3,2368,2.0,3.0,"I bought 2 of these, 1 for each of my 2 younge...",7,1166,3.0,True
4,4912,1.0,3.0,I got this tablet so I wouldn't have to pack u...,19,302,3.0,True


In [41]:
DataTest[['reviewId', 'rating', 'testimony', 'itemId', 'userId','testimony_value' ]].to_csv("data/SentimentDataTest.csv")

In [None]:
############# evaluation ###########

In [34]:
tp = tn = fp = fn = net = 0
# for index, row in itemdf.iterrows():
#     itemname = str(row[1].lower())

for index, row in DataTest.iterrows():
    if row[6]==3:
        if row[6]==row[2]:
            tp +=1
        else:
            fp +=1
    elif row[6]==1:
        if row[6]==row[2]:
            tn +=1
        else:
            fn +=1
    elif row[6]==2:
        net += 1
total = tp + fp + tn + fn + net
print("tp = " + str(tp))
print("fp = " + str(fp))
print("tn = " + str(tn))
print("fn = " + str(fn))
print("total = "  + str(total))

accuracy = (tp + tn)/ total
precision = tp /(tp + fp)
recall = tp /(tp + fn)
print("accuracy = "  + str(accuracy))
print("precision = "  + str(precision))
print("recall = "  + str(recall))

tp = 381
fp = 22
tn = 12
fn = 11
total = 480
accuracy = 0.81875
precision = 0.9454094292803971
recall = 0.9719387755102041
