In [2]:
import json
import pandas as pd
import gzip
import math
from datetime import datetime

# To count frequency in lists
import collections

# For graphing
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

import spacy
from spacy import displacy

In [37]:
# Load data
df = pd.read_csv('/Users/yujinglai/Dropbox/Eugenie/data/processed_julian_amazon_data/merged_review_lemma.csv', index_col=0, low_memory=False)

In [3]:
df.shape

(244775, 2)

In [4]:
# Set up NLP
nlp = spacy.load("en_core_web_sm")

In [120]:
x = df.loc[df['reviewText'].str.contains("incentive"), 'reviewText']
print(len(list(x)))

28


In [5]:
def getToken(doc):
    for token in doc:
        print(token.lemma_, token.dep_, token.head.text)

In [6]:
doc = nlp("I received no incentives")
getToken(doc)

-PRON- nsubj received
receive ROOT received
no det incentives
incentive dobj received


In [7]:
df.head()

Unnamed: 0,reviewText,lemma
0,I got the charger 2 weeks ago and one week lat...,-PRON- get the charger 2 week ago and one wee...
1,Not only does it take 20 minutes to charge a s...,not only do -PRON- take 20 minute to charge a...
2,This charger was great while it lasted. Howeve...,this charger be great while -PRON- last . how...
3,cable is too short,cable be too short
4,Metal piece came out. I pushed it back in but...,metal piece come out . -PRON- push -PRON- b...


In [40]:
##### PLAN #####
# 1. Get lemma
# 2. Identify reviews with incentive termination strings
# 3. Find negation 
# 4. Compare the positive and negative reviews

In [38]:
# 2. Identify reviews with incentive termination strings
incentivized_flags = ['incentivize', 'incentive', 'discount', 'affiliate', 'promote', 'promotion', 'sponsor', 'sponsorship', 'discount']
reviews = df.loc[df['lemma'].apply(lambda x: any([k in x for k in incentivized_flags]))]

In [9]:
reviews.head()

Unnamed: 0,reviewText,lemma
774,I've been using it in my car for 3 months and ...,-PRON- have be use -PRON- in -PRON- car for 3...
825,I went to a discount store near my home to pur...,-PRON- go to a discount store near -PRON- hom...
1042,"I received my 5-Port USB Wall Charger, EZOPowe...",-PRON- receive -PRON- 5-port usb Wall Charger...
1147,My Iphones have always said that this device i...,-PRON- iphone have always say that this devic...
1217,With me always carrying an Android device and ...,with -PRON- always carry an Android device an...


In [51]:
# 3. Find negation 
# Construct a string array
doc = nlp("-PRON- be not incentivized.")
displacy.serve(doc, style="dep")


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [68]:
negation_tokens = [tok for tok in doc if tok.text in ['no','not']]
negation_head_tokens = [token.head.text for token in negation_tokens]

In [69]:
negation_head_tokens

['incentivized']

In [62]:
doc = nlp("This was not given to me or discounted in anyway")
it_ancestors = doc[2].ancestors
for i in it_ancestors:
    print(i.text)
    for j in i.children:
        print(j.text)

given
This
was
not
to
or
discounted
anyway


In [82]:
getNegation_head_tokens("This was not given to me or discounted in anyway")

not


['or', 'not', 'discounted', 'to', 'anyway', 'was', 'given']

In [6]:
def getNegation_head_tokens(x):
    doc = nlp(x)
    negation_tokens = [tok for tok in doc if tok.text in ['not','no']]
    negation_head_tokens = [token.head.text for token in negation_tokens]
    return negation_head_tokens

In [7]:
def getNegation_tree(x):
    doc = nlp(x)
    negation_tokens = [tok for tok in doc if tok.text in ['not','no']]
    negation_head_tokens = [token.head.text for token in negation_tokens]
    negation_ancestors = []
    for n in negation_tokens:
        n_tokens = n.ancestors
        # The first jump
        for i in n_tokens:
            negation_ancestors.append(i.text)
            children = i.children
            # The second jump
            for j in children:
                negation_ancestors.extend([t.text for t in children])
    result = negation_ancestors + negation_head_tokens
    if len(result) > 0:
        return list(set(result))
    return []

In [8]:
def ifNegation_incentivized(x):
    negation_head_tokens = getNegation_tree(x)
    for i in negation_head_tokens:
        if i in incentivized_flags:
            return 1
    return 0

In [39]:
reviews['if_neg'] = reviews['lemma'].apply(ifNegation_incentivized)

In [42]:
reviews.head()

Unnamed: 0,incentivized,if_neg
774,0,1
825,1,0
1042,1,0
1147,1,0
1217,0,1


In [41]:
reviews['incentivized'] = 1
reviews['incentivized'] = reviews['incentivized'] - reviews['if_neg']
reviews = reviews[['incentivized','if_neg']]

In [94]:
reviews.groupby(['if_neg','if_neg_tree']).count()
#before/after
#apply it to other datasets
#plot the histogram on the rating (1-5): a way to reduce the sample size again to enlarge the affect of 
#incentivized revies because they are such a small portion

Unnamed: 0_level_0,Unnamed: 1_level_0,reviewText,lemma
if_neg,if_neg_tree,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,537,537
0,1,67,67
1,1,30,30


In [44]:
df_incent = pd.merge(merged_reviews, reviews, left_index=True, right_index=True, how='left')
df_incent['incentivized'] = df_incent['incentivized'].fillna(int(0))
df_incent['if_neg'] = df_incent['if_neg'].fillna(int(0))

In [43]:
# Load data
merged_reviews = pd.read_csv('/Users/yujinglai/Dropbox/Eugenie/data/processed_julian_amazon_data/reviews_merged_with_similar_amazon_products_cellphone.csv', index_col=0, low_memory=False)

In [49]:
df_incent['if_neg'] = df_incent['if_neg'].fillna(int(0))
df_incent.head()

Unnamed: 0,asin,verified,reviewText,overall,summary,unixReviewTime,reviewerName,reviewTime,vote,image,...,sim2,sim3,sim4,sim5,sim6,sim7,sim8,sim9,incentivized,if_neg
0,9713957334,True,I got the charger 2 weeks ago and one week lat...,1.0,This is a bad product. At least the charger is...,1489968000,oswaldo lopez,2017-03-20,2.0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
1,9713957334,True,Not only does it take 20 minutes to charge a s...,1.0,Trash,1489622400,Ted Curtas,2017-03-16,0.0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
2,9713957334,True,This charger was great while it lasted. Howeve...,1.0,Does not last,1489190400,Emily,2017-03-11,0.0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
3,9713957334,True,cable is too short,3.0,Three Stars,1489104000,Rodney,2017-03-10,0.0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
4,9713957334,True,Metal piece came out. I pushed it back in but...,1.0,Don't trust,1486944000,Alicia,2017-02-13,0.0,0,...,0,0,0,0,0,0,0,0,0.0,0.0


In [50]:
df_incent.to_csv('/Users/yujinglai/Dropbox/Eugenie/data/processed_julian_amazon_data/reviews_merged_with_similar_amazon_products_cellphone.csv')

In [33]:
df_incent.head()

Unnamed: 0,reviewText,lemma,incentivized
0,I got the charger 2 weeks ago and one week lat...,-PRON- get the charger 2 week ago and one wee...,0.0
1,Not only does it take 20 minutes to charge a s...,not only do -PRON- take 20 minute to charge a...,0.0
2,This charger was great while it lasted. Howeve...,this charger be great while -PRON- last . how...,0.0
3,cable is too short,cable be too short,0.0
4,Metal piece came out. I pushed it back in but...,metal piece come out . -PRON- push -PRON- b...,0.0


In [104]:
if_neg_tree1 = reviews[(reviews['if_neg'] ==0) & (reviews['if_neg_tree'] ==1)]
for i in list(if_neg_tree1['reviewText'].head()):
    print('Review:')
    print(i)
    print("")

Review:
With me always carrying an Android device and my wife going over to the dark side and getting an iPhone, we couldn't both charge at the same time in a vehicle. There are thousands of car chargers out there, but having a USB option allows for future cables as well that don't exist yet, like Apple that changes their format frequently.
I have mad many chargers break, fail, or plain fall apart over the years and I have had great luck with Amazon Basic batteries in the past, so I thought I would give these a try. So far so good!
I did not receive a discount on this product for my honest opinion, I paid full price for the item.

Review:
Love this little gadget! Whilst most cases/covers can form a folding stand, sometimes they just don't sit at the right angle, especially when using Skype or other video chat apps. Sometimes you need to have your iPad in portrait view, perhaps when using website based recipes... In any case, this little stand is perfect. Clean lines, easily but securel

In [97]:
for i in list(reviews.loc[reviews['if_neg'] == 0,'reviewText'].head()):
    print('Review:')
    print(i)
    print("")

Review:
I went to a discount store near my home to purchase a power adapter for my auto cigarette port. While there I was amazed to see that the pricing was so high. I pulled out my trusty iPhone I learned that Amazon had a similar adapter for about 50% less. I left the store and ordered the adapter from my car. I expected something less than the picture revealed. I was surprise to see the power adapter arrive nicer than I anticipated and included double stick take to hold the adapter in place. Great buy...lesson learned, shop with your smart phone and you will likely save money. I saved about $10 and received a device nicer than expected.

Review:
I received my 5-Port USB Wall Charger, EZOPower 36W 7.2A 5-Port USB Desktop Travel Charger Adapter with 8ft Extension Power Cord- Black, from Amazon, and wasted no time in testing it out. I had just gotten home from a bit of PokimonGo, so my phone was below 20%!

The item came in a great package, I am sure the items inside were well protecte

In [115]:
def getSent(x):
    doc = nlp(x)
    incen_sent = ''
    incen_tokens = [tok for tok in doc if tok.text in incentivized_flags]
    for i in incen_tokens:
        incen_sent += ' ' + str(i.sent)
    return incen_sent

In [116]:
getSent("I be given incentive. I be incentivize.")

' I be given incentive. I be incentivize.'

In [117]:
incen_sents = pd.DataFrame(reviews['lemma'].apply(getSent))
incen_sents.head()

Unnamed: 0,lemma
774,"accept no discount , trade or payment of any ..."
825,-PRON- go to a discount store near -PRON- ho...
1042,do receive this product at a discount or for ...
1147,-PRON- buy a Belkin specifically promote by A...
1217,do not receive a discount on this product for...


In [118]:
incen_sents.to_csv('incen_sents.csv')