# Exploratory Analysis: Helpful Reviews

1. Helpful vs. Not Helpful Reviews
2. Sentence Generation
3. 
    
TODO:
+ Can review headline indicate helpful rating?
  + using the few words in headline, indicate a good rating?
+ Helpful reviews length
+ Helpeful votes with Nutrients/Categories?
  + maybe some products require more helpful votes?
+ model helpful reviews by review words...etc
  + since helpful reviews are likel to influence other users
+ helpful reivews
  + would some categories have more helpful reviews?
  + would some nutrients product have more helpful reviews?

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [2]:
import os
from sqlalchemy import create_engine

from dotenv import load_dotenv # env variables
load_dotenv(verbose=True)

True

In [3]:
SQLALCHEMY_DATABASE_URI = os.getenv('DATABASE_URL')
engine = create_engine(SQLALCHEMY_DATABASE_URI)

use merged data from file
```python
data_path = 'D:\DATA\OurFoods'
df = pd.read_csv(os.path.join(data_path, 'merged_amz-off_3.csv.gz'),\
                 dtype={'customer_id': 'object', 'product_parent': 'object', \
                        'star_rating': pd.Int64Dtype(), 'helpful_votes': pd.Int64Dtype(), 
                        'total_votes': pd.Int64Dtype(), \
                        'code': 'object'},
                 compression='gzip')
# convert reivew_date to datetime object
df.review_date = pd.to_datetime(df.review_date)
df.shape
```

### 1. Helpful vs. Not Helpful
+ using threshold of 0.5 to evalutate helpful or not

In [4]:
sql = \
"""
SELECT 
    review_id, 
    helpful_votes, total_votes,
    review_headline, review_body
FROM 
    food_reviews
WHERE 
    energy_100g IS NOT NULL
    AND energy_100g < 3000
    AND review_date >= '2010-01-01'
    AND verified_purchase LIKE 'Y'
    AND total_votes > 0
"""
# assume helpful_votes / total_votes > .5 is a helpful review
df = pd.read_sql(sql, con=engine)\
    .assign(helpful=lambda df: df.helpful_votes/df.total_votes >= .5)
df.shape

(44363, 6)

In [18]:
df.total_votes.value_counts().describe()

count      173.000000
mean       256.433526
std       1722.682708
min          1.000000
25%          1.000000
50%          2.000000
75%         11.000000
max      20236.000000
Name: total_votes, dtype: float64

### 2. Helpful vs. Not Helpful: Sentence Generation
+ What are helpful and non-helpful review saying?
+ Using Trigram model for sentence generations
+ Choose different start word to generate

In [19]:
import re
import json
import operator
# import pprint
from nltk.util import ngrams
from collections import Counter

In [20]:
sql = \
"""
SELECT 
    review_id, helpful_votes, total_votes,
    review_headline, review_body
FROM 
    food_reviews
WHERE 
    energy_100g IS NOT NULL
    AND energy_100g < 3000
    AND review_date >= '2010-01-01'
    AND verified_purchase LIKE 'Y'
    AND total_votes > 0
"""
# assume helpful_votes / total_votes > .5 is a helpful review
df = pd.read_sql(sql, con=engine)\
    .assign(helpful=lambda df: df.helpful_votes/df.total_votes >= .5)
df.shape

(44363, 6)

In [21]:
# given a review string, return list of list of tokens
def reviewTokenize(review):
    # remove line breaker, non-word, except space and period,
    review =  re.sub(r'<br />|[^A-Za-z. ]', '', review.lower())
    # remove space at setence end
    review =  re.sub(r'\s$', '', review)
    # reduce multiple space to single
    review = re.sub(' +', ' ', review) 
    # split by end of sentence 
    sents = review.split('.')
    # split into tokens, ignore empty string
    return [[token for token in sent.split(' ') if token != ''] \
            for sent in sents if sent != '' ] 

In [22]:
# using dict, then convert to Counter
def build_trigrams(review_bodys):
    model = {}
    for review in review_bodys:
        for sent in reviewTokenize(review):
            trigrams = ngrams(sent, 3, pad_left=True, pad_right=True, \
                             left_pad_symbol='<s>', right_pad_symbol='</s>')
            for trigram in trigrams:
                if model.get(trigram) is not None:
                    model[trigram] += 1
                else:
                    model[trigram] = 1
    return model

In [23]:
# turn the result into dict 
def trigram_to_dict(trigrams):
    d = {}
    for words, p in trigrams: 
        w1, w2, w3 = words
        if d.get(w1) is not None:
            if d[w1].get(w2) is not None:
                d[w1][w2][w3] = p
            else:
                d[w1][w2] = {w3: p}
        else:
            d[w1] = {w2: {w3: p}}
    return d
# pprint.pprint(d)

### Helpful Reviews

In [30]:
modelHelpful = build_trigrams(df[df.helpful == True].review_body)
freqHelpful = [(k, v / len(modelHelpful) * 100.0) for k, v in modelHelpful.items()]
helpful = trigram_to_dict(freqHelpful)

In [31]:
textLen = 20
sentence = ['<s>', '<s>']
while len(sentence) < textLen:
    w1, w2 = sentence[-2:]
    if (w1 == '</s>') & (w2 == '</s>'): 
        break
    next_word = max(helpful[w1][w2].items(), key=operator.itemgetter(1))[0]
    sentence.append(next_word)
' '.join(sentence)

'<s> <s> i have been using this product </s> </s>'

### Non-helpful Reviews

In [26]:
mdlNotHelpful = build_trigrams(df[df.helpful == False].review_body)
freqNotHelpful = [(k, v / len(mdlNotHelpful) * 100.0) for k, v in mdlNotHelpful.items()]
notHelpful = trigram_to_dict(freqNotHelpful)

In [27]:
textLen = 20
sentence = ['<s>', '<s>']
while len(sentence) < textLen:
    w1, w2 = sentence[-2:]
    if (w1 == '</s>') & (w2 == '</s>'): 
        break
    next_word = max(notHelpful[w1][w2].items(), key=operator.itemgetter(1))[0]
    sentence.append(next_word)
' '.join(sentence)

'<s> <s> i have to say i am not a good price </s> </s>'

### Export both dict

In [28]:
# reduce frequency dict size
def reduce_dictSize(freqDict, size=5, startingSize=30): 
    for w1 in freqDict.keys():
        for w2 in freqDict[w1].keys():
            # if sentence starting tokens
            if (w1 == '<s>') & (w2 == '<s>'): 
                tmp = dict(sorted(freqDict[w1][w2].items(), \
                                  key=lambda kv: kv[1], reverse=True)[:startingSize])
            # if other tokens
            else: 
                tmp = dict(sorted(freqDict[w1][w2].items(), \
                                  key=lambda kv: kv[1], reverse=True)[:size])
            freqDict[w1][w2] = tmp
    return freqDict

In [22]:
d = {'helpful': reduce_dictSize(helpful, 3, 10), \
     'notHelpful': reduce_dictSize(notHelpful, 3, 10) }

In [23]:
# with open('helpful_reviews.json', 'w', encoding='utf-8') as f:
#     json.dump(d, f, ensure_ascii=False, indent=4)

In [24]:
len(d['helpful']), len(d['notHelpful'])

(33921, 14493)

In [84]:
def recursion(sent, sents, trigrams):
    w1, w2 = sent[-2:]
     # terminate condition: end of sentence or sentence length
    if (w1 == '</s>') | (w2 == '</s>'): 
        sents.append(sent[1:])
        return sents
    if len(sent) >= 12: # limit setence length
        return sents
    
    # iterate each word on subset 
    for word in trigrams[w1][w2].keys():
        sents = recursion(sent+[word], sents, trigrams)
    return sents

In [86]:
# build sentences to visualization
helpfulSents = recursion(['<s>', '<s>', 'i'], [], helpful)
notHelpfulSents = recursion(['<s>', '<s>', 'i'], [], notHelpful)
len(helpfulSents), len(notHelpfulSents)

(1601, 1352)

In [None]:
for sent in hel

In [85]:
v = {'helpful': helpfulSents, 
    'notHelpful': notHelpfulSents}
with open('helpful_sents.json', 'w', encoding='utf-8') as f:
    json.dump(v, f, ensure_ascii=False, indent=4)