In [1]:
from lxml import html  
import requests
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
import pygal
from IPython.display import SVG, HTML
from pygal.style import DarkSolarizedStyle
from pygal import Config
import datetime
import math
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error

In [2]:
def date_to_mjd(year,month,day):

    if month == 1 or month == 2:
        yearp = year - 1
        monthp = month + 12
    else:
        yearp = year
        monthp = month
    # this checks where we are in relation to October 15, 1582, the beginning
    # of the Gregorian calendar.
    if ((year < 1582) or
        (year == 1582 and month < 10) or
        (year == 1582 and month == 10 and day < 15)):
        # before start of Gregorian calendar
        B = 0
    else:
        # after start of Gregorian calendar
        A = math.trunc(yearp / 100.)
        B = 2 - A + math.trunc(A / 4.)
    if yearp < 0:
        C = math.trunc((365.25 * yearp) - 0.75)
    else:
        C = math.trunc(365.25 * yearp)
    D = math.trunc(30.6001 * (monthp + 1))
    jd = B + C + D + day + 1720994.5
    return jd - 2400000.5

In [3]:
def get_total_pages(asin):
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'}
    XPATH_REVIEW_PAGENUM = './/li[@class="page-button"]//text()'
    XPATH_TOTAL_REVIEWS = './/span[@data-hook="total-review-count"]//text()'
    amazon_url = 'https://www.amazon.com/product-reviews/' + asin + '/ref=cm_cr_arp_d_paging_btm_1?pageNumber=1&sortBy=recent'
            # Add some recent user agent to prevent amazon from blocking the request 
            # Find some chrome user agent strings  here https://udger.com/resources/ua-list/browser-detail?browser=Chrome
    page = requests.get(amazon_url, headers=headers)
    page_response = page.text.encode('utf-8')
    parser = html.fromstring(page_response)
    pages = parser.xpath(XPATH_REVIEW_PAGENUM)
    totalreviews = parser.xpath(XPATH_TOTAL_REVIEWS)
    totalreviews = int(totalreviews[0].replace(',','')) 
    maxpage = int(pages[-1].replace(',','')) 
    print('Total pages of reviews: {}'.format(maxpage))
    print('Total number of reviews: {}'.format(totalreviews))
    return maxpage

In [4]:
def scrape_reviews(asins):
    ratings_dict = {}
    reviews_list = []
    reviews_df = pd.DataFrame()

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'}
    XPATH_REVIEWS = '//div[@data-hook="review"]'
    XPATH_REVIEW_RATING = './/i[@data-hook="review-star-rating"]//text()'
    XPATH_REVIEW_HEADER = './/a[@data-hook="review-title"]//text()'
    XPATH_REVIEW_AUTHOR = './/a[@data-hook="review-author"]//text()'
    XPATH_REVIEW_DATE = './/span[@data-hook="review-date"]//text()'
    XPATH_REVIEW_BODY = './/span[@data-hook="review-body"]//text()'
    XPATH_REVIEW_HELPFUL = './/span[@data-hook="helpful-vote-statement"]//text()'
    XPATH_REVIEW_PAGENUM = './/li[@class="page-button"]//text()'

    p_num = 1
    for asin in asins:
        totalpages = get_total_pages(asin)
        while True:
            print('Scraping review page nr. {}'.format(p_num))
            amazon_url = 'https://www.amazon.com/product-reviews/' + asin + '/ref=cm_cr_arp_d_paging_btm_' +str(p_num) + '?pageNumber=' + str(p_num) + '&sortBy=recent'
            # Add some recent user agent to prevent amazon from blocking the request 
            # Find some chrome user agent strings  here https://udger.com/resources/ua-list/browser-detail?browser=Chrome
            page = requests.get(amazon_url, headers=headers)
            page_response = page.text.encode('utf-8')
            parser = html.fromstring(page_response)
            reviews = parser.xpath(XPATH_REVIEWS)
            
            if not len(reviews) > 0:
                break
            # Parsing individual reviews
            for review in reviews:
                raw_review_author = review.xpath(XPATH_REVIEW_AUTHOR)
                raw_review_rating = review.xpath(XPATH_REVIEW_RATING)
                raw_review_header = review.xpath(XPATH_REVIEW_HEADER)
                raw_review_date = review.xpath(XPATH_REVIEW_DATE)
                raw_review_body = review.xpath(XPATH_REVIEW_BODY)
                raw_review_helpful = review.xpath(XPATH_REVIEW_HELPFUL)

                review_dict = {
                    'review_text': raw_review_body,
                    'review_posted_date': raw_review_date,
                    'review_header': raw_review_header,
                    'review_rating': raw_review_rating,
                    'review_helpful': raw_review_helpful,
                    'review_author': raw_review_author
                }
                reviews_df = reviews_df.append(review_dict, ignore_index=True)
            p_num += 1
            if p_num > totalpages:
                break
    return reviews_df

In [5]:
filename = 'cellphone_1_reviews.pickle'
asins = ['B06Y6J869C']

if os.path.isfile(filename):
    print('Loading reviews from disk') 
    reviews_df = pd.read_pickle(filename)
    print('Loaded {} reviews'.format(len(reviews_df)))
else:
    print('Scraping reviews for item...') 
    reviews_df = scrape_reviews(asins)
    print('Scraped {} reviews'.format(len(reviews_df))) 
    print('Saving reviews to disk') 
    reviews_df.to_pickle(filename)

Loading reviews from disk
Loaded 346 reviews


In [6]:
def format_reviews(reviews_df):
    # convert list to string
    for col in reviews_df.columns:
        reviews_df[col] = reviews_df[col].apply(lambda x: '\n'.join(x))
    reviews_df['review_helpful'] = (reviews_df['review_helpful']
                                    .str.replace('One', '1')
                                    .str.replace(r'[^0-9]', ''))
    reviews_df['review_helpful'].loc[reviews_df['review_helpful'] == ''] = '0'
    reviews_df['review_helpful'] = reviews_df['review_helpful'].astype(int)
    reviews_df['review_posted_date'] = pd.to_datetime(reviews_df['review_posted_date']
                                                      .str.strip('on'))
    reviews_df['review_rating'] = reviews_df['review_rating'].str.strip('out of 5 stars').astype(float)
    reviews_df.loc[reviews_df['review_rating'] == 0, 'review_rating'] = 5
    reviews_df['review_length'] = reviews_df['review_text'].apply(lambda x: len(x))
    reviews_df.drop_duplicates(inplace=True)
    return reviews_df

In [7]:
html_pygal = """
<!DOCTYPE html>
<html>
  <head>
  <script type="text/javascript" src="http://kozea.github.com/pygal.js/javascripts/svg.jquery.js"></script>
  <script type="text/javascript" src="http://kozea.github.com/pygal.js/javascripts/pygal-tooltips.js"></script>
    <!-- Bar Graph Demo -->
  </head>
  <body>
    <figure width="150" height="150">
      {pygal_render}
    </figure>
  </body>
</html>
"""

In [8]:
def plot_review_length_hist(reviews_df):
    
    bins = np.arange(0, 1001, 50)
    reviews_df['range'] = pd.cut(reviews_df['review_length'], bins, right = True)
    review_length_df = reviews_df[['range', 'review_length']].groupby('range').count().reset_index().rename(columns={'review_length': 'counts'})
    review_length_df['left'] = review_length_df['range'].apply(lambda x: x.left).astype(int)
    review_length_df['right'] = review_length_df['range'].apply(lambda x: x.right).astype(int)
    hist_bin = []
    for index, row in review_length_df.iterrows():
        new_tuple = (row['counts'], row['left'], row['right'])
        hist_bin.append(new_tuple)
    
    review_length_hist = pygal.Histogram(show_legend=False, title=u'Review Length Histogram', dynamic_print_values=True,  style=DarkSolarizedStyle(
                      value_font_family='googlefont:Raleway',
                      value_font_size=30,
                      value_colors=('white',)), x_title='Review length in number of character', y_title='Number of Reviews')
    review_length_hist.add('# of reviews',  hist_bin)
    return HTML(html_pygal.format(pygal_render=review_length_hist.render(is_unicode=True)))

In [9]:
reviews_df = format_reviews(reviews_df)

In [10]:
plot_review_length_hist(reviews_df)

In [11]:
def plot_review_stars(reviews_df):
    
    reviews_200 = reviews_df[reviews_df['review_length'] > 200].groupby('review_rating').count().reset_index()
    reviews_200 = reviews_200[['review_rating','review_author']].rename(columns={'review_author': 'counts'}).sort_values(by=['review_rating'])
    review_star_bar = pygal.Bar(show_legend=False,dynamic_print_values=True,  style=DarkSolarizedStyle(
                      value_font_family='googlefont:Raleway',
                      value_font_size=30,
                      value_colors=('white',)))
    review_star_bar.title = 'Review Stars for Review Length > 200'
    review_star_bar.x_labels = ['1 Star', '2 Star', '3 Star', '4 Star', '5 Star']
    review_star_bar.add('# of Stars', reviews_200['counts'].tolist())
    return HTML(html_pygal.format(pygal_render=review_star_bar.render(is_unicode=True)))

In [12]:
plot_review_stars(reviews_df)

In [13]:
def plot_monthly_sales(reviews_df):
    reviews_df['year'] = reviews_df['review_posted_date'].apply(lambda reviews_df:reviews_df.year)	
    reviews_df['month'] = reviews_df['review_posted_date'].apply(lambda reviews_df:reviews_df.month)
    # reviews_df.set_index(reviews_df['index_date_time'],inplace=True)
    monthly_reviews = reviews_df.groupby(['year', 'month']).count().reset_index()
    monthly_reviews['day'] = monthly_reviews['month'].apply(lambda monthly_reviews:1)
    monthly_reviews_counts = monthly_reviews[['year','month', 'day', 'review_posted_date']].rename(columns={'review_posted_date': 'counts'})
    monthly_reviews_counts['date'] = pd.to_datetime(monthly_reviews_counts[['year','month','day']])
    monthly_review_series = pygal.Line(x_label_rotation=20, show_legend=False,dynamic_print_values=True, style=DarkSolarizedStyle(
                      value_font_family='googlefont:Raleway',
                      value_font_size=30,
                      value_colors=('white',)))
    monthly_review_series.x_labels = map(lambda d: d.strftime('%Y-%m-%d'), monthly_reviews_counts['date'].tolist())
    monthly_review_series.title = 'Estimated Monthly Sales Based on Number of Reviews'
    monthly_review_series.add("Sales", monthly_reviews_counts['counts'].tolist())
    return HTML(html_pygal.format(pygal_render=monthly_review_series.render(is_unicode=True)))

In [14]:
plot_monthly_sales(reviews_df)

In [30]:
poly_degree = 6
ridge_alpha = 0.5
# scale first then polynomial
def fit_poly_model_1(reviews_df, poly_degree, ridge_alpha):
    reviews_df['year'] = reviews_df['review_posted_date'].apply(lambda reviews_df:reviews_df.year)	
    reviews_df['month'] = reviews_df['review_posted_date'].apply(lambda reviews_df:reviews_df.month)
    # reviews_df.set_index(reviews_df['index_date_time'],inplace=True)
    monthly_reviews = reviews_df.groupby(['year', 'month']).count().reset_index()
    monthly_reviews['day'] = monthly_reviews['month'].apply(lambda monthly_reviews:1)
    monthly_reviews_counts = monthly_reviews[['year','month', 'day', 'review_posted_date']].rename(columns={'review_posted_date': 'counts'})
    monthly_reviews_counts['date'] = pd.to_datetime(monthly_reviews_counts[['year','month','day']])
    monthly_reviews_counts['mjd'] = monthly_reviews_counts.apply(lambda monthly_reviews_counts: date_to_mjd(monthly_reviews_counts['year'], monthly_reviews_counts['month'], monthly_reviews_counts['day']), axis=1)
    x_training = monthly_reviews_counts['mjd'].values.reshape(-1, 1)
    x_training_scaled = preprocessing.scale(x_training)
    scaler = preprocessing.StandardScaler().fit(x_training)
    print(scaler.mean_)
    print(scaler.scale_)
    get_poly = PolynomialFeatures(poly_degree)
    x_training_poly = get_poly.fit_transform(x_training_scaled)
    print(x_training_poly)
    y_training = monthly_reviews_counts['counts'].values.reshape(-1, 1)
    model = Ridge(alpha=ridge_alpha)
    model.fit(x_training_poly, y_training) 
    print(model.intercept_)
    print(model.coef_)
    y_predicted = model.predict(x_training_poly)
    return mean_squared_error(y_training, y_predicted)
fit_poly_model_1(reviews_df, poly_degree, ridge_alpha)

[58072.4375]
[140.21321654]
[[ 1.00000000e+00 -1.62921517e+00  2.65434208e+00 -4.32449440e+00
   7.04553190e+00 -1.14786875e+01  1.87012518e+01]
 [ 1.00000000e+00 -1.41525532e+00  2.00294761e+00 -2.83468225e+00
   4.01179912e+00 -5.67772003e+00  8.03542345e+00]
 [ 1.00000000e+00 -1.19416346e+00  1.42602637e+00 -1.70290859e+00
   2.03355122e+00 -2.42839256e+00  2.89989766e+00]
 [ 1.00000000e+00 -9.80203603e-01  9.60799103e-01 -9.41778742e-01
   9.23134916e-01 -9.04860170e-01  8.86947199e-01]
 [ 1.00000000e+00 -7.59111749e-01  5.76250647e-01 -4.37438636e-01
   3.32064808e-01 -2.52074297e-01  1.91352561e-01]
 [ 1.00000000e+00 -5.38019895e-01  2.89465407e-01 -1.55738148e-01
   8.37902219e-02 -4.50808064e-02  2.42543707e-02]
 [ 1.00000000e+00 -3.24060036e-01  1.05014907e-01 -3.40311345e-02
   1.10281307e-02 -3.57377642e-03  1.15811812e-03]
 [ 1.00000000e+00 -1.02968182e-01  1.06024465e-02 -1.09171464e-03
   1.12411872e-04 -1.15748461e-05  1.19184086e-06]
 [ 1.00000000e+00  1.10991677e-01  1

20.251261938363328

In [31]:
poly_degree = 15
ridge_alpha = 0.5
# polynomial first then scale
def fit_poly_model_2(reviews_df, poly_degree, ridge_alpha):
    reviews_df['year'] = reviews_df['review_posted_date'].apply(lambda reviews_df:reviews_df.year)	
    reviews_df['month'] = reviews_df['review_posted_date'].apply(lambda reviews_df:reviews_df.month)
    # reviews_df.set_index(reviews_df['index_date_time'],inplace=True)
    monthly_reviews = reviews_df.groupby(['year', 'month']).count().reset_index()
    monthly_reviews['day'] = monthly_reviews['month'].apply(lambda monthly_reviews:1)
    monthly_reviews_counts = monthly_reviews[['year','month', 'day', 'review_posted_date']].rename(columns={'review_posted_date': 'counts'})
    monthly_reviews_counts['date'] = pd.to_datetime(monthly_reviews_counts[['year','month','day']])
    monthly_reviews_counts['mjd'] = monthly_reviews_counts.apply(lambda monthly_reviews_counts: date_to_mjd(monthly_reviews_counts['year'], monthly_reviews_counts['month'], monthly_reviews_counts['day']), axis=1)
    x_training = monthly_reviews_counts['mjd'].values.reshape(-1, 1)
    get_poly = PolynomialFeatures(poly_degree)
    x_training_poly = get_poly.fit_transform(x_training)
    x_training_scaled = preprocessing.scale(x_training_poly)
    x_training_scaled[:,0] = 1
    scaler = preprocessing.StandardScaler().fit(x_training_poly)
    print(scaler.mean_)
    print(scaler.scale_)
    print(x_training_scaled)
    y_training = monthly_reviews_counts['counts'].values.reshape(-1, 1)
    model = Ridge(alpha=ridge_alpha)
    model.fit(x_training_scaled, y_training) 
    print(model.intercept_)
    print(model.coef_)
    y_predicted = model.predict(x_training_scaled)
    return mean_squared_error(y_training, y_predicted)
fit_poly_model_2(reviews_df, poly_degree, ridge_alpha)

[1.00000000e+00 5.80724375e+04 3.37242766e+09 1.95847378e+14
 1.13735335e+19 6.60504214e+23 3.83582076e+28 2.22763252e+33
 1.29369329e+38 7.51314257e+42 4.36329390e+47 2.53401880e+52
 1.47166083e+57 8.54689081e+61 4.96376384e+66 2.88281383e+71]
[1.00000000e+00 1.40213217e+02 1.62849016e+07 1.41854984e+12
 1.09838235e+17 7.97325736e+21 5.55637132e+26 3.76456061e+31
 2.49852957e+36 1.63236502e+41 1.05331203e+46 6.72874411e+50
 4.26293653e+55 2.68200311e+60 1.67739086e+65 1.04373397e+70]
[[ 1.         -1.62921517 -1.62723249 -1.62525063 -1.62326961 -1.62128944
  -1.61931012 -1.61733167 -1.6153541  -1.61337741 -1.61140161 -1.60942672
  -1.60745273 -1.60547967 -1.60350753 -1.60153634]
 [ 1.         -1.41525532 -1.41405711 -1.41285784 -1.41165751 -1.41045613
  -1.4092537  -1.40805025 -1.40684577 -1.40564027 -1.40443377 -1.40322627
  -1.40201778 -1.40080832 -1.39959788 -1.39838649]
 [ 1.         -1.19416346 -1.19365977 -1.19315385 -1.19264569 -1.1921353
  -1.19162269 -1.19110787 -1.19059085 -



68.8706950618056

## Conclusion: should scale feature first and populate feature using high degree polynomials

In [32]:
def add_3_months(datetime_df):
    for i in range(3):
        last = len(datetime_df.index) - 1
        if datetime_df.loc[last]['month'] == 12:
            datetime_append = pd.DataFrame([[datetime_df.loc[last]['year']+1, 1, 1]], columns=['year', 'month', 'day'])
        else:
            datetime_append = pd.DataFrame([[datetime_df.loc[last]['year'], datetime_df.loc[last]['month'] + 1, 1]], columns=['year', 'month', 'day'])
        datetime_df = datetime_df.append(datetime_append, ignore_index=True)
    datetime_df['date'] = pd.to_datetime(datetime_df[['year','month','day']])
    datetime_df['mjd'] = datetime_df.apply(lambda datetime_df: date_to_mjd(datetime_df['year'], datetime_df['month'], datetime_df['day']), axis=1)
    return datetime_df

In [34]:
def scale_features(input_feature_array, means, scales):
    means[0] = 0
    scales[0] = 1
    for i, item in enumerate(input_feature_array):
        input_feature_array[i] = (item - means) / scales
    return input_feature_array

In [40]:
poly_degree = 5
ridge_alpha = 0.5
def predict_poly_model(reviews_df, poly_degree, ridge_alpha):
    reviews_df['year'] = reviews_df['review_posted_date'].apply(lambda reviews_df:reviews_df.year)	
    reviews_df['month'] = reviews_df['review_posted_date'].apply(lambda reviews_df:reviews_df.month)
    # reviews_df.set_index(reviews_df['index_date_time'],inplace=True)
    monthly_reviews = reviews_df.groupby(['year', 'month']).count().reset_index()
    monthly_reviews['day'] = monthly_reviews['month'].apply(lambda monthly_reviews:1)
    monthly_reviews_counts = monthly_reviews[['year','month', 'day', 'review_posted_date']].rename(columns={'review_posted_date': 'counts'})
    monthly_reviews_counts['date'] = pd.to_datetime(monthly_reviews_counts[['year','month','day']])
    monthly_reviews_counts['mjd'] = monthly_reviews_counts.apply(lambda monthly_reviews_counts: date_to_mjd(monthly_reviews_counts['year'], monthly_reviews_counts['month'], monthly_reviews_counts['day']), axis=1)
    x_training = monthly_reviews_counts['mjd'].values.reshape(-1, 1)
    x_training_scaled = preprocessing.scale(x_training)
    scaler = preprocessing.StandardScaler().fit(x_training)
    print(scaler.mean_)
    print(scaler.scale_)
    get_poly = PolynomialFeatures(poly_degree)
    x_training_poly = get_poly.fit_transform(x_training_scaled)
    print(x_training_poly)
    y_training = monthly_reviews_counts['counts'].values.reshape(-1, 1)
    model = Ridge(alpha=ridge_alpha)
    model.fit(x_training_poly, y_training) 
    print(model.intercept_)
    print(model.coef_)
    y_modeled = model.predict(x_training_poly)
    y_modeled_tolist = []
    for sublist in y_modeled:
    for item in sublist:
        y_modeled_tolist.append(item)
    mse = mean_squared_error(y_training, y_modeled)
    print('MSE:{}'.format(mse))
    prediction_time = monthly_reviews_counts[['year', 'month', 'day']]
    prediction_time_add_3_months = add_3_months(prediction_time)
    feature_to_be_feeded = prediction_time_add_3_months['mjd'].values.reshape(-1, 1)
    input_feature_array_scaled = scale_features(feature_to_be_feeded, scaler.mean_, scaler.scale_)
    input_feature_array_poly = get_poly.fit_transform(input_feature_array_scaled)
    output_predicted = model.predict(input_feature_array_poly)
    output_predicted_tolist = []
    for sublist in output_predicted:
        for item in sublist:
            output_predicted_tolist.append(item)
    sales_prediction = pygal.Line(x_label_rotation=20, show_legend=False,dynamic_print_values=True, style=DarkSolarizedStyle(
                  value_font_family='googlefont:Raleway',
                  value_font_size=30,
                  value_colors=('white',)))
    sales_prediction.x_labels = map(lambda d: d.strftime('%Y-%m-%d'), monthly_reviews_counts['date'].tolist())
    sales_prediction.title = 'Monthly Sales Prediction Using Polynomial Regression'
    sales_prediction.add("Sales", y_modeled)
    return HTML(html_pygal.format(pygal_render=sales_prediction.render(is_unicode=True)))
predict_poly_model(reviews_df, poly_degree, ridge_alpha)

[58072.4375]
[140.21321654]
[[ 1.00000000e+00 -1.62921517e+00  2.65434208e+00 -4.32449440e+00
   7.04553190e+00 -1.14786875e+01]
 [ 1.00000000e+00 -1.41525532e+00  2.00294761e+00 -2.83468225e+00
   4.01179912e+00 -5.67772003e+00]
 [ 1.00000000e+00 -1.19416346e+00  1.42602637e+00 -1.70290859e+00
   2.03355122e+00 -2.42839256e+00]
 [ 1.00000000e+00 -9.80203603e-01  9.60799103e-01 -9.41778742e-01
   9.23134916e-01 -9.04860170e-01]
 [ 1.00000000e+00 -7.59111749e-01  5.76250647e-01 -4.37438636e-01
   3.32064808e-01 -2.52074297e-01]
 [ 1.00000000e+00 -5.38019895e-01  2.89465407e-01 -1.55738148e-01
   8.37902219e-02 -4.50808064e-02]
 [ 1.00000000e+00 -3.24060036e-01  1.05014907e-01 -3.40311345e-02
   1.10281307e-02 -3.57377642e-03]
 [ 1.00000000e+00 -1.02968182e-01  1.06024465e-02 -1.09171464e-03
   1.12411872e-04 -1.15748461e-05]
 [ 1.00000000e+00  1.10991677e-01  1.23191523e-02  1.36732337e-03
   1.51761513e-04  1.68442648e-05]
 [ 1.00000000e+00  3.32083531e-01  1.10279471e-01  3.66219962e-