In [399]:
import pandas as pd
import numpy as np
import requests
from selenium import webdriver
import time
import bs4
import re
import os 
import math
from tqdm import tqdm_notebook as tqdm
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

## import datasets
1. review_dataframe_mega_ALL_New.csv : the web-scraped dataset of Reviewmeta
2. wrong_link.csv: After about two months, we checked the reviews in the web-scraped RM dataset and found out that actually a certain amount of them have been already deleted.
check this link: https://www.amazon.com/gp/customer-reviews/R1KIX5COX51UFL?pldnSite=1.
So we re-scraped and identify the review links that shows "Sorry, we couldn't find that page." All the links are in this csv file.

In [36]:
reviewmeta = pd.read_csv('review_dataframe_mega_ALL_New.csv', index_col=0)
wrong_link = pd.read_csv('wrong_link.csv')

In [186]:
reviewmeta[reviewmeta['Amazon_ID'] == 'R141L3QCLC6H6H']

Unnamed: 0,product,trust,Unnamed: 3,review_rating,review_title,reviewer_details,reviewer_link_RM,rvwr_text_Amazon,rvwr_link_Amazon,Amazon_ID,...,Easy_grade_rating,Overlapping_rev_history,Brand_Rep_freq,Brand_rep_rating,One_hit,incentivized,Brand_repeater,Brand_Loyalist,Brand_Monogamist,single_day
15338,B00Q8QOZPG,0.73,0.742984,1,One Star,\n Verified PurchaserReviewer: Jamil Husain\n\...,https://reviewmeta.com/profile/amazon-uk/A37FW...,\n\t\t\t\t2nd one has failed and will be sent ...,https://smile.amazon.co.uk/gp/customer-reviews...,R141L3QCLC6H6H,...,5.0,0,0,0.0,0,0,0,0,0,0


In [37]:
deleted_reviews = pd.merge(reviewmeta,wrong_link,left_on='rvwr_link_Amazon',right_on='wrong link')
deleted_reviews['rvwr_link_Amazon'].nunique()

921

In [38]:
# There are some duplicate rows with the same amazon review link but with different product ASIN. Since they are the same review,
# We decided to remove the duplicates according to the review link.
deleted_reviews = deleted_reviews.drop_duplicates(subset='rvwr_link_Amazon', keep="first")

## Data cleaning

In [39]:
deleted_reviews.rename(columns={"trust": "RM_Score", "Unnamed: 3": "RB_Score"}, inplace = True)
deleted_reviews.drop(['wrong link'], axis=1, inplace = True)
deleted_reviews = deleted_reviews.drop(['RB_Score','Critical_Rev_rating','Take_backs_rating','Easy_grade_rating','Brand_Rep_freq','Brand_rep_rating','product', 'RM_Score', 'review_title', 'reviewer_details', 'rvwr_text_Amazon','rvwr_link_Amazon'], axis=1)
deleted_reviews = deleted_reviews.reset_index(drop = True)


## Get good reviews 
Get good reviews by subtracting the datasets. full - bad reviews = good reviews

In [40]:
good_reviews = pd.merge(reviewmeta, wrong_link, left_on='rvwr_link_Amazon',right_on='wrong link', how = "outer", indicator=True)
good_reviews = good_reviews[good_reviews['_merge'] == 'left_only']
good_reviews = good_reviews.drop_duplicates(subset='rvwr_link_Amazon', keep="first")
good_reviews.drop(['wrong link','_merge'], axis=1, inplace = True)

good_reviews.rename(columns={"trust": "RM_Score", "Unnamed: 3": "RB_Score"}, inplace = True)
good_reviews = good_reviews.drop(['RB_Score','Critical_Rev_rating','Take_backs_rating','Easy_grade_rating','Brand_Rep_freq','Brand_rep_rating','product', 'review_title', 'reviewer_details', 'rvwr_text_Amazon','rvwr_link_Amazon'], axis=1).reset_index(drop = True)


In [30]:
# # We get deleted reviews a score of 1 and other reviews a score of 0
# deleted_reviews['RM_Score'] = 1
# good_reviews['RM_Score'] = 0

## Sampling the good reviews
Since we have 13559 good reviews and 912 bad reviews, the modeling dataset would be extremely unbalanced. So we decide to do a stratified sampling on the good review dataset.

Except for scores ranging from 0.5 and 0.7, we select 450 reviews from each bin to make the sampled dataset not that skewed.

In [138]:
good_reviews

Unnamed: 0,RM_Score,review_rating,reviewer_link_RM,Amazon_ID,Verified_Purchases,Nvr_verified_reviewer,Contains_rep_phrases,high_vol_day_rev,Average_Rating,Take_backs,Overrep_part,Overrep_wrd_cnt,Overlapping_rev_history,One_hit,incentivized,Brand_repeater,Brand_Loyalist,Brand_Monogamist,single_day,categories
0,0,2,https://reviewmeta.com/profile/amazon-de/A291K...,RB8O5NGJMI0KN,1,0,0,0,5.0,0,0,0,0,0,0,0,0,0,0,"(0.9, 1.0]"
1,0,5,https://reviewmeta.com/profile/amazon-de/A12RI...,R3PWIWOZ36AHAB,1,0,0,0,4.0,0,0,0,0,0,0,0,0,0,0,"(0.9, 1.0]"
2,0,1,https://reviewmeta.com/profile/amazon-de/AELCA...,R3MS9TWCGVYCIL,1,0,0,0,5.0,0,0,0,0,0,0,0,0,0,0,"(0.9, 1.0]"
3,0,1,https://reviewmeta.com/profile/amazon-de/A44CR...,R1HTSZPEKJTW5A,1,0,0,0,1.0,0,0,0,0,0,0,0,0,0,0,"(0.9, 1.0]"
4,0,1,https://reviewmeta.com/profile/amazon-de/AHMUV...,RCXPXSEUFKKQZ,1,0,0,0,1.0,0,0,0,0,0,0,0,0,0,0,"(0.9, 1.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13554,0,4,https://reviewmeta.com/profile/amazon/A2IQFJRN...,R1K3LD26UE9GEL,1,0,0,0,4.0,0,0,0,0,0,0,0,0,0,0,"(0.9, 1.0]"
13555,0,5,https://reviewmeta.com/profile/amazon/A2VH68NC...,R7SGZELW92PCN,1,0,0,0,4.3,1,0,1,0,0,0,0,0,0,0,"(0.9, 1.0]"
13556,0,5,https://reviewmeta.com/profile/amazon/AECBKDIS...,R12XO05CXGZHJ5,1,0,0,0,5.0,0,0,0,0,1,0,0,0,0,0,"(0.9, 1.0]"
13557,0,1,https://reviewmeta.com/profile/amazon/AHDJSTJS...,R3CBBHN9RQ0ABA,1,0,0,0,1.0,0,0,0,0,1,0,0,0,0,0,"(0.9, 1.0]"


In [41]:
bins = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
good_reviews['categories'] = pd.cut(good_reviews['RM_Score'], bins)
good_reviews.groupby('categories').count()

Unnamed: 0_level_0,RM_Score,review_rating,reviewer_link_RM,Amazon_ID,Verified_Purchases,Nvr_verified_reviewer,Contains_rep_phrases,high_vol_day_rev,Average_Rating,Take_backs,Overrep_part,Overrep_wrd_cnt,Overlapping_rev_history,One_hit,incentivized,Brand_repeater,Brand_Loyalist,Brand_Monogamist,single_day
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
"(0.1, 0.3]",472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472
"(0.3, 0.5]",985,985,985,985,985,985,985,985,985,985,985,985,985,985,985,985,985,985,985
"(0.5, 0.7]",154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154
"(0.7, 0.9]",1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118
"(0.9, 1.0]",7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277


In [44]:
RM_Score = good_reviews['RM_Score']
sample_df1 = good_reviews[(RM_Score <= 0.3) & (RM_Score > 0.1)].sample(n = 450)
sample_df2 = good_reviews[(RM_Score <= 0.5) & (RM_Score > 0.3)].sample(n = 450)
sample_df3 = good_reviews[(RM_Score <= 0.7) & (RM_Score > 0.5)]
sample_df4 = good_reviews[(RM_Score <= 0.9) & (RM_Score > 0.7)].sample(n = 450)
sample_df5 = good_reviews[(RM_Score <= 1.0) & (RM_Score > 0.9)].sample(n = 450)

In [45]:
good_reviews_sample = pd.concat([sample_df1, sample_df2, sample_df3, sample_df4, sample_df5], ignore_index = True).reset_index(drop = True)
good_reviews_sample['RM_Score'] = 0

In [46]:
good_reviews_sample = good_reviews_sample.drop('categories', 1)
good_reviews_sample

Unnamed: 0,RM_Score,review_rating,reviewer_link_RM,Amazon_ID,Verified_Purchases,Nvr_verified_reviewer,Contains_rep_phrases,high_vol_day_rev,Average_Rating,Take_backs,Overrep_part,Overrep_wrd_cnt,Overlapping_rev_history,One_hit,incentivized,Brand_repeater,Brand_Loyalist,Brand_Monogamist,single_day
0,0,1,https://reviewmeta.com/profile/amazon-uk/A3JKG...,R3AGJG4I6N2UND,1,0,0,0,1.0,0,1,0,0,1,0,0,0,0,0
1,0,5,https://reviewmeta.com/profile/amazon/AFVMYCAJ...,R2GPL8QSPZX9I5,1,0,0,0,5.0,0,1,1,0,0,0,0,0,0,0
2,0,5,https://reviewmeta.com/profile/amazon-de/A13ZX...,R15VBAX5TXYA1N,0,0,0,0,4.9,1,0,1,1,0,0,1,0,0,0
3,0,5,https://reviewmeta.com/profile/amazon-de/AELOK...,R857Y335SVT8F,1,0,1,0,5.0,0,1,1,0,0,0,0,0,1,0
4,0,4,https://reviewmeta.com/profile/amazon-uk/A22OS...,R1VMG2C0M6SXJT,1,0,1,1,4.2,1,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1949,0,4,https://reviewmeta.com/profile/amazon-fr/AFOGV...,R2MTHQBG0UJLRP,1,0,0,0,4.0,0,0,0,0,0,0,0,0,0,1
1950,0,4,https://reviewmeta.com/profile/amazon-ca/AIW0E...,R1YBSOKI8ZRCPM,1,0,0,0,4.3,0,0,0,0,0,0,0,0,0,0
1951,0,1,https://reviewmeta.com/profile/amazon-in/AFJRE...,RQULO1ZXI1J9K,0,1,0,0,1.0,0,0,0,0,0,0,0,0,0,0
1952,0,5,https://reviewmeta.com/profile/amazon-de/A3SOL...,R1G74H468ZU8X0,1,0,0,0,4.6,0,0,0,0,0,0,0,0,0,0


In [48]:
# We get deleted reviews a score of 1 and other reviews a score of 0
deleted_reviews['RM_Score'] = 1
good_reviews['RM_Score'] = 0

In [49]:
deleted_reviews.columns

Index(['review_rating', 'reviewer_link_RM', 'Amazon_ID', 'Verified_Purchases',
       'Nvr_verified_reviewer', 'Contains_rep_phrases', 'high_vol_day_rev',
       'Average_Rating', 'Take_backs', 'Overrep_part', 'Overrep_wrd_cnt',
       'Overlapping_rev_history', 'One_hit', 'incentivized', 'Brand_repeater',
       'Brand_Loyalist', 'Brand_Monogamist', 'single_day', 'RM_Score'],
      dtype='object')

In [50]:
deleted_reviews = deleted_reviews[['RM_Score','review_rating','reviewer_link_RM', 'Amazon_ID', 'Verified_Purchases', 'Nvr_verified_reviewer',
       'Contains_rep_phrases', 'high_vol_day_rev', 'Take_backs',
       'Overrep_part', 'Overrep_wrd_cnt','Average_Rating', 'Overlapping_rev_history', 'One_hit',
       'incentivized', 'Brand_repeater', 'Brand_Loyalist', 'Brand_Monogamist',
       'single_day']]
deleted_reviews = deleted_reviews.reset_index(drop = True)

## Combining deleted reviews and good reviews

In [62]:
final_df = pd.concat([deleted_reviews, good_reviews_sample], sort = False).reset_index(drop = True)

In [64]:
# Since all the flags have 1 as not good and 0 as good, we change the column Verified_Purchases to Non_Verified_Purchases.
final_df['Verified_Purchases'] = 1-final_df['Verified_Purchases']
final_df = final_df.rename(columns = {'Verified_Purchases': 'Non_Verified_Purchases'})

In [65]:
final_df

Unnamed: 0,RM_Score,review_rating,reviewer_link_RM,Amazon_ID,Non_Verified_Purchases,Nvr_verified_reviewer,Contains_rep_phrases,high_vol_day_rev,Take_backs,Overrep_part,Overrep_wrd_cnt,Average_Rating,Overlapping_rev_history,One_hit,incentivized,Brand_repeater,Brand_Loyalist,Brand_Monogamist,single_day
0,1,5,https://reviewmeta.com/profile/amazon/AFG4VMDI...,RXDGH790RKPUF,1,1,0,0,0,0,0,5.0,0,1,0,0,0,0,0
1,1,1,https://reviewmeta.com/profile/amazon/AHWBOFLE...,RVJE4LSV9ZWLK,1,1,0,0,0,1,0,1.0,0,1,0,0,0,0,0
2,1,2,https://reviewmeta.com/profile/amazon/A12K842R...,RV3XIX9GL0RTH,1,1,1,0,0,1,0,2.0,0,1,0,0,0,0,0
3,1,5,https://reviewmeta.com/profile/amazon/A18LBGL7...,R8P2NMWQ7HZFO,1,0,1,0,1,0,0,4.6,1,0,0,1,0,0,0
4,1,5,https://reviewmeta.com/profile/amazon/A315QJ0Z...,R1OF6OLI5LWG8T,0,0,0,1,1,0,0,4.9,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2870,0,4,https://reviewmeta.com/profile/amazon-fr/AFOGV...,R2MTHQBG0UJLRP,0,0,0,0,0,0,0,4.0,0,0,0,0,0,0,1
2871,0,4,https://reviewmeta.com/profile/amazon-ca/AIW0E...,R1YBSOKI8ZRCPM,0,0,0,0,0,0,0,4.3,0,0,0,0,0,0,0
2872,0,1,https://reviewmeta.com/profile/amazon-in/AFJRE...,RQULO1ZXI1J9K,1,1,0,0,0,0,0,1.0,0,0,0,0,0,0,0
2873,0,5,https://reviewmeta.com/profile/amazon-de/A3SOL...,R1G74H468ZU8X0,0,0,0,0,0,0,0,4.6,0,0,0,0,0,0,0


## Scrape and get reviewer profile link

In [210]:
user_agent = {'User-agent': 'Mozilla/5.0'}
# for i in tqdm(range(1975,2875)):
#     url = final_df.loc[i, 'reviewer_link_RM']
#     response=requests.get(url,headers = user_agent)
#     soup = bs4.BeautifulSoup(response.text)
#     profile_url = soup.find_all('div', class_ = 'col-md-8')[2].find('a').get('href')
#     final_df.loc[i,'profile_url'] = profile_url
profile_url = final_df['profile_url']
profile_url.to_csv('profile_url.csv')


  if __name__ == '__main__':


In [209]:
final_df['profile_url']

0       https://smile.amazon.com/gp/profile/amzn1.acco...
1       https://smile.amazon.com/gp/profile/amzn1.acco...
2       https://smile.amazon.com/gp/profile/amzn1.acco...
3       https://smile.amazon.com/gp/profile/amzn1.acco...
4       https://smile.amazon.com/gp/profile/amzn1.acco...
                              ...                        
2870    https://www.amazon.fr/gp/profile/amzn1.account...
2871    https://www.amazon.ca/gp/profile/amzn1.account...
2872    https://www.amazon.in/gp/profile/amzn1.account...
2873    https://smile.amazon.de/gp/profile/amzn1.accou...
2874    https://www.amazon.fr/gp/profile/amzn1.account...
Name: profile_url, Length: 2875, dtype: object

In [78]:
profile_url = pd.read_csv('profile_url.csv', header = None, index_col = 0)
final_df['profile_url'] = profile_url

In [211]:
final_df[['reviewer_link_RM','profile_url']].to_csv('check_url.csv')

## Web-scrape reviewer profile

In [159]:
final_df.loc[2,'profile_url']

'https://smile.amazon.com/gp/profile/amzn1.account.AGGBYXLIZ6IRV63SAPNWMO4D4VBQ'

In [212]:
%time
from selenium.webdriver.common.keys import Keys


d = webdriver.Chrome(executable_path=os.path.abspath('chromedriver'))   
#d = webdriver.Chrome(executable_path=os.path.abspath('chromedriver')) 
for i in tqdm(range(2475, 2875)):
    time.sleep(3) #Hold 1 seconds before the next scrape.
    num=str(i)
    newurl = final_df.loc[i,'profile_url']
    Amazon_ID = final_df.loc[i,'Amazon_ID']
 
    
    body = d.find_element_by_tag_name("body")
    body.send_keys(Keys.CONTROL + 't')
    
    d.get(newurl)
    d.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 'w') 
    #d.execute_script('document.documentElement.scollTop=10000')# To make the webdriver scroll down to fetch as many as the reviews.
    d.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    lenOfPage = d.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
    match=False
    counter=0
    while(match==False):
            counter=counter+1
            if(counter>=10):
                break
            lastCount = lenOfPage
            time.sleep(3)
            lenOfPage = d.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
            if lastCount==lenOfPage:
                match=True
    
    
    #time.sleep(2) # sleep again the let the page load
    path = os.getcwd() +"/profile_RM/"
    name= Amazon_ID +'.txt' #The new file name. 
    with open(path + name, 'w') as file:
        file.write(d.page_source)
        file.close()
  

    #Close the google webpage that webdriver open for you, otherwise it will be crazy.
d.close()

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 1.05 ms


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))




In [213]:
soup = {}
for i in tqdm(range(2475, 2875)):
    try:
        Amazon_ID = final_df.loc[i,'Amazon_ID']
        slash = '/'
        name = Amazon_ID +'.txt'
        path = os.getcwd() + '/profile_RM/'
        f = open("{}{}{}".format(path,slash,name),"r", encoding="utf-8").read()

        #text=open(name,'r').read() #Read the text from the file.
        soup[i]=bs4.BeautifulSoup(f) #Create a beautifulsoup object using the txt we got.
    except:
        print(i)

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))




In [214]:
# extracting more features
for i in tqdm(range(2475, 2875)):
    try:
        tag0 = soup[i].find_all('div', class_='dashboard-desktop-stat-value')[0] 
        final_df.loc[i,'helpful_votes'] = tag0.find('span', class_='a-size-large a-color-base').get_text() 

        for tag in soup[i].find_all('div', class_='a-row a-spacing-none name-container'):    
            final_df.loc[i,'name'] = tag.find('span', class_='a-size-extra-large').get_text() 

        tag1 = soup[i].find_all('div', class_='dashboard-desktop-stat-value')[1]    
        final_df.loc[i,'num_of_reviews'] = int(tag1.find('span', class_='a-size-large a-color-base').get_text())
        
        final_df.loc[i,'num_of_reviews_count'] = len(soup[i].find_all('div', class_='a-section profile-at-content'))
        
        
        # verified
        verified = []
        if len(soup[i].find_all('div', class_='a-row a-spacing-mini')) == 0:
            final_df.loc[i,'num_of_verified'] = 0 
        else:
            for tag in soup[i].find_all('div', class_='a-row a-spacing-mini'): 
                try:   
                    verified.append(tag.find('span', class_='a-size-small a-color-state profile-at-review-badge a-text-bold').get_text())
                    final_df.loc[i,'num_of_verified'] = len(verified)
                except:
                    continue 


        final_df.loc[i,'num_of_unverified'] = final_df.loc[i,'num_of_reviews_count'] - final_df.loc[i,'num_of_verified']

        
        date_mode_number = []
        # mode_number means if one person has many purchases on one day, how many purchases? I found out the date that appears most times.
        if len(soup[i].find_all('div', class_='a-profile-content')) == 0:
            final_df.loc[i,'mode_number'] = 0
        else:
            for tag in soup[i].find_all('div', class_='a-profile-content'):
                date_mode_number.append(tag.find('span', class_='a-profile-descriptor').get_text())
                final_df.loc[i,'mode_number'] = len([j for j, review in enumerate(date_mode_number) if review == max(set(date_mode_number), key=date_mode_number.count)])
        if final_df.loc[i,'mode_number'] > 20:
            final_df.loc[i,'samedate_20'] = 1
        else:
            final_df.loc[i,'samedate_20'] = 0

        # reviewer anonymous
        if ('Customer' in final_df.loc[i,'name']) | ('customer' in final_df.loc[i,'name']):
            final_df.loc[i,'anonymous'] = 1
        else:
            final_df.loc[i,'anonymous'] = 0
        
        # only 5 star reviews
        star5 = []
        if soup[i].find_all('div',class_='a-section a-spacing-mini') == 0:
            final_df.loc[i,'only_5star'] = 0
        else:
            for tag in soup[i].find_all('div',class_='a-section a-spacing-mini'):
                star5.append(tag.find('span',class_='a-icon-alt').text)
            if (len(set(star5)) == 1) & ('5 out of five stars' in set(star5)):
                final_df.loc[i,'only_5star'] = 1
            else:
                final_df.loc[i,'only_5star'] = 0
    except:
        print(i)
  

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))

2530
2604
2649
2694
2700
2730
2745
2765
2778
2791
2808
2841
2851
2868



In [228]:
pd.concat([final_df[0:1875], final_df[2475:2875]],ignore_index = True).to_csv('scraped_Iris.csv')

## Read the scraped csv

In [451]:
scraped_Iris = pd.read_csv('scraped_Iris.csv',index_col = 0)
scraped_KK = pd.read_csv('scraped_KK.csv', index_col = 0)
scraped_KK = scraped_KK[1875:2475]
model_data = pd.concat([scraped_Iris, scraped_KK], ignore_index = True)
model_data.to_csv('model_data.csv')

In [453]:
model_data.columns

Index(['RM_Score', 'review_rating', 'reviewer_link_RM', 'Amazon_ID',
       'Non_Verified_Purchases', 'Nvr_verified_reviewer',
       'Contains_rep_phrases', 'high_vol_day_rev', 'Take_backs',
       'Overrep_part', 'Overrep_wrd_cnt', 'Average_Rating',
       'Overlapping_rev_history', 'One_hit', 'incentivized', 'Brand_repeater',
       'Brand_Loyalist', 'Brand_Monogamist', 'single_day', 'profile_url',
       'helpful_votes', 'name', 'num_of_reviews', 'num_of_reviews_count',
       'num_of_verified', 'num_of_unverified', 'mode_number', 'samedate_20',
       'anonymous', 'only_5star'],
      dtype='object')

In [454]:
# data cleaning
for i in range(len(model_data)):
    if (model_data.loc[i,'num_of_reviews_count'] == 0) | math.isnan(model_data.loc[i,'num_of_reviews_count']) == True:
        model_data.loc[i,'0_review'] = 1
    else:
        model_data.loc[i,'0_review'] = 0

In [455]:
model_data['num_of_unverified'] = model_data['num_of_unverified'].fillna(model_data['num_of_reviews_count'])

In [456]:
model_data = model_data.drop(['reviewer_link_RM','Amazon_ID','profile_url','name','num_of_reviews', 'num_of_reviews_count','num_of_verified','helpful_votes'],1)

In [457]:
# model_data['num_of_reviews'] = model_data['num_of_reviews'].apply(lambda x: int(x))
#model_data['helpful_votes'] = 
#model_data['helpful_votes'] = model_data['helpful_votes'].apply(lambda x: float(x.replace(',','').replace('\xa0','')))


In [462]:
model_data = model_data.fillna(0)

In [463]:
model_data

Unnamed: 0,RM_Score,review_rating,Non_Verified_Purchases,Nvr_verified_reviewer,Contains_rep_phrases,high_vol_day_rev,Take_backs,Overrep_part,Overrep_wrd_cnt,Average_Rating,...,Brand_Loyalist,Brand_Monogamist,single_day,num_of_unverified,mode_number,samedate_20,anonymous,only_5star,0_review,helpful_votes
0,1,5,1,1,0,0,0,0,0,5.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0
1,1,1,1,1,0,0,0,1,0,1.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0
2,1,2,1,1,1,0,0,1,0,2.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,1,5,1,0,1,0,1,0,0,4.6,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,1,5,0,0,0,1,1,0,0,4.9,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2870,0,4,0,0,0,0,0,0,0,4.5,...,0,0,0,1.0,3.0,0.0,0.0,0.0,0.0,0
2871,0,1,0,0,0,0,0,0,0,1.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0
2872,0,5,0,0,0,0,1,0,0,4.6,...,0,0,0,0.0,6.0,0.0,0.0,0.0,0.0,0
2873,0,5,0,0,0,0,0,0,1,4.4,...,0,0,0,4.0,4.0,0.0,0.0,0.0,0.0,0


## Random Forest

In [537]:
model_data['RM_Score'].value_counts()

0    1954
1     921
Name: RM_Score, dtype: int64

In [580]:
x = model_data.loc[:,'review_rating':] # roc_auc_score: 87.8%
#x = model_data.loc[:,'review_rating':'single_day'] # roc_auc_score: 86%
y = model_data['RM_Score']

In [586]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1, train_size = 0.7)

In [587]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((2012, 23), (863, 23), (2012,), (863,))

In [588]:
y_test.value_counts()

0    585
1    278
Name: RM_Score, dtype: int64

In [589]:
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
test_pred = clf.predict(x_test)
train_pred = clf.predict(x_train)
print ('RF result: %.3f/%.3f' % (roc_auc_score(y_train, train_pred), roc_auc_score(y_test, test_pred)))

RF result: 0.971/0.876




In [590]:
print ("=== Confusion Matrix ===")
print (confusion_matrix(y_test, test_pred))
print ('\n')
print ("=== Classification Report ===")
print (classification_report(y_test, test_pred))
print ('\n')

=== Confusion Matrix ===
[[545  40]
 [ 50 228]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.92      0.93      0.92       585
           1       0.85      0.82      0.84       278

    accuracy                           0.90       863
   macro avg       0.88      0.88      0.88       863
weighted avg       0.89      0.90      0.90       863





In [591]:
feat_labels = x_train.columns
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(x_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))


 1) Average_Rating                 0.206966
 2) high_vol_day_rev               0.112546
 3) mode_number                    0.102291
 4) review_rating                  0.093383
 5) Overlapping_rev_history        0.071357
 6) Take_backs                     0.053711
 7) num_of_unverified              0.052817
 8) Contains_rep_phrases           0.041648
 9) 0_review                       0.040786
10) Overrep_part                   0.033927
11) Non_Verified_Purchases         0.033034
12) Overrep_wrd_cnt                0.031822
13) Nvr_verified_reviewer          0.031084
14) One_hit                        0.020928
15) only_5star                     0.019363
16) Brand_repeater                 0.014085
17) single_day                     0.013472
18) anonymous                      0.011541
19) Brand_Monogamist               0.005472
20) samedate_20                    0.004298
21) Brand_Loyalist                 0.003269
22) incentivized                   0.001617
23) helpful_votes               

In [239]:
x_train.columns

Index(['review_rating', 'Verified_Purchases', 'Nvr_verified_reviewer',
       'Contains_rep_phrases', 'high_vol_day_rev', 'Take_backs',
       'Overrep_part', 'Overrep_wrd_cnt', 'Overlapping_rev_history', 'One_hit',
       'incentivized', 'Brand_repeater', 'Brand_Loyalist', 'Brand_Monogamist',
       'single_day'],
      dtype='object')

In [243]:
x_train.describe()

Unnamed: 0,review_rating,Verified_Purchases,Nvr_verified_reviewer,Contains_rep_phrases,high_vol_day_rev,Take_backs,Overrep_part,Overrep_wrd_cnt,Overlapping_rev_history,One_hit,incentivized,Brand_repeater,Brand_Loyalist,Brand_Monogamist,single_day
count,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0
mean,4.249503,0.882207,0.073559,0.259443,0.195328,0.334493,0.418489,0.23161,0.32008,0.179423,0.00497,0.064612,0.005964,0.006461,0.03827
std,1.329157,0.322443,0.261116,0.438438,0.396552,0.47193,0.493434,0.421966,0.466623,0.383802,0.070342,0.245902,0.077017,0.080142,0.191896
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Logistic Regression

In [221]:
x

Unnamed: 0,review_rating,Verified_Purchases,Nvr_verified_reviewer,Contains_rep_phrases,high_vol_day_rev,Take_backs,Overrep_part,Overrep_wrd_cnt,Overlapping_rev_history,One_hit,incentivized,Brand_repeater,Brand_Loyalist,Brand_Monogamist,single_day
0,5,0,1,0,0,0,0,0,0,1,0,0,0,0,0
1,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0
2,2,0,1,1,0,0,1,0,0,1,0,0,0,0,0
3,5,0,0,1,0,1,0,0,1,0,0,1,0,0,0
4,5,1,0,0,1,1,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2870,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2871,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2872,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0
2873,5,1,0,0,0,1,0,0,0,0,0,0,0,0,0
