# Documentation
1. In the second and third section we are doing data preparation including web-scraping. If there's no new data coming in, just use the modeling related code starting from section 4 'Use pre-prepared dataset to do the modeling'.

In [1]:
import pandas as pd
import numpy as np
import requests
from selenium import webdriver
import time
import bs4
import re
import os 
import math
import datetime
from tqdm import tqdm_notebook as tqdm
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from numpy import loadtxt
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tnrange

# Preparing data for this new approach
Don't need to run unless there's new data coming in.
## import datasets
1. review_dataframe_mega_ALL_New.csv : the web-scraped dataset of Reviewmeta
2. wrong_link.csv: After about two months, we checked the reviews in the web-scraped RM dataset and found out that actually a certain amount of them have been already deleted.
check this link: https://www.amazon.com/gp/customer-reviews/R1KIX5COX51UFL?pldnSite=1.
So we re-scraped and identify the review links that shows "Sorry, we couldn't find that page." All the links are in this csv file.

In [2]:
reviewmeta = pd.read_csv('review_dataframe_mega_ALL_New.csv', index_col=0)
wrong_link = pd.read_csv('wrong_link.csv')

In [3]:
reviewmeta.head()

Unnamed: 0,product,trust,Unnamed: 3,review_rating,review_title,reviewer_details,reviewer_link_RM,rvwr_text_Amazon,rvwr_link_Amazon,Amazon_ID,...,Easy_grade_rating,Overlapping_rev_history,Brand_Rep_freq,Brand_rep_rating,One_hit,incentivized,Brand_repeater,Brand_Loyalist,Brand_Monogamist,single_day
0,B00SMMDNCA,1.0,0.970929,2,Nicht gut verpackt!,\n Verified PurchaserReviewer: Ulrike Kohlhaas...,https://reviewmeta.com/profile/amazon-de/A291K...,\n\t\t\t\tAn sich ein tolles Teil. Hatte mich ...,https://smile.amazon.de/gp/customer-reviews/RB...,RB8O5NGJMI0KN,...,5.0,0,0,0.0,0,0,0,0,0,0
1,B00SMMDNCA,1.0,0.966222,5,Würde ich wieder kaufen.,\n Verified PurchaserReviewer: M. Fritzsch\n\n...,https://reviewmeta.com/profile/amazon-de/A12RI...,\n\t\t\t\tIch war wirklich sehr sehr skeptisch...,https://smile.amazon.de/gp/customer-reviews/R3...,R3PWIWOZ36AHAB,...,0.0,0,0,0.0,0,0,0,0,0,0
2,B00SMMDNCA,1.0,0.970929,1,Fehlkauf,\n Verified PurchaserReviewer: Barbara\n\n Eas...,https://reviewmeta.com/profile/amazon-de/AELCA...,\n\t\t\t\tLeider hat es unser yorkshire terrie...,https://smile.amazon.de/gp/customer-reviews/R3...,R3MS9TWCGVYCIL,...,5.0,0,0,0.0,0,0,0,0,0,0
3,B00SMMDNCA,1.0,0.947291,1,Plastikschrott in grottenschlechter Qualität,\n Verified PurchaserReviewer: Martin Zimmerma...,https://reviewmeta.com/profile/amazon-de/A44CR...,\n\t\t\t\tVöllig ausgefranster Kunstrasen. Da ...,https://smile.amazon.de/gp/customer-reviews/R1...,R1HTSZPEKJTW5A,...,0.0,0,0,0.0,0,0,0,0,0,0
4,B00SMMDNCA,1.0,0.947291,1,Keine Funktion,\n Verified PurchaserReviewer: Ahmet Sahin\n\n...,https://reviewmeta.com/profile/amazon-de/AHMUV...,\n\t\t\t\tHilft nichts [Go to full review]\n,https://smile.amazon.de/gp/customer-reviews/RC...,RCXPXSEUFKKQZ,...,0.0,0,0,0.0,0,0,0,0,0,0


In [4]:
wrong_link.head()

Unnamed: 0,wrong link
0,https://smile.amazon.com/gp/customer-reviews/R...
1,https://smile.amazon.com/gp/customer-reviews/R...
2,https://smile.amazon.com/gp/customer-reviews/R...
3,https://smile.amazon.com/gp/customer-reviews/R...
4,https://smile.amazon.com/gp/customer-reviews/R...


In [5]:
deleted_reviews = pd.merge(reviewmeta,wrong_link,left_on='rvwr_link_Amazon',right_on='wrong link')
deleted_reviews['rvwr_link_Amazon'].nunique()

921

In [6]:
# There are some duplicate rows with the same amazon review link but with different product ASIN. Since they are the same review,
# We decided to remove the duplicates according to the review link.
deleted_reviews = deleted_reviews.drop_duplicates(subset='rvwr_link_Amazon', keep="first")

## Data cleaning

In [7]:
deleted_reviews.rename(columns={"trust": "RM_Score", "Unnamed: 3": "RB_Score"}, inplace = True)
deleted_reviews.drop(['wrong link'], axis=1, inplace = True)
deleted_reviews = deleted_reviews.drop(['RB_Score','Critical_Rev_rating','Take_backs_rating','Easy_grade_rating','Brand_Rep_freq','Brand_rep_rating','product', 'RM_Score', 'review_title', 'reviewer_details', 'rvwr_text_Amazon','rvwr_link_Amazon'], axis=1)
deleted_reviews = deleted_reviews.reset_index(drop = True)


In [8]:
# Since all the flags have 1 as not good and 0 as good, we change the column Verified_Purchases to Non_Verified_Purchases.
def modify_column_veri_purchase(df):
    df['Verified_Purchases'] = 1-df['Verified_Purchases']
    df.rename(columns = {'Verified_Purchases': 'Non_Verified_Purchases'}, inplace = True)

In [9]:
modify_column_veri_purchase(deleted_reviews)

## Get good reviews 
Get good reviews by subtracting the datasets. full - bad reviews = good reviews

In [10]:
good_reviews = pd.merge(reviewmeta, wrong_link, left_on='rvwr_link_Amazon',right_on='wrong link', how = "outer", indicator=True)
good_reviews = good_reviews[good_reviews['_merge'] == 'left_only']
good_reviews = good_reviews.drop_duplicates(subset='rvwr_link_Amazon', keep="first")
good_reviews.drop(['wrong link','_merge'], axis=1, inplace = True)

good_reviews.rename(columns={"trust": "RM_Score", "Unnamed: 3": "RB_Score"}, inplace = True)
good_reviews = good_reviews.drop(['RB_Score','Critical_Rev_rating','Take_backs_rating','Easy_grade_rating','Brand_Rep_freq','Brand_rep_rating','product', 'review_title', 'reviewer_details', 'rvwr_text_Amazon','rvwr_link_Amazon'], axis=1).reset_index(drop = True)


In [11]:
# We get deleted reviews a score of 1 and other reviews a score of 0
# deleted_reviews['RM_Score'] = 1
# good_reviews['RM_Score'] = 0

## Sampling the good reviews
Since we have 13559 good reviews and 912 bad reviews, the modeling dataset would be extremely unbalanced. So we decide to do a stratified sampling on the good review dataset.

Except for scores ranging from 0.5 and 0.7, we select 450 reviews from each bin to make the sampled dataset not that skewed.

Note that sample function will sample out different sets of reviews every time. If one doesn't want random sampling, remember to set a seed.

In [12]:
bins = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
good_reviews['categories'] = pd.cut(good_reviews['RM_Score'], bins)
good_reviews.groupby('categories').count()

Unnamed: 0_level_0,RM_Score,review_rating,reviewer_link_RM,Amazon_ID,Verified_Purchases,Nvr_verified_reviewer,Contains_rep_phrases,high_vol_day_rev,Average_Rating,Take_backs,Overrep_part,Overrep_wrd_cnt,Overlapping_rev_history,One_hit,incentivized,Brand_repeater,Brand_Loyalist,Brand_Monogamist,single_day
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
"(0.1, 0.3]",472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472
"(0.3, 0.5]",985,985,985,985,985,985,985,985,985,985,985,985,985,985,985,985,985,985,985
"(0.5, 0.7]",154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154
"(0.7, 0.9]",1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118
"(0.9, 1.0]",7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277,7277


In [13]:
RM_Score = good_reviews['RM_Score']
sample_df1 = good_reviews[(RM_Score <= 0.3) & (RM_Score > 0.1)].sample(n = 450)
sample_df2 = good_reviews[(RM_Score <= 0.5) & (RM_Score > 0.3)].sample(n = 450)
sample_df3 = good_reviews[(RM_Score <= 0.7) & (RM_Score > 0.5)]
sample_df4 = good_reviews[(RM_Score <= 0.9) & (RM_Score > 0.7)].sample(n = 450)
sample_df5 = good_reviews[(RM_Score <= 1.0) & (RM_Score > 0.9)].sample(n = 450)

In [14]:
good_reviews_sample = pd.concat([sample_df1, sample_df2, sample_df3, sample_df4, sample_df5], ignore_index = True).reset_index(drop = True)


In [15]:
good_reviews_sample = good_reviews_sample.drop('categories', 1)
modify_column_veri_purchase(good_reviews_sample)

In [16]:
# We get deleted reviews a score of 1 and other reviews a score of 0
deleted_reviews['RM_Score'] = 1
good_reviews_sample['RM_Score'] = 0

In [17]:
deleted_reviews.columns

Index(['review_rating', 'reviewer_link_RM', 'Amazon_ID',
       'Non_Verified_Purchases', 'Nvr_verified_reviewer',
       'Contains_rep_phrases', 'high_vol_day_rev', 'Average_Rating',
       'Take_backs', 'Overrep_part', 'Overrep_wrd_cnt',
       'Overlapping_rev_history', 'One_hit', 'incentivized', 'Brand_repeater',
       'Brand_Loyalist', 'Brand_Monogamist', 'single_day', 'RM_Score'],
      dtype='object')

In [18]:
# make the order of columns the same for two datasets.
deleted_reviews = deleted_reviews[['RM_Score','review_rating','reviewer_link_RM', 'Amazon_ID', 'Non_Verified_Purchases', 'Nvr_verified_reviewer',
       'Contains_rep_phrases', 'high_vol_day_rev', 'Take_backs',
       'Overrep_part', 'Overrep_wrd_cnt','Average_Rating', 'Overlapping_rev_history', 'One_hit',
       'incentivized', 'Brand_repeater', 'Brand_Loyalist', 'Brand_Monogamist',
       'single_day']]
deleted_reviews = deleted_reviews.reset_index(drop = True)

## Combining deleted reviews and good reviews

In [19]:
final_df = pd.concat([deleted_reviews, good_reviews_sample], sort = False).reset_index(drop = True)

In [20]:
final_df

Unnamed: 0,RM_Score,review_rating,reviewer_link_RM,Amazon_ID,Non_Verified_Purchases,Nvr_verified_reviewer,Contains_rep_phrases,high_vol_day_rev,Take_backs,Overrep_part,Overrep_wrd_cnt,Average_Rating,Overlapping_rev_history,One_hit,incentivized,Brand_repeater,Brand_Loyalist,Brand_Monogamist,single_day
0,1,5,https://reviewmeta.com/profile/amazon/AFG4VMDI...,RXDGH790RKPUF,1,1,0,0,0,0,0,5.0,0,1,0,0,0,0,0
1,1,1,https://reviewmeta.com/profile/amazon/AHWBOFLE...,RVJE4LSV9ZWLK,1,1,0,0,0,1,0,1.0,0,1,0,0,0,0,0
2,1,2,https://reviewmeta.com/profile/amazon/A12K842R...,RV3XIX9GL0RTH,1,1,1,0,0,1,0,2.0,0,1,0,0,0,0,0
3,1,5,https://reviewmeta.com/profile/amazon/A18LBGL7...,R8P2NMWQ7HZFO,1,0,1,0,1,0,0,4.6,1,0,0,1,0,0,0
4,1,5,https://reviewmeta.com/profile/amazon/A315QJ0Z...,R1OF6OLI5LWG8T,0,0,0,1,1,0,0,4.9,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2870,0,5,https://reviewmeta.com/profile/amazon-uk/AF7CM...,R2JA7ZQJ5GAAL7,0,0,0,0,0,0,0,5.0,0,0,0,0,0,0,0
2871,0,5,https://reviewmeta.com/profile/amazon-ca/AFFS3...,R3ILCM2P4VBAIX,0,0,0,0,0,0,0,5.0,0,1,0,0,0,0,0
2872,0,5,https://reviewmeta.com/profile/amazon-ca/AHVO5...,R2FJK4UAX0DKL0,0,0,0,0,0,0,0,5.0,0,0,0,0,0,0,0
2873,0,5,https://reviewmeta.com/profile/amazon-ca/A1NG4...,RWSHGX9FG78EH,0,0,0,0,0,0,0,4.3,0,0,0,0,0,0,0


# Web-scraping reviewer profiles
1. We are doing this because we want to generate more features to help predict the score (0/1)
2. Don't run this whole section unless there's new data coming in and you need profile pages.

## Scrape and get reviewer profile link

In [21]:
user_agent = {'User-agent': 'Mozilla/5.0'}
for i in tqdm(range(1975,2875)):
    url = final_df.loc[i, 'reviewer_link_RM']
    response=requests.get(url,headers = user_agent)
    soup = bs4.BeautifulSoup(response.text)
    profile_url = soup.find_all('div', class_ = 'col-md-8')[2].find('a').get('href')
    final_df.loc[i,'profile_url'] = profile_url
profile_url = final_df['profile_url']
profile_url.to_csv('profile_url.csv')


In [21]:
profile_url = pd.read_csv('profile_url.csv', header = None, index_col = 0)
final_df['profile_url'] = profile_url

## Web-scrape reviewer profile
1. We are using webdriver because we want the page to automatically scroll down and scrape all the reviews. Otherwise, it will only scrape around 10 reviews.

In [159]:
final_df.loc[2,'profile_url']

'https://smile.amazon.com/gp/profile/amzn1.account.AGGBYXLIZ6IRV63SAPNWMO4D4VBQ'

In [212]:
%time
from selenium.webdriver.common.keys import Keys


d = webdriver.Chrome(executable_path=os.path.abspath('chromedriver'))   
#d = webdriver.Chrome(executable_path=os.path.abspath('chromedriver')) 
for i in tqdm(range(2475, 2875)):
    time.sleep(3) #Hold 1 seconds before the next scrape.
    num=str(i)
    newurl = final_df.loc[i,'profile_url']
    Amazon_ID = final_df.loc[i,'Amazon_ID']
 
    
    body = d.find_element_by_tag_name("body")
    body.send_keys(Keys.CONTROL + 't')
    
    d.get(newurl)
    d.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 'w') 
    d.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    lenOfPage = d.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
    match=False
    counter=0
    while(match==False):
            counter=counter+1
            if(counter>=10):
                break
            lastCount = lenOfPage
            time.sleep(3)
            lenOfPage = d.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
            if lastCount==lenOfPage:
                match=True
    
    
    #time.sleep(2) # sleep again the let the page load
    path = os.getcwd() +"/profile_RM/"
    name= Amazon_ID +'.txt' #The new file name. 
    with open(path + name, 'w') as file:
        file.write(d.page_source)
        file.close()
  

    #Close the google webpage that webdriver open for you, otherwise it will be crazy.
d.close()

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 1.05 ms


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))




In [213]:
# Save all the profile txts in a dictionary.
soup = {}
for i in tqdm(range(2475, 2875)):
    try:
        Amazon_ID = final_df.loc[i,'Amazon_ID']
        slash = '/'
        name = Amazon_ID +'.txt'
        path = os.getcwd() + '/profile_RM/'
        f = open("{}{}{}".format(path,slash,name),"r", encoding="utf-8").read()
        soup[i]=bs4.BeautifulSoup(f) #Create a beautifulsoup object using the txt we got.
    except:
        print(i)

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))




In [676]:
# extracting more features
for i in tqdm(range(0, 2875)):
    try:
        tag0 = soup[i].find_all('div', class_='dashboard-desktop-stat-value')[0] 
        final_df.loc[i,'helpful_votes'] = tag0.find('span', class_='a-size-large a-color-base').get_text() 

        for tag in soup[i].find_all('div', class_='a-row a-spacing-none name-container'):    
            final_df.loc[i,'name'] = tag.find('span', class_='a-size-extra-large').get_text() 

        tag1 = soup[i].find_all('div', class_='dashboard-desktop-stat-value')[1]    
        final_df.loc[i,'num_of_reviews'] = int(tag1.find('span', class_='a-size-large a-color-base').get_text())
        
        final_df.loc[i,'num_of_reviews_count'] = len(soup[i].find_all('div', class_='a-section profile-at-content'))
        
        
        # verified
        verified = []
        if len(soup[i].find_all('div', class_='a-row a-spacing-mini')) == 0:
            final_df.loc[i,'num_of_verified'] = 0 
        else:
            for tag in soup[i].find_all('div', class_='a-row a-spacing-mini'): 
                try:   
                    verified.append(tag.find('span', class_='a-size-small a-color-state profile-at-review-badge a-text-bold').get_text())
                    final_df.loc[i,'num_of_verified'] = len(verified)
                except:
                    continue 


        final_df.loc[i,'num_of_unverified'] = final_df.loc[i,'num_of_reviews_count'] - final_df.loc[i,'num_of_verified']

        
        date_mode_number = []
        
        # mode_number means if one person has many purchases on one day, how many purchases? I found out the date that appears most times.
        if len(soup[i].find_all('div', class_='a-profile-content')) == 0:
            final_df.loc[i,'mode_number'] = 0
        else:
            for tag in soup[i].find_all('div', class_='a-profile-content'):
                date_mode_number.append(tag.find('span', class_='a-profile-descriptor').get_text())
                final_df.loc[i,'mode_number'] = len([j for j, review in enumerate(date_mode_number) if review == max(set(date_mode_number), key=date_mode_number.count)])
        if final_df.loc[i,'mode_number'] > 20:
            final_df.loc[i,'samedate_20'] = 1
        else:
            final_df.loc[i,'samedate_20'] = 0

        # reviewer anonymous
        if ('Customer' in final_df.loc[i,'name']) | ('customer' in final_df.loc[i,'name']):
            final_df.loc[i,'anonymous'] = 1
        else:
            final_df.loc[i,'anonymous'] = 0
        
        # only 5 star reviews
        star5 = []
        if soup[i].find_all('div',class_='a-section a-spacing-mini') == 0:
            final_df.loc[i,'only_5star'] = 0
        else:
            for tag in soup[i].find_all('div',class_='a-section a-spacing-mini'):
                star5.append(tag.find('span',class_='a-icon-alt').text)
            if (len(set(star5)) == 1) & ('5 out of five stars' in set(star5)):
                final_df.loc[i,'only_5star'] = 1
            else:
                final_df.loc[i,'only_5star'] = 0
    except:
        print(i)
  

In [228]:
pd.concat([final_df[0:1875], final_df[2475:2875]],ignore_index = True).to_csv('scraped_Iris.csv')

# Use pre-prepared dataset to do the modeling 
1. If there's no new data, you only need to run this section to train the model.
2. There are two csv called 'scraped_Iris.csv' and 'scraped_KK.csv' because we split the scraping task.


## Read the csv with new profile-related features

In [1063]:
scraped_Iris = pd.read_csv('scraped_Iris.csv',index_col = 0)
scraped_KK = pd.read_csv('scraped_KK.csv', index_col = 0)
scraped_KK = scraped_KK[1875:2475]
model_data = pd.concat([scraped_Iris, scraped_KK], ignore_index = True)
model_data.to_csv('model_data.csv')

In [1064]:
model_data.shape

(2875, 30)

In [1065]:
model_data.columns

Index(['RM_Score', 'review_rating', 'reviewer_link_RM', 'Amazon_ID',
       'Non_Verified_Purchases', 'Nvr_verified_reviewer',
       'Contains_rep_phrases', 'high_vol_day_rev', 'Take_backs',
       'Overrep_part', 'Overrep_wrd_cnt', 'Average_Rating',
       'Overlapping_rev_history', 'One_hit', 'incentivized', 'Brand_repeater',
       'Brand_Loyalist', 'Brand_Monogamist', 'single_day', 'profile_url',
       'helpful_votes', 'name', 'num_of_reviews', 'num_of_reviews_count',
       'num_of_verified', 'num_of_unverified', 'mode_number', 'samedate_20',
       'anonymous', 'only_5star'],
      dtype='object')

## adding new columns
### repetitive phrase within the reviewer history
1. relative code: rep_reviewer_D.ipynb
2. Now the csv includes every review as a row, with a column 'repetitive phrase' indicating whether this review contain repetitive phrase compared with other reviews of this reviewer.
3. To combine it with the full dataset, we need to do aggregation. We calculate the perentage of reviews that contain repetitive phrase and assign it as 1 if the percentage is above 0.4. This threshold can be tuned.

In [1066]:
rep_reviewer = pd.read_csv('rep_reviewer_text_Repetitive_phrase_v2.csv', index_col = 0)

In [1067]:
rep_reviewer.head()

Unnamed: 0,Amazon_ID,review_text,ID,repetitive phrase
0,R1AMHAPDS0T3B8,"I don't want to return the whole order, but on...",0,0
1,R1AMHAPDS0T3B8,Beautiful lights!! I love them. They go perfec...,1,0
2,R1AMHAPDS0T3B8,Does not work. Bought. Not working at first use.,2,0
3,R1AMHAPDS0T3B8,Broke after one day. The outside plastic broke...,3,0
4,R1AMHAPDS0T3B8,I sent an email and never heard back. We run a...,4,0


In [1068]:
rep_group = rep_reviewer.groupby('Amazon_ID')['repetitive phrase']
rep_group_perc = pd.DataFrame(rep_group.sum() / rep_group.count()).reset_index()

In [1069]:
rep_group_perc['rep_reviewer_flag'] = np.where(rep_group_perc['repetitive phrase'] > 0.4, 1, 0)

In [1070]:
model_data = model_data.merge(rep_group_perc, left_on = 'Amazon_ID', right_on = 'Amazon_ID', how = 'left')

In [1071]:
model_data = model_data.drop(['repetitive phrase'],1)

In [1072]:
# There are many NAs, because some reviewer don't have any review on their page now. We will impute them as 0 later.
model_data.head()

Unnamed: 0,RM_Score,review_rating,reviewer_link_RM,Amazon_ID,Non_Verified_Purchases,Nvr_verified_reviewer,Contains_rep_phrases,high_vol_day_rev,Take_backs,Overrep_part,Overrep_wrd_cnt,Average_Rating,Overlapping_rev_history,One_hit,incentivized,Brand_repeater,Brand_Loyalist,Brand_Monogamist,single_day,profile_url,helpful_votes,name,num_of_reviews,num_of_reviews_count,num_of_verified,num_of_unverified,mode_number,samedate_20,anonymous,only_5star,rep_reviewer_flag
0,1,5,https://reviewmeta.com/profile/amazon/AFG4VMDI...,RXDGH790RKPUF,1,1,0,0,0,0,0,5.0,0,1,0,0,0,0,0,https://smile.amazon.com/gp/profile/amzn1.acco...,0,PSPP Inc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,1,1,https://reviewmeta.com/profile/amazon/AHWBOFLE...,RVJE4LSV9ZWLK,1,1,0,0,0,1,0,1.0,0,1,0,0,0,0,0,https://smile.amazon.com/gp/profile/amzn1.acco...,0,Philip Powell,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,1,2,https://reviewmeta.com/profile/amazon/A12K842R...,RV3XIX9GL0RTH,1,1,1,0,0,1,0,2.0,0,1,0,0,0,0,0,https://smile.amazon.com/gp/profile/amzn1.acco...,0,K. Jan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,1,5,https://reviewmeta.com/profile/amazon/A18LBGL7...,R8P2NMWQ7HZFO,1,0,1,0,1,0,0,4.6,1,0,0,1,0,0,0,https://smile.amazon.com/gp/profile/amzn1.acco...,0,Larry,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,1,5,https://reviewmeta.com/profile/amazon/A315QJ0Z...,R1OF6OLI5LWG8T,0,0,0,1,1,0,0,4.9,1,0,0,0,0,0,0,https://smile.amazon.com/gp/profile/amzn1.acco...,0,Pete Ramos,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


### adding a new column: 0_review
Some reviewers have had reviews before but now there isn't any review on their profile page.

In [1073]:
for i in range(len(model_data)):
    if (model_data.loc[i,'num_of_reviews_count'] == 0) | math.isnan(model_data.loc[i,'num_of_reviews_count']) == True:
        model_data.loc[i,'0_review'] = 1
    else:
        model_data.loc[i,'0_review'] = 0

## removing reviews with only 1/2 flags
In order to make our model stricter / decrease the false positives, we remove reviews that only have 1 flag or 2 flags and have been deleted by Amazon.

In [1074]:
num_flags = pd.DataFrame(model_data[['Non_Verified_Purchases','Nvr_verified_reviewer', 'Contains_rep_phrases', 'high_vol_day_rev','Take_backs', 'Overrep_part', 'Overrep_wrd_cnt',
       'Overlapping_rev_history', 'One_hit', 'incentivized', 'Brand_repeater',
       'Brand_Loyalist', 'Brand_Monogamist', 'single_day']].apply(lambda x: x.sum(), axis = 1), columns = ['num_flags'])



In [1075]:
model_data_flag = model_data.merge(num_flags, left_index = True, right_index = True)

In [1076]:
model_data_flag.drop(model_data_flag[model_data_flag['RM_Score'] == 1][model_data_flag['num_flags'] < 3].index, inplace=True)

  """Entry point for launching an IPython kernel.


In [1077]:
model_data_flag = model_data_flag.reset_index(drop = True)
model_data = model_data_flag

In [1078]:
model_data.columns

Index(['RM_Score', 'review_rating', 'reviewer_link_RM', 'Amazon_ID',
       'Non_Verified_Purchases', 'Nvr_verified_reviewer',
       'Contains_rep_phrases', 'high_vol_day_rev', 'Take_backs',
       'Overrep_part', 'Overrep_wrd_cnt', 'Average_Rating',
       'Overlapping_rev_history', 'One_hit', 'incentivized', 'Brand_repeater',
       'Brand_Loyalist', 'Brand_Monogamist', 'single_day', 'profile_url',
       'helpful_votes', 'name', 'num_of_reviews', 'num_of_reviews_count',
       'num_of_verified', 'num_of_unverified', 'mode_number', 'samedate_20',
       'anonymous', 'only_5star', 'rep_reviewer_flag', '0_review',
       'num_flags'],
      dtype='object')

## Data cleaning

In [1079]:
# For this column, NA exists because all of this reviewer's reviews are unverified.
# should be fixed in the feature generation chunk.
model_data['num_of_unverified'] = model_data['num_of_unverified'].fillna(model_data['num_of_reviews_count'])

In [1080]:
model_data['rep_reviewer_flag'] = model_data['rep_reviewer_flag'].fillna(0)

In [1081]:
model_data = model_data.drop(['reviewer_link_RM','Amazon_ID','profile_url','name','num_of_reviews', 'num_of_reviews_count','num_of_verified','helpful_votes'],1)

In [1082]:
# model_data['num_of_reviews'] = model_data['num_of_reviews'].apply(lambda x: int(x))
#model_data['helpful_votes'] = 
#model_data['helpful_votes'] = model_data['helpful_votes'].apply(lambda x: float(x.replace(',','').replace('\xa0','')))


In [1083]:
# These NAs are there because there are no reviews on the reviewer page. 0_review is 0 for all these rows.
pd.options.display.max_columns = None
model_data[model_data.isnull().values == True]

Unnamed: 0,RM_Score,review_rating,Non_Verified_Purchases,Nvr_verified_reviewer,Contains_rep_phrases,high_vol_day_rev,Take_backs,Overrep_part,Overrep_wrd_cnt,Average_Rating,Overlapping_rev_history,One_hit,incentivized,Brand_repeater,Brand_Loyalist,Brand_Monogamist,single_day,num_of_unverified,mode_number,samedate_20,anonymous,only_5star,rep_reviewer_flag,0_review,num_flags
64,1,5,1,1,1,1,0,1,0,5.0,0,0,0,0,0,0,0,,,,,,0.0,1.0,5
64,1,5,1,1,1,1,0,1,0,5.0,0,0,0,0,0,0,0,,,,,,0.0,1.0,5
64,1,5,1,1,1,1,0,1,0,5.0,0,0,0,0,0,0,0,,,,,,0.0,1.0,5
64,1,5,1,1,1,1,0,1,0,5.0,0,0,0,0,0,0,0,,,,,,0.0,1.0,5
64,1,5,1,1,1,1,0,1,0,5.0,0,0,0,0,0,0,0,,,,,,0.0,1.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2664,0,1,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,,,,,,0.0,1.0,0
2664,0,1,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,,,,,,0.0,1.0,0
2664,0,1,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,,,,,,0.0,1.0,0
2664,0,1,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,,,,,,0.0,1.0,0


In [1084]:
model_data = model_data.fillna(0)

In [1085]:
model_data = model_data.drop('num_flags',1)

In [1086]:
model_data = model_data.drop('review_rating', 1)

In [1087]:
# When you run section 1 and do the modeling, you need to run this chunk.
# final_df = final_df.drop(['reviewer_link_RM','Amazon_ID'],1)
# model_data = final_df

# Modeling
## Random Forest

In [1088]:
model_data.columns

Index(['RM_Score', 'Non_Verified_Purchases', 'Nvr_verified_reviewer',
       'Contains_rep_phrases', 'high_vol_day_rev', 'Take_backs',
       'Overrep_part', 'Overrep_wrd_cnt', 'Average_Rating',
       'Overlapping_rev_history', 'One_hit', 'incentivized', 'Brand_repeater',
       'Brand_Loyalist', 'Brand_Monogamist', 'single_day', 'num_of_unverified',
       'mode_number', 'samedate_20', 'anonymous', 'only_5star',
       'rep_reviewer_flag', '0_review'],
      dtype='object')

In [1114]:
model_data = model_data[['RM_Score', 'Non_Verified_Purchases', 'Nvr_verified_reviewer',
       'Contains_rep_phrases', 'high_vol_day_rev', 'Take_backs', 'Overrep_wrd_cnt', 'Average_Rating',
       'Overlapping_rev_history', 'One_hit', 'incentivized',
        'single_day','0_review']]

In [1115]:
model_data['RM_Score'].value_counts()

0    1954
1     714
Name: RM_Score, dtype: int64

In [1116]:
x = model_data.loc[:,'Non_Verified_Purchases':] # roc_auc_score: 87.8%
#x = model_data.loc[:,'review_rating':'single_day'] # roc_auc_score: 86%
y = model_data['RM_Score']

In [1121]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1, train_size = 0.7)

In [1122]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1867, 12), (801, 12), (1867,), (801,))

In [1123]:
y_test.value_counts()

0    589
1    212
Name: RM_Score, dtype: int64

In [1124]:
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
test_pred = clf.predict(x_test)
train_pred = clf.predict(x_train)
print ('RF result: %.3f/%.3f' % (roc_auc_score(y_train, train_pred), roc_auc_score(y_test, test_pred)))

RF result: 0.951/0.905




In [1125]:
print ("=== Confusion Matrix ===")
print (confusion_matrix(y_test, test_pred))
print ('\n')
print ("=== Classification Report ===")
print (classification_report(y_test, test_pred))
print ('\n')

=== Confusion Matrix ===
[[563  26]
 [ 31 181]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       589
           1       0.87      0.85      0.86       212

    accuracy                           0.93       801
   macro avg       0.91      0.90      0.91       801
weighted avg       0.93      0.93      0.93       801





In [1126]:
feat_labels = x_train.columns
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(x_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))


 1) Average_Rating                 0.277215
 2) high_vol_day_rev               0.218143
 3) Overlapping_rev_history        0.102867
 4) 0_review                       0.097624
 5) Contains_rep_phrases           0.068200
 6) Non_Verified_Purchases         0.058047
 7) Take_backs                     0.056211
 8) Overrep_wrd_cnt                0.049228
 9) Nvr_verified_reviewer          0.032072
10) single_day                     0.018292
11) One_hit                        0.018066
12) incentivized                   0.004036


### Gridsearch 

In [312]:
clf = RandomForestClassifier(n_jobs=-1)

param_grid = {
    'min_samples_split': [3, 5, 10], 
    'n_estimators' : [100, 300],
    'max_depth': [3, 5, 15, 25],
    'max_features': [3, 5, 10, 20]
}

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

In [236]:
def grid_search_wrapper(refit_score='recall_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,
                           cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(x_train.values, y_train.values)

    # make the predictions
    y_pred = grid_search.predict(x_test.values)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    return grid_search

In [281]:
#grid_search_clf = grid_search_wrapper(refit_score='recall_score')

## XG Boost

In [1127]:
xg_model = XGBRegressor(objective = 'binary:logistic')
xg_model.fit(x_train, y_train)

  if getattr(data, 'base', None) is not None and \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='binary:logistic',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [1128]:
y_pred_xg = xg_model.predict(x_test)
predictions = [round(value) for value in y_pred_xg]

In [1129]:
test_pred = xg_model.predict(x_test)
test_pred = [round(value) for value in test_pred]
train_pred = xg_model.predict(x_train)
train_pred = [round(value) for value in train_pred]
print ('xgboost result: %.3f/%.3f' % (roc_auc_score(y_train, train_pred), roc_auc_score(y_test, test_pred)))

xgboost result: 0.930/0.913


In [1130]:
print ("=== Confusion Matrix ===")
print (confusion_matrix(y_test, test_pred))
print ('\n')
print ("=== Classification Report ===")
print (classification_report(y_test, test_pred))
print ('\n')

=== Confusion Matrix ===
[[564  25]
 [ 28 184]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       589
           1       0.88      0.87      0.87       212

    accuracy                           0.93       801
   macro avg       0.92      0.91      0.91       801
weighted avg       0.93      0.93      0.93       801





### Gridsearch parameter

In [1131]:
params = {"objective":["binary:logistic"],'colsample_bytree': [0.2,0.3,0.4,0.5],'learning_rate': [0.1,0.2,0.3],
                'max_depth': [5,6,7], 'alpha': [10,11,12]}

In [1132]:
best_xgb = GridSearchCV(
    xg_model, param_grid=params, cv=10, verbose=0, n_jobs=-1)

In [1133]:
best_xgb.fit(x_train, y_train)

  if getattr(data, 'base', None) is not None and \


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='binary:logistic', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='warn', n_jobs=-1,
             param_grid={'alpha': [10, 11, 12],
                         'colsample_bytree': [0.2, 0.3, 0.4, 0.5],
   

In [1134]:
y_pred_grid = best_xgb.predict(x_test)
y_pred_grid = [round(value) for value in y_pred_grid]

In [1135]:
# 93.1

test_pred_grid = best_xgb.predict(x_test)
test_pred_grid = [round(value) for value in test_pred_grid]
train_pred_grid = best_xgb.predict(x_train)
train_pred_grid = [round(value) for value in train_pred_grid]
print ('xgboost result: %.3f/%.3f' % (roc_auc_score(y_train, train_pred_grid), roc_auc_score(y_test, test_pred_grid)))

xgboost result: 0.935/0.913


In [1136]:
print ("=== Confusion Matrix ===")
print (confusion_matrix(y_test, test_pred_grid))
print ('\n')
print ("=== Classification Report ===")
print (classification_report(y_test, test_pred_grid))
print ('\n')

=== Confusion Matrix ===
[[564  25]
 [ 28 184]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       589
           1       0.88      0.87      0.87       212

    accuracy                           0.93       801
   macro avg       0.92      0.91      0.91       801
weighted avg       0.93      0.93      0.93       801





In [1113]:
best_xgb.best_estimator_.get_booster().get_score(importance_type="gain")

{'Average_Rating': 4.636798201983143,
 'Non_Verified_Purchases': 3.6503660924046675,
 'Overrep_wrd_cnt': 1.7129237141402869,
 'One_hit': 1.388065913941471,
 'single_day': 1.4423004249384903,
 'Contains_rep_phrases': 2.3345694951427096,
 '0_review': 2.7861918207078658,
 'rep_reviewer_flag': 0.7117294659300001,
 'Overlapping_rev_history': 3.6115753920935982,
 'high_vol_day_rev': 9.765007309375697,
 'Take_backs': 5.436684159427658,
 'Nvr_verified_reviewer': 2.0688814594300164,
 'incentivized': 0.743902064}

# Verify Amazon's deletion
For this part, you need to re-train the model on only the features given by Reviewmeta
## KK's approach: 
use the above model to predict the 4000 reviews which have a RM_score < 0.1 but are not deleted by Amazon.
Look at the results to check.

In [256]:
reviewmeta_5k = reviewmeta[reviewmeta['trust'] <= 0.1]
reviewmeta_5k.shape

(5236, 30)

In [257]:
deleted_reviews = pd.merge(reviewmeta,wrong_link,left_on='rvwr_link_Amazon',right_on='wrong link')
deleted_reviews['rvwr_link_Amazon'].nunique()

921

In [258]:
reviewmeta_4k_full = reviewmeta_5k[~reviewmeta_5k.rvwr_link_Amazon.isin(deleted_reviews.rvwr_link_Amazon)]

In [259]:
reviewmeta_4k_full = reviewmeta_4k_full.drop_duplicates(subset='rvwr_link_Amazon', keep="first")
reviewmeta_4k_full.rename(columns={"trust": "RM_Score", "Unnamed: 3": "RB_Score"}, inplace = True)

reviewmeta_4k = reviewmeta_4k_full.drop(['RB_Score','Critical_Rev_rating','Take_backs_rating','Easy_grade_rating','Brand_Rep_freq','Brand_rep_rating','product', 'RM_Score', 'review_title', 'reviewer_details', 'rvwr_text_Amazon','rvwr_link_Amazon'], axis=1)
reviewmeta_4k = reviewmeta_4k.drop(['reviewer_link_RM','Amazon_ID'],1)
reviewmeta_4k = reviewmeta_4k.reset_index(drop = True)

In [260]:
reviewmeta_4k

Unnamed: 0,review_rating,Verified_Purchases,Nvr_verified_reviewer,Contains_rep_phrases,high_vol_day_rev,Average_Rating,Take_backs,Overrep_part,Overrep_wrd_cnt,Overlapping_rev_history,One_hit,incentivized,Brand_repeater,Brand_Loyalist,Brand_Monogamist,single_day
0,3,1,0,1,0,3.0,0,0,1,0,1,0,0,0,0,0
1,2,0,0,1,0,4.0,1,1,1,0,1,0,0,0,0,0
2,1,1,0,0,0,4.2,0,1,0,0,0,0,0,0,0,0
3,3,1,0,0,0,4.3,0,0,0,1,0,0,0,0,0,1
4,5,1,0,0,0,4.9,1,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3548,3,1,0,0,1,4.4,1,0,0,1,0,0,0,0,0,0
3549,3,0,0,0,0,4.2,1,0,0,0,0,0,1,0,0,0
3550,5,0,0,0,0,4.3,0,0,0,0,0,0,0,0,0,0
3551,4,0,0,0,0,3.3,0,0,0,0,0,0,0,0,0,0


In [261]:
modify_column_veri_purchase(reviewmeta_4k)

In [262]:
reviewmeta_4k.columns

Index(['review_rating', 'Non_Verified_Purchases', 'Nvr_verified_reviewer',
       'Contains_rep_phrases', 'high_vol_day_rev', 'Average_Rating',
       'Take_backs', 'Overrep_part', 'Overrep_wrd_cnt',
       'Overlapping_rev_history', 'One_hit', 'incentivized', 'Brand_repeater',
       'Brand_Loyalist', 'Brand_Monogamist', 'single_day'],
      dtype='object')

In [263]:
reviewmeta_4k = reviewmeta_4k[['review_rating', 'Non_Verified_Purchases', 'Nvr_verified_reviewer',
       'Contains_rep_phrases', 'high_vol_day_rev', 'Take_backs',
       'Overrep_part', 'Overrep_wrd_cnt', 'Average_Rating',
       'Overlapping_rev_history', 'One_hit', 'incentivized', 'Brand_repeater',
       'Brand_Loyalist', 'Brand_Monogamist', 'single_day']]
x_test_4k = reviewmeta_4k

In [1137]:
y_pred_4k = best_xgb.predict(x_test_4k)
y_pred_4k = [round(value) for value in y_pred_4k]

In [492]:
reviewmeta_4k_full = reviewmeta_4k_full.reset_index(drop = True)

In [493]:
x_test_4k_check = reviewmeta_4k_full.merge(pd.DataFrame(y_pred_4k, columns = ['pred']), left_index = True, right_index = True)



In [497]:
x_test_4k_check['pred'].sum() / 3553

0.3861525471432592

In [494]:
x_test_4k_check.to_csv('x_test_4k_check.csv')

### Iris's approach

In [256]:
reviewmeta_5k = reviewmeta[reviewmeta['trust'] <= 0.1]
reviewmeta_5k = reviewmeta_5k.reset_index(drop = True)

In [257]:
for i in range(len(reviewmeta_5k)):
    if reviewmeta_5k.loc[i,'rvwr_link_Amazon'] in wrong_link['wrong link'].values:
        reviewmeta_5k.loc[i,'RM_Score'] = 1
    else:
        reviewmeta_5k.loc[i,'RM_Score'] = 0

In [258]:
reviewmeta_5k = reviewmeta_5k.drop_duplicates(subset='rvwr_link_Amazon', keep="first")
reviewmeta_5k.rename(columns={"Unnamed: 3": "RB_Score"}, inplace = True)
reviewmeta_5k['RM_Score'].value_counts()

0.0    3553
1.0     921
Name: RM_Score, dtype: int64

In [259]:
reviewmeta_5k = reviewmeta_5k.drop(['trust','RB_Score','Critical_Rev_rating','Take_backs_rating','Easy_grade_rating','Brand_Rep_freq','Brand_rep_rating','product', 'review_title', 'reviewer_details', 'rvwr_text_Amazon','rvwr_link_Amazon'], axis=1)
reviewmeta_5k = reviewmeta_5k.drop(['reviewer_link_RM','Amazon_ID'],1)
reviewmeta_5k = reviewmeta_5k.reset_index(drop = True)

In [260]:
reviewmeta_5k

Unnamed: 0,review_rating,Verified_Purchases,Nvr_verified_reviewer,Contains_rep_phrases,high_vol_day_rev,Average_Rating,Take_backs,Overrep_part,Overrep_wrd_cnt,Overlapping_rev_history,One_hit,incentivized,Brand_repeater,Brand_Loyalist,Brand_Monogamist,single_day,RM_Score
0,3,1,0,1,0,3.0,0,0,1,0,1,0,0,0,0,0,0.0
1,2,0,0,1,0,4.0,1,1,1,0,1,0,0,0,0,0,0.0
2,1,1,0,0,0,4.2,0,1,0,0,0,0,0,0,0,0,0.0
3,3,1,0,0,0,4.3,0,0,0,1,0,0,0,0,0,1,0.0
4,5,1,0,0,0,4.9,1,0,1,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4469,3,1,0,0,1,4.4,1,0,0,1,0,0,0,0,0,0,0.0
4470,3,0,0,0,0,4.2,1,0,0,0,0,0,1,0,0,0,0.0
4471,5,0,0,0,0,4.3,0,0,0,0,0,0,0,0,0,0,0.0
4472,4,0,0,0,0,3.3,0,0,0,0,0,0,0,0,0,0,0.0


In [262]:
x = reviewmeta_5k.loc[:,'review_rating':'single_day'] 
y = reviewmeta_5k['RM_Score']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1, train_size = 0.7)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((3131, 16), (1343, 16), (3131,), (1343,))

In [263]:
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
test_pred = clf.predict(x_test)
train_pred = clf.predict(x_train)
print ('RF result: %.3f/%.3f' % (roc_auc_score(y_train, train_pred), roc_auc_score(y_test, test_pred)))

RF result: 0.808/0.693




In [265]:
print ("=== Confusion Matrix ===")
print (confusion_matrix(y_test, test_pred))
print ('\n')
print ("=== Classification Report ===")
print (classification_report(y_test, test_pred))
print ('\n')

=== Confusion Matrix ===
[[989  77]
 [150 127]]


=== Classification Report ===
              precision    recall  f1-score   support

         0.0       0.87      0.93      0.90      1066
         1.0       0.62      0.46      0.53       277

    accuracy                           0.83      1343
   macro avg       0.75      0.69      0.71      1343
weighted avg       0.82      0.83      0.82      1343





# Apply model on Reviewbox Data
all the features:

1. Iris:
'Non_Verified_Purchases', 
'Nvr_verified_reviewer',
'Take_backs',
'Average_Rating',
'One_hit', 
'single_day',
'0_review',

2. Daviid:
'Contains_rep_phrases', 
'incentivized',

3. Yvette: 'high_vol_day_rev',  
4. KK: 'Overrep_wrd_cnt', 
5. Jordan: 'Overlapping_rev_history'

Note:

Since it takes a long time to scrape the profiles, our team divided the task within 7 of us. Each time we use the index 1, 2.. as the file name and that's why we need to run the feature generation code for 7 times and then complie the csv. Next time, it's better to name the file using unique id such as reviewid.

In [7]:
# Read the dataset with the profile links
profile_urls = pd.read_csv('RSC reviews with profile ids.csv')

## Basic Features: reviewer profile info (verified, average rating, one-hit, tack-back, never-verified, single day, brand-related)

*Note that in this part we would use the profile pages scrapped to retrieve information*

In [8]:
# Keep columns for later aggregation
profile_urls = profile_urls[['author','source','reviewid','product','profile','verified']]
profile_urls['source_product'] = profile_urls['source'] +' '+ profile_urls['product']
profile_urls = profile_urls.rename(columns={'verified': 'Verified_Purchases'})
profile_urls.Verified_Purchases = profile_urls.Verified_Purchases.astype(int)
profile_urls_useful = profile_urls.dropna()
profile_urls_useful = profile_urls_useful[profile_urls_useful['profile'].str.contains('account')].reset_index(drop = True)

In [12]:
profile_urls_useful

Unnamed: 0,author,source,reviewid,product,profile,Verified_Purchases,source_product
0,Amazon Customer,amazon.ca,R31B5G60GS531M,B078N8NR7G,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B078N8NR7G
1,nathalie,amazon.ca,RRCY4V48RQBXG,B01HO8U5NC,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B01HO8U5NC
2,Conure Mum,amazon.ca,R18076F5C879LP,B01HO8U5NC,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B01HO8U5NC
3,Wayne Smith,amazon.ca,RLA1DFN3DCSFJ,B01HO8U5NC,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B01HO8U5NC
4,Rob Self,amazon.ca,R3F4GS6FDS5ALH,B01HO8U5NC,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B01HO8U5NC
5,Richard Goods,amazon.ca,R1KDTNCKJ3DCV2,B01HO8U5NC,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B01HO8U5NC
6,Jerome Tanguay,amazon.ca,R10TY9YVK98S85,B01HO8U5NC,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B01HO8U5NC
7,Tammy Roode,amazon.ca,R1ZRSF0QANSTRY,B01HO8U5NC,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B01HO8U5NC
8,Lisa,amazon.ca,R3HA5C606GWKN,B01HO8U5NC,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B01HO8U5NC
9,mika jiang,amazon.ca,R2OHNULSZ4ARHS,B01HO8U5NC,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B01HO8U5NC


In [13]:
folder = os.getcwd() + '/profiles/'
if not os.path.exists(folder):
    os.mkdir(folder)

# Later this part can be integrated into one:
name = input("enter your first name initial in caps")
if name == "T":
    profile_urls_useful_T = profile_urls_useful[530:4280]
if name == "N":
    profile_urls_useful_N = profile_urls_useful[4280:8030]
if name == "I":
    profile_urls_useful_I = profile_urls_useful[8030:11780]
if name == "J":
    profile_urls_useful_J = profile_urls_useful[11780:15530]
if name == "Y":
    profile_urls_useful_Y = profile_urls_useful[15530:19280]
if name == "D":
    profile_urls_useful_D = profile_urls_useful[19280:23030]
if name == "K":
    profile_urls_useful_K = profile_urls_useful[23030:len(profile_urls_useful)-1]

enter your first name initial in capsN


In [14]:
name

'N'

In [16]:
# reset index for further use(can be deleted later)
profile_urls_useful_N = profile_urls_useful_N.reset_index(drop = True)
profile_urls_useful_N.head()

Unnamed: 0,author,source,reviewid,product,profile,Verified_Purchases,source_product
0,DawneJ,amazon.ca,R33QE9N6XL7AIV,B000GEZCJ4,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B000GEZCJ4
1,Kristy,amazon.ca,R20D0Y8GCSRJ47,B000GEZCJ4,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B000GEZCJ4
2,Nsf,amazon.ca,R42B0PDV3LNNM,B000GEZCJ4,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B000GEZCJ4
3,HJK,amazon.ca,R1TAXPAVCAIH2Q,B000GEZCJ4,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B000GEZCJ4
4,Iris Reid,amazon.ca,R1QODMNSBYI8NF,B000GEZCJ4,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B000GEZCJ4


In [11]:
soup_I = soup_K = soup_Y = soup_J = soup_T = soup_N = soup_D = {}

In [19]:
def save_in_dict(folder_name, df_name, soup):
    for i in tqdm(range(len(df_name))):
        try:
            num = str(i)
            slash = '/'
            name = 'profile'+num+'.txt'
            path = os.getcwd() + '/' + folder_name + '/'
            f = open("{}{}{}".format(path,slash,name),"r", encoding="utf-8").read()

            #text=open(name,'r').read() #Read the text from the file.
            soup[i]=bs4.BeautifulSoup(f) #Create a beautifulsoup object using the txt we got.
        except:
            print(i)

In [20]:
# generating new columns: one reviewer one row
def generate_features(df_name, soup):
    for i in tqdm(range(len(df_name))):
        try:
            tag0 = soup[i].find_all('div', class_='dashboard-desktop-stat-value')[0] 
            df_name.loc[i,'helpful_votes'] = tag0.find('span', class_='a-size-large a-color-base').get_text() 

            for tag in soup[i].find_all('div', class_='a-row a-spacing-none name-container'):    
                df_name.loc[i,'name'] = tag.find('span', class_='a-size-extra-large').get_text() 

            tag1 = soup[i].find_all('div', class_='dashboard-desktop-stat-value')[1]    
            df_name.loc[i,'num_of_reviews'] = int(tag1.find('span', class_='a-size-large a-color-base').get_text())

            df_name.loc[i,'num_of_reviews_count'] = len(soup[i].find_all('div', class_='a-section profile-at-content'))


            # 0_review
            if df_name.loc[i,'num_of_reviews_count'] == 0:
                df_name.loc[i,'0_review'] = 1
            else:
                df_name.loc[i,'0_review'] = 0
            
            # One-Hit Wonder
            if df_name.loc[i,'num_of_reviews'] == 1:
                df_name.loc[i,'One_hit'] = 1
            else:
                df_name.loc[i,'One_hit'] = 0

            # take back
            df_name['take_back'] = df_name.apply(lambda x: x['num_of_reviews'] - x['num_of_reviews_count'], axis=1)
            if df_name.loc[i,'take_back'] > 0:
                df_name.loc[i,'Take_backs'] = 1
            else:
                df_name.loc[i,'Take_backs'] = 0

            # never verified
            verified = []
            for tag in soup[i].find_all('div', class_='a-row a-spacing-mini'): 
                try:   
                    verified.append(tag.find('span', class_='a-size-small a-color-state profile-at-review-badge a-text-bold').get_text())
                except:
                    continue  
            df_name.loc[i,'num_of_verified'] = len(verified)
            df_name.loc[i,'num_of_unverified'] = df_name.loc[i,'num_of_reviews_count'] - df_name.loc[i,'num_of_verified']
            if (df_name.loc[i,'num_of_unverified'] == df_name.loc[i,'num_of_reviews_count']) & (df_name.loc[i,'num_of_unverified'] > 0):
                df_name.loc[i,'Nvr_verified_reviewer'] = 1
            else:
                df_name.loc[i,'Nvr_verified_reviewer'] = 0

            
            # single day
            date_mode_number = []
            for tag in soup[i].find_all('div', class_='a-profile-content'):
                date_mode_number.append(tag.find('span', class_='a-profile-descriptor').get_text())
                if len(set(date_mode_number)) == 1:
                    df_name.loc[i,'single_day'] = 1
                else:
                    df_name.loc[i,'single_day'] = 0
                    
            # avg rating 
            stars = []
            for tag in soup[i].find_all('div',class_='a-section a-spacing-mini'):
                stars.append(int(tag.find('span',class_='a-icon-alt').text[0]) )
            df_name.loc[i,'avg_rating'] = sum(stars)/len(stars)  


        except:
            continue

In [19]:
# Change captical leter accordingly
save_in_dict('profiles',profile_urls_useful_N, soup_N)
generate_features(profile_urls_useful_N, soup_N)

HBox(children=(IntProgress(value=0, max=3750), HTML(value='')))




In [26]:
# Check the features
print(profile_urls_useful_N.loc[0,'profile'])

https://www.amazon.ca/gp/profile/amzn1.account.AHJBVGCBBQYCWRCMMZTRCI2I6ZCQ/ref=cm_cr_arp_d_gw_btm?ie=UTF8


In [28]:
profile_urls_useful_N.head()

Unnamed: 0,author,source,reviewid,product,profile,Verified_Purchases,source_product,helpful_votes,name,num_of_reviews,num_of_reviews_count,0_review,One_hit,take_back,Take_backs,num_of_verified,num_of_unverified,Nvr_verified_reviewer,single_day,avg_rating
0,DawneJ,amazon.ca,R33QE9N6XL7AIV,B000GEZCJ4,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B000GEZCJ4,,,,,,,,,,,,,
1,Kristy,amazon.ca,R20D0Y8GCSRJ47,B000GEZCJ4,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B000GEZCJ4,41.0,Kristy,57.0,57.0,0.0,0.0,0.0,0.0,50.0,7.0,0.0,0.0,4.210526
2,Nsf,amazon.ca,R42B0PDV3LNNM,B000GEZCJ4,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B000GEZCJ4,45.0,Nsf,56.0,56.0,0.0,0.0,0.0,0.0,54.0,2.0,0.0,0.0,4.035714
3,HJK,amazon.ca,R1TAXPAVCAIH2Q,B000GEZCJ4,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B000GEZCJ4,17.0,HJK,85.0,85.0,0.0,0.0,0.0,0.0,82.0,3.0,0.0,0.0,3.964706
4,Iris Reid,amazon.ca,R1QODMNSBYI8NF,B000GEZCJ4,https://www.amazon.ca/gp/profile/amzn1.account...,1,amazon.ca B000GEZCJ4,1.0,Iris Roskewich,17.0,13.0,0.0,0.0,4.0,1.0,12.0,1.0,0.0,0.0,3.769231


In [29]:
# Change captical leter accordingly
profile_urls_useful_N.to_csv('profile_urls_useful_N_0506.csv')

### Combine the datasets

In [39]:
profile_urls_useful_I = pd.read_csv('profile_urls_useful_D.csv', index_col = 0)
profile_urls_useful_K = pd.read_csv('profile_urls_useful_K.csv', index_col = 0)
profile_urls_useful_J = pd.read_csv('profile_urls_useful_J.csv', index_col = 0)
profile_urls_useful_T = pd.read_csv('profile_urls_useful_T.csv', index_col = 0)
profile_urls_useful_Y = pd.read_csv('profile_urls_useful_Y.csv', index_col = 0)
profile_urls_useful_D = pd.read_csv('profile_urls_useful_D.csv', index_col = 0)

frame = [profile_urls_useful_N, profile_urls_useful_I, profile_urls_useful_K, 
         profile_urls_useful_J, profile_urls_useful_T, profile_urls_useful_Y,
        profile_urls_useful_D]

In [34]:
final_output_with_all = pd.concat([frame], ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [35]:
final_output_with_all.head()

Unnamed: 0,0_review,Nvr_verified_reviewer,One_hit,Take_backs,Verified_Purchases,author,avg_rating,helpful_votes,name,num_of_reviews,num_of_reviews_count,num_of_unverified,num_of_verified,product,profile,reviewid,single_day,source,source_product,take_back
0,,,,,1,DawneJ,,,,,,,,B000GEZCJ4,https://www.amazon.ca/gp/profile/amzn1.account...,R33QE9N6XL7AIV,,amazon.ca,amazon.ca B000GEZCJ4,
1,0.0,0.0,0.0,0.0,1,Kristy,4.210526,41.0,Kristy,57.0,57.0,7.0,50.0,B000GEZCJ4,https://www.amazon.ca/gp/profile/amzn1.account...,R20D0Y8GCSRJ47,0.0,amazon.ca,amazon.ca B000GEZCJ4,0.0
2,0.0,0.0,0.0,0.0,1,Nsf,4.035714,45.0,Nsf,56.0,56.0,2.0,54.0,B000GEZCJ4,https://www.amazon.ca/gp/profile/amzn1.account...,R42B0PDV3LNNM,0.0,amazon.ca,amazon.ca B000GEZCJ4,0.0
3,0.0,0.0,0.0,0.0,1,HJK,3.964706,17.0,HJK,85.0,85.0,3.0,82.0,B000GEZCJ4,https://www.amazon.ca/gp/profile/amzn1.account...,R1TAXPAVCAIH2Q,0.0,amazon.ca,amazon.ca B000GEZCJ4,0.0
4,0.0,0.0,0.0,1.0,1,Iris Reid,3.769231,1.0,Iris Roskewich,17.0,13.0,1.0,12.0,B000GEZCJ4,https://www.amazon.ca/gp/profile/amzn1.account...,R1QODMNSBYI8NF,0.0,amazon.ca,amazon.ca B000GEZCJ4,4.0


In [36]:
final_output_with_all.shape

(26238, 20)

In [37]:
final_output_with_all.to_csv('final_output_with_all.csv', index=False)

## More Features

Here we generated more features for the model, the data we based on is '**RSC reviews with profile ids.csv**'( = profile_urls in the previous session)

### High Volumn Day

In [2]:
reviews = profile_urls.copy()

In [3]:
reviews.head()

Unnamed: 0,source,product,PART NUMBER_custom,SKU_custom,analysis_purpose_custom_custom,flag_custom,special_name_custom,test_field2_custom,test_field3_custom,name,...,statusdt,statustime,helpfulcount,commenttext,commentauthor,officialcomment,totalcomments,commentts,commentdatestring,inputtime
0,amazon.ca,B078N8NR7G,,,,,,,,PetSafe 900 Meter Remote Trainer,...,,,,,,,,,,2018-12-22 06:24
1,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,,,2019-04-15 10:32
2,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,,,2019-04-04 12:02
3,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,,,2019-04-02 20:35
4,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,,,2018-07-31 08:00


In [5]:
# Set product as index for later merging tasks
reviews.set_index('product',drop=True,inplace=True)

In [6]:
# create review dictionary
reviews_dict = {}
for i in reviews.index.unique():
    reviews_dict[i] = reviews.loc[i, ['source','date','reviewid', 'text']]

In [7]:
# check the date column
pd.to_datetime(reviews_dict["B01HO8U5NC"]['date'])

product
B01HO8U5NC   2019-04-13 08:00:00
B01HO8U5NC   2019-04-02 08:00:00
B01HO8U5NC   2019-03-31 08:00:00
B01HO8U5NC   2018-07-31 08:00:00
B01HO8U5NC   2018-07-29 08:00:00
B01HO8U5NC   2018-04-12 08:00:00
B01HO8U5NC   2018-01-21 08:00:00
B01HO8U5NC   2018-01-08 08:00:00
B01HO8U5NC   2017-10-21 08:00:00
B01HO8U5NC   2017-10-16 08:00:00
B01HO8U5NC   2017-08-18 08:00:00
B01HO8U5NC   2017-08-14 08:00:00
B01HO8U5NC   2017-05-24 08:00:00
B01HO8U5NC   2017-04-24 08:00:00
B01HO8U5NC   2017-04-09 08:00:00
B01HO8U5NC   2017-03-15 08:00:00
B01HO8U5NC   2017-03-13 08:00:00
B01HO8U5NC   2017-02-17 08:00:00
B01HO8U5NC   2017-02-14 08:00:00
B01HO8U5NC   2017-02-01 08:00:00
B01HO8U5NC   2017-01-17 08:00:00
B01HO8U5NC   2017-01-12 08:00:00
B01HO8U5NC   2017-01-10 08:00:00
B01HO8U5NC   2017-01-07 08:00:00
B01HO8U5NC   2017-01-01 08:00:00
B01HO8U5NC   2016-12-29 08:00:00
B01HO8U5NC   2016-12-28 08:00:00
B01HO8U5NC   2016-12-18 08:00:00
B01HO8U5NC   2016-12-18 08:00:00
B01HO8U5NC   2016-12-16 08:00:00
  

In [8]:
# Create a 'new_date' column for date comparison
for i in reviews_dict:
    if type(reviews_dict[i]) != pd.core.series.Series:
        reviews_dict[i] = reviews_dict[i].sort_values('date').drop_duplicates()
        datetime = pd.to_datetime(reviews_dict[i]['date'])
        reviews_dict[i]['new_date'] = datetime.dt.strftime('%Y-%m-%d')
        reviews_dict[i]['new_date'] = pd.to_datetime(reviews_dict[i]['new_date'])

In [9]:
# Generate the high volumn day feature
for i in reviews_dict:
    if type(reviews_dict[i]) != pd.core.series.Series:
        num_review_per_day = reviews_dict[i][["text", "new_date"]].groupby(by = "new_date", as_index =False).count()
        if len(num_review_per_day) > 1:
            num_avg = num_review_per_day["text"].mean(axis = 0)
            num_std = num_review_per_day["text"].std(axis = 0)
            num_limit = math.ceil(num_avg + num_std)
            high_volumes_day = num_review_per_day[num_review_per_day['text'] > num_limit]['new_date'] 
            reviews_dict[i]['whether_high_volume'] = reviews_dict[i]['new_date'].isin(high_volumes_day)
        else:
            reviews_dict[i]['whether_high_volume'] = True # because all the reviews were left on the same date
    else:
        reviews_dict[i]['whether_high_volume'] = False

In [10]:
reviews_dict

{'B078N8NR7G': source                                                     amazon.ca
 date                                                2018-12-20 08:00
 author                                               Amazon Customer
 reviewid                                              R31B5G60GS531M
 text                   Produit disfonctionnel. J'exige remboursement
 whether_high_volume                                            False
 Name: B078N8NR7G, dtype: object,
 'B01HO8U5NC':                source              date                 author  \
 product                                                          
 B01HO8U5NC     amazon  2016-09-09 08:00   Catherine Mulholland   
 B01HO8U5NC     amazon  2016-09-12 08:00             Dani-jewel   
 B01HO8U5NC     amazon  2016-09-13 08:00               Jennifer   
 B01HO8U5NC     amazon  2016-09-18 08:00          Susan Garrett   
 B01HO8U5NC     amazon  2016-09-20 08:00               Cavalier   
 B01HO8U5NC     amazon  2016-09-23 08:00         

In [None]:
# combine the feature back to the dataframe
reviews_high_volume = pd.DataFrame()
for i in reviews_dict:
    if type(reviews_dict[i]) != pd.core.series.Series:
        df_bin = reviews_dict[i]
    else:
        df_bin = reviews_dict[i].to_frame().T
    reviews_high_volume = pd.concat([reviews_high_volume, df_bin], ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [None]:
reviews_high_volume['whether_high_volume'] =reviews_high_volume['whether_high_volume'].apply(lambda x: 1 if x==True else 0)

In [None]:
# We can save the table for further use
reviews_high_volume.to_csv('reviews_high_volume.csv')

In [None]:
# To compile we just keep the feature we need:
reviews_high_volume.set_index('reviewid',drop=True,inplace=True)
reviews_high_volume = reviews_high_volume[['new_date', 'whether_high_volume']]

### Word Count Comparison

To build our word count distribution, we start by putting every single review for a product into a “word count group”.  For example, a 23 word review would fall into the “21-25 word count group”, a 109 word review would fall into the “101-125 word count group”, and a 600 word review would fall into the “201+ word count group”.  This gives us the product’s word count distribution.  But just a product’s  word count distribution doesn’t really tell us that much: we need something to compare it to. That is why we grab the word count distribution for all of the reviews in the products category (category2) to get the expected word count distribution.  

Once we have the word count distribution of the product and the expected distribution of the category we compare the two distributions and identify product word count groups that are higher in concentration than we’d expect to see. For each of the larger groups we run a significance test to ensure that it isn’t due to random chance or lack of data points but rather that they are substantially overrepresented. If a product doesn’t have that many reviews, we are likely to see more variance due to random chance.  However, if our formula determines the difference is statistically significant, we’ll label that group as an Overrepresented Word Count Group.

In [3]:
# Load the sales dataset
sales = pd.read_csv("SalesRankExport_f0337c16-d7f3-4fc0-a46b-a0e14f18b595.csv")
sales.shape

  interactivity=interactivity, compiler=compiler, result=result)


(2258613, 17)

In [4]:
# Check column names
sales.columns

Index(['source', 'id', 'start_ts', 'end_ts', 'date', 'category_id1',
       'category_name1', 'category_rank1', 'category_id2', 'category_name2',
       'category_rank2', 'category_id3', 'category_name3', 'category_rank3',
       'category_id4', 'category_name4', 'category_rank4'],
      dtype='object')

In [5]:
# Extract only columns of interest
sales = sales[['id','category_id2']]

# Take only the unique product id
sales = sales.drop_duplicates('id')
sales.shape

In [7]:
# Now let's compile the reviews and sales dataframes to identify the category of each product in the reviews dataset
compiled = pd.merge(reviews,sales, how = 'inner', left_on = "product", right_on="id")
compiled.head()

Unnamed: 0,source,product,PART NUMBER_custom,SKU_custom,analysis_purpose_custom_custom,flag_custom,special_name_custom,test_field2_custom,test_field3_custom,name,...,helpfulcount,commenttext,commentauthor,officialcomment,totalcomments,commentts,commentdatestring,inputtime,id,category_id2
0,amazon.ca,B078N8NR7G,,,,,,,,PetSafe 900 Meter Remote Trainer,...,,,,,,,,2018-12-22 06:24,B078N8NR7G,pet-supplies
1,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,2019-04-15 10:32,B01HO8U5NC,pet-supplies
2,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,2019-04-04 12:02,B01HO8U5NC,pet-supplies
3,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,2019-04-02 20:35,B01HO8U5NC,pet-supplies
4,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,2018-07-31 08:00,B01HO8U5NC,pet-supplies


In [8]:
# Check information 
print(compiled.shape)
print("\n")
print(compiled.info())

(65451, 41)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 65451 entries, 0 to 65450
Data columns (total 41 columns):
source                            65451 non-null object
product                           65451 non-null object
PART NUMBER_custom                0 non-null float64
SKU_custom                        0 non-null float64
analysis_purpose_custom_custom    0 non-null float64
flag_custom                       0 non-null float64
special_name_custom               0 non-null float64
test_field2_custom                0 non-null float64
test_field3_custom                0 non-null float64
name                              65451 non-null object
date                              65451 non-null object
status                            65451 non-null object
sentiment                         65451 non-null object
topic                             65451 non-null object
notes                             0 non-null float64
profile                           32187 non-null object
autho

In [9]:
# Take only columns of interest
## We will select category_id2 where we will be comparing the word count of the individual products with the word count
## of this category level
compiled = compiled[['source','product','text','category_id2','reviewid']]
compiled.head()

Unnamed: 0,source,product,text,category_id2,reviewid
0,amazon.ca,B078N8NR7G,Produit disfonctionnel. J'exige remboursement,pet-supplies,R31B5G60GS531M
1,amazon.ca,B01HO8U5NC,J’ai adorer,pet-supplies,RRCY4V48RQBXG
2,amazon.ca,B01HO8U5NC,Bought this as a running bird bath for my two ...,pet-supplies,R18076F5C879LP
3,amazon.ca,B01HO8U5NC,Lid is easily knocked off but it’s still a gre...,pet-supplies,RLA1DFN3DCSFJ
4,amazon.ca,B01HO8U5NC,Works okay but is VERY NOISY.,pet-supplies,R3F4GS6FDS5ALH


In [10]:
# Let's create the word count column
compiled['totalwords'] = compiled['text'].str.split().str.len()

In [11]:
# Create word bins with appropriate ranges
compiled['word_bins'] = pd.cut(x=compiled['totalwords'], bins=[0, 5, 15, 25, 40, 65, 100, 200, 100000])
compiled['word_bins'] = pd.cut(x=compiled['totalwords'], bins=[0, 5, 15, 25, 40, 65, 100, 200, 100000], labels=['0 - 5 words', '6 - 15 words', '16 - 25 words', '26 - 40 words', '41 - 65 words', '66 - 100 words', '101 - 200 words','200+'])
compiled.head()

Unnamed: 0,source,product,text,category_id2,reviewid,totalwords,word_bins
0,amazon.ca,B078N8NR7G,Produit disfonctionnel. J'exige remboursement,pet-supplies,R31B5G60GS531M,4.0,0 - 5 words
1,amazon.ca,B01HO8U5NC,J’ai adorer,pet-supplies,RRCY4V48RQBXG,2.0,0 - 5 words
2,amazon.ca,B01HO8U5NC,Bought this as a running bird bath for my two ...,pet-supplies,R18076F5C879LP,36.0,26 - 40 words
3,amazon.ca,B01HO8U5NC,Lid is easily knocked off but it’s still a gre...,pet-supplies,RLA1DFN3DCSFJ,11.0,6 - 15 words
4,amazon.ca,B01HO8U5NC,Works okay but is VERY NOISY.,pet-supplies,R3F4GS6FDS5ALH,6.0,6 - 15 words


In [12]:
# Create a dataframe to aggregate word bins across products
# Normalize to get proportions
product_aggregation = pd.crosstab(compiled["product"], compiled["word_bins"], margins=True, normalize='index')
product_aggregation.head()

word_bins,0 - 5 words,6 - 15 words,16 - 25 words,26 - 40 words,41 - 65 words,66 - 100 words,101 - 200 words,200+
product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
B0000AVVPU,0.181818,0.272727,0.181818,0.272727,0.0,0.0,0.090909,0.0
B0000BYCM0,0.4,0.292308,0.092308,0.076923,0.0,0.107692,0.030769,0.0
B0000DAPGK,0.411765,0.176471,0.235294,0.117647,0.058824,0.0,0.0,0.0
B0001ZWZ9S,0.052632,0.263158,0.0,0.157895,0.0,0.368421,0.157895,0.0
B00023N7TG,0.189873,0.303797,0.113924,0.160338,0.122363,0.067511,0.037975,0.004219


In [13]:
# Create a dataframe to aggregate word bins across categories
# Normalize to get proportions
category_aggregation = pd.crosstab(compiled["category_id2"], compiled["word_bins"], margins=True, normalize='index')
category_aggregation.head()

word_bins,0 - 5 words,6 - 15 words,16 - 25 words,26 - 40 words,41 - 65 words,66 - 100 words,101 - 200 words,200+
category_id2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ce-de/3578331,0.334525,0.235883,0.173695,0.128306,0.083631,0.029664,0.012152,0.002144
diy,0.054878,0.073171,0.097561,0.146341,0.152439,0.182927,0.146341,0.146341
garden/4339577031,0.254054,0.275676,0.237838,0.118919,0.081081,0.021622,0.010811,0.0
industrial/4546048031,0.252708,0.296029,0.148014,0.111913,0.101083,0.057762,0.021661,0.01083
pet-supplies,0.121428,0.168432,0.147719,0.152633,0.146834,0.107325,0.108578,0.047053


In [14]:
# Next we need to merge product_aggregation and category_aggregation!
# To do that we first merge category_id to product_aggregation on id
product_aggregation = pd.merge(product_aggregation,sales, how = 'inner', left_on = "product", right_on="id")
product_aggregation.head()

Unnamed: 0,0 - 5 words,6 - 15 words,16 - 25 words,26 - 40 words,41 - 65 words,66 - 100 words,101 - 200 words,200+,id,category_id2
0,0.181818,0.272727,0.181818,0.272727,0.0,0.0,0.090909,0.0,B0000AVVPU,pet-supplies/2975425011
1,0.4,0.292308,0.092308,0.076923,0.0,0.107692,0.030769,0.0,B0000BYCM0,pet-supplies
2,0.411765,0.176471,0.235294,0.117647,0.058824,0.0,0.0,0.0,B0000DAPGK,pet-supplies/2975425011
3,0.052632,0.263158,0.0,0.157895,0.0,0.368421,0.157895,0.0,B0001ZWZ9S,pet-supplies
4,0.189873,0.303797,0.113924,0.160338,0.122363,0.067511,0.037975,0.004219,B00023N7TG,pet-supplies/2975349011


In [15]:
# Next we merge the the category_aggregation table by joining it on category_id2
product_aggregation = pd.merge(product_aggregation,category_aggregation, how = 'inner', left_on = "category_id2", right_on="category_id2")
product_aggregation.head()

Unnamed: 0,0 - 5 words_x,6 - 15 words_x,16 - 25 words_x,26 - 40 words_x,41 - 65 words_x,66 - 100 words_x,101 - 200 words_x,200+_x,id,category_id2,0 - 5 words_y,6 - 15 words_y,16 - 25 words_y,26 - 40 words_y,41 - 65 words_y,66 - 100 words_y,101 - 200 words_y,200+_y
0,0.181818,0.272727,0.181818,0.272727,0.0,0.0,0.090909,0.0,B0000AVVPU,pet-supplies/2975425011,0.135165,0.174176,0.134066,0.143956,0.157692,0.092308,0.101099,0.061538
1,0.411765,0.176471,0.235294,0.117647,0.058824,0.0,0.0,0.0,B0000DAPGK,pet-supplies/2975425011,0.135165,0.174176,0.134066,0.143956,0.157692,0.092308,0.101099,0.061538
2,0.087324,0.135211,0.135211,0.16338,0.205634,0.101408,0.126761,0.04507,B00062F6HE,pet-supplies/2975425011,0.135165,0.174176,0.134066,0.143956,0.157692,0.092308,0.101099,0.061538
3,0.0,0.0,0.571429,0.0,0.285714,0.142857,0.0,0.0,B00062F6OM,pet-supplies/2975425011,0.135165,0.174176,0.134066,0.143956,0.157692,0.092308,0.101099,0.061538
4,0.173077,0.298077,0.153846,0.173077,0.125,0.038462,0.038462,0.0,B00068R98C,pet-supplies/2975425011,0.135165,0.174176,0.134066,0.143956,0.157692,0.092308,0.101099,0.061538


In [16]:
# Now let's compile the word count comparison to our original dataframe to begin comparing on a review basis
compiled_word_count = pd.merge(compiled,product_aggregation, how = 'inner', left_on = "product", right_on="id")
compiled_word_count.head()

Unnamed: 0,source,product,text,category_id2_x,reviewid,totalwords,word_bins,0 - 5 words_x,6 - 15 words_x,16 - 25 words_x,...,id,category_id2_y,0 - 5 words_y,6 - 15 words_y,16 - 25 words_y,26 - 40 words_y,41 - 65 words_y,66 - 100 words_y,101 - 200 words_y,200+_y
0,amazon.ca,B078N8NR7G,Produit disfonctionnel. J'exige remboursement,pet-supplies,R31B5G60GS531M,4.0,0 - 5 words,1.0,0.0,0.0,...,B078N8NR7G,pet-supplies,0.121428,0.168432,0.147719,0.152633,0.146834,0.107325,0.108578,0.047053
1,amazon.ca,B01HO8U5NC,J’ai adorer,pet-supplies,RRCY4V48RQBXG,2.0,0 - 5 words,0.163636,0.139394,0.145455,...,B01HO8U5NC,pet-supplies,0.121428,0.168432,0.147719,0.152633,0.146834,0.107325,0.108578,0.047053
2,amazon.ca,B01HO8U5NC,Bought this as a running bird bath for my two ...,pet-supplies,R18076F5C879LP,36.0,26 - 40 words,0.163636,0.139394,0.145455,...,B01HO8U5NC,pet-supplies,0.121428,0.168432,0.147719,0.152633,0.146834,0.107325,0.108578,0.047053
3,amazon.ca,B01HO8U5NC,Lid is easily knocked off but it’s still a gre...,pet-supplies,RLA1DFN3DCSFJ,11.0,6 - 15 words,0.163636,0.139394,0.145455,...,B01HO8U5NC,pet-supplies,0.121428,0.168432,0.147719,0.152633,0.146834,0.107325,0.108578,0.047053
4,amazon.ca,B01HO8U5NC,Works okay but is VERY NOISY.,pet-supplies,R3F4GS6FDS5ALH,6.0,6 - 15 words,0.163636,0.139394,0.145455,...,B01HO8U5NC,pet-supplies,0.121428,0.168432,0.147719,0.152633,0.146834,0.107325,0.108578,0.047053


In [17]:
# Rename columns to make them look a little prettier
compiled_word_count.rename(columns={'0 - 5 words_x':'product_0-5',
                                   '6 - 15 words_x':'product_6-15',
                                   '16 - 25 words_x':'product_16-25',
                                   '26 - 40 words_x':'product_26-40',
                                   '41 - 65 words_x':'product_41-65',
                                   '66 - 100 words_x':'product_66-100',
                                   '101 - 200 words_x':'product_101-200',
                                   '200+_x':'product_200+',
                                   '0 - 5 words_y':'category_0-5',
                                   '6 - 15 words_y':'category_6-15',
                                   '16 - 25 words_y':'category_16-25',
                                   '26 - 40 words_y':'category_26-40',
                                   '41 - 65 words_y':'category_41-65',
                                   '66 - 100 words_y':'category_66-100',
                                   '101 - 200 words_y':'category_101-200',
                                   '200+_y':'category_200+'}, inplace=True)

In [18]:
# Include the number of reviews per product as this will be one of our thresholds 
## We will only look at overrepresented word category for products having > 10 reviews; otherwise the results could
## be due to lack of data
compiled_word_count['number_of_reviews'] = compiled_word_count['product'].map(compiled_word_count['product'].value_counts())

In [19]:
# Create functions that will output a value of 1 for products with overrepresented word category counts (>10 %)
## 0 will be shown for products that are not within the overrepresented word category
## This is applied across all the word bins

def a(row):
    if (row['product_0-5'] > 0.1 + row['category_0-5']) and (row['number_of_reviews'] > 10):
        val = 1
    elif (row['product_0-5'] < 0.1 + row['category_0-5']) and (row['number_of_reviews'] > 10):
        val = 0
    else:
        val = 0
    return val

def b(row):
    if (row['product_6-15'] > 0.1 + row['category_6-15']) and (row['number_of_reviews'] > 10):
        val = 1
    elif (row['product_6-15'] < 0.1 + row['category_6-15']) and (row['number_of_reviews'] > 10):
        val = 0
    else:
        val = 0
    return val

def c(row):
    if (row['product_16-25'] > 0.1 + row['category_16-25']) and (row['number_of_reviews'] > 10):
        val = 1
    elif (row['product_16-25'] < 0.1 + row['category_16-25']) and (row['number_of_reviews'] > 10):
        val = 0
    else:
        val = 0
    return val

def d(row):
    if (row['product_26-40'] > 0.1 + row['category_26-40']) and (row['number_of_reviews'] > 10):
        val = 1
    elif (row['product_26-40'] < 0.1 + row['category_26-40']) and (row['number_of_reviews'] > 10):
        val = 0
    else:
        val = 0
    return val

def e(row):
    if (row['product_41-65'] > 0.1 + row['category_41-65']) and (row['number_of_reviews'] > 10):
        val = 1
    elif (row['product_41-65'] < 0.1 + row['category_41-65']) and (row['number_of_reviews'] > 10):
        val = 0
    else:
        val = 0
    return val

def f(row):
    if (row['product_66-100'] > 0.1 + row['category_66-100']) and (row['number_of_reviews'] > 10):
        val = 1
    elif (row['product_66-100'] < 0.1 + row['category_66-100']) and (row['number_of_reviews'] > 10):
        val = 0
    else:
        val = 0
    return val

def g(row):
    if (row['product_101-200'] > 0.1 + row['category_101-200']) and (row['number_of_reviews'] > 10):
        val = 1
    elif (row['product_101-200'] < 0.1 + row['category_101-200']) and (row['number_of_reviews'] > 10):
        val = 0
    else:
        val = 0
    return val

def h(row):
    if (row['product_200+'] > 0.1 + row['category_200+']) and (row['number_of_reviews'] > 10):
        val = 1
    elif (row['product_200+'] < 0.1 + row['category_200+']) and (row['number_of_reviews'] > 10):
        val = 0
    else:
        val = 0
    return val

In [21]:
# Create a new column showing the overrepresented word bins for each product
compiled_word_count['0-5_OR'] = compiled_word_count.apply(a, axis=1)
compiled_word_count['6-15_OR'] = compiled_word_count.apply(b, axis=1)
compiled_word_count['16-25_OR'] = compiled_word_count.apply(c, axis=1)
compiled_word_count['26-40_OR'] = compiled_word_count.apply(d, axis=1)
compiled_word_count['41-65_OR'] = compiled_word_count.apply(e, axis=1)
compiled_word_count['66-100_OR'] = compiled_word_count.apply(f, axis=1)
compiled_word_count['101-200_OR'] = compiled_word_count.apply(g, axis=1)
compiled_word_count['200+_OR'] = compiled_word_count.apply(h, axis=1)

In [22]:
# Check what it looks like!
compiled_word_count.head()

Unnamed: 0,source,product,text,category_id2_x,reviewid,totalwords,word_bins,product_0-5,product_6-15,product_16-25,...,category_200+,number_of_reviews,0-5_OR,6-15_OR,16-25_OR,26-40_OR,41-65_OR,66-100_OR,101-200_OR,200+_OR
0,amazon.ca,B078N8NR7G,Produit disfonctionnel. J'exige remboursement,pet-supplies,R31B5G60GS531M,4.0,0 - 5 words,1.0,0.0,0.0,...,0.047053,1,0,0,0,0,0,0,0,0
1,amazon.ca,B01HO8U5NC,J’ai adorer,pet-supplies,RRCY4V48RQBXG,2.0,0 - 5 words,0.163636,0.139394,0.145455,...,0.047053,165,0,0,0,0,0,0,0,0
2,amazon.ca,B01HO8U5NC,Bought this as a running bird bath for my two ...,pet-supplies,R18076F5C879LP,36.0,26 - 40 words,0.163636,0.139394,0.145455,...,0.047053,165,0,0,0,0,0,0,0,0
3,amazon.ca,B01HO8U5NC,Lid is easily knocked off but it’s still a gre...,pet-supplies,RLA1DFN3DCSFJ,11.0,6 - 15 words,0.163636,0.139394,0.145455,...,0.047053,165,0,0,0,0,0,0,0,0
4,amazon.ca,B01HO8U5NC,Works okay but is VERY NOISY.,pet-supplies,R3F4GS6FDS5ALH,6.0,6 - 15 words,0.163636,0.139394,0.145455,...,0.047053,165,0,0,0,0,0,0,0,0


In [23]:
# Create a function that will check if the individual review in the row with subject word bin is within the overrepresented criteria
def i(row):
    if row['word_bins'] == "0 - 5 words":
        val =  row['0-5_OR']
    elif row['word_bins'] == "6 - 15 words":
        val =  row['6-15_OR']
    elif row['word_bins'] == "16 - 25 words":
        val =  row['16-25_OR']
    elif row['word_bins'] == "26 - 40 words":
        val =  row['26-40_OR']
    elif row['word_bins'] == "41 - 65 words":
        val =  row['41-65_OR']
    elif row['word_bins'] == "66 - 100 words":
        val =  row['66-100_OR']
    elif row['word_bins'] == "101 - 200 words":
        val =  row['101-200_OR']
    elif row['word_bins'] == "200+":
        val =  row['200+_OR']
    else:
        val = 0
    return val

In [24]:
# Apply function for every row and create new column
compiled_word_count['OR'] = compiled_word_count.apply(i, axis=1)

In [25]:
# Check what it looks like!
compiled_word_count.head()

Unnamed: 0,source,product,text,category_id2_x,reviewid,totalwords,word_bins,product_0-5,product_6-15,product_16-25,...,number_of_reviews,0-5_OR,6-15_OR,16-25_OR,26-40_OR,41-65_OR,66-100_OR,101-200_OR,200+_OR,OR
0,amazon.ca,B078N8NR7G,Produit disfonctionnel. J'exige remboursement,pet-supplies,R31B5G60GS531M,4.0,0 - 5 words,1.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,amazon.ca,B01HO8U5NC,J’ai adorer,pet-supplies,RRCY4V48RQBXG,2.0,0 - 5 words,0.163636,0.139394,0.145455,...,165,0,0,0,0,0,0,0,0,0
2,amazon.ca,B01HO8U5NC,Bought this as a running bird bath for my two ...,pet-supplies,R18076F5C879LP,36.0,26 - 40 words,0.163636,0.139394,0.145455,...,165,0,0,0,0,0,0,0,0,0
3,amazon.ca,B01HO8U5NC,Lid is easily knocked off but it’s still a gre...,pet-supplies,RLA1DFN3DCSFJ,11.0,6 - 15 words,0.163636,0.139394,0.145455,...,165,0,0,0,0,0,0,0,0,0
4,amazon.ca,B01HO8U5NC,Works okay but is VERY NOISY.,pet-supplies,R3F4GS6FDS5ALH,6.0,6 - 15 words,0.163636,0.139394,0.145455,...,165,0,0,0,0,0,0,0,0,0


In [26]:
# Delete any unneccessary columns
compiled_word_count.drop(['category_id2_x', 'id','category_id2_y'], axis=1, inplace = True)

In [27]:
compiled_word_count.head()

Unnamed: 0,source,product,text,reviewid,totalwords,word_bins,product_0-5,product_6-15,product_16-25,product_26-40,...,number_of_reviews,0-5_OR,6-15_OR,16-25_OR,26-40_OR,41-65_OR,66-100_OR,101-200_OR,200+_OR,OR
0,amazon.ca,B078N8NR7G,Produit disfonctionnel. J'exige remboursement,R31B5G60GS531M,4.0,0 - 5 words,1.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,amazon.ca,B01HO8U5NC,J’ai adorer,RRCY4V48RQBXG,2.0,0 - 5 words,0.163636,0.139394,0.145455,0.115152,...,165,0,0,0,0,0,0,0,0,0
2,amazon.ca,B01HO8U5NC,Bought this as a running bird bath for my two ...,R18076F5C879LP,36.0,26 - 40 words,0.163636,0.139394,0.145455,0.115152,...,165,0,0,0,0,0,0,0,0,0
3,amazon.ca,B01HO8U5NC,Lid is easily knocked off but it’s still a gre...,RLA1DFN3DCSFJ,11.0,6 - 15 words,0.163636,0.139394,0.145455,0.115152,...,165,0,0,0,0,0,0,0,0,0
4,amazon.ca,B01HO8U5NC,Works okay but is VERY NOISY.,R3F4GS6FDS5ALH,6.0,6 - 15 words,0.163636,0.139394,0.145455,0.115152,...,165,0,0,0,0,0,0,0,0,0


In [29]:
# We can save the table for further use
compiled_word_count.to_csv('compiled_word_count_0506.csv',index=False)

In [None]:
# To compile we just keep the feature we need:
compiled_word_count.set_index('reviewid',drop=True,inplace=True)
word_count_labeled = compiled_word_count[['totalwords', 'OR']]

Compiling the above 2 features with the main table **"final_output_with_all"**:

In [None]:
merged_highvol_wordcnt = pd.merge(compiled_word_count,reviews_high_volume,how='left', left_index=True, right_index=True)

In [None]:
final_output_with_all.set_index('reviewid',drop=True,inplace=True)
merged_basic_highvol_wordcnt = pd.merge(final_output_with_all,merged_highvol_wordcnt,how='left', left_index=True, right_index=True)

In [None]:
# save the table for further use
merged_basic_highvol_wordcnt.to_csv('merged_basic_highvol_wordcnt.csv',index=True)

### Text-related: Repetitive phrases and incentivized reviews

These two features are created based on the review text analysis.

**Incentivized reviews**: Phrases that have a potential to indicate incentivized behaviors are selected to help detect reviews.<br>
**Repetitive phrases**:


In [None]:
#load_NLP_packages
def normalize_document(doc):
    # doc is one Review text

    # Lemmatizer, tokenizer, stop_words
    lemmatizer = WordNetLemmatizer() 
    stop_words = nltk.corpus.stopwords.words('english')

    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()

    # tokenize & lemmatize document
    tokens = [lemmatizer.lemmatize(word,pos="v") for word in word_tokenize(doc)]
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc


def NLP_models(norm_corpus,option=0):
    ### Modeling 
    # 1. Define CountVectorizer
    # 2. Get word Matrix: count vectorizer transform text document

    ### 1.  Bag of Word Model
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    if(option==1):
        # Bag of Words
        cv = CountVectorizer(min_df=0.02,max_df=0.99,max_features=300)
    
    elif(option==2):
        # Bag of 2-grams
        cv = CountVectorizer(ngram_range=(2,2))
    else:
        # TF-IDF
        cv = TfidfVectorizer(min_df=0.02,max_df=0.99,max_features=300, use_idf=True)

    cv_fit=cv.fit_transform(norm_corpus)
    cv_matrix = cv.fit_transform(norm_corpus)
    cv_matrix = cv_matrix.toarray()

    # get all unique words in the corpus
    vocab = cv.get_feature_names()

    # show document feature vectors
    cv_matrix_df=pd.DataFrame(np.round(cv_matrix,2), columns=vocab)
    return cv_matrix_df

In [None]:
texts=merged_basic_highvol_wordcnt['text'].apply(str)
normalize_corpus=np.vectorize(normalize_document)
texts_clean= normalize_corpus(texts)

#### Incentivized Review

In [None]:
def create_incentivized_words():
    
    incentivized_words=["Free collar"
    ,"Free collar offer"
    ,"Free one"
    ,"Free product"
    ,"Free dog collar for a positive review"
    ,"Free second collar"
    ,"Free gift"
    ,"Additional free chargers for a positive review"
    ,"Promised a free collar"
    ,"Another free"
    ,"In exchange for a positive review"
    ,"In exchange for a review"
    ,"If you review"
    ,"If I reviewed the product"
    ,"Write a review"
    ,"Writing a review"
    ,"Leave us a review"
    ,"Leave a review"
    ,"Positive review"
    ,"If I Left a review"
    ,"Reviews are paid"
    ,"Review in return"
    ,"For a review"
    ,"For our review"
    ,"For my review"
    ,"Leave a 5 star review"
    ,"Incentive"
    ,"Incentivized"
    ,"Gift card"
    ,"Inside the packaging was a flyer"
    ,"Flyer"
    ,"Bribe"]
    cleaned= normalize_corpus(incentivized_words)
    cleaned=cleaned[cleaned!='review']

    incentivized_words_list=[]
    [incentivized_words_list.append(x) for x in cleaned if x not in incentivized_words_list]
    print(incentivized_words_list)
    return(incentivized_words_list)


incentivized_words_cleaned=create_incentivized_words()

vector=[]
for text in texts_clean:
    if any(word in text for word in incentivized_words_cleaned):
        vector.append(1)
    else:
        vector.append(0)
print("Total incentivzed reviews= {} ".format(sum(vector)))

print("{} percent of the reviews are incentivized ".format(sum(vector)/merged_basic_highvol_wordcnt.shape[0]*100))
merged_basic_highvol_wordcnt['incenvized_reviews']=vector

In [None]:
incentivized_review = merged_basic_highvol_wordcnt[['reviewid','incenvized_reviews']]
incentivized_review.to_csv("incentivized_review.csv")

#### Phrase repetition

In [None]:
df = profile_urls.copy()
df = df.dropna(axis=1,thresh=len(df)*0.9)
print(df.head(3))

In [None]:
# Check number of products
num_products= len(df['product'].value_counts())
print('there are {} products'.format(num_products))
unique_product_list=df['product'].unique()

# Dictionary to Store Products
product_dict={}
for product in unique_product_list:
    product_dict[product]= df.loc[df['product']==product,]

print("The product we are interested is {}".format(product))
print("\n")
print(product_dict[product].head(3))
sub_df= product_dict[product]
text=sub_df['text']
text.reset_index(drop=True, inplace=True)
###    Finish Preparing Text

normalize_corpus = np.vectorize(normalize_document)
norm_corpus = normalize_corpus(text)

# Run TF-IDF model, get 
cv_matrix_df=NLP_models(norm_corpus,option=0)

# Use cosine similarity
similarity_matrix = cosine_similarity(cv_matrix_df)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df.head(3)

In [15]:
merged_basic_highvol_wordcnt['text_len']=merged_basic_highvol_wordcnt['text'].apply(str).apply(lambda x:len(x))
temp=merged_basic_highvol_wordcnt.copy()

In [16]:
merged_basic_highvol_wordcnt=merged_basic_highvol_wordcnt[merged_basic_highvol_wordcnt['text_len']>=30]
print("{} percentage of reviews keeped".format(merged_basic_highvol_wordcnt.shape[0]/temp.shape[0]))

0.8539624648582934 percentage of reviews keeped


In [17]:
def find_text_similarity(norm_corpus,model_option=2,bench_mark=0.5):
    cv_matrix_df=NLP_models(norm_corpus,option=model_option)
    similarity_matrix = cosine_similarity(cv_matrix_df)
    similarity_df=pd.DataFrame(cosine_similarity(similarity_matrix))
    index= (similarity_df[similarity_df>bench_mark].notna().sum() !=1)==True
    true_index=index[index==True]
    
    return(true_index)

#index=find_text_similarity(norm_corpus)
#index

In [18]:
# Find reviews with problematic phrase repetition
problem_review_id=[]

for product in unique_product_list:
    try:
        print(product)
        product_dict[product]= merged_basic_highvol_wordcnt.loc[df['product']==product,]
        sub_df= product_dict[product]
        text=sub_df['text'].apply(str)
        text.reset_index(drop=True, inplace=True)
        norm_corpus = normalize_corpus(text)
        index=find_text_similarity(norm_corpus)
        problem_review_id.append(sub_df.iloc[index.index,][["reviewid"]].values.tolist()) 
    except:
        continue

B078N8NR7G
B01HO8U5NC
B01HI5ZXN8
B01HB7N5ZQ
B078N83GS4
B078N564WT
B078N3JVYV
B01GCAS5SK
B01GCAS4VS
B01GCAS4RM
B01J18Z1BO
B01J18Z1AU
B01E6TI2DC
B01E6TI1Q0
B01GCAS4JA
B01EYK74FK
B01ESR0PT6
B078N35M1S
B01GCAS4MC
B01GCAS4KE
B01GCAS4K4
B01ESR0O5G
B01ESR0OAQ
B01EA7E88I
B01EA7E766
B01ESR0MAI
B01ATSHB5E
B01ESR0MSU
B01ESR0MR6
B01CZ6VENI
B01B1FT4H2
B01E6THUR6
B01E6THUK8
B01E6THUJE
B01ATS8NUQ
B01E6THUIU
B01E6THU30
B01DGEGIPW
B015TNVVGY
B014COTASW
B0188Y676U
B017N6IF5U
B0167GU9AG
B015TNW2FS
B015TNW12C
B015TNW0Z0
B019I1ZTXY
B019I1ZTKC
B0188Y67J2
B015TNVZEW
B015TNVYRU
B015TNVYP2
B012F869RM
B010E08V06
B015TNW0GE
B015TNW01O
B015TNVZHE
B014COTAK0
B014COTA6E
B014COTA46
B00ZCFPHO2
B00ZCFPH56
B00YHPNWWC
B015TNVY0M
B015TNVXQ2
B015TNVXAI
B00ZEGHU8A
B00ZEGHS4G
B00ZEGHR10
B015TNVWWM
B015TNVWEK
B00VIXRB6O
B00UTIASZ0
B00T88U5DC
B00YHPNS8U
B00MPE5KFY
B00MPE5KCM
B00VPYYR9A
B00VPYYR8G
B00VPYYQZA
B00QV5GF34
B00QTCUV0C
B00Q52H0DW
B00OZMOR26
B00OZMOQM2
B00OH46TSW
B00MPE5U2W
B00MPE5PAO
B00MPE5P5O
B00MPE5JZA
B00MPE5FUY

B00B17ETJE
B00CW9XWX4
B00IAOB50C
B00CMLS0VG
B000A27NGW
B000LXY3CC
B00F0JD184
B0007RD9O0
B00B17ETPI
B000LXVYM4
B00CW9XWTI
B000LXW0YA
B000LXU3N0
B000LXU3NA
B01171OR6I
B000RXY4H0
B00IAOB4VC
B00B732D2W
B00CW9XWXE
B000RXVJEQ
B00B17ETPS
B00B17ETR6
B000LY0XWU
B0016HNU12
B00LPFP31A
B000241NRI
B00CZ7HP4A
B00CZ7HP5O
B00CZ7HP68
B00QGYMAIY
B00VPYYY16
B00CZ7HO1Y
B00CZ7HO3W
B00CZ7HOS2
B0016HPTFW
B01MYBV6FN
B00CZ7HE4Q
B00CZ7HE9G
B00CZ7HFBS
B01K4KYZL0
B00B23AUVS
B00B17ETNU
B00C1FI63A
B00LHUWS6Q
B008LUKBGE
B008LUKC7W
B00I04Y7RA
B00L51ZQHU
B0752XP3R5
B073FV5LVW
B0011F4WWK
B004WO90E2
B008LUKARE
B075T6VM7W
B01ATS8NY2
B01ATS8OP0
B01ATS8EVY
B01ATS8JFU
B00WFKJWNY
B01ATS8JH8
B01ATS8ESM
B00U2P342E
B00US6U6ZU
B00W8GDDBM
B01ATS8JKU
B01ATS8OK0
B00VKW57VE
B00VPVJKMM
B00W8GGDQ4
B00W8GJK64
B00VKW6Y2U
B00SX8JQR4
B00T3X1W52
B00VKVZAZI
B00VV5TG08
B00W8G9NBG
B00VKW1R2C
B00VKW3GIA
B00QRSA540
B00QHID8VC
B00QHID92K
B00VKVHBT6
B00PJ8RQR8
B00PJ8RFI8
B00PJ8RGNW
B00RKFK3Q4
B00S8JW1T8
B00PJ8NDB6
B00DQXR42U
B00DSOMPEY
B00E1T0CAO

In [19]:
def get_flattened_list(lst):
    flattened_list = []
    #flatten the lis
    for x in lst:
        for y in x:
            flattened_list.append(y)
    return(flattened_list)

problem_review_id=get_flattened_list(get_flattened_list(problem_review_id))

In [20]:
# Create the new column
new_col=merged_basic_highvol_wordcnt['reviewid'].apply(lambda x: 1 if x in problem_review_id else 0)
merged_basic_highvol_wordcnt['repetitive phrase']=new_col

In [22]:
merged_basic_highvol_wordcnt.head()

Unnamed: 0,reviewid,0_review,Nvr_verified_reviewer,One_hit,Take_backs,Verified_Purchases,avg_rating,helpful_votes,name,num_of_reviews_count,...,OR,author,date,new_date,source,text,whether_high_volume,incenvized_reviews,text_len,repetitive phrase
0,R1001WAW3T7HTQ,0.0,0.0,0.0,0.0,1,3.333333,30.0,gilles gaujard,18.0,...,0.0,gilles gaujard,2019-08-25 08:00,2019-08-25,amazon.fr,Achetée pour empêcher le chat de mes voisins ...,False,0,355,0
2,R1007O54FB5M3J,0.0,0.0,0.0,1.0,1,3.8,2.0,Amazon Kunde,10.0,...,0.0,Amazon Kunde,2019-07-31 08:00,2019-07-31,amazon.de,"Naja, es geht...die Bürsten verbiegen sich seh...",False,0,96,0
3,R100JBMVROD5NL,0.0,0.0,0.0,1.0,1,3.75,4.0,James Palovich,4.0,...,0.0,James Palovich,2019-10-08 08:00,2019-10-08,amazon,I am not impressed with the recharge time.,False,0,42,0
4,R100Z5EA9UVROI,,,,,1,,,,,...,0.0,Ishmael Smith,2018-03-24 08:00,2018-03-24,amazon.ca,Works great for our big male. He figured out h...,False,0,179,0
5,R101IHLFAECMI9,,,,,1,,,,,...,1.0,john vaughan,2018-03-07 08:00,2018-03-07,amazon.uk,Awesome!! Fits easily and arrived on time.,False,0,42,0


### Overlap History

This feature is created to detect whether certain reviewers have overlapping review histories with others. If a reviewer has more than 2 items(products) in his/her history that are same as another reviewer, than we will flag **the reviewer**.

Possible bias: without the full review history of a certain reviewer, we can only tag according to the reviews we scrapped for a certain brand. The result will be more reliable if the full review page of a customer is available.

In [52]:
# use the dataset with the profile links
profiles = profile_urls.copy()

# Create a subset to contain only reviewer and the products they reviewed
sub=profiles[['author','product']]
sub.groupby('author').agg({'product':len}).sort_values('product',ascending=False)

Unnamed: 0_level_0,product
author,Unnamed: 1_level_1
Amazon Customer,6290
Client d'Amazon,734
Amazon Kunde,411
Kindle Customer,158
Chris,77
Sarah,73
Mike,61
Jennifer,60
Lisa,58
Michelle,53


In [53]:
# Get names of indexes of anonymous customers
indexName1 = sub[sub['author'] == 'Amazon Customer' ].index
indexName2 = sub[sub['author'] == "Client d'Amazon" ].index
indexName3 = sub[sub['author'] == "Kindle Customer" ].index
indexName4 = sub[sub['author'] == "Amazon Kunde" ].index  #There might be other anonymous customers for other datasets

# Delete these row indexes from dataFrame
sub.drop(indexName1 , inplace=True)
sub.drop(indexName2 , inplace=True)
sub.drop(indexName3 , inplace=True)
sub.drop(indexName4 , inplace=True)
sub.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [224]:
# Create an author list
author_list = list(sub['author'].unique())

In [134]:
# Create a product list that shows the set of products that an author reviewed
product_list = []
for i in range(0,len(author_list)):
    author_name = sub['author'][i]
    products = list(sub[sub['author']==author_name]['product'].unique())
    product_list.append(products)

In this part we will compare each random 2 pairs of product lists from different authors, <br>and get all the authors that have more than 2 overlapping review histories with others.

In [137]:
# Define a function to find out for each pair of author, how many products overlap
def find_overlay(s1,s2):
    return len([x for x in list(s1) if x in list(s2)])

# Choose only the pairs with more than two products overlapping
pairs = []
for i in tnrange(-1,len(author_list)-2):
    i += 1
    j = i + 1
    for j in range(i+1,len(author_list)):
        if find_overlay(product_list[i], product_list[j]) >= 2:
            pairs.append([i,j])
        else:
            pass
        j += 1

  import sys


HBox(children=(FloatProgress(value=0.0, max=45209.0), HTML(value='')))




In [157]:
# Retrieve only the unique reviewers in the pairs
pairs = pd.DataFrame(pairs)
pairs.columns = [['author','overlap_author']]

list1 = pairs.overlap_author.values.tolist()
list2 = pairs.author.values.tolist()
list1.extend(list2)
row = list(np.unique(list1))

In [231]:
len(row)

14321

In [244]:
# Change from number to the respective names of authors
overlap_author_names = [author_list[x] for x in row]
overlap_author_names

['nathalie',
 'Richard Goods',
 'Jerome Tanguay',
 'Lisa',
 'roger',
 'Francine',
 'Kat',
 'Marie-christine Lapierre',
 'CanadianBob',
 'rex',
 'Rick G.',
 'Deb',
 'Angie',
 'Jay and Leah',
 'S. Westley',
 'Leighthecat',
 'CL',
 'Evan',
 'Randalin G.',
 'Kate',
 'Momo',
 'Lisa W.',
 'Ann',
 'Katherine Down',
 'stephanie',
 'Relax Already',
 'KEM',
 'Derek',
 'Danielle',
 'Jason Tran',
 'Jayar123',
 'Cat',
 'Jeremy',
 'Kevin',
 'Mila',
 'Korvida',
 'TheBookChick',
 'ItTakesFour',
 'Fenn',
 'Booklover',
 'Fat Rabbit',
 'Lazy Lazer',
 'Jared Gushattey',
 'im board',
 'El buyer',
 'Teri Derrick',
 'Telsa',
 'MBW',
 'Mr Z',
 'Hannah Hollett',
 'North.ca',
 'Teena',
 'Maude',
 'Sam M',
 'Lisa G',
 'Catherine Coste',
 'Sophie Cormier',
 'Toni',
 'karyn baker',
 'Bill Rockwell',
 'Yves',
 'Katie Toews',
 'Felicia',
 'Rachel',
 'Christina',
 'Lori Shore',
 'Mary A Smith',
 'jack',
 'Linda Jenkins',
 'Melanie',
 'Robert K',
 'Tree',
 'Lindsey',
 'Corina Lockyer',
 'Kimanh',
 'Minerva',
 'Sandra 

In [247]:
# Tag these authors with overlapping history labels
overlap_to_join = pd.DataFrame(overlap_author_names)
overlap_to_join.columns = ['author']
overlap_to_join['overlap_history'] = 1

In [251]:
# join with the review table
overlaps = pd.merge(profiles, overlap_to_join, how='left', on='author')
overlaps.overlap_history = overlaps.overlap_history.fillna('0')
overlaps

Unnamed: 0,source,product,PART NUMBER_custom,SKU_custom,analysis_purpose_custom_custom,flag_custom,special_name_custom,test_field2_custom,test_field3_custom,name,...,statustime,helpfulcount,commenttext,commentauthor,officialcomment,totalcomments,commentts,commentdatestring,inputtime,overlap_history
0,amazon.ca,B078N8NR7G,,,,,,,,PetSafe 900 Meter Remote Trainer,...,,,,,,,,,2018-12-22 06:24,0
1,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,,2019-04-15 10:32,1
2,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,,2019-04-04 12:02,0
3,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,,2019-04-02 20:35,0
4,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,,2018-07-31 08:00,0
5,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,,2018-07-29 08:00,1
6,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,,2018-04-12 08:00,1
7,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,,2018-01-22 08:34,0
8,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,,2018-01-08 18:43,1
9,amazon.ca,B01HO8U5NC,,,,,,,,Drinkwell Platinum Pet Fountain 168oz,...,,,,,,,,,2017-10-23 06:09,0


In [252]:
overlaps.to_csv('overlapping_0506.csv',index=False)

In [None]:
# Keep only the columns in interest:
overlap_labeled = overlaps[['reviewid','overlap_history']]
overlap_labeled.set_index('reviewid',drop=True,inplace=True)

In [None]:
# Merge with the full dataset
merged_basic_highvol_wordcnt.set_index('reviewid',drop=True,inplace=True)
full_merged_data = pd.merge(merged_basic_highvol_wordcnt,overlap_labeled,how='left', left_index=True, right_index=True)

# Export data
full_merged_data.to_csv('full_merged_data.csv',index=True)