In [13]:
import os
import time
import datetime
import pandas as pd
import numpy as np

In [14]:
path = 'graph_Amazon_Ali/'

In [21]:
################################# FUNCTIONS ###################################

def get_category_freq(x_reviews):
	return dict(x_reviews.groupby('product_category_condensed')['review_text'].count() / x_reviews.shape[0])

def get_month_freq(x_reviews):
	return dict(x_reviews.groupby('month_posted')['review_text'].count() / x_reviews.shape[0])

In [16]:
################################# Creating a map for Review, Reviewers, and Products ###################################
r_df = pd.read_csv(path + 'raw/reviews_to_be_shared.csv')
r_df.drop(columns=['reviewer_id', 'product_id'], inplace=True)
r_df.rename(columns={"asin": "product_asin", "reviewer_ID": "reviewer_id"}, inplace=True)


# Review
r_df[['review_id']].to_csv(path + 'clean/review_map.csv', index=False)

# Reviewer
pd.DataFrame({"reviewer_id": r_df.reviewer_id.unique()}).to_csv(path + 'clean/reviewer_map.csv', index=False)

# Product
pd.DataFrame({"product_asin": r_df.product_asin.unique()}).to_csv(path + 'clean/product_map.csv', index=False)

In [17]:
################################# All Product Categories ###################################
all_products = pd.read_csv(path + 'raw/all_product_categories.csv')
all_products

Unnamed: 0,product_asin,product_category_condensed
0,B07K7MSV25,Other
1,B07H3PDWVZ,Sports and Outdoors
2,B07Z3FG46V,Electronics
3,B07RT5DZ2Y,Sports and Outdoors
4,B07WHGZLK8,Personal Care
...,...,...
4225,B0822W9HHH,Electronics
4226,B083LTRNHV,"Clothing, Shoes, and Jewelry"
4227,B085QKTJNF,Home Improvement
4228,B085XH2WGR,Sports and Outdoors


In [18]:
################################# Focal Products ###################################
focal_products = pd.read_csv(path + 'raw/focal_products.csv')

focal_products.drop(columns=['Facebook Group name', 'Product Category'], inplace=True)
focal_products.rename(columns={"asin": "product_asin"}, inplace=True)

focal_products['first_day_date'] = pd.to_datetime(focal_products['first_day_date'])
focal_products['last_day_date'] = pd.to_datetime(focal_products['last_day_date'])

focal_products

Unnamed: 0,product_asin,first_day_date,last_day_date
0,B003MROHZW,2020-02-08,2020-02-25
1,B00GIHF6AS,2020-02-14,2020-03-05
2,B00PAOFGY6,2019-11-20,2019-11-20
3,B00PH1MQV8,2019-11-20,2019-11-20
4,B00SGF5N1M,2020-03-15,2020-05-29
...,...,...,...
1488,B088LSQLJ3,2020-06-01,2020-06-01
1489,B088TCYV9F,2020-06-05,2020-06-05
1490,B088TJZ4PS,2020-06-03,2020-06-19
1491,B08935J6YY,2020-06-07,2020-06-11


In [19]:
################################# DATA (reviews) ###################################
reviews = pd.read_csv(path + 'raw/reviews_to_be_shared.csv')

reviews.drop(columns=['reviewer_id', 'product_id'], inplace=True)
reviews.rename(columns={"asin": "product_asin", "reviewer_ID": "reviewer_id"}, inplace=True)

# Handling null review title and body
reviews.loc[reviews['review_title'].isnull(), ['review_title']] = ''
reviews.loc[reviews['review_body'].isnull(), ['review_body']] = ''

# Counting words in review title, body, and as a whole
reviews['len_review_title'] = [len(x.split()) for x in reviews['review_title'].tolist()]
reviews['len_review_title'] = reviews['len_review_title'].astype(float)
reviews['len_review_body'] = [len(x.split()) for x in reviews['review_body'].tolist()]
reviews['len_review_body'] = reviews['len_review_body'].astype(float)
reviews['len_review_text'] = reviews['len_review_title'] + reviews['len_review_body']

# Converting review title and body to a single review_text column
reviews["review_text"] = reviews["review_title"].astype(str) + " " + reviews["review_body"].astype(str)
reviews.drop(columns=['review_title', 'review_body'], inplace=True)

# Converting number_of_photos -> has_photos
reviews['has_photos'] = 0.0
reviews.loc[reviews.number_of_photos >= 1, 'has_photos'] = 1.0

# Converting number_of_helpful -> has_helpful
reviews['has_helpful'] = 0.0
reviews.loc[reviews.number_of_helpful >= 1, 'has_helpful'] = 1.0

# Converting review_data to actual datetime
reviews['review_date'] = pd.to_datetime(reviews['review_date'])

# Review month
reviews['month_posted'] = reviews['review_date'].dt.month

# Appending product categories
reviews = reviews.merge(all_products, left_on='product_asin', right_on='product_asin', how='left')

# If the review was posted within the FaceBook review buying campaign window (+2 weeks after the campaign ended), then it was posted 'during_campaign'
reviews = reviews.merge(focal_products, left_on='product_asin', right_on='product_asin', how='left')
reviews['during_campaign'] = 0
criteria = (reviews.fake_asin == 1) & (reviews['review_date'] >= reviews['first_day_date']) & (reviews['review_date'] <= (reviews['last_day_date'] + pd.to_timedelta(14, unit='D')))
reviews.loc[criteria, 'during_campaign'] = 1

# Defining train and test observations
review_train_test_dict = pd.read_pickle(path + 'raw/train_test_dict_reviews.pkl')
reviews['is_train'] = reviews.review_id.map(review_train_test_dict)
reviews.loc[reviews.is_train == 'train', 'is_train'] = 1
reviews.loc[reviews.is_train == 'test', 'is_train'] = 0

# Cleaned reviews dataframe
reviews = reviews[['reviewer_id', 'product_asin', 'fake_asin', 'during_campaign', 'is_train', 'removed_by_Amazon', 'number_of_photos', 'number_of_helpful', 'review_date', 'month_posted', 'review_rating', 'has_photos', 'has_helpful', 'len_review_title', 'len_review_body', 'len_review_text', 'review_text', 'product_category_condensed']]
reviews

Unnamed: 0,reviewer_id,product_asin,fake_asin,during_campaign,is_train,removed_by_Amazon,number_of_photos,number_of_helpful,review_date,month_posted,review_rating,has_photos,has_helpful,len_review_title,len_review_body,len_review_text,review_text,product_category_condensed
0,723414,B07K7MSV25,0,0,1,0,0,0,2020-05-07,5,1.0,0.0,0.0,1.0,3.0,4.0,unfriendly behave badly vx:hytfy89usjmys3548,Other
1,1034254,B07H3PDWVZ,0,0,1,1,2,94,2020-06-22,6,5.0,1.0,1.0,3.0,79.0,82.0,waterproof and lightweight. This bottle is 1...,Sports and Outdoors
2,1027740,B07Z3FG46V,1,0,1,1,0,0,2020-05-30,5,5.0,0.0,0.0,4.0,16.0,20.0,Great for multiple devices Awesome to charge a...,Electronics
3,146762,B07RT5DZ2Y,0,0,1,1,0,0,2020-09-23,9,5.0,0.0,0.0,4.0,17.0,21.0,Easy and saves room So simple to use! When d...,Sports and Outdoors
4,627419,B07WHGZLK8,1,1,1,1,0,0,2020-05-21,5,5.0,0.0,0.0,3.0,51.0,54.0,Light house cleaning. Really easy to assemble....,Personal Care
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1398069,921326,B07X8V3SLM,0,0,0,0,0,0,2020-01-12,1,5.0,0.0,0.0,6.0,93.0,99.0,Very Happy with this USB-C Hub I am using th...,Electronics
1398070,572598,B082DBC2X1,1,0,0,0,0,0,2019-12-30,12,5.0,0.0,0.0,7.0,153.0,160.0,"Brilliant design, quality build, looks great t...",Home Improvement
1398071,210516,B084ZDPNTT,1,0,1,0,0,0,2020-09-04,9,5.0,0.0,0.0,2.0,15.0,17.0,Real nice Great quality and feels really nic...,Electronics
1398072,500627,B071PDY5DS,0,0,1,0,0,0,2019-07-29,7,5.0,0.0,0.0,12.0,41.0,53.0,There is a slight taste to them but you barely...,Personal Care


In [20]:
reviews.to_csv(path + 'clean/processed_reviews.csv', index=False)

In [8]:
################################# DATA (review labels) ###################################
# We don't have ground-truth labels here
# These labels are proxies
# The code used allows us to test accuracy with review labels 
# Once we have some ground-truth labels, these will be changed
# For now, we determine a fake review that was:
# 1. posted on a product that is known to buy reviews
# 2. posted within the FaceBook review buying campaign window (+2 weeks after the campaign ended)
reviews[['during_campaign']]

Unnamed: 0,during_campaign
0,0
1,0
2,0
3,0
4,1
...,...
1398069,0
1398070,0
1398071,0
1398072,0


In [9]:
np.save(path + 'labels-review.npy', reviews[['during_campaign']].to_numpy())

In [10]:
################################# DATA (review train test split) ###################################
reviews[['is_train']]

Unnamed: 0,is_train
0,1
1,1
2,1
3,1
4,1
...,...
1398069,0
1398070,0
1398071,1
1398072,1


In [11]:
reviews[['is_train']].to_csv(path + 'review_train_test.csv', index=False)

In [22]:
################################# DATA (products) ###################################
products_df = pd.DataFrame({"product_asin": reviews.product_asin.unique()})


# Number of reviews
print("Number of reviews: ", end='')
t0 = time.time()
n_reviews_gb = reviews.groupby('product_asin')['review_date'].count()
products_df['n_of_reviews'] = [n_reviews_gb[i] for i in products_df.product_asin]
print(time.time() - t0, 'seconds')


# Proportion of reviews with photos
print("Proportion of reviews with photos: ", end='')
t0 = time.time()
share_n_photos_gb = reviews.groupby('product_asin')['has_photos'].mean()
products_df['share_photos'] = [share_n_photos_gb[i] for i in products_df.product_asin]
print(time.time() - t0, 'seconds')

# Proportion of reviews with helpful votes
print("Proportion of reviews with helpful votes: ", end='')
t0 = time.time()
share_n_helpful_gb = reviews.groupby('product_asin')['has_helpful'].mean()
products_df['share_helpful'] = [share_n_helpful_gb[i] for i in products_df.product_asin]
print(time.time() - t0, 'seconds')

# Proportion of 1-star reviews
print("Proportion of 1-star reviews: ", end='')
t0 = time.time()
share_1star_gb = reviews.groupby('product_asin').apply(lambda x : sum(x['review_rating'] == 1.0) / x['review_date'].count())
products_df['share_1star'] = [share_1star_gb[i] for i in products_df.product_asin]
print(time.time() - t0, 'seconds')

# Proportion of 5-star reviews
print("Proportion of 5-star reviews: ", end='')
t0 = time.time()
share_5star_gb = reviews.groupby('product_asin').apply(lambda x : sum(x['review_rating'] == 5.0) / x['review_date'].count())
products_df['share_5star'] = [share_5star_gb[i] for i in products_df.product_asin]
print(time.time() - t0, 'seconds')


# Mean review rating
print("Mean review rating: ", end='')
t0 = time.time()
avg_rev_rate_gb = reviews.groupby('product_asin')['review_rating'].mean()
products_df['avg_review_rating'] = [avg_rev_rate_gb[i] for i in products_df.product_asin]
print(time.time() - t0, 'seconds')

# Variation in review rating
print("Variation in review rating: ", end='')
t0 = time.time()
sd_rev_rate_gb = reviews.groupby('product_asin')['review_rating'].std()
products_df['stdev_review_rating'] = [sd_rev_rate_gb[i] for i in products_df.product_asin]
products_df.loc[products_df['stdev_review_rating'].isnull(), ['stdev_review_rating']] = 0.0
print(time.time() - t0, 'seconds')

# Min in review rating
print("Min in review rating: ", end='')
t0 = time.time()
min_rev_rate_gb = reviews.groupby('product_asin')['review_rating'].min()
products_df['min_review_rating'] = [min_rev_rate_gb[i] for i in products_df.product_asin]
print(time.time() - t0, 'seconds')

# Max in review rating
print("Max in review rating: ", end='')
t0 = time.time()
max_rev_rate_gb = reviews.groupby('product_asin')['review_rating'].max()
products_df['max_review_rating'] = [max_rev_rate_gb[i] for i in products_df.product_asin]
print(time.time() - t0, 'seconds')


# Mean review length
print("Mean review length: ", end='')
t0 = time.time()
avg_rev_length_gb = reviews.groupby('product_asin')['len_review_text'].mean()
products_df['avg_review_length'] = [avg_rev_length_gb[i] for i in products_df.product_asin]
print(time.time() - t0, 'seconds')

# Variation in review length
print("Variation in review length: ", end='')
t0 = time.time()
sd_rev_length_gb = reviews.groupby('product_asin')['len_review_text'].std()
products_df['stdev_review_length'] = [sd_rev_length_gb[i] for i in products_df.product_asin]
products_df.loc[products_df['stdev_review_length'].isnull(), ['stdev_review_length']] = 0.0
print(time.time() - t0, 'seconds')

# Min in review length
print("Min in review length: ", end='')
t0 = time.time()
min_rev_length_gb = reviews.groupby('product_asin')['len_review_text'].min()
products_df['min_review_length'] = [min_rev_length_gb[i] for i in products_df.product_asin]
print(time.time() - t0, 'seconds')

# Max in review length
print("Max in review length: ", end='')
t0 = time.time()
max_rev_length_gb = reviews.groupby('product_asin')['len_review_text'].max()
products_df['max_review_length'] = [max_rev_length_gb[i] for i in products_df.product_asin]
print(time.time() - t0, 'seconds')


# Mean days between reviews
print("Mean days between reviews: ", end='')
t0 = time.time()
avg_days_bw_revs_gb = reviews.groupby('product_asin').apply(lambda x : x['review_date'].sort_values().diff().mean())
products_df['avg_days_between_reviews'] = [avg_days_bw_revs_gb[i] for i in products_df.product_asin]
products_df['avg_days_between_reviews'] = products_df['avg_days_between_reviews'] / datetime.timedelta(days=1)
products_df.loc[products_df['avg_days_between_reviews'].isnull(), ['avg_days_between_reviews']] = 0.0
print(time.time() - t0, 'seconds')

# Variation of time between reviews
print("Variation of time between reviews: ", end='')
t0 = time.time()
sd_days_bw_revs_gb = reviews.groupby('product_asin').apply(lambda x : x['review_date'].sort_values().diff().std())
products_df['stdev_days_between_reviews'] = [sd_days_bw_revs_gb[i] for i in products_df.product_asin]
products_df['stdev_days_between_reviews'] = products_df['stdev_days_between_reviews'] / datetime.timedelta(days=1)
products_df.loc[products_df['stdev_days_between_reviews'].isnull(), ['stdev_days_between_reviews']] = 0.0
print(time.time() - t0, 'seconds')

# Min of time between reviews
print("Min of time between reviews: ", end='')
t0 = time.time()
min_days_bw_revs_gb = reviews.groupby('product_asin').apply(lambda x : x['review_date'].sort_values().diff().min())
products_df['min_days_between_reviews'] = [min_days_bw_revs_gb[i] for i in products_df.product_asin]
products_df['min_days_between_reviews'] = products_df['min_days_between_reviews'] / datetime.timedelta(days=1)
products_df.loc[products_df['min_days_between_reviews'].isnull(), ['min_days_between_reviews']] = 0.0
print(time.time() - t0, 'seconds')

# Max of time between reviews
print("Max of time between reviews: ", end='')
t0 = time.time()
max_days_bw_revs_gb = reviews.groupby('product_asin').apply(lambda x : x['review_date'].sort_values().diff().max())
products_df['max_days_between_reviews'] = [max_days_bw_revs_gb[i] for i in products_df.product_asin]
products_df['max_days_between_reviews'] = products_df['max_days_between_reviews'] / datetime.timedelta(days=1)
products_df.loc[products_df['max_days_between_reviews'].isnull(), ['max_days_between_reviews']] = 0.0
print(time.time() - t0, 'seconds')


# Getting month post frequencies
print("Getting month post frequencies: ", end='')
t0 = time.time()
out = reviews.groupby('product_asin').apply(lambda x: get_month_freq(x))
out = pd.DataFrame({'months': out})
out = pd.concat([out.drop(['months'], axis=1), out['months'].apply(pd.Series)], axis=1)
out.fillna(0.0, inplace=True)
products_df = products_df.merge(out, left_on='product_asin', right_index=True)
print(time.time() - t0, 'seconds')


products_df

Number of reviews: 0.08234763145446777 seconds
Proportion of reviews with photos: 0.07711291313171387 seconds
Proportion of reviews with helpful votes: 0.07714653015136719 seconds
Proportion of 1-star reviews: 1.4356088638305664 seconds
Proportion of 5-star reviews: 1.425964593887329 seconds
Mean review rating: 0.07457232475280762 seconds
Variation in review rating: 0.0776064395904541 seconds
Min in review rating: 0.07725811004638672 seconds
Max in review rating: 0.08048725128173828 seconds
Mean review length: 0.13979172706604004 seconds
Variation in review length: 0.08522367477416992 seconds
Min in review length: 0.07723379135131836 seconds
Max in review length: 0.07600593566894531 seconds
Mean days between reviews: 2.1124801635742188 seconds
Variation of time between reviews: 2.3922293186187744 seconds
Min of time between reviews: 2.031331777572632 seconds
Max of time between reviews: 2.041660785675049 seconds
Getting month post frequencies: 3.2796502113342285 seconds


Unnamed: 0,product_asin,n_of_reviews,share_photos,share_helpful,share_1star,share_5star,avg_review_rating,stdev_review_rating,min_review_rating,max_review_rating,...,3,4,5,6,7,8,9,10,11,12
0,B07K7MSV25,1265,0.126482,0.104348,0.101976,0.662451,4.192885,1.343331,1.0,5.0,...,0.050593,0.077470,0.120158,0.111462,0.132016,0.110672,0.081423,0.069565,0.040316,0.065613
1,B07H3PDWVZ,286,0.080420,0.199301,0.139860,0.583916,3.961538,1.485104,1.0,5.0,...,0.076923,0.080420,0.069930,0.104895,0.104895,0.059441,0.083916,0.118881,0.083916,0.083916
2,B07Z3FG46V,1519,0.201448,0.094799,0.150099,0.698486,4.105991,1.512342,1.0,5.0,...,0.144174,0.103357,0.174457,0.090849,0.079658,0.046083,0.032258,0.088874,0.023700,0.042791
3,B07RT5DZ2Y,131,0.038168,0.091603,0.091603,0.564885,4.076336,1.316279,1.0,5.0,...,0.068702,0.022901,0.053435,0.007634,0.030534,0.053435,0.061069,0.083969,0.106870,0.183206
4,B07WHGZLK8,671,0.187779,0.156483,0.101341,0.766021,4.323398,1.348754,1.0,5.0,...,0.053651,0.068554,0.119225,0.099851,0.129657,0.084948,0.108793,0.144560,0.032787,0.058122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3403,B0881NL6HH,1,0.000000,0.000000,0.000000,1.000000,5.000000,0.000000,5.0,5.0,...,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3404,B082F65KF8,2,0.000000,0.500000,0.500000,0.000000,2.500000,2.121320,1.0,4.0,...,0.000000,0.000000,0.500000,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3405,B0822QT8PG,2,0.500000,0.500000,0.500000,0.500000,3.000000,2.828427,1.0,5.0,...,0.500000,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000
3406,B07L5P4HPR,1,0.000000,0.000000,0.000000,1.000000,5.000000,0.000000,5.0,5.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [23]:
products_df.to_csv(path + 'clean/processed_products.csv', index=False)

In [19]:
################################# DATA (product labels) ###################################
product_labels_df = pd.DataFrame({"product_asin": reviews.product_asin.unique()})
product_labels_df = product_labels_df.merge(reviews[['product_asin', 'fake_asin']], left_on='product_asin', right_on='product_asin', how='left').drop_duplicates(ignore_index=True)
product_labels_df

Unnamed: 0,product_asin,fake_asin
0,B07K7MSV25,0
1,B07H3PDWVZ,0
2,B07Z3FG46V,1
3,B07RT5DZ2Y,0
4,B07WHGZLK8,1
...,...,...
3403,B0881NL6HH,1
3404,B082F65KF8,1
3405,B0822QT8PG,1
3406,B07L5P4HPR,0


In [20]:
np.save(path + 'labels-product.npy', product_labels_df.drop(columns=['product_asin']).to_numpy())

In [21]:
################################# DATA (product train test split) ###################################
product_train_test_dict = pd.read_pickle(path + 'raw/train_test_dict_products.pkl')
product_train_test_df = pd.DataFrame({"product_asin": reviews.product_asin.unique()})
product_train_test_df['is_train'] = product_train_test_df.product_asin.map(product_train_test_dict)
product_train_test_df.loc[product_train_test_df.is_train == 'train', 'is_train'] = 1
product_train_test_df.loc[product_train_test_df.is_train == 'test', 'is_train'] = 0
product_train_test_df

Unnamed: 0,product_asin,is_train
0,B07K7MSV25,1
1,B07H3PDWVZ,0
2,B07Z3FG46V,1
3,B07RT5DZ2Y,1
4,B07WHGZLK8,0
...,...,...
3403,B0881NL6HH,0
3404,B082F65KF8,1
3405,B0822QT8PG,1
3406,B07L5P4HPR,0


In [22]:
product_train_test_df.to_csv(path + 'product_train_test.csv', index=False)

In [23]:
################################# DATA (reviewers) ###################################
reviewers_df = pd.DataFrame({"reviewer_id": reviews.reviewer_id.unique()})


# Number of reviews
print("Number of reviews: ", end='')
t0 = time.time()
n_reviews_gb = reviews.groupby('reviewer_id')['review_date'].count()
reviewers_df['n_of_reviews'] = [n_reviews_gb[i] for i in reviewers_df.reviewer_id]
reviewers_df['n_of_reviews'] = reviewers_df['n_of_reviews'].astype(float)
print(time.time() - t0, 'seconds')


# Proportion of reviews with photos
print("Proportion of reviews with photos: ", end='')
t0 = time.time()
share_n_photos_gb = reviews.groupby('reviewer_id')['has_photos'].mean()
reviewers_df['share_photos'] = [share_n_photos_gb[i] for i in reviewers_df.reviewer_id]
print(time.time() - t0, 'seconds')

# Proportion of reviews with helpful votes
print("Proportion of reviews with helpful votes: ", end='')
t0 = time.time()
share_n_helpful_gb = reviews.groupby('reviewer_id')['has_helpful'].mean()
reviewers_df['share_helpful'] = [share_n_helpful_gb[i] for i in reviewers_df.reviewer_id]
print(time.time() - t0, 'seconds')

# Proportion of 1-star reviews
print("Proportion of 1-star reviews: ", end='')
t0 = time.time()
share_1star_gb = reviews.groupby('reviewer_id').apply(lambda x : sum(x['review_rating'] == 1.0) / x['review_date'].count())
reviewers_df['share_1star'] = [share_1star_gb[i] for i in reviewers_df.reviewer_id]
print(time.time() - t0, 'seconds')

# Proportion of 5-star reviews
print("Proportion of 5-star reviews: ", end='')
t0 = time.time()
share_5star_gb = reviews.groupby('reviewer_id').apply(lambda x : sum(x['review_rating'] == 5.0) / x['review_date'].count())
reviewers_df['share_5star'] = [share_5star_gb[i] for i in reviewers_df.reviewer_id]
print(time.time() - t0, 'seconds')


# Mean review rating
print("Mean review rating: ", end='')
t0 = time.time()
avg_rev_rate_gb = reviews.groupby('reviewer_id')['review_rating'].mean()
reviewers_df['avg_review_rating'] = [avg_rev_rate_gb[i] for i in reviewers_df.reviewer_id]
print(time.time() - t0, 'seconds')

# Variation in review rating
print("Variation in review rating: ", end='')
t0 = time.time()
sd_rev_rate_gb = reviews.groupby('reviewer_id')['review_rating'].std()
reviewers_df['stdev_review_rating'] = [sd_rev_rate_gb[i] for i in reviewers_df.reviewer_id]
reviewers_df.loc[reviewers_df['stdev_review_rating'].isnull(), ['stdev_review_rating']] = 0.0
print(time.time() - t0, 'seconds')

# Min in review rating
print("Min in review rating: ", end='')
t0 = time.time()
min_rev_rate_gb = reviews.groupby('reviewer_id')['review_rating'].min()
reviewers_df['min_review_rating'] = [min_rev_rate_gb[i] for i in reviewers_df.reviewer_id]
print(time.time() - t0, 'seconds')

# Max in review rating
print("Max in review rating: ", end='')
t0 = time.time()
max_rev_rate_gb = reviews.groupby('reviewer_id')['review_rating'].max()
reviewers_df['max_review_rating'] = [max_rev_rate_gb[i] for i in reviewers_df.reviewer_id]
print(time.time() - t0, 'seconds')


# Mean review length
print("Mean review length: ", end='')
t0 = time.time()
avg_rev_length_gb = reviews.groupby('reviewer_id')['len_review_text'].mean()
reviewers_df['avg_review_length'] = [avg_rev_length_gb[i] for i in reviewers_df.reviewer_id]
print(time.time() - t0, 'seconds')

# Variation in review length
print("Variation in review length: ", end='')
t0 = time.time()
sd_rev_length_gb = reviews.groupby('reviewer_id')['len_review_text'].std()
reviewers_df['stdev_review_length'] = [sd_rev_length_gb[i] for i in reviewers_df.reviewer_id]
reviewers_df.loc[reviewers_df['stdev_review_length'].isnull(), ['stdev_review_length']] = 0.0
print(time.time() - t0, 'seconds')

# Min in review length
print("Min in review length: ", end='')
t0 = time.time()
min_rev_length_gb = reviews.groupby('reviewer_id')['len_review_text'].min()
reviewers_df['min_review_length'] = [min_rev_length_gb[i] for i in reviewers_df.reviewer_id]
print(time.time() - t0, 'seconds')

# Max in review length
print("Max in review length: ", end='')
t0 = time.time()
max_rev_length_gb = reviews.groupby('reviewer_id')['len_review_text'].max()
reviewers_df['max_review_length'] = [max_rev_length_gb[i] for i in reviewers_df.reviewer_id]
print(time.time() - t0, 'seconds')


# Mean days between reviews
print("Mean days between reviews: ", end='')
t0 = time.time()
avg_days_bw_revs_gb = reviews.groupby('reviewer_id').apply(lambda x : x['review_date'].sort_values().diff().mean())
reviewers_df['avg_days_between_reviews'] = [avg_days_bw_revs_gb[i] for i in reviewers_df.reviewer_id]
reviewers_df['avg_days_between_reviews'] = reviewers_df['avg_days_between_reviews'] / datetime.timedelta(days=1)
reviewers_df.loc[reviewers_df['avg_days_between_reviews'].isnull(), ['avg_days_between_reviews']] = 0.0
print(time.time() - t0, 'seconds')

# Variation of time between reviews
print("Variation of time between reviews: ", end='')
t0 = time.time()
sd_days_bw_revs_gb = reviews.groupby('reviewer_id').apply(lambda x : x['review_date'].sort_values().diff().std())
reviewers_df['stdev_days_between_reviews'] = [sd_days_bw_revs_gb[i] for i in reviewers_df.reviewer_id]
reviewers_df['stdev_days_between_reviews'] = reviewers_df['stdev_days_between_reviews'] / datetime.timedelta(days=1)
reviewers_df.loc[reviewers_df['stdev_days_between_reviews'].isnull(), ['stdev_days_between_reviews']] = 0.0
print(time.time() - t0, 'seconds')

# Min of time between reviews
print("Min of time between reviews: ", end='')
t0 = time.time()
min_days_bw_revs_gb = reviews.groupby('reviewer_id').apply(lambda x : x['review_date'].sort_values().diff().min())
reviewers_df['min_days_between_reviews'] = [min_days_bw_revs_gb[i] for i in reviewers_df.reviewer_id]
reviewers_df['min_days_between_reviews'] = reviewers_df['min_days_between_reviews'] / datetime.timedelta(days=1)
reviewers_df.loc[reviewers_df['min_days_between_reviews'].isnull(), ['min_days_between_reviews']] = 0.0
print(time.time() - t0, 'seconds')

# Max of time between reviews
print("Max of time between reviews: ", end='')
t0 = time.time()
max_days_bw_revs_gb = reviews.groupby('reviewer_id').apply(lambda x : x['review_date'].sort_values().diff().max())
reviewers_df['max_days_between_reviews'] = [max_days_bw_revs_gb[i] for i in reviewers_df.reviewer_id]
reviewers_df['max_days_between_reviews'] = reviewers_df['max_days_between_reviews'] / datetime.timedelta(days=1)
reviewers_df.loc[reviewers_df['max_days_between_reviews'].isnull(), ['max_days_between_reviews']] = 0.0
print(time.time() - t0, 'seconds')


# Getting category post frequencies
print("Getting category post frequencies: ", end='')
t0 = time.time()
out = reviews.groupby('reviewer_id').apply(lambda x: get_category_freq(x))
out = pd.DataFrame({'categories': out})
out = pd.concat([out.drop(['categories'], axis=1), out['categories'].apply(pd.Series)], axis=1)
out.fillna(0.0, inplace=True)
reviewers_df = reviewers_df.merge(out, left_on='reviewer_id', right_index=True)
print(time.time() - t0, 'seconds')


reviewers_df

Number of reviews: 8.455201625823975 seconds
Proportion of reviews with photos: 8.66696572303772 seconds
Proportion of reviews with helpful votes: 8.537378072738647 seconds
Proportion of 1-star reviews: 203.23395419120789 seconds
Proportion of 5-star reviews: 200.44415616989136 seconds
Mean review rating: 8.164793252944946 seconds
Variation in review rating: 8.10870099067688 seconds
Min in review rating: 8.083131074905396 seconds
Max in review rating: 8.133288145065308 seconds
Mean review length: 8.096458911895752 seconds
Variation in review length: 8.155420541763306 seconds
Min in review length: 8.12654447555542 seconds
Max in review length: 8.077555418014526 seconds
Mean days between reviews: 437.6379425525665 seconds
Variation of time between reviews: 534.6954321861267 seconds
Min of time between reviews: 412.288871049881 seconds
Max of time between reviews: 409.38910484313965 seconds
Getting category post frequencies: 810.1958801746368 seconds


Unnamed: 0,reviewer_id,n_of_reviews,share_photos,share_helpful,share_1star,share_5star,avg_review_rating,stdev_review_rating,min_review_rating,max_review_rating,...,Other,Home Improvement,Kids and Baby,Office and Industry,"Toys, Instruments, and Games","Clothing, Shoes, and Jewelry",Sports and Outdoors,Automotive,Arts and Crafts,Pets
0,723414,1.0,0.0,0.0,1.0,0.0,1.0,0.000000,1.0,1.0,...,1.0,0.0,0.000,0.0,0.0,0.0,0.0,0.000,0.0,0.0
1,1034254,2.0,0.5,0.5,0.0,1.0,5.0,0.000000,5.0,5.0,...,0.0,0.0,0.000,0.0,0.0,0.0,0.5,0.000,0.0,0.5
2,1027740,5.0,0.4,0.0,0.2,0.8,4.2,1.788854,1.0,5.0,...,0.0,0.6,0.000,0.0,0.0,0.0,0.0,0.000,0.0,0.0
3,146762,1.0,0.0,0.0,0.0,1.0,5.0,0.000000,5.0,5.0,...,0.0,0.0,0.000,0.0,0.0,0.0,1.0,0.000,0.0,0.0
4,627419,8.0,0.0,0.0,0.0,1.0,5.0,0.000000,5.0,5.0,...,0.0,0.5,0.125,0.0,0.0,0.0,0.0,0.125,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1281354,844045,1.0,0.0,0.0,0.0,1.0,5.0,0.000000,5.0,5.0,...,0.0,1.0,0.000,0.0,0.0,0.0,0.0,0.000,0.0,0.0
1281355,921326,1.0,0.0,0.0,0.0,1.0,5.0,0.000000,5.0,5.0,...,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.000,0.0,0.0
1281356,210516,1.0,0.0,0.0,0.0,1.0,5.0,0.000000,5.0,5.0,...,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.000,0.0,0.0
1281357,500627,1.0,0.0,0.0,0.0,1.0,5.0,0.000000,5.0,5.0,...,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.000,0.0,0.0


In [39]:
reviewers_df.set_index('reviewer_id', inplace=True)
reviewers_df.sort_index(inplace=True)
reviewers_df

Unnamed: 0_level_0,n_of_reviews,share_photos,share_helpful,share_1star,share_5star,avg_review_rating,stdev_review_rating,min_review_rating,max_review_rating,avg_review_length,...,Other,Home Improvement,Kids and Baby,Office and Industry,"Toys, Instruments, and Games","Clothing, Shoes, and Jewelry",Sports and Outdoors,Automotive,Arts and Crafts,Pets
reviewer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,1.0,5.0,0.0,5.0,5.0,28.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,5.0,0.0,5.0,5.0,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,5.0,0.0,5.0,5.0,58.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,0.0,0.5,0.0,1.0,5.0,0.0,5.0,5.0,47.5,...,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,11.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1281354,1.0,0.0,0.0,0.0,1.0,5.0,0.0,5.0,5.0,21.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1281355,1.0,0.0,0.0,0.0,1.0,5.0,0.0,5.0,5.0,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1281356,1.0,0.0,1.0,0.0,1.0,5.0,0.0,5.0,5.0,33.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1281357,1.0,1.0,0.0,0.0,1.0,5.0,0.0,5.0,5.0,28.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
reviewers_df.to_csv(path + 'clean/processed_reviewers.csv', index=False)

In [48]:
################################# DATA (reviewer labels) ###################################
reviewer_labels_df = reviewers_df.copy()
reviewer_labels_df['fake_asin'] = 0
reviewer_labels_df = reviewer_labels_df[['fake_asin']]
reviewer_labels_df

Unnamed: 0_level_0,fake_asin
reviewer_id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0
...,...
1281354,0
1281355,0
1281356,0
1281357,0


In [49]:
np.save(path + 'labels-reviewer.npy', reviewer_labels_df.to_numpy())

In [61]:
################################# DATA (reviewer train test split) ###################################
def get_train_test_split(length, percentage):
    # ones = train
    # zeros = test
    # You define the amount of ones
    trains = np.ones(round(length*percentage))
    tests = np.zeros(round(length*(1-percentage)))
    result = np.concatenate((trains, tests), axis=None)
    
    # Set shuffle stage
    np.random.seed(42)
    np.random.shuffle(result)
    
    return result


reviewer_train_test_dict = reviewers_df.copy()
reviewer_train_test_dict['is_train'] = get_train_test_split(reviewers_df.shape[0], 0.7)
reviewer_train_test_dict = reviewer_train_test_dict[['is_train']]
reviewer_train_test_dict

Unnamed: 0_level_0,is_train
reviewer_id,Unnamed: 1_level_1
0,1.0
1,1.0
2,0.0
3,1.0
4,1.0
...,...
1281354,1.0
1281355,1.0
1281356,1.0
1281357,1.0


In [62]:
reviewer_train_test_dict.to_csv(path + 'reviewer_train_test.csv', index=False)