-
Notifications
You must be signed in to change notification settings - Fork 1
/
rating_product_sorting_reviews_in_amazon.py
239 lines (183 loc) · 10.7 KB
/
rating_product_sorting_reviews_in_amazon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
"""
Author : Mustafa Gürkan Çanakçi
LinkedIn : https://www.linkedin.com/in/mgurkanc/
"""
# Project Name : Rating Product & Sorting Reviews in Amazon
#############################################################################################################
# BUSINESS PROBLEM #
#############################################################################################################
# In e-commerce marketing, one of the most important issues is the correct calculation of the after-sales service to
# the products.
# The method that we will use to solve this problem is as follows:
# * Providing greater customer satisfaction for the e-commerce site
# * Bringing a product into prominence for the sellers
# * Unproblematic shopping experience for the buyers
# Another issue is the correct ordering of the comments given to the products. The misleading comments will directly
# affect the sale of the product so it will cause both financial loss and loss of customers. The solution of these 2
# basic situations will increase the sales of the e-commerce site and the sellers, while the customers will complete
# the purchasing journey without any problems.
#############################################################################################################
# THE STORY OF DATASET #
#############################################################################################################
# reviewerID : User ID
# asin : Product ID
# reviewerName : Username
# helpful : Usefull rating
# revieweText : Assessment
# overall : Product Rating
# summary : Evaluation summary
# unixReviewTime : Evaluation time
# reviewTime : Evaluation time raw
# day_diff : Number of days since evaluation
# helpful_yes : Number of useful
# total_vote : Number of votes
#############################################################################################################
# Mission 1 : Calculate "Average Rating" by Current Comments and Compare it with the existing average rating.
#############################################################################################################
import pandas as pd
import math
import scipy.stats as st
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
# Import dataset
df = pd.read_csv("C:/Users/mgurk/PycharmProjects/pythonProject1/datasets/amazon_review.csv")
df.head()
# Average Rating
df["overall"].mean()
# Out[34]: 4.587589013224822
df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 4915 entries, 0 to 4914
# Data columns (total 12 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 reviewerID 4915 non-null object
# 1 asin 4915 non-null object
# 2 reviewerName 4914 non-null object
# 3 helpful 4915 non-null object
# 4 reviewText 4914 non-null object
# 5 overall 4915 non-null float64
# 6 summary 4915 non-null object
# 7 unixReviewTime 4915 non-null int64
# 8 reviewTime 4915 non-null object
# 9 day_diff 4915 non-null int64
# 10 helpful_yes 4915 non-null int64
# 11 total_vote 4915 non-null int64
# dtypes: float64(1), int64(4), object(7)
# memory usage: 460.9+ KB
# Convert some variables to date format
df["reviewTime"] = pd.to_datetime(df["reviewTime"])
current_date =pd.to_datetime('2014-12-10')
df["days"] = (current_date - df["reviewTime"]).dt.days
df.head()
# Numerical analysis of the "days" variable
df["days"].describe([0.10, 0.25, 0.50, 0.75, 0.80, 0.90, 0.95, 0.99]).T
# Out[67]:
# count 4915.00000
# mean 439.36704
# std 209.43987
# min 3.00000
# 10% 169.00000
# 25% 283.00000
# 50% 433.00000
# 75% 603.00000
# 80% 640.00000
# 90% 710.00000
# 95% 750.00000
# 99% 945.00000
# max 1066.00000
# Name: days, dtype: float64
q1 = df["days"].quantile(0.25)
q2 = df["days"].quantile(0.50)
q3 = df["days"].quantile(0.75)
df.loc[df["days"]<= q1,"overall"].mean()
# Out[30]: 4.6957928802588995
df.loc[(df["days"] > q1) & (df["days"] <= q2) ,"overall"].mean()
# Out[31]: 4.636140637775961
df.loc[(df["days"] > q2) & (df["days"] <= q3) ,"overall"].mean()
# Out[32]: 4.571661237785016
df.loc[df["days"] > q3,"overall"].mean()
# Out[33]: 4.4462540716612375
# Definiton of Time-Based Weighted Average Function
def time_based_weighted_average(dataframe,w1,w2,w3,w4):
return df.loc[df["days"]<= q1,"overall"].mean() * w1/100 + \
df.loc[(df["days"] > q1) & (df["days"] <= q2) ,"overall"].mean()* w2/100 +\
df.loc[(df["days"] > q2) & (df["days"] <= q3) ,"overall"].mean()* w3/100 + \
df.loc[df["days"] > q3,"overall"].mean()* w4/100
time_based_weighted_average(df,30,26,24,20)
# Out[39]: 4.600583941300071
#############################################################################################################
# Mission 2 : Specify 20 reviews for the product to be displayed on the product detail page.
#############################################################################################################
df.head()
# The number of unhelpful votes
df["helpful_no"] = df["total_vote"] - df["helpful_yes"]
# Define a new dataframe
df = df[["reviewerID","overall","reviewTime","day_diff","total_vote","helpful_yes","helpful_no"]]
df.head(10)
# Define score_pos_neg_diff, score_average_rating and wilson_lower_bound functions
def score_pos_neg_diff(up, down):
return up - down
def score_average_rating(up, down):
if up + down == 0:
return 0
return up / (up + down)
def wilson_lower_bound(up, down, confidence=0.95):
"""
Wilson Lower Bound Score hesapla
- Bernoulli parametresi p için hesaplanacak güven aralığının alt sınırı WLB skoru olarak kabul edilir.
- Hesaplanacak skor ürün sıralaması için kullanılır.
- Not:
Eğer skorlar 1-5 arasıdaysa 1-3 negatif, 4-5 pozitif olarak işaretlenir ve bernoulli'ye uygun hale getirilebilir.
Bu beraberinde bazı problemleri de getirir. Bu sebeple bayesian average rating yapmak gerekir.
Parameters
----------
up: int
up count
down: int
down count
confidence: float
confidence
Returns
-------
wilson score: float
"""
n = up + down
if n == 0:
return 0
z = st.norm.ppf(1 - (1 - confidence) / 2)
phat = 1.0 * up / n
return (phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)
# Calculate new scores by these functions and create new variables
df["score_pos_neg_diff"] = df.apply(lambda x: score_pos_neg_diff(x["helpful_yes"], x["helpful_no"]), axis=1)
df["score_average_rating"] = df.apply(lambda x: score_average_rating(x["helpful_yes"], x["helpful_no"]), axis=1)
df["wilson_lower_bound"] = df.apply(lambda x: wilson_lower_bound(x["helpful_yes"], x["helpful_no"]), axis=1)
df.head()
# Sorting the first 20 comments by "wilson_lower_bound"
df.sort_values("wilson_lower_bound", ascending=False).head(20)
# Out[54]:
# reviewerID overall reviewTime day_diff total_vote helpful_yes helpful_no score_pos_neg_diff score_average_rating wilson_lower_bound
# 2031 A12B7ZMXFI6IXY 5.00000 2013-01-05 702 2020 1952 68 1884 0.96634 0.95754
# 3449 AOEAD7DPLZE53 5.00000 2012-09-26 803 1505 1428 77 1351 0.94884 0.93652
# 4212 AVBMZZAFEKO58 1.00000 2013-05-08 579 1694 1568 126 1442 0.92562 0.91214
# 317 A1ZQAQFYSXL5MQ 1.00000 2012-02-09 1033 495 422 73 349 0.85253 0.81858
# 4672 A2DKQQIZ793AV5 5.00000 2014-07-03 158 49 45 4 41 0.91837 0.80811
# 1835 A1J6VSUM80UAF8 5.00000 2014-02-28 283 68 60 8 52 0.88235 0.78465
# 3981 A1K91XXQ6ZEBQR 5.00000 2012-10-22 777 139 112 27 85 0.80576 0.73214
# 3807 AFGRMORWY2QNX 3.00000 2013-02-27 649 25 22 3 19 0.88000 0.70044
# 4306 AOHXKM5URSKAB 5.00000 2012-09-06 823 65 51 14 37 0.78462 0.67033
# 4596 A1WTQUOQ4WG9AI 1.00000 2012-09-22 807 109 82 27 55 0.75229 0.66359
# 315 A2J26NNQX6WKAU 5.00000 2012-08-13 847 48 38 10 28 0.79167 0.65741
# 1465 A6I8KXYK24RTB 4.00000 2014-04-14 238 7 7 0 7 1.00000 0.64567
# 1609 A2TPXOZSU1DACQ 5.00000 2014-03-26 257 7 7 0 7 1.00000 0.64567
# 4302 A2EL2GWJ9T0DWY 5.00000 2014-03-21 262 16 14 2 12 0.87500 0.63977
# 4072 A22GOZTFA02O2F 5.00000 2012-11-09 759 6 6 0 6 1.00000 0.60967
# 1072 A2O96COBMVY9C4 5.00000 2012-05-10 942 5 5 0 5 1.00000 0.56552
# 2583 A3MEPYZVTAV90W 5.00000 2013-08-06 489 5 5 0 5 1.00000 0.56552
# 121 A2Z4VVF1NTJWPB 5.00000 2012-05-09 943 5 5 0 5 1.00000 0.56552
# 1142 A1PLHPPAJ5MUXG 5.00000 2014-02-04 307 5 5 0 5 1.00000 0.56552
# 1753 ALPLKR59QMBUX 5.00000 2012-10-22 777 5 5 0 5 1.00000 0.56552