-
Notifications
You must be signed in to change notification settings - Fork 1
/
flo_rfm_project.py
712 lines (602 loc) · 36.1 KB
/
flo_rfm_project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
# CUSTOMER SEGMENTATION WITH RFM ANALYSIS PROJECT
# BUSINESS PROBLEM:
# FLO,an online shoe store,wants to segment its customers and determine marketing strategies
# according to these segments. In this regard, the behavior of customers will be defined and
# groups will be formed according to the clustering in these behaviors.
# Story of Dataset
# The dataset shows the customers which last purchases from the FLO store on Omnichannel(both online and offline
# shopping store)in 2020-2021. However, these customers have consist of infomation from their past shopping behavior.
# master_id : Unique Customer Number
# order_channel : Which channel of the shopping platform is used (Android, ios, Desktop, Mobile))
# last_order_channel : The channel where the most recent purchase was made
# first_order_date : The customer's first purchase date
# last_order_date : The customer's last purchase date
# last_order_date_online : The customer's last purchase date in online shopping platform
# last_order_date_offline : The customer's last purchase date in offline shopping platform
# order_num_total_ever_online : The customer's total purchases in online shopping platform
# order_num_total_ever_offline : The customer's total purchases in offline shopping platform
# customer_value_total_ever_offline : The total expenditure by customer in offline shopping platform
# customer_value_total_ever_online : The total expenditure by customer in online shopping platform
# interested_in_categories_12 : List of categories the customer has shopped in the last 12 months
##############################################################################################
# MISSION 1 : DATA UNDERSTANDING AND PREPARATION #
##############################################################################################
# STEP 1 - READ DATASET
import datetime as dt
import numpy as np
import pandas as pd
pd.set_option('display.max_columns',1000)
# pd.set_option('display_max_rows',None)
pd.set_option('display.float_format',lambda x : '%.3f' % x)
df_= pd.read_csv("/Users/mgurk/PycharmProjects/pythonProject1/datasets/flo_data_20k.csv")
df = df_.copy()
# STEP 2
df.head(10)
# Out[79]:
# master_id order_channel last_order_channel \
# 0 cc294636-19f0-11eb-8d74-000d3a38a36f Android App Offline
# 1 f431bd5a-ab7b-11e9-a2fc-000d3a38a36f Android App Mobile
# 2 69b69676-1a40-11ea-941b-000d3a38a36f Android App Android App
# 3 1854e56c-491f-11eb-806e-000d3a38a36f Android App Android App
# 4 d6ea1074-f1f5-11e9-9346-000d3a38a36f Desktop Desktop
# 5 e585280e-aae1-11e9-a2fc-000d3a38a36f Desktop Offline
# 6 c445e4ee-6242-11ea-9d1a-000d3a38a36f Android App Android App
# 7 3f1b4dc8-8a7d-11ea-8ec0-000d3a38a36f Mobile Offline
# 8 cfbda69e-5b4f-11ea-aca7-000d3a38a36f Android App Android App
# 9 1143f032-440d-11ea-8b43-000d3a38a36f Mobile Mobile
# first_order_date last_order_date last_order_date_online \
# 0 2020-10-30 2021-02-26 2021-02-21
# 1 2017-02-08 2021-02-16 2021-02-16
# 2 2019-11-27 2020-11-27 2020-11-27
# 3 2021-01-06 2021-01-17 2021-01-17
# 4 2019-08-03 2021-03-07 2021-03-07
# 5 2018-11-18 2021-03-13 2018-11-18
# 6 2020-03-04 2020-10-18 2020-10-18
# 7 2020-05-15 2020-08-12 2020-05-15
# 8 2020-01-23 2021-03-07 2021-03-07
# 9 2019-07-30 2020-10-04 2020-10-04
# last_order_date_offline order_num_total_ever_online \
# 0 2021-02-26 4.000
# 1 2020-01-10 19.000
# 2 2019-12-01 3.000
# 3 2021-01-06 1.000
# 4 2019-08-03 1.000
# 5 2021-03-13 1.000
# 6 2020-03-04 3.000
# 7 2020-08-12 1.000
# 8 2020-01-25 3.000
# 9 2019-07-30 1.000
# order_num_total_ever_offline customer_value_total_ever_offline \
# 0 1.000 139.990
# 1 2.000 159.970
# 2 2.000 189.970
# 3 1.000 39.990
# 4 1.000 49.990
# 5 2.000 150.870
# 6 1.000 59.990
# 7 1.000 49.990
# 8 2.000 120.480
# 9 1.000 69.980
# customer_value_total_ever_online interested_in_categories_12
# 0 799.380 [KADIN]
# 1 1853.580 [ERKEK, COCUK, KADIN, AKTIFSPOR]
# 2 395.350 [ERKEK, KADIN]
# 3 81.980 [AKTIFCOCUK, COCUK]
# 4 159.990 [AKTIFSPOR]
# 5 49.990 [KADIN]
# 6 315.940 [AKTIFSPOR]
# 7 113.640 [COCUK]
# 8 934.210 [ERKEK, COCUK, KADIN]
# 9 95.980 [KADIN, AKTIFSPOR]
# Değişken İsimleri :
df.columns
# Out[80]:
# Index(['master_id', 'order_channel', 'last_order_channel', 'first_order_date',
# 'last_order_date', 'last_order_date_online', 'last_order_date_offline',
# 'order_num_total_ever_online', 'order_num_total_ever_offline',
# 'customer_value_total_ever_offline', 'customer_value_total_ever_online',
# 'interested_in_categories_12'],
# dtype='object')
df.interested_in_categories_12.value_counts()
# [AKTIFSPOR] 3464
# [KADIN] 2158
# [] 2135
# [ERKEK] 1973
# [KADIN, AKTIFSPOR] 1352
# [ERKEK, AKTIFSPOR] 1178
# [ERKEK, KADIN] 848
# [COCUK] 836
# [ERKEK, KADIN, AKTIFSPOR] 775
# [AKTIFCOCUK] 679
# [COCUK, KADIN] 443
# [AKTIFCOCUK, COCUK] 349
# [AKTIFCOCUK, AKTIFSPOR] 317
# [COCUK, AKTIFSPOR] 317
# [COCUK, KADIN, AKTIFSPOR] 241
# [AKTIFCOCUK, ERKEK, COCUK, KADIN, AKTIFSPOR] 223
# [ERKEK, COCUK] 215
# [ERKEK, COCUK, KADIN, AKTIFSPOR] 213
# [AKTIFCOCUK, COCUK, KADIN] 213
# [AKTIFCOCUK, KADIN] 210
# [ERKEK, COCUK, KADIN] 204
# [AKTIFCOCUK, COCUK, KADIN, AKTIFSPOR] 203
# [AKTIFCOCUK, COCUK, AKTIFSPOR] 202
# [AKTIFCOCUK, KADIN, AKTIFSPOR] 184
# [ERKEK, COCUK, AKTIFSPOR] 156
# [AKTIFCOCUK, ERKEK] 152
# [AKTIFCOCUK, ERKEK, AKTIFSPOR] 142
# [AKTIFCOCUK, ERKEK, KADIN, AKTIFSPOR] 132
# [AKTIFCOCUK, ERKEK, COCUK] 122
# [AKTIFCOCUK, ERKEK, COCUK, KADIN] 115
# [AKTIFCOCUK, ERKEK, COCUK, AKTIFSPOR] 105
# [AKTIFCOCUK, ERKEK, KADIN] 89
# Name: interested_in_categories_12, dtype: int64
df.order_channel.value_counts()
# Android App 9495
# Mobile 4882
# Ios App 2833
# Desktop 2735
# Name: order_channel, dtype: int64
df.nunique()
# master_id 19945
# order_channel 4
# last_order_channel 5
# first_order_date 2465
# last_order_date 366
# last_order_date_online 1743
# last_order_date_offline 738
# order_num_total_ever_online 57
# order_num_total_ever_offline 32
# customer_value_total_ever_offline 6097
# customer_value_total_ever_online 11292
# interested_in_categories_12 32
# order_num_total_online_offline 63
# customer_onoff_total_expense 16277
# Betimsel İstatistik
df.describe().T
# count mean std min 25% \
# order_num_total_ever_online 19945.000 3.111 4.226 1.000 1.000
# order_num_total_ever_offline 19945.000 1.914 2.063 1.000 1.000
# customer_value_total_ever_offline 19945.000 253.923 301.533 10.000 99.990
# customer_value_total_ever_online 19945.000 497.322 832.602 12.990 149.980
# 50% 75% max
# order_num_total_ever_online 2.000 4.000 200.000
# order_num_total_ever_offline 1.000 2.000 109.000
# customer_value_total_ever_offline 179.980 319.970 18119.140
# customer_value_total_ever_online 286.460 578.440 45220.130
# Boş Değerler
df.isnull().sum()
# Out[82]:
# master_id 0
# order_channel 0
# last_order_channel 0
# first_order_date 0
# last_order_date 0
# last_order_date_online 0
# last_order_date_offline 0
# order_num_total_ever_online 0
# order_num_total_ever_offline 0
# customer_value_total_ever_offline 0
# customer_value_total_ever_online 0
# interested_in_categories_12 0
# dtype: int64
# Değişken Tipleri
df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 19945 entries, 0 to 19944
# Data columns (total 12 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 master_id 19945 non-null object
# 1 order_channel 19945 non-null object
# 2 last_order_channel 19945 non-null object
# 3 first_order_date 19945 non-null object
# 4 last_order_date 19945 non-null object
# 5 last_order_date_online 19945 non-null object
# 6 last_order_date_offline 19945 non-null object
# 7 order_num_total_ever_online 19945 non-null float64
# 8 order_num_total_ever_offline 19945 non-null float64
# 9 customer_value_total_ever_offline 19945 non-null float64
# 10 customer_value_total_ever_online 19945 non-null float64
# 11 interested_in_categories_12 19945 non-null object
# dtypes: float64(4), object(8)
# memory usage: 1.8+ MB
# Function of Checking Dataframe and Missing Values
# Summary of Dataset Variables
def missing_values_analysis(df):
na_columns_ = [col for col in df.columns if df[col].isnull().sum() > 0]
n_miss = df[na_columns_].isnull().sum().sort_values(ascending=True)
ratio_ = (df[na_columns_].isnull().sum() / df.shape[0] * 100).sort_values(ascending=True)
missing_df = pd.concat([n_miss, np.round(ratio_, 2)], axis=1, keys=['Total Missing Values', 'Ratio'])
missing_df = pd.DataFrame(missing_df)
return missing_df
def check_df(df, head=5):
print("--------------------- Shape ---------------------")
print(df.shape)
print("--------------------- Types ---------------------")
print(df.dtypes)
print("--------------------- Head ---------------------")
print(df.head(head))
print("--------------------- Missing Values Analysis ---------------------")
print(missing_values_analysis(df))
print("--------------------- Quantiles ---------------------")
print(df.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)
check_df(df, head=10)
# --------------------- Shape ---------------------
# (19945, 14)
# --------------------- Types ---------------------
# master_id object
# order_channel object
# last_order_channel object
# first_order_date datetime64[ns]
# last_order_date datetime64[ns]
# last_order_date_online datetime64[ns]
# last_order_date_offline datetime64[ns]
# order_num_total_ever_online float64
# order_num_total_ever_offline float64
# customer_value_total_ever_offline float64
# customer_value_total_ever_online float64
# interested_in_categories_12 object
# order_num_total_online_offline float64
# customer_onoff_total_expense float64
# dtype: object
# --------------------- Head ---------------------
# master_id order_channel last_order_channel \
# 0 cc294636-19f0-11eb-8d74-000d3a38a36f Android App Offline
# 1 f431bd5a-ab7b-11e9-a2fc-000d3a38a36f Android App Mobile
# 2 69b69676-1a40-11ea-941b-000d3a38a36f Android App Android App
# 3 1854e56c-491f-11eb-806e-000d3a38a36f Android App Android App
# 4 d6ea1074-f1f5-11e9-9346-000d3a38a36f Desktop Desktop
# 5 e585280e-aae1-11e9-a2fc-000d3a38a36f Desktop Offline
# 6 c445e4ee-6242-11ea-9d1a-000d3a38a36f Android App Android App
# 7 3f1b4dc8-8a7d-11ea-8ec0-000d3a38a36f Mobile Offline
# 8 cfbda69e-5b4f-11ea-aca7-000d3a38a36f Android App Android App
# 9 1143f032-440d-11ea-8b43-000d3a38a36f Mobile Mobile
# first_order_date last_order_date last_order_date_online \
# 0 2020-10-30 2021-02-26 2021-02-21
# 1 2017-02-08 2021-02-16 2021-02-16
# 2 2019-11-27 2020-11-27 2020-11-27
# 3 2021-01-06 2021-01-17 2021-01-17
# 4 2019-08-03 2021-03-07 2021-03-07
# 5 2018-11-18 2021-03-13 2018-11-18
# 6 2020-03-04 2020-10-18 2020-10-18
# 7 2020-05-15 2020-08-12 2020-05-15
# 8 2020-01-23 2021-03-07 2021-03-07
# 9 2019-07-30 2020-10-04 2020-10-04
# last_order_date_offline order_num_total_ever_online \
# 0 2021-02-26 4.000
# 1 2020-01-10 19.000
# 2 2019-12-01 3.000
# 3 2021-01-06 1.000
# 4 2019-08-03 1.000
# 5 2021-03-13 1.000
# 6 2020-03-04 3.000
# 7 2020-08-12 1.000
# 8 2020-01-25 3.000
# 9 2019-07-30 1.000
# order_num_total_ever_offline customer_value_total_ever_offline \
# 0 1.000 139.990
# 1 2.000 159.970
# 2 2.000 189.970
# 3 1.000 39.990
# 4 1.000 49.990
# 5 2.000 150.870
# 6 1.000 59.990
# 7 1.000 49.990
# 8 2.000 120.480
# 9 1.000 69.980
# customer_value_total_ever_online interested_in_categories_12 \
# 0 799.380 [KADIN]
# 1 1853.580 [ERKEK, COCUK, KADIN, AKTIFSPOR]
# 2 395.350 [ERKEK, KADIN]
# 3 81.980 [AKTIFCOCUK, COCUK]
# 4 159.990 [AKTIFSPOR]
# 5 49.990 [KADIN]
# 6 315.940 [AKTIFSPOR]
# 7 113.640 [COCUK]
# 8 934.210 [ERKEK, COCUK, KADIN]
# 9 95.980 [KADIN, AKTIFSPOR]
# order_num_total_online_offline customer_onoff_total_expense
# 0 5.000 939.370
# 1 21.000 2013.550
# 2 5.000 585.320
# 3 2.000 121.970
# 4 2.000 209.980
# 5 3.000 200.860
# 6 4.000 375.930
# 7 2.000 163.630
# 8 5.000 1054.690
# 9 2.000 165.960
# --------------------- Missing Values Analysis ---------------------
# Empty DataFrame
# Columns: [Total Missing Values, Ratio]
# Index: []
# --------------------- Quantiles ---------------------
# 0.000 0.050 0.500 0.950 0.990 \
# order_num_total_ever_online 1.000 1.000 2.000 10.000 20.000
# order_num_total_ever_offline 1.000 1.000 1.000 4.000 7.000
# customer_value_total_ever_offline 10.000 39.990 179.980 694.222 1219.947
# customer_value_total_ever_online 12.990 63.990 286.460 1556.726 3143.810
# order_num_total_online_offline 2.000 2.000 4.000 12.000 22.000
# customer_onoff_total_expense 44.980 175.480 545.270 1921.924 3606.356
# 1.000
# order_num_total_ever_online 200.000
# order_num_total_ever_offline 109.000
# customer_value_total_ever_offline 18119.140
# customer_value_total_ever_online 45220.130
# order_num_total_online_offline 202.000
# customer_onoff_total_expense 45905.100
# STEP 3 # Total Purchases Number and Total Expense for Omnichannel Customers
# Total purchases for omnichannel customers
df["order_num_total_online_offline"] = df["order_num_total_ever_online"] + df["order_num_total_ever_offline"]
# Total expense for omnichannel customers
df["customer_onoff_total_expense"] = df["customer_value_total_ever_offline"] + df["customer_value_total_ever_online"]
df.head()
# Yeni Değişkenler oluşturuldu.
# STEP 4 Review the data type for variable expressing the date
df.info()
# 3 first_order_date 19945 non-null object
# 4 last_order_date 19945 non-null object
# 5 last_order_date_online 19945 non-null object
# 6 last_order_date_offline 19945 non-null object
dates = ["first_order_date","last_order_date","last_order_date_online","last_order_date_offline"]
df[dates] = df[dates].apply(pd.to_datetime)
df["last_order_date"].max()
# Out[90]: '2021-05-30'
last_date = dt.datetime(2021,5,30)
type(last_date)
# Out[98]: datetime.datetime
today_date = dt.datetime(2011, 6, 2)
df.info()
# 3 first_order_date 19945 non-null datetime64[ns]
# 4 last_order_date 19945 non-null datetime64[ns]
# 5 last_order_date_online 19945 non-null datetime64[ns]
# 6 last_order_date_offline 19945 non-null datetime64[ns]
# STEP 5 #
# Distribution of the number of Customers in the Shopping Channels,
# the total number of products purchased and their total expenditures
df.groupby('order_channel').agg({'order_num_total_online_offline': 'sum',
'customer_onoff_total_expense':'count'}).sort_values(by="customer_onoff_total_expense",ascending=False)
# order_num_total_online_offline customer_onoff_total_expense
# order_channel
# Android App 52269.000 9495
# Mobile 21679.000 4882
# Ios App 15351.000 2833
# Desktop 10920.000 2735
# STEP 6 # Top 10 Customers with the Most Profits
df.groupby('master_id').agg({'customer_onoff_total_expense':'sum'}).\
sort_values(by="customer_onoff_total_expense",ascending=False).head(10)
# customer_onoff_total_expense
# master_id
# 5d1c466a-9cfd-11e9-9897-000d3a38a36f 45905.100
# d5ef8058-a5c6-11e9-a2fc-000d3a38a36f 36818.290
# 73fd19aa-9e37-11e9-9897-000d3a38a36f 33918.100
# 7137a5c0-7aad-11ea-8f20-000d3a38a36f 31227.410
# 47a642fe-975b-11eb-8c2a-000d3a38a36f 20706.340
# a4d534a2-5b1b-11eb-8dbd-000d3a38a36f 18443.570
# d696c654-2633-11ea-8e1c-000d3a38a36f 16918.570
# fef57ffa-aae6-11e9-a2fc-000d3a38a36f 12726.100
# cba59206-9dd1-11e9-9897-000d3a38a36f 12282.240
# fc0ce7a4-9d87-11e9-9897-000d3a38a36f 12103.150
# STEP 7 # Top 10 Customers with Most Orders
df.groupby('master_id').agg({'order_num_total_online_offline':'sum'}).\
sort_values(by="order_num_total_online_offline",ascending=False).head(10)
# order_num_total_online_offline
# master_id
# 5d1c466a-9cfd-11e9-9897-000d3a38a36f 202.000
# cba59206-9dd1-11e9-9897-000d3a38a36f 131.000
# a57f4302-b1a8-11e9-89fa-000d3a38a36f 111.000
# fdbe8304-a7ab-11e9-a2fc-000d3a38a36f 88.000
# 329968c6-a0e2-11e9-a2fc-000d3a38a36f 83.000
# 73fd19aa-9e37-11e9-9897-000d3a38a36f 82.000
# 44d032ee-a0d4-11e9-a2fc-000d3a38a36f 77.000
# b27e241a-a901-11e9-a2fc-000d3a38a36f 75.000
# d696c654-2633-11ea-8e1c-000d3a38a36f 70.000
# a4d534a2-5b1b-11eb-8dbd-000d3a38a36f 70.000
# Step 8 # Data Preparation Process Function
def data_preparation(dataframe):
dataframe["order_num_total_online_offline"] = dataframe["order_num_total_ever_online"] + dataframe["order_num_total_ever_offline"]
dataframe["customer_onoff_total_expense"] = dataframe["customer_value_total_ever_offline"] + dataframe["customer_value_total_ever_online"]
dates = ["first_order_date", "last_order_date", "last_order_date_online", "last_order_date_offline"]
dataframe[dates] = dataframe[dates].apply(pd.to_datetime)
return dataframe
data_preparation(df)
### BONUS
df.groupby("master_id")["customer_onoff_total_expense"].sum().sort_values(ascending=False).head()
total_categories_expense = df.groupby("interested_in_categories_12")["customer_onoff_total_expense"].sum().sort_values(
ascending=False).reset_index().head()
total_categories_expense.head()
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 8))
sns.barplot(data=total_categories_expense, x='interested_in_categories_12', y='customer_onoff_total_expense')
plt.show(block=True)
##############################################################################################
# MISSION 4 : CALCULATING RFM METRICS #
##############################################################################################
df["last_order_date"].max()
# Out[90]: '2021-05-30'
last_date = dt.datetime(2021,5,30)
type(last_date)
today_date = dt.datetime(2021, 6, 2)
rfm = df.groupby('master_id').agg( {'last_order_date': lambda last_order_date : (today_date- last_order_date.max()).days,
'order_num_total_online_offline': lambda total_purchases : total_purchases.sum(),
'customer_onoff_total_expense': lambda total_expense: total_expense.sum(),
})
rfm.columns = ['Recency', 'Frequency', 'Monetary']
rfm.head(10)
# Recency Frequency Monetary
# master_id
# 00016786-2f5a-11ea-bb80-000d3a38a36f 11 5.000 776.070
# 00034aaa-a838-11e9-a2fc-000d3a38a36f 299 3.000 269.470
# 000be838-85df-11ea-a90b-000d3a38a36f 214 4.000 722.690
# 000c1fe2-a8b7-11ea-8479-000d3a38a36f 28 7.000 874.160
# 000f5e3e-9dde-11ea-80cd-000d3a38a36f 21 7.000 1620.330
# 00136ce2-a562-11e9-a2fc-000d3a38a36f 204 2.000 359.450
# 00142f9a-7af6-11eb-8460-000d3a38a36f 26 3.000 404.940
# 0014778a-5b11-11ea-9a2c-000d3a38a36f 27 3.000 727.430
# 0018c6aa-ab6c-11e9-a2fc-000d3a38a36f 127 2.000 317.910
# 0022f41e-5597-11eb-9e65-000d3a38a36f 13 2.000 154.980
##############################################################################################
# MISSION 3 : CALCULATING RFM SCORES #
##############################################################################################
# MISSION 3 CALCULATING RFM SCORES
rfm['Recency_score'] = pd.qcut(rfm['Recency'],5, labels = [5,4,3,2,1] )
rfm["Frequency_score"] = pd.qcut(rfm['Frequency'].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])
rfm["Monetary_score"] = pd.qcut(rfm['Monetary'], 5, labels=[1, 2, 3, 4, 5])
rfm.head()
# Recency Frequency Monetary \
# master_id
# 00016786-2f5a-11ea-bb80-000d3a38a36f 11 5.000 776.070
# 00034aaa-a838-11e9-a2fc-000d3a38a36f 299 3.000 269.470
# 000be838-85df-11ea-a90b-000d3a38a36f 214 4.000 722.690
# 000c1fe2-a8b7-11ea-8479-000d3a38a36f 28 7.000 874.160
# 000f5e3e-9dde-11ea-80cd-000d3a38a36f 21 7.000 1620.330
# Recency_score Frequency_score \
# master_id
# 00016786-2f5a-11ea-bb80-000d3a38a36f 5 4
# 00034aaa-a838-11e9-a2fc-000d3a38a36f 1 2
# 000be838-85df-11ea-a90b-000d3a38a36f 2 3
# 000c1fe2-a8b7-11ea-8479-000d3a38a36f 5 4
# 000f5e3e-9dde-11ea-80cd-000d3a38a36f 5 4
# Monetary_score
# master_id
# 00016786-2f5a-11ea-bb80-000d3a38a36f 4
# 00034aaa-a838-11e9-a2fc-000d3a38a36f 1
# 000be838-85df-11ea-a90b-000d3a38a36f 4
# 000c1fe2-a8b7-11ea-8479-000d3a38a36f 4
# 000f5e3e-9dde-11ea-80cd-000d3a38a36f 5
rfm['RFM_score'] = (rfm['Recency_score'].astype(str)+ rfm['Frequency_score'].astype(str))
rfm[rfm["RFM_score"] == "55"].head(10)
# Recency Frequency Monetary \
# master_id
# 004d5204-2037-11ea-87bf-000d3a38a36f 28 8.000 1170.760
# 00736820-a834-11e9-a2fc-000d3a38a36f 27 9.000 714.530
# 00b3ee24-aa44-11e9-a2fc-000d3a38a36f 25 8.000 2027.780
# 00cf8494-9da2-11e9-9897-000d3a38a36f 6 53.000 6275.330
# 0151bbee-a7de-11e9-a2fc-000d3a38a36f 15 18.000 2649.020
# 016521aa-aa88-11e9-a2fc-000d3a38a36f 28 10.000 1691.280
# 020fdc82-a8d3-11e9-a2fc-000d3a38a36f 7 10.000 1735.080
# 024da65a-5b36-11ea-b7e2-000d3a38a36f 16 11.000 2312.010
# 025b8bb6-ac28-11e9-a2fc-000d3a38a36f 7 11.000 820.880
# 02a5bc6c-d663-11e9-93bc-000d3a38a36f 20 11.000 1578.700
# Recency_score Frequency_score \
# master_id
# 004d5204-2037-11ea-87bf-000d3a38a36f 5 5
# 00736820-a834-11e9-a2fc-000d3a38a36f 5 5
# 00b3ee24-aa44-11e9-a2fc-000d3a38a36f 5 5
# 00cf8494-9da2-11e9-9897-000d3a38a36f 5 5
# 0151bbee-a7de-11e9-a2fc-000d3a38a36f 5 5
# 016521aa-aa88-11e9-a2fc-000d3a38a36f 5 5
# 020fdc82-a8d3-11e9-a2fc-000d3a38a36f 5 5
# 024da65a-5b36-11ea-b7e2-000d3a38a36f 5 5
# 025b8bb6-ac28-11e9-a2fc-000d3a38a36f 5 5
# 02a5bc6c-d663-11e9-93bc-000d3a38a36f 5 5
# Monetary_score RFM_score
# master_id
# 004d5204-2037-11ea-87bf-000d3a38a36f 5 55
# 00736820-a834-11e9-a2fc-000d3a38a36f 4 55
# 00b3ee24-aa44-11e9-a2fc-000d3a38a36f 5 55
# 00cf8494-9da2-11e9-9897-000d3a38a36f 5 55
# 0151bbee-a7de-11e9-a2fc-000d3a38a36f 5 55
# 016521aa-aa88-11e9-a2fc-000d3a38a36f 5 55
# 020fdc82-a8d3-11e9-a2fc-000d3a38a36f 5 55
# 024da65a-5b36-11ea-b7e2-000d3a38a36f 5 55
# 025b8bb6-ac28-11e9-a2fc-000d3a38a36f 4 55
##############################################################################################
# MISSION 4 : DEFINING RFM SCORE AS A SEGMENT #
##############################################################################################
seg_map = {
r'[1-2][1-2]': 'hibernating',
r'[1-2][3-4]': 'at_Risk',
r'[1-2]5': 'cant_loose',
r'3[1-2]': 'about_to_sleep',
r'33': 'need_attention',
r'[3-4][4-5]': 'loyal_customers',
r'41': 'promising',
r'51': 'new_customers',
r'[4-5][2-3]': 'potential_loyalists',
r'5[4-5]': 'champions'
}
seg_map
rfm['segment'] = rfm['RFM_score'].replace(seg_map, regex=True)
rfm[["segment", "RFM_score"]].groupby("segment").agg(["mean", "count"])
rfm[["segment", "Recency", "Frequency", "Monetary"]].groupby("segment").agg(["mean", "count"])
# Recency Frequency Monetary
# mean count mean count mean count
# segment
# about_to_sleep 115.032 1643 2.407 1643 361.649 1643
# at_Risk 243.329 3152 4.470 3152 648.325 3152
# cant_loose 236.159 1194 10.717 1194 1481.652 1194
# champions 18.142 1920 8.965 1920 1410.709 1920
# hibernating 248.426 3589 2.391 3589 362.583 3589
# loyal_customers 83.558 3375 8.356 3375 1216.257 3375
# need_attention 114.037 806 3.739 806 553.437 806
# new_customers 18.976 673 2.000 673 344.049 673
# potential_loyalists 37.870 2925 3.311 2925 533.741 2925
# promising 59.695 668 2.000 668 334.153 668
rfm[rfm["segment"] == "need_attention"].head(10)
# Recency Frequency Monetary \
# master_id
# 0033a502-5bf1-11ea-829b-000d3a38a36f 106 4.000 788.730
# 003c4ebc-aa23-11e9-a2fc-000d3a38a36f 109 4.000 360.760
# 00f53518-ab9e-11e9-a2fc-000d3a38a36f 89 4.000 349.940
# 012fe082-b134-11e9-9757-000d3a38a36f 137 4.000 609.940
# 019443fe-ab05-11e9-a2fc-000d3a38a36f 89 4.000 317.450
# 023db43a-aa05-11e9-a2fc-000d3a38a36f 100 4.000 489.870
# 02bf2f16-ad15-11e9-a2fc-000d3a38a36f 100 4.000 330.850
# 030b5cca-ab25-11e9-a2fc-000d3a38a36f 92 4.000 477.950
# 0393a9d8-541f-11ea-b1db-000d3a38a36f 100 4.000 535.900
# 03b39e0a-e205-11e9-957d-000d3a38a36f 115 4.000 429.960
# Recency_score Frequency_score \
# master_id
# 0033a502-5bf1-11ea-829b-000d3a38a36f 3 3
# 003c4ebc-aa23-11e9-a2fc-000d3a38a36f 3 3
# 00f53518-ab9e-11e9-a2fc-000d3a38a36f 3 3
# 012fe082-b134-11e9-9757-000d3a38a36f 3 3
# 019443fe-ab05-11e9-a2fc-000d3a38a36f 3 3
# 023db43a-aa05-11e9-a2fc-000d3a38a36f 3 3
# 02bf2f16-ad15-11e9-a2fc-000d3a38a36f 3 3
# 030b5cca-ab25-11e9-a2fc-000d3a38a36f 3 3
# 0393a9d8-541f-11ea-b1db-000d3a38a36f 3 3
# 03b39e0a-e205-11e9-957d-000d3a38a36f 3 3
# Monetary_score RFM_score segment
# master_id
# 0033a502-5bf1-11ea-829b-000d3a38a36f 4 33 need_attention
# 003c4ebc-aa23-11e9-a2fc-000d3a38a36f 2 33 need_attention
# 00f53518-ab9e-11e9-a2fc-000d3a38a36f 2 33 need_attention
# 012fe082-b134-11e9-9757-000d3a38a36f 3 33 need_attention
# 019443fe-ab05-11e9-a2fc-000d3a38a36f 2 33 need_attention
# 023db43a-aa05-11e9-a2fc-000d3a38a36f 3 33 need_attention
# 02bf2f16-ad15-11e9-a2fc-000d3a38a36f 2 33 need_attention
# 030b5cca-ab25-11e9-a2fc-000d3a38a36f 3 33 need_attention
# 0393a9d8-541f-11ea-b1db-000d3a38a36f 3 33 need_attention
# 03b39e0a-e205-11e9-957d-000d3a38a36f 2 33 need_attention
##############################################################################################
# CASES 1 #
##############################################################################################
# A new women's shoe brand will be included. The target audience (champions,
# loyal_customers) and women are determined as shoppers.
# We need access to the id numbers of these customers.
vip_cust = (rfm[(rfm["segment"]=="champions") | (rfm["segment"]=="loyal_customers")])
women_cat = df[(df["interested_in_categories_12"]).str.contains("KADIN")]
women_vip_cust = pd.merge(vip_cust,women_cat[["interested_in_categories_12","master_id"]],on=["master_id"])
women_vip_cust.columns
# Index(['master_id', 'Recency', 'Frequency', 'Monetary', 'Recency_score',
# 'Frequency_score', 'Monetary_score', 'RFM_score', 'segment',
# 'interested_in_categories_12'],
# dtype='object')
women_vip_cust_ = women_vip_cust.drop(women_vip_cust.loc[:, 'Recency':'interested_in_categories_12'].columns, axis=1)
women_vip_cust_.to_csv("women_vip_customer_info.csv")
##############################################################################################
# CASES 2 #
##############################################################################################
# For the men and children category, customers who have not been shopping for a long time, those who are asleep,
# and new customers should not be lost. It is desired to be specially selected for the customers with this feature.
# It is planned to apply a 40% discount to these customers.
cus_profile = rfm[(rfm["segment"]=="cant_loose") | (rfm["segment"]=="about_to_sleep") | (rfm["segment"]=="new_customers")]
man_boy_cus =df[(df["interested_in_categories_12"]).str.contains("ERKEK|COCUK")]
man_boy_cus_profile = pd.merge(cus_profile,man_boy_cus[["interested_in_categories_12","master_id"]],on=["master_id"])
man_boy_cus_profile = man_boy_cus_profile.drop(man_boy_cus_profile.loc[:,'Recency':'interested_in_categories_12'].columns, axis=1)
man_boy_cus_profile
man_boy_cus_profile.to_csv("man_boy_customer_profile.csv")