In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm #進度條
from collections import Counter
import pickle
import os
import gzip

# 讀取資料
需要資料: 用戶ID、項目ID、購買日期<br>
將一個用戶在同一天購買的項目集合作為一個購物籃<br>
另外新增一個欄位，叫做購物籃ID。(同用戶同天透買的商品視為一個購物籃)

## Ta Feng Grocery Dataset

In [3]:
df = pd.read_csv('../raw_data/ta_feng_all_months_merged.csv')
df.head(20)

Unnamed: 0,TRANSACTION_DT,CUSTOMER_ID,AGE_GROUP,PIN_CODE,PRODUCT_SUBCLASS,PRODUCT_ID,AMOUNT,ASSET,SALES_PRICE
0,11/1/2000,1104905,45-49,115,110411,4710199010372,2,24,30
1,11/1/2000,418683,45-49,115,120107,4710857472535,1,48,46
2,11/1/2000,1057331,35-39,115,100407,4710043654103,2,142,166
3,11/1/2000,1849332,45-49,Others,120108,4710126092129,1,32,38
4,11/1/2000,1981995,50-54,115,100205,4710176021445,1,14,18
5,11/1/2000,1741797,35-39,115,110122,78895770025,1,54,75
6,11/1/2000,308359,60-64,115,110507,4710192225520,1,85,105
7,11/1/2000,1607000,35-39,221,520503,4712936888817,1,45,68
8,11/1/2000,1057331,35-39,115,320203,4715398106864,2,70,78
9,11/1/2000,236645,35-39,Unknown,120110,4710126091870,1,43,53


In [3]:
# 取出其中3個欄位
ta_feng_df = df.loc[:, ["CUSTOMER_ID", "PRODUCT_ID", "TRANSACTION_DT"]]
# 給每個項目新的索引(把 PRODUCT_ID 從0開始計)
ta_feng_df.loc[:, ["CART_ID", "NEW_ITEM_ID"]] = ""
ta_feng_df.head(5)
# 此資料集沒有空值跟重複的資料，所以沒有進行 drop_na 跟 drop_duplicates

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,NEW_ITEM_ID
0,1104905,4710199010372,11/1/2000,,
1,418683,4710857472535,11/1/2000,,
2,1057331,4710043654103,11/1/2000,,
3,1849332,4710126092129,11/1/2000,,
4,1981995,4710176021445,11/1/2000,,


In [4]:
ta_feng_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 817741 entries, 0 to 817740
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   CUSTOMER_ID     817741 non-null  int64 
 1   PRODUCT_ID      817741 non-null  int64 
 2   TRANSACTION_DT  817741 non-null  object
 3   CART_ID         817741 non-null  object
 4   NEW_ITEM_ID     817741 non-null  object
dtypes: int64(2), object(3)
memory usage: 31.2+ MB


In [5]:
# 檢查
CUSTOMER_ID_u = ta_feng_df["CUSTOMER_ID"].unique()
PRODUCT_ID_u = ta_feng_df["PRODUCT_ID"].unique()
TRANSACTION_DT_u = ta_feng_df["TRANSACTION_DT"].unique()

print("CUSTOMER_ID:", CUSTOMER_ID_u, "\nPRODUCT_ID:", PRODUCT_ID_u, "\nTRANSACTION_DT:", TRANSACTION_DT_u[:6])
print("\nCustomer length =", len(CUSTOMER_ID_u))
print("Product length =", len(PRODUCT_ID_u))
print("Transaction length =", len(TRANSACTION_DT_u))

CUSTOMER_ID: [1104905  418683 1057331 ...  324434 1538359 1023602] 
PRODUCT_ID: [4710199010372 4710857472535 4710043654103 ...      20192310 4973167117040
 8801008120415] 
TRANSACTION_DT: ['11/1/2000' '11/2/2000' '11/3/2000' '11/4/2000' '11/5/2000' '11/6/2000']

Customer length = 32266
Product length = 23812
Transaction length = 120


In [6]:
ta_feng_df["TRANSACTION_DT"] = ta_feng_df["TRANSACTION_DT"].astype("datetime64[ns]")
ta_feng_df = ta_feng_df.sort_values(["CUSTOMER_ID", "TRANSACTION_DT"])
ta_feng_df

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,NEW_ITEM_ID
89853,1069,9556439880610,2000-11-13,,
90164,1069,4710176008699,2000-11-13,,
542122,1069,4710320224661,2001-01-21,,
542544,1069,4710022101208,2001-01-21,,
542959,1069,4712603661644,2001-01-21,,
...,...,...,...,...,...
529754,20002000,4714800731229,2001-01-20,,
531237,20002000,4714541091071,2001-01-20,,
532062,20002000,4710018008634,2001-01-20,,
661318,20002000,4710085120680,2001-02-05,,


## 處理1: 刪除項目比數小於5的項目

In [7]:
count_item = Counter(df["PRODUCT_ID"])
count_item

Counter({4714981010038: 8476,
         4711271000014: 6120,
         4719090900065: 2444,
         4711080010112: 2251,
         4710114128038: 2181,
         4710265849066: 2018,
         4713985863121: 1976,
         4710088410139: 1870,
         4710583996008: 1840,
         4710908131589: 1680,
         4710291112172: 1641,
         4710011401128: 1637,
         4710088410610: 1632,
         4710036003581: 1622,
         4712425010712: 1620,
         4710421090059: 1564,
         4710094097768: 1448,
         4719090900058: 1408,
         4710018004605: 1391,
         4710114105046: 1384,
         4710085120628: 1357,
         4710114606048: 1294,
         4711001302104: 1242,
         20557003: 1202,
         37000440147: 1196,
         4710104111569: 1166,
         4714381003128: 1147,
         4710908131534: 1143,
         8888021200256: 1119,
         4710032501791: 1113,
         4710054380619: 1084,
         4712162000038: 1083,
         4710363352000: 1056,
         37000445

In [8]:
items_keep_list = [] # 要保留項目清單
for i, (keys, values) in enumerate(count_item.items()):
    if(values>=5):
        items_keep_list.append(keys)
items_keep_list

[4710199010372,
 4710857472535,
 4710043654103,
 4710126092129,
 4710176021445,
 78895770025,
 4710192225520,
 4712936888817,
 4715398106864,
 4710126091870,
 4711713290201,
 4710314491017,
 4710088410207,
 4710154012144,
 4710126091849,
 20000102818,
 4710583910004,
 4902520162847,
 4710734001346,
 8801055190294,
 4710323168054,
 4710054139804,
 4712031000060,
 4902181016893,
 4710043004090,
 4710131130311,
 4710088424655,
 4714981010038,
 4710088410443,
 4710868501019,
 4710583110015,
 4710985010272,
 20546601,
 4710012122121,
 4710088436498,
 4710011405133,
 4710085120628,
 4714125961004,
 4710011401128,
 4719090900058,
 4710018008634,
 4710088412201,
 4710290004171,
 4716349008084,
 4710088443328,
 4710022102892,
 4710364142068,
 20411909,
 4711524000457,
 4710134023276,
 4710011409056,
 4711271000014,
 4710098162806,
 4710088432674,
 4714617943044,
 20549404,
 4711713390031,
 4710357308303,
 4710057870148,
 4712500125010,
 4710018031632,
 4710109770402,
 4710543310066,
 4710011402

In [9]:
len(items_keep_list)

15786

In [10]:
print(items_keep_list)
ta_feng_df = ta_feng_df[ta_feng_df.PRODUCT_ID.isin(items_keep_list)]
ta_feng_df

[4710199010372, 4710857472535, 4710043654103, 4710126092129, 4710176021445, 78895770025, 4710192225520, 4712936888817, 4715398106864, 4710126091870, 4711713290201, 4710314491017, 4710088410207, 4710154012144, 4710126091849, 20000102818, 4710583910004, 4902520162847, 4710734001346, 8801055190294, 4710323168054, 4710054139804, 4712031000060, 4902181016893, 4710043004090, 4710131130311, 4710088424655, 4714981010038, 4710088410443, 4710868501019, 4710583110015, 4710985010272, 20546601, 4710012122121, 4710088436498, 4710011405133, 4710085120628, 4714125961004, 4710011401128, 4719090900058, 4710018008634, 4710088412201, 4710290004171, 4716349008084, 4710088443328, 4710022102892, 4710364142068, 20411909, 4711524000457, 4710134023276, 4710011409056, 4711271000014, 4710098162806, 4710088432674, 4714617943044, 20549404, 4711713390031, 4710357308303, 4710057870148, 4712500125010, 4710018031632, 4710109770402, 4710543310066, 4710011402019, 4710362020740, 2250062000090, 4711080040195, 4902704244567

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,NEW_ITEM_ID
89853,1069,9556439880610,2000-11-13,,
90164,1069,4710176008699,2000-11-13,,
542122,1069,4710320224661,2001-01-21,,
542544,1069,4710022101208,2001-01-21,,
542959,1069,4712603661644,2001-01-21,,
...,...,...,...,...,...
529754,20002000,4714800731229,2001-01-20,,
531237,20002000,4714541091071,2001-01-20,,
532062,20002000,4710018008634,2001-01-20,,
661318,20002000,4710085120680,2001-02-05,,


## 處理2: 刪除購物籃中項目筆數小於3的購物籃

In [11]:
# 將購物籃中的項目筆數少於 3 的購物籃資料刪除
ta_feng_df = ta_feng_df.groupby(["CUSTOMER_ID", "TRANSACTION_DT"]).filter(lambda x: len(x) >= 3)
ta_feng_df

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,NEW_ITEM_ID
542122,1069,4710320224661,2001-01-21,,
542544,1069,4710022101208,2001-01-21,,
542959,1069,4712603661644,2001-01-21,,
686679,1069,4710088620156,2001-02-03,,
687898,1069,4710176008699,2001-02-03,,
...,...,...,...,...,...
528125,20002000,4710339772139,2001-01-20,,
528162,20002000,20513184,2001-01-20,,
529754,20002000,4714800731229,2001-01-20,,
531237,20002000,4714541091071,2001-01-20,,


## 處理3: 刪除購物籃筆數少於3的用戶資料

In [12]:
CUSTOMER_ID_u = ta_feng_df["CUSTOMER_ID"].unique()

In [13]:
# 按照 user_id 排序
CUSTOMER_ID_u.sort()
print(len(CUSTOMER_ID_u))
CUSTOMER_ID_u

28182


array([    1069,     1113,     1250, ...,  2179605,  2179643, 20002000],
      dtype=int64)

In [14]:
# 將購物籃筆數少於 3 的用戶資料刪除
count = 0
user_keep_list = []
for i in tqdm(CUSTOMER_ID_u):
    if len(ta_feng_df[ta_feng_df.CUSTOMER_ID == i].TRANSACTION_DT.unique()) >= 3:
        count = count + 1
        user_keep_list.append(i)
print("\ncount=", count) # 剩下的用戶數

  0%|          | 0/28182 [00:00<?, ?it/s]


count= 10654


In [15]:
print(user_keep_list)
ta_feng_df = ta_feng_df[ta_feng_df.CUSTOMER_ID.isin(user_keep_list)]
ta_feng_df

[1113, 5241, 5517, 6668, 7795, 10801, 11235, 12249, 13697, 18173, 18524, 19545, 19750, 19873, 19903, 20220, 20459, 20596, 20695, 20794, 20800, 20817, 20879, 21128, 21203, 21449, 21951, 22736, 22934, 23078, 23474, 23672, 23771, 23795, 23863, 23917, 24259, 24532, 25058, 25324, 25362, 25386, 25553, 25683, 25805, 25973, 26000, 26062, 26192, 26260, 26307, 26321, 26468, 26499, 26550, 26666, 26925, 27199, 27243, 27304, 27335, 27540, 27632, 27908, 28516, 28547, 28899, 28967, 29018, 29209, 29377, 29421, 29964, 30045, 30106, 30267, 30281, 30502, 30908, 32100, 32674, 32827, 32896, 33268, 33510, 33770, 33862, 35866, 36016, 36450, 38386, 38782, 39635, 39642, 39932, 40129, 40150, 40181, 40198, 40242, 40488, 40662, 41133, 41171, 41188, 41263, 41607, 41737, 41928, 42017, 42093, 42208, 42666, 42857, 42963, 43113, 43359, 43472, 43502, 43748, 43816, 43854, 43861, 44363, 44431, 44875, 45377, 45506, 45674, 45902, 45957, 45971, 45995, 46015, 46039, 46145, 46466, 46633, 46855, 46923, 46992, 47067, 47197, 472

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,NEW_ITEM_ID
186918,1113,4902105011621,2000-11-26,,
188130,1113,7616100830794,2000-11-26,,
188131,1113,4710892632017,2000-11-26,,
190132,1113,4710905340113,2000-11-27,,
193725,1113,4717362901277,2000-11-27,,
...,...,...,...,...,...
528125,20002000,4710339772139,2001-01-20,,
528162,20002000,20513184,2001-01-20,,
529754,20002000,4714800731229,2001-01-20,,
531237,20002000,4714541091071,2001-01-20,,


## 處理4: 重新計算商品編號

In [16]:
itemid_dict = {}
new_id = 0
def give_item_id(x):
    if x not in itemid_dict.keys():
        if itemid_dict:
            new_id = max(itemid_dict.values()) + 1
        else:
            new_id = 0
            
        itemid_dict[x] = new_id
    return itemid_dict[x]

In [17]:
tqdm.pandas(desc = 'apply')
new_itemID_series = ta_feng_df["PRODUCT_ID"].progress_apply(give_item_id)
ta_feng_df["NEW_ITEM_ID"] = new_itemID_series
ta_feng_df

apply:   0%|          | 0/533059 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ta_feng_df["NEW_ITEM_ID"] = new_itemID_series


Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,NEW_ITEM_ID
186918,1113,4902105011621,2000-11-26,,0
188130,1113,7616100830794,2000-11-26,,1
188131,1113,4710892632017,2000-11-26,,2
190132,1113,4710905340113,2000-11-27,,3
193725,1113,4717362901277,2000-11-27,,4
...,...,...,...,...,...
528125,20002000,4710339772139,2001-01-20,,4546
528162,20002000,20513184,2001-01-20,,1351
529754,20002000,4714800731229,2001-01-20,,2946
531237,20002000,4714541091071,2001-01-20,,7382


## 處理5: 新增購物籃編號

In [18]:
user_id_u = ta_feng_df["CUSTOMER_ID"].unique()
print(len(user_id_u))
user_id_u

10654


array([    1113,     5241,     5517, ...,  2176161,  2177199, 20002000],
      dtype=int64)

In [19]:
# 使用同個使用者、同一天購買日期進行分群。(中括號內為 index)
df_gp = ta_feng_df.groupby(["CUSTOMER_ID", "TRANSACTION_DT"]).groups
df_gp

{(1113, 2000-11-26 00:00:00): [186918, 188130, 188131], (1113, 2000-11-27 00:00:00): [190132, 193725, 194153, 194573, 194970, 194974], (1113, 2001-01-06 00:00:00): [457372, 458108, 458469, 460077, 460863, 461655, 462278], (5241, 2000-12-16 00:00:00): [330329, 330745, 331143, 333833, 335022, 337296], (5241, 2001-01-07 00:00:00): [465517, 465849, 466354, 466674, 467096, 468381, 469098, 469111, 469520, 469922], (5241, 2001-01-16 00:00:00): [396542, 408292, 408566, 409846, 409851, 409854, 409855, 410140, 411191, 416751, 418426, 427846, 430664, 445640, 450125, 450970], (5241, 2001-02-19 00:00:00): [737728, 738147, 740859, 740880, 741664], (5517, 2000-12-09 00:00:00): [229648, 229650, 230055, 230062, 247544, 247636, 247745, 248577, 248579, 248584, 248906, 248911, 249186, 249206, 249248, 249600, 249611, 249958, 249976, 250606, 251303, 251434, 251436, 251479, 251857, 251916, 251937, 252723, 252746, 252965, 252973, 253155, 253183, 253407, 253413, 253578], (5517, 2000-12-23 00:00:00): [358446, 3

In [20]:
df_gp_list = []
for k, v in tqdm(df_gp.items()):
    df_gp_list.append(list(pd.Series(v)))
df_gp_list

  0%|          | 0/62361 [00:00<?, ?it/s]

[[186918, 188130, 188131],
 [190132, 193725, 194153, 194573, 194970, 194974],
 [457372, 458108, 458469, 460077, 460863, 461655, 462278],
 [330329, 330745, 331143, 333833, 335022, 337296],
 [465517,
  465849,
  466354,
  466674,
  467096,
  468381,
  469098,
  469111,
  469520,
  469922],
 [396542,
  408292,
  408566,
  409846,
  409851,
  409854,
  409855,
  410140,
  411191,
  416751,
  418426,
  427846,
  430664,
  445640,
  450125,
  450970],
 [737728, 738147, 740859, 740880, 741664],
 [229648,
  229650,
  230055,
  230062,
  247544,
  247636,
  247745,
  248577,
  248579,
  248584,
  248906,
  248911,
  249186,
  249206,
  249248,
  249600,
  249611,
  249958,
  249976,
  250606,
  251303,
  251434,
  251436,
  251479,
  251857,
  251916,
  251937,
  252723,
  252746,
  252965,
  252973,
  253155,
  253183,
  253407,
  253413,
  253578],
 [358446,
  359275,
  359277,
  361928,
  363154,
  363170,
  363173,
  363178,
  363895,
  363904,
  363918,
  364311,
  364326,
  364337,
  3654

In [21]:
cart_id_list = []
for items_list in tqdm(df_gp_list):
    cart_id = df_gp_list.index(items_list)
    for item in items_list:
        cart_id_list.append(cart_id)
cart_id_list  

  0%|          | 0/62361 [00:00<?, ?it/s]

[0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 13,
 13,
 13,
 14,
 14,
 14,
 14,
 15,
 15,
 15,
 15,
 16,
 16,
 16,
 16,
 16,
 17,
 17,
 17,
 18,
 18,
 18,
 18,
 18,
 18,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20

In [22]:
ta_feng_df["CART_ID"] = cart_id_list
ta_feng_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ta_feng_df["CART_ID"] = cart_id_list


Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,NEW_ITEM_ID
186918,1113,4902105011621,2000-11-26,0,0
188130,1113,7616100830794,2000-11-26,0,1
188131,1113,4710892632017,2000-11-26,0,2
190132,1113,4710905340113,2000-11-27,1,3
193725,1113,4717362901277,2000-11-27,1,4
...,...,...,...,...,...
528125,20002000,4710339772139,2001-01-20,62360,4546
528162,20002000,20513184,2001-01-20,62360,1351
529754,20002000,4714800731229,2001-01-20,62360,2946
531237,20002000,4714541091071,2001-01-20,62360,7382


In [23]:
ta_feng_df

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,NEW_ITEM_ID
186918,1113,4902105011621,2000-11-26,0,0
188130,1113,7616100830794,2000-11-26,0,1
188131,1113,4710892632017,2000-11-26,0,2
190132,1113,4710905340113,2000-11-27,1,3
193725,1113,4717362901277,2000-11-27,1,4
...,...,...,...,...,...
528125,20002000,4710339772139,2001-01-20,62360,4546
528162,20002000,20513184,2001-01-20,62360,1351
529754,20002000,4714800731229,2001-01-20,62360,2946
531237,20002000,4714541091071,2001-01-20,62360,7382


In [24]:
# 輸出整理後的檔案
cleaned_folder = "./cleaned_dataset"
if not os.path.exists(cleaned_folder):
    os.mkdir(cleaned_folder)
    
ta_feng_clean = os.path.join(cleaned_folder, "TaFeng_clean.csv")
ta_feng_df.to_csv(ta_feng_clean, sep=",", index=False, header=True)

In [25]:
df = pd.read_csv(ta_feng_clean)
df.head(20)

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,NEW_ITEM_ID
0,1113,4902105011621,2000-11-26,0,0
1,1113,7616100830794,2000-11-26,0,1
2,1113,4710892632017,2000-11-26,0,2
3,1113,4710905340113,2000-11-27,1,3
4,1113,4717362901277,2000-11-27,1,4
5,1113,4712076000650,2000-11-27,1,5
6,1113,4710114105046,2000-11-27,1,6
7,1113,4712425010712,2000-11-27,1,7
8,1113,4710114606048,2000-11-27,1,8
9,1113,4710254015014,2001-01-06,2,9


In [26]:
Counter(ta_feng_df["PRODUCT_ID"]).most_common

<bound method Counter.most_common of Counter({4714981010038: 4735, 4711271000014: 3428, 4711080010112: 1633, 4710114128038: 1411, 4713985863121: 1402, 4719090900065: 1382, 4710583996008: 1271, 4710011401128: 1261, 4710088410139: 1254, 4710088410610: 1147, 4710908131589: 1112, 4710291112172: 1040, 4710094097768: 1028, 4712425010712: 1013, 4710265849066: 969, 4710036003581: 969, 4710018004605: 956, 4710085120628: 913, 4719090900058: 904, 4710114105046: 895, 4710421090059: 846, 4710114606048: 832, 20557003: 824, 4711001302104: 814, 37000440147: 802, 4710011406123: 773, 4710011405133: 756, 4710104111569: 752, 4714381003128: 745, 4710908131534: 744, 4711258001256: 738, 8888021200256: 723, 4710011401135: 715, 4710105015118: 713, 4712019100591: 711, 37000445111: 706, 4710583110015: 700, 4710085172696: 698, 4710032501791: 693, 20332433: 681, 37000329169: 679, 4711022100017: 673, 4710105015125: 664, 4712162000038: 662, 4712425010255: 659, 4710254049521: 659, 4711271000472: 649, 4710011402019: 6

## 處理成NBR-WBS主程式需要的格式

In [27]:
df = pd.read_csv(ta_feng_clean)
df

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,NEW_ITEM_ID
0,1113,4902105011621,2000-11-26,0,0
1,1113,7616100830794,2000-11-26,0,1
2,1113,4710892632017,2000-11-26,0,2
3,1113,4710905340113,2000-11-27,1,3
4,1113,4717362901277,2000-11-27,1,4
...,...,...,...,...,...
533054,20002000,4710339772139,2001-01-20,62360,4546
533055,20002000,20513184,2001-01-20,62360,1351
533056,20002000,4714800731229,2001-01-20,62360,2946
533057,20002000,4714541091071,2001-01-20,62360,7382


In [28]:
df[df["CUSTOMER_ID"]==1113]["TRANSACTION_DT"].unique().astype("datetime64")

array(['2000-11-26', '2000-11-27', '2001-01-06'], dtype='datetime64[D]')

In [29]:
df

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,NEW_ITEM_ID
0,1113,4902105011621,2000-11-26,0,0
1,1113,7616100830794,2000-11-26,0,1
2,1113,4710892632017,2000-11-26,0,2
3,1113,4710905340113,2000-11-27,1,3
4,1113,4717362901277,2000-11-27,1,4
...,...,...,...,...,...
533054,20002000,4710339772139,2001-01-20,62360,4546
533055,20002000,20513184,2001-01-20,62360,1351
533056,20002000,4714800731229,2001-01-20,62360,2946
533057,20002000,4714541091071,2001-01-20,62360,7382


In [30]:
# 計算此用戶最後一個購物籃與所有購物籃的時間差(不包含最後一個購物籃)
def get_deltaT(userID):
    baskets_times = df[df["CUSTOMER_ID"]==userID]["TRANSACTION_DT"].unique().astype("datetime64")
    delta_t = baskets_times[-2] - baskets_times[:-1]
    # min-max Normalization
    delta_t_min = delta_t.min()
    delta_t_max = delta_t.max()
    delta_t = 1 - (delta_t - delta_t_min) / (delta_t_max - delta_t_min)
    return delta_t

get_deltaT(1113)

array([0., 1.])

In [31]:
# 取的所有用戶的所有購物籃項目id(輸出是一個三維串列)、用戶ID、購物籃項目id、購物籃
def get_users_cartitemid(df_group):
    
    get_cart_itemid = lambda cart: [df["NEW_ITEM_ID"][index] for index in cart]
    
    last_user_id = 0
    user_cart_itemid_list = []
    cart_itemid_list = []
    cart_size_list = []
    for index, (userid, cartid) in enumerate(df_group.groups):
        if last_user_id == 0 or last_user_id == userid:
            item_list = list(df_group.groups[(userid, cartid)])
            cart_itemid_list.append(get_cart_itemid(item_list))
            cart_size_list.append(len(item_list))
        else:
            user_cart_itemid_list.append((last_user_id, cart_itemid_list, cart_size_list))
            cart_itemid_list = []
            cart_size_list = []
            item_list = list(df_group.groups[(userid, cartid)])
            cart_itemid_list.append(get_cart_itemid(item_list))
            cart_size_list.append(len(item_list))
        last_user_id = userid
    user_cart_itemid_list.append((last_user_id, cart_itemid_list, cart_size_list))
    return user_cart_itemid_list

df_group = df.groupby(["CUSTOMER_ID", "CART_ID"])
user_cart_itemid_list = get_users_cartitemid(df_group)
user_cart_itemid_list[:2]

[(1113,
  [[0, 1, 2], [3, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14, 15]],
  [3, 6, 7]),
 (5241,
  [[16, 17, 18, 19, 20, 21],
   [22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
   [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
   [48, 49, 50, 51, 52]],
  [6, 10, 16, 5])]

In [32]:
preprocessing_folder = "../preprocessing-data"
if not os.path.exists(preprocessing_folder):
    os.mkdir(preprocessing_folder)
with gzip.GzipFile("../preprocessing-data/user_cart_item_list/TaFeng_user_cart_itemid_list.gz", "wb") as fp:
    pickle.dump(user_cart_itemid_list, fp)