In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm  # 進度條
from collections import Counter
import pickle
import os
import gzip

# 讀取資料

### Instacart Dataset

In [2]:
user_order_d = pd.read_csv("../raw_data/instacart/orders.csv", usecols=["user_id", "order_number", "order_id"])
order_item_train = pd.read_csv("../raw_data/instacart/order_products__train.csv", usecols = ["order_id", "product_id"])
order_item_prior = pd.read_csv("../raw_data/instacart/order_products__prior.csv", usecols = ["order_id", "product_id"])
order_item = pd.concat([order_item_prior, order_item_train], ignore_index = True)

user_order = pd.merge(user_order_d, order_item, on = "order_id", how = "left")
user_order = user_order.dropna(how = "any")
user_order = user_order.loc[:, ["user_id", "order_number", "product_id"]]

In [3]:
# 縮小資料集
user_num = len(set(user_order["user_id"].tolist()))
user_num = int(user_num * 0.1)
user_order = user_order[user_order["user_id"] <= user_num]

In [4]:
user_order

Unnamed: 0,user_id,order_number,product_id
0,1,1,196.0
1,1,1,14084.0
2,1,1,12427.0
3,1,1,26088.0
4,1,1,26405.0
...,...,...,...
3355239,20620,5,22963.0
3355240,20620,5,12204.0
3355241,20620,5,21543.0
3355242,20620,5,23541.0


In [5]:
# 欄位重新命名
user_order.columns = ["CUSTOMER_ID", "ORDER_NUMBER", "PRODUCT_ID"]
#  給每個項目新的索引(把PRODUCT_ID從0開始計)
user_order.loc[:, ["CART_ID", "NEW_ITEM_ID"]] = ""
user_order.head(5)
# 此資料集沒有空值和重複資料，所以沒有進行drop_na 跟 drop_duplicates

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,PRODUCT_ID,CART_ID,NEW_ITEM_ID
0,1,1,196.0,,
1,1,1,14084.0,,
2,1,1,12427.0,,
3,1,1,26088.0,,
4,1,1,26405.0,,


In [6]:
user_order["PRODUCT_ID"] = user_order["PRODUCT_ID"].astype("int64")
user_order.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3347805 entries, 0 to 3355243
Data columns (total 5 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   CUSTOMER_ID   int64 
 1   ORDER_NUMBER  int64 
 2   PRODUCT_ID    int64 
 3   CART_ID       object
 4   NEW_ITEM_ID   object
dtypes: int64(3), object(2)
memory usage: 153.3+ MB


In [7]:
user_order = user_order.sort_values(["CUSTOMER_ID", "ORDER_NUMBER"])
user_order

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,PRODUCT_ID,CART_ID,NEW_ITEM_ID
0,1,1,196,,
1,1,1,14084,,
2,1,1,12427,,
3,1,1,26088,,
4,1,1,26405,,
...,...,...,...,...,...
3355239,20620,5,22963,,
3355240,20620,5,12204,,
3355241,20620,5,21543,,
3355242,20620,5,23541,,


# 處理1. 刪除項目筆數小於5的項目

In [8]:
count_item = Counter(user_order["PRODUCT_ID"])
count_item

Counter({24852: 47560,
         13176: 38621,
         21137: 27146,
         21903: 25131,
         47209: 22915,
         47766: 17776,
         47626: 16073,
         16797: 14832,
         27966: 14517,
         26209: 14448,
         27845: 13645,
         22935: 11741,
         24964: 11390,
         45007: 10959,
         39275: 10460,
         49683: 10430,
         28204: 9028,
         5876: 8796,
         40706: 8386,
         30391: 8321,
         4920: 8298,
         8277: 8297,
         44632: 8215,
         42265: 8097,
         45066: 8095,
         17794: 7782,
         19057: 7718,
         21616: 7709,
         49235: 7639,
         37646: 7574,
         4605: 7530,
         27104: 7491,
         30489: 7423,
         31717: 7417,
         46979: 7312,
         27086: 7244,
         28985: 7181,
         44359: 7059,
         8518: 6975,
         41950: 6606,
         26604: 6551,
         34126: 6170,
         21938: 6167,
         43352: 6163,
         35951: 6098,

In [9]:
items_keep_list = [] # 要保留項目清單
for i, (keys, values) in enumerate(count_item.items()):
    if(values>=5):
        items_keep_list.append(keys)
items_keep_list

[196,
 14084,
 12427,
 26088,
 26405,
 10258,
 13176,
 13032,
 25133,
 30450,
 10326,
 17122,
 41787,
 46149,
 49235,
 39657,
 38928,
 35951,
 27845,
 32792,
 47766,
 20574,
 12000,
 48110,
 22474,
 16589,
 35917,
 27344,
 30489,
 27966,
 45066,
 16797,
 47526,
 8479,
 19051,
 8138,
 7781,
 28874,
 49451,
 32139,
 34688,
 36735,
 37646,
 22829,
 24852,
 47209,
 33276,
 45613,
 9681,
 21150,
 47144,
 5322,
 17224,
 38656,
 21376,
 48210,
 5907,
 14553,
 47553,
 22124,
 19156,
 1559,
 32052,
 46676,
 33754,
 17872,
 18523,
 24954,
 4957,
 40571,
 46886,
 40198,
 17758,
 28918,
 22963,
 23,
 20084,
 2002,
 5212,
 14306,
 13742,
 18961,
 15841,
 13351,
 5450,
 48099,
 49273,
 47792,
 9124,
 22559,
 33957,
 27737,
 2573,
 4071,
 8296,
 21227,
 3151,
 19240,
 21709,
 10305,
 24990,
 36287,
 42356,
 12258,
 42342,
 30908,
 79,
 5869,
 44303,
 16521,
 39877,
 19057,
 45948,
 22825,
 27413,
 39928,
 20785,
 24768,
 7963,
 13640,
 9387,
 24838,
 38547,
 19019,
 12007,
 26352,
 31883,
 12324,
 56

In [10]:
len(items_keep_list)

27096

In [11]:
print(items_keep_list)
instacart_df = user_order[user_order.PRODUCT_ID.isin(items_keep_list)]
instacart_df

[196, 14084, 12427, 26088, 26405, 10258, 13176, 13032, 25133, 30450, 10326, 17122, 41787, 46149, 49235, 39657, 38928, 35951, 27845, 32792, 47766, 20574, 12000, 48110, 22474, 16589, 35917, 27344, 30489, 27966, 45066, 16797, 47526, 8479, 19051, 8138, 7781, 28874, 49451, 32139, 34688, 36735, 37646, 22829, 24852, 47209, 33276, 45613, 9681, 21150, 47144, 5322, 17224, 38656, 21376, 48210, 5907, 14553, 47553, 22124, 19156, 1559, 32052, 46676, 33754, 17872, 18523, 24954, 4957, 40571, 46886, 40198, 17758, 28918, 22963, 23, 20084, 2002, 5212, 14306, 13742, 18961, 15841, 13351, 5450, 48099, 49273, 47792, 9124, 22559, 33957, 27737, 2573, 4071, 8296, 21227, 3151, 19240, 21709, 10305, 24990, 36287, 42356, 12258, 42342, 30908, 79, 5869, 44303, 16521, 39877, 19057, 45948, 22825, 27413, 39928, 20785, 24768, 7963, 13640, 9387, 24838, 38547, 19019, 12007, 26352, 31883, 12324, 5699, 31612, 34284, 48523, 2361, 48821, 11913, 45645, 1757, 21329, 17668, 15143, 39190, 21903, 39922, 24810, 32402, 38596, 248, 40

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,PRODUCT_ID,CART_ID,NEW_ITEM_ID
0,1,1,196,,
1,1,1,14084,,
2,1,1,12427,,
3,1,1,26088,,
4,1,1,26405,,
...,...,...,...,...,...
3355239,20620,5,22963,,
3355240,20620,5,12204,,
3355241,20620,5,21543,,
3355242,20620,5,23541,,


# 處理2: 刪除購物籃中項目筆數小於3的購物籃

In [12]:
# 將購物籃中項目筆數少於 3 的購物籃資料刪除
instacart_df = instacart_df.groupby(["CUSTOMER_ID","ORDER_NUMBER"]).filter(lambda x : len(x) >= 3)
instacart_df

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,PRODUCT_ID,CART_ID,NEW_ITEM_ID
0,1,1,196,,
1,1,1,14084,,
2,1,1,12427,,
3,1,1,26088,,
4,1,1,26405,,
...,...,...,...,...,...
3355239,20620,5,22963,,
3355240,20620,5,12204,,
3355241,20620,5,21543,,
3355242,20620,5,23541,,


#  處理3:  刪除購物籃筆數少於3的用戶資料

In [13]:
CUSTOMER_ID_u = instacart_df["CUSTOMER_ID"].unique()

In [14]:
# 按照user_id排序
CUSTOMER_ID_u.sort()
print(len(CUSTOMER_ID_u))
CUSTOMER_ID_u

20365


array([    1,     2,     3, ..., 20618, 20619, 20620], dtype=int64)

In [15]:
# 將購物籃筆數少於3的用戶資料刪除
count = 0
user_keep_list = []
for i in tqdm(CUSTOMER_ID_u):
    if len(instacart_df[instacart_df.CUSTOMER_ID == i].ORDER_NUMBER.unique()) >= 3:
        count = count + 1
        user_keep_list.append(i)
         
print("\ncount=", count) # 剩下用戶數

  0%|          | 0/20365 [00:00<?, ?it/s]


count= 19485


In [16]:
print(user_keep_list)
instacart_df = instacart_df[instacart_df.CUSTOMER_ID.isin(user_keep_list)]
instacart_df

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 89, 90, 91, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 114, 115, 116, 117, 118, 119, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 197, 198, 199, 200, 201, 202, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,PRODUCT_ID,CART_ID,NEW_ITEM_ID
0,1,1,196,,
1,1,1,14084,,
2,1,1,12427,,
3,1,1,26088,,
4,1,1,26405,,
...,...,...,...,...,...
3355239,20620,5,22963,,
3355240,20620,5,12204,,
3355241,20620,5,21543,,
3355242,20620,5,23541,,


# 處理4: 重新計算商品編號

In [17]:
itemid_dict = {}
new_id = 0
def give_item_id(x):
    if x not in itemid_dict.keys():
        if itemid_dict:
            new_id = max(itemid_dict.values())+1
        else:
            new_id = 0
        itemid_dict[x] = new_id
    return itemid_dict[x]

In [18]:
tqdm.pandas(desc="apply")
new_itemID_series = instacart_df["PRODUCT_ID"].progress_apply(give_item_id)
instacart_df["NEW_ITEM_ID"] = new_itemID_series
instacart_df

apply:   0%|          | 0/3252976 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  instacart_df["NEW_ITEM_ID"] = new_itemID_series


Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,PRODUCT_ID,CART_ID,NEW_ITEM_ID
0,1,1,196,,0
1,1,1,14084,,1
2,1,1,12427,,2
3,1,1,26088,,3
4,1,1,26405,,4
...,...,...,...,...,...
3355239,20620,5,22963,,74
3355240,20620,5,12204,,1604
3355241,20620,5,21543,,3295
3355242,20620,5,23541,,417


# 處理5: 新增購物籃編號

In [19]:
user_id_u = instacart_df["CUSTOMER_ID"].unique()
print(len(user_id_u))
user_id_u

19485


array([    1,     2,     3, ..., 20617, 20619, 20620], dtype=int64)

In [20]:
# 使用同個使用者、同一個購買編號進行分群。(中括號內為index)
df_gp = instacart_df.groupby(["CUSTOMER_ID", "ORDER_NUMBER"]).groups
df_gp

{(1, 1): [0, 1, 2, 3, 4], (1, 2): [5, 6, 7, 8, 9, 10], (1, 3): [11, 12, 13, 14, 15], (1, 4): [16, 17, 18, 19, 20], (1, 5): [21, 22, 23, 24, 25, 26, 27, 28], (1, 6): [29, 30, 31, 32], (1, 7): [33, 34, 35, 36, 37], (1, 8): [38, 39, 40, 41, 42, 43], (1, 9): [44, 45, 46, 47, 48, 49], (1, 10): [50, 51, 52, 53, 54, 55, 56, 57, 58], (1, 11): [59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], (2, 1): [70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82], (2, 2): [83, 84, 85, 86, 87, 88], (2, 3): [89, 90, 91, 92, 93], (2, 4): [94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106], (2, 5): [107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119], (2, 6): [120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140], (2, 7): [141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154], (2, 8): [155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170], (2, 9): [171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181

In [21]:
df_gp_list = []
for k, v in tqdm(df_gp.items()):
    df_gp_list.append(list(pd.Series(v)))
df_gp_list

  0%|          | 0/295154 [00:00<?, ?it/s]

[[0, 1, 2, 3, 4],
 [5, 6, 7, 8, 9, 10],
 [11, 12, 13, 14, 15],
 [16, 17, 18, 19, 20],
 [21, 22, 23, 24, 25, 26, 27, 28],
 [29, 30, 31, 32],
 [33, 34, 35, 36, 37],
 [38, 39, 40, 41, 42, 43],
 [44, 45, 46, 47, 48, 49],
 [50, 51, 52, 53, 54, 55, 56, 57, 58],
 [59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
 [70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82],
 [83, 84, 85, 86, 87, 88],
 [89, 90, 91, 92, 93],
 [94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106],
 [107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119],
 [120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140],
 [141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154],
 [155,
  156,
  157,
  158,
  159,
  160,
  161,
  162,
  163,
  164,
  165,
  166,
  167,
  168,
  169,
  170],
 [171,
  172,
  173,
  174,
  175,
  176,
  177,
  178,
  179,
  180,
  181,
  182,
  183,
  184,
  185,
  186,
  187,
  188,
  

In [22]:
cart_id_list = []
for items_list in tqdm(df_gp_list):
    cart_id = df_gp_list.index(items_list)
    for item in items_list:
        cart_id_list.append(cart_id)
cart_id_list

  0%|          | 0/295154 [00:00<?, ?it/s]

[0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 13,
 13,
 13,
 13,
 13,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 21,
 21,
 21,
 21,
 21,
 21,

In [23]:
instacart_df["CART_ID"] = cart_id_list
instacart_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  instacart_df["CART_ID"] = cart_id_list


Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,PRODUCT_ID,CART_ID,NEW_ITEM_ID
0,1,1,196,0,0
1,1,1,14084,0,1
2,1,1,12427,0,2
3,1,1,26088,0,3
4,1,1,26405,0,4
...,...,...,...,...,...
3355239,20620,5,22963,295153,74
3355240,20620,5,12204,295153,1604
3355241,20620,5,21543,295153,3295
3355242,20620,5,23541,295153,417


In [24]:
instacart_df

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,PRODUCT_ID,CART_ID,NEW_ITEM_ID
0,1,1,196,0,0
1,1,1,14084,0,1
2,1,1,12427,0,2
3,1,1,26088,0,3
4,1,1,26405,0,4
...,...,...,...,...,...
3355239,20620,5,22963,295153,74
3355240,20620,5,12204,295153,1604
3355241,20620,5,21543,295153,3295
3355242,20620,5,23541,295153,417


In [25]:
# 輸出整理後的檔案
cleaned_folder = "../cleaned_dataset"
if not os.path.exists(cleaned_folder):
    os.mkdir(cleaned_folder)
    
instacart_clean = os.path.join(cleaned_folder, "Instacart_clean.csv")
instacart_df.to_csv(instacart_clean, sep=",", index=False, header=True)

In [26]:
df = pd.read_csv(instacart_clean)
df.head(20)

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,PRODUCT_ID,CART_ID,NEW_ITEM_ID
0,1,1,196,0,0
1,1,1,14084,0,1
2,1,1,12427,0,2
3,1,1,26088,0,3
4,1,1,26405,0,4
5,1,2,196,1,0
6,1,2,10258,1,5
7,1,2,12427,1,2
8,1,2,13176,1,6
9,1,2,26088,1,3


In [27]:
Counter(instacart_df["PRODUCT_ID"]).most_common

<bound method Counter.most_common of Counter({24852: 46782, 13176: 37453, 21137: 26655, 21903: 24663, 47209: 22666, 47766: 17567, 47626: 15907, 27966: 14378, 16797: 14372, 26209: 14302, 27845: 13268, 22935: 11625, 24964: 11299, 45007: 10835, 49683: 10343, 39275: 10152, 28204: 8907, 5876: 8711, 40706: 8336, 30391: 8269, 8277: 8210, 4920: 8171, 44632: 8027, 45066: 8004, 42265: 7954, 17794: 7715, 19057: 7643, 21616: 7608, 37646: 7508, 4605: 7489, 27104: 7427, 30489: 7375, 31717: 7366, 49235: 7268, 46979: 7225, 28985: 7112, 27086: 7110, 44359: 7000, 8518: 6940, 41950: 6541, 26604: 6458, 21938: 6138, 34126: 6095, 22035: 6046, 10749: 5989, 39877: 5967, 5077: 5960, 35951: 5940, 24184: 5930, 9076: 5775, 43352: 5762, 43961: 5724, 34969: 5472, 39928: 5436, 19660: 5324, 25890: 5262, 31506: 5247, 46667: 5170, 24838: 5149, 12341: 5133, 48679: 5121, 5450: 4976, 5785: 4939, 35221: 4912, 22825: 4751, 28842: 4707, 8424: 4559, 27521: 4542, 33731: 4460, 44142: 4440, 8174: 4382, 29487: 4381, 15290: 4353, 

## 處理成NBR-WBS主程式需要的格式

In [28]:
df = pd.read_csv(instacart_clean)
df

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,PRODUCT_ID,CART_ID,NEW_ITEM_ID
0,1,1,196,0,0
1,1,1,14084,0,1
2,1,1,12427,0,2
3,1,1,26088,0,3
4,1,1,26405,0,4
...,...,...,...,...,...
3252971,20620,5,22963,295153,74
3252972,20620,5,12204,295153,1604
3252973,20620,5,21543,295153,3295
3252974,20620,5,23541,295153,417


In [29]:
# 取得所有用戶的所有購物籃項目id(輸出是一個三維串列)、用戶ID、購物籃項目id、購物籃大小
def get_users_cartitemid(df_group):
    
    get_cart_itemid = lambda cart: [df["NEW_ITEM_ID"][index] for index in cart]
    
    last_user_id = 0
    user_cart_itemid_list = []
    cart_itemid_list = []
    cart_size_list = []
    for index, (userid, cartid) in enumerate(df_group.groups):
        if last_user_id == 0 or last_user_id == userid:
            item_list = list(df_group.groups[(userid, cartid)])
            cart_itemid_list.append(get_cart_itemid(item_list))
            cart_size_list.append(len(item_list))
        else:
            user_cart_itemid_list.append((last_user_id, cart_itemid_list, cart_size_list))
            cart_itemid_list = []
            cart_size_list = []
            item_list = list(df_group.groups[(userid, cartid)])
            cart_itemid_list.append(get_cart_itemid(item_list))
            cart_size_list.append(len(item_list))
        last_user_id = userid
    user_cart_itemid_list.append((last_user_id, cart_itemid_list, cart_size_list))
    return user_cart_itemid_list

df_group = df.groupby(["CUSTOMER_ID", "ORDER_NUMBER"])
user_cart_itemid_list = get_users_cartitemid(df_group)
user_cart_itemid_list[:2]

[(1,
  [[0, 1, 2, 3, 4],
   [0, 5, 2, 6, 3, 7],
   [0, 2, 5, 8, 9],
   [0, 2, 5, 8, 4],
   [0, 2, 5, 8, 10, 11, 12, 6],
   [0, 2, 5, 8],
   [0, 5, 2, 8, 7],
   [2, 0, 5, 8, 13, 14],
   [14, 13, 8, 0, 5, 2],
   [0, 13, 15, 16, 8, 5, 17, 7, 2],
   [0, 8, 16, 4, 15, 5, 7, 3, 18, 14, 13]],
  [5, 6, 5, 5, 8, 4, 5, 6, 6, 9, 11]),
 (2,
  [[19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 6, 30],
   [31, 32, 33, 20, 34, 35],
   [20, 19, 21, 36, 37],
   [38, 19, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
   [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 20],
   [19,
    22,
    48,
    39,
    62,
    27,
    44,
    30,
    29,
    45,
    41,
    63,
    64,
    65,
    66,
    67,
    68,
    69,
    12,
    34,
    46],
   [19, 22, 25, 62, 34, 39, 45, 44, 70, 71, 72, 30, 12, 40],
   [19, 73, 44, 40, 23, 45, 62, 72, 71, 70, 74, 75, 76, 77, 78, 79],
   [19,
    44,
    22,
    54,
    77,
    36,
    80,
    81,
    82,
    64,
    59,
    60,
    61,
    83,
    66,
    65,
    84,
    24,
  

In [30]:
preprocessing_folder = "../preprocessing-data"
if not os.path.exists(preprocessing_folder):
    os.mkdir(preprocessing_folder)
with gzip.GzipFile("../preprocessing-data/Instacart_user_cart_itemid_list.gz", "wb") as fp:
    pickle.dump(user_cart_itemid_list, fp)