## 1. 載入套件

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm, trange # 進度條
import gzip
import pickle
import os

# 計算信賴度使用
from collections import defaultdict
from itertools import combinations, permutations # 排列組合

## 2. 讀取資料
- TaFeng Dataset
- Dunnhumby Dataset
- Instacart Dataset

In [2]:
# DATASET_NAME = "TaFeng"     # 讀取TaFeng資料
# DATASET_NAME = "Dunnhumby"  # 讀取Dunnhumby資料
DATASET_NAME = "Instacart"  # 讀取Instacart資料    
EMBEDDING_DIMENSION = 32

In [3]:
df = pd.read_csv(os.path.join("../cleaned_dataset", DATASET_NAME+"_clean.csv"))
df

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,PRODUCT_ID,CART_ID,NEW_ITEM_ID
0,1,1,196,0,0
1,1,1,14084,0,1
2,1,1,12427,0,2
3,1,1,26088,0,3
4,1,1,26405,0,4
...,...,...,...,...,...
3252971,20620,5,22963,295153,74
3252972,20620,5,12204,295153,1604
3252973,20620,5,21543,295153,3295
3252974,20620,5,23541,295153,417


In [4]:
# load word2Vec pre_train model
model_filename = f"../preprocessing-data/item2vec_models/item2vec_{DATASET_NAME}.{EMBEDDING_DIMENSION}d.model"
# {DATASET}
with open(model_filename, "rb") as fp:
    model = pickle.load(fp)
model.wv.vectors

array([[ 0.08183119, -0.22947407, -0.11485998, ..., -0.10110876,
         0.01599585,  0.3459958 ],
       [-0.4406482 , -0.3032823 , -0.00721561, ..., -0.18979977,
         0.35160244,  0.14411661],
       [-0.42942336, -0.5231408 ,  0.05393182, ..., -0.35660285,
         0.3834403 ,  0.29964426],
       ...,
       [ 0.00766997,  0.1052394 , -0.04556641, ...,  0.02579874,
        -0.07378694,  0.34006742],
       [ 0.02208109,  0.09017415, -0.01387959, ..., -0.01254665,
        -0.09366735,  0.33281934],
       [ 0.01265153, -0.04201095, -0.01140495, ...,  0.00250275,
         0.0037039 ,  0.16943507]], dtype=float32)

In [5]:
# 定義資料轉換函數(於collate_batch函式中使用)
item_index_pipeline = lambda x : [[model.wv.key_to_index[j] for j in i] for i in x] # 取得購物籃中，項目的索引值(只有使用Item2Vec時會需要)

In [6]:
model.wv.key_to_index

{44: 0,
 6: 1,
 149: 2,
 141: 3,
 45: 4,
 20: 5,
 402: 6,
 29: 7,
 31: 8,
 266: 9,
 18: 10,
 501: 11,
 1001: 12,
 209: 13,
 151: 14,
 216: 15,
 959: 16,
 486: 17,
 196: 18,
 236: 19,
 247: 20,
 225: 21,
 456: 22,
 30: 23,
 159: 24,
 283: 25,
 111: 26,
 190: 27,
 42: 28,
 1275: 29,
 308: 30,
 28: 31,
 184: 32,
 14: 33,
 362: 34,
 273: 35,
 521: 36,
 396: 37,
 182: 38,
 679: 39,
 185: 40,
 1699: 41,
 404: 42,
 158: 43,
 1610: 44,
 110: 45,
 544: 46,
 17: 47,
 410: 48,
 636: 49,
 257: 50,
 163: 51,
 1289: 52,
 115: 53,
 2914: 54,
 1805: 55,
 393: 56,
 681: 57,
 121: 58,
 1538: 59,
 206: 60,
 84: 61,
 2158: 62,
 945: 63,
 113: 64,
 405: 65,
 214: 66,
 204: 67,
 443: 68,
 518: 69,
 1827: 70,
 2313: 71,
 373: 72,
 264: 73,
 27: 74,
 1562: 75,
 202: 76,
 290: 77,
 520: 78,
 1248: 79,
 437: 80,
 267: 81,
 1601: 82,
 698: 83,
 3150: 84,
 296: 85,
 98: 86,
 1674: 87,
 1298: 88,
 36: 89,
 120: 90,
 292: 91,
 298: 92,
 50: 93,
 12: 94,
 371: 95,
 401: 96,
 1771: 97,
 515: 98,
 364: 99,
 242: 100,


## 3. 後端關聯：Confidence Matrix
- TaFeng Dataset
- Dunnhumby Dataset
- Instacart Dataset

In [7]:
# 所有I2V項目集合
item_u = list(model.wv.key_to_index.values())
# 購物籃數量
cart_max = df["CART_ID"].max()+1
# 項目數量
item_max = max(item_u)+1
print("item_u =", item_u)
print("cart_max =", cart_max)
print("item_max =", item_max)

item_u = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 

In [8]:
# 取出所有購物籃-項目欄位的資料
cart_item_df = df[["CART_ID", "NEW_ITEM_ID"]]
cart_item_df

Unnamed: 0,CART_ID,NEW_ITEM_ID
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
3252971,295153,74
3252972,295153,1604
3252973,295153,3295
3252974,295153,417


In [9]:
def I2V_cart(cart):
    item_list = []
    for item in cart["NEW_ITEM_ID"]:
        item_list.append(model.wv.key_to_index[item])
    return item_list

In [10]:
cart_list = []
for i in trange(cart_max):
    cart_list.append(I2V_cart(cart_item_df[cart_item_df.CART_ID == i]))
    # cart_list.append(cart_item_df[cart_item_df.CART_ID == i]["NEW_ITEM_ID"].tolist())
cart_list

100%|██████████| 295154/295154 [17:22<00:00, 283.08it/s]


[[113, 274, 1057, 2268, 5734],
 [113, 3117, 1057, 1, 2268, 1800],
 [113, 1057, 3117, 893, 154],
 [113, 1057, 3117, 893, 5734],
 [113, 1057, 3117, 893, 1156, 405, 94, 1],
 [113, 1057, 3117, 893],
 [113, 3117, 1057, 893, 1800],
 [1057, 113, 3117, 893, 804, 33],
 [33, 804, 893, 113, 3117, 1057],
 [113, 804, 1187, 441, 893, 3117, 47, 1800, 1057],
 [113, 893, 441, 5734, 1187, 3117, 1800, 2268, 10, 33, 804],
 [3925, 5, 448, 4432, 1951, 440, 754, 8610, 74, 31, 7, 1, 23],
 [8, 795, 640, 5, 1631, 698],
 [5, 3925, 448, 89, 14320],
 [3504, 3925, 1313, 2053, 383, 28, 5073, 0, 4, 13720, 1040, 3737, 2184],
 [93, 508, 705, 837, 284, 2753, 7789, 1468, 4562, 494, 994, 954, 5],
 [3925,
  4432,
  3737,
  1313,
  1160,
  74,
  0,
  23,
  7,
  4,
  383,
  143,
  110,
  264,
  157,
  518,
  179,
  223,
  94,
  1631,
  13720],
 [3925, 4432, 754, 1160, 1631, 1313, 4, 0, 693, 501, 525, 23, 94, 2053],
 [3925,
  1666,
  0,
  2053,
  1951,
  4,
  1160,
  525,
  501,
  693,
  119,
  5466,
  1711,
  1912,
  167,
  

####  匯出list 至 pickle 壓縮檔

In [11]:
confidences_folder = "../preprocessing-data/confidences"
if not os.path.exists(confidences_folder):
    os.mkdir(confidences_folder)
with gzip.GzipFile(os.path.join(confidences_folder, DATASET_NAME + "_cart_list.gz"), "wb") as fp:  # Pickling & gzip
    pickle.dump(cart_list, fp)

#### 從 pickle 檔匯入

In [12]:
with gzip.open(os.path.join("../preprocessing-data/confidences", DATASET_NAME + "_cart_list.gz"), "rb") as fp:
    cart_list = pickle.load(fp)

### 信賴度矩陣計算

**使用矩陣紀錄每個項目出現的次數**\
index代表他的item_id，value代表他的出現次數\
**共現次數用二維列陣存**\
第一維的index是第一個項目id，第二維的index是第二個項目的id。

In [13]:
# P(item_b | item a) = P(item_b ∩ item_a) / P(item_a)
def compute_all_count(data, item_max):
    column, row = item_max, item_max
    item_count_array = np.zeros(row, dtype=int)
    item_2_array = np.zeros((column, row), dtype=int)
    
    for transaction in tqdm(data):
        trans_set = set(transaction)
        # 計算項目(item_a, item_b)共現次數
        for item_a, item_b in combinations(trans_set, 2):
            item_2_array[item_a][item_b] += 1
            item_2_array[item_b][item_a] += 1
        
        # 項目item_a出現次數
        for x in trans_set:
            item_count_array[x] += 1
    
    return item_2_array, item_count_array

**1.TaFeng Dataset\
2.Dunnhumby Dataset\
3.Instacart Dataset**

In [14]:
# 計算項目的出現次數(item_count_array)，項目的共現次數(item_2_array)
item_2_array_df, item_count_array_df = compute_all_count(cart_list, item_max)

100%|██████████| 295154/295154 [00:21<00:00, 13810.41it/s]


In [15]:
confidences_array_df = np.zeros((item_max, item_max), dtype=float)
for i in item_u:
    confidences_array_df[i] = item_2_array_df[i] / item_count_array_df[i]

#### 匯出 array 至 pickle 檔

In [16]:
with gzip.GzipFile(os.path.join("../preprocessing-data/confidences", DATASET_NAME +"_confidences_array.gz"), "wb") as fp:  # Pickling & gzip
    pickle.dump(confidences_array_df, fp)

#### 從 pickle 檔匯入

In [17]:
with gzip.open(os.path.join("../preprocessing-data/confidences", DATASET_NAME+"_confidences_array.gz"), "rb") as fp:
    confidences_array_df = pickle.load(fp)