## 1. 載入套件

In [16]:
import numpy as np
import pandas as pd
from tqdm import tqdm, trange # 進度條
import gzip
import pickle
import os

# 計算信賴度使用
from collections import defaultdict
from itertools import combinations, permutations # 排列組合

## 2. 讀取資料
- TaFeng Dataset
- Dunnhumby Dataset
- Instacart Dataset

In [17]:
# DATASET_NAME = "TaFeng"     # 讀取TaFeng資料
DATASET_NAME = "Dunnhumby"  # 讀取Dunnhumby資料
# DATASET_NAME = "Instacart"  # 讀取Instacart資料    
EMBEDDING_DIMENSION = 32

In [18]:
df = pd.read_csv(os.path.join("../cleaned_dataset", DATASET_NAME+"_clean.csv"))
df

Unnamed: 0,TRANSACTION_DT,PRODUCT_ID,CUSTOMER_ID,CART_ID,NEW_ITEM_ID
0,20060416,PRD0900173,CUST0000000031,0,0
1,20060416,PRD0900199,CUST0000000031,0,1
2,20060416,PRD0900867,CUST0000000031,0,2
3,20060416,PRD0901294,CUST0000000031,0,3
4,20060416,PRD0901986,CUST0000000031,0,4
...,...,...,...,...,...
1486181,20060604,PRD0901722,CUST0000999976,147357,222
1486182,20060604,PRD0901732,CUST0000999976,147357,2457
1486183,20060604,PRD0902897,CUST0000999976,147357,1904
1486184,20060604,PRD0903032,CUST0000999976,147357,223


In [19]:
# load word2Vec pre_train model
model_filename = f"../preprocessing-data/item2vec_models/item2vec_{DATASET_NAME}.{EMBEDDING_DIMENSION}d.model"
# {DATASET}
with open(model_filename, "rb") as fp:
    model = pickle.load(fp)
model.wv.vectors

array([[-0.05077608, -0.41115162,  0.20189342, ..., -0.04871842,
        -0.04989515, -0.08554478],
       [ 0.09421023, -0.33858928,  0.17217717, ..., -0.04710799,
        -0.21103606, -0.0095089 ],
       [ 0.05844222, -0.4244002 ,  0.1448443 , ..., -0.00409105,
        -0.10781361, -0.09229282],
       ...,
       [ 0.11391113, -0.23380217,  0.05809765, ...,  0.02625999,
        -0.09182655,  0.21177351],
       [ 0.11442229, -0.40926394,  0.0098845 , ..., -0.10613459,
        -0.01305813,  0.16375196],
       [ 0.27400592, -0.29986933, -0.18476827, ..., -0.08571988,
         0.00500092,  0.2051328 ]], dtype=float32)

In [20]:
# 定義資料轉換函數(於collate_batch函式中使用)
item_index_pipeline = lambda x : [[model.wv.key_to_index[j] for j in i] for i in x] # 取得購物籃中，項目的索引值(只有使用Item2Vec時會需要)

In [24]:
model.wv.key_to_index

{13: 0,
 56: 1,
 38: 2,
 211: 3,
 227: 4,
 344: 5,
 21: 6,
 142: 7,
 0: 8,
 111: 9,
 148: 10,
 481: 11,
 250: 12,
 214: 13,
 64: 14,
 339: 15,
 9: 16,
 290: 17,
 115: 18,
 70: 19,
 283: 20,
 104: 21,
 102: 22,
 169: 23,
 660: 24,
 424: 25,
 569: 26,
 213: 27,
 610: 28,
 43: 29,
 57: 30,
 281: 31,
 505: 32,
 208: 33,
 225: 34,
 268: 35,
 815: 36,
 123: 37,
 1247: 38,
 877: 39,
 277: 40,
 279: 41,
 468: 42,
 41: 43,
 246: 44,
 348: 45,
 157: 46,
 1593: 47,
 407: 48,
 375: 49,
 93: 50,
 455: 51,
 456: 52,
 1072: 53,
 217: 54,
 653: 55,
 153: 56,
 96: 57,
 1454: 58,
 22: 59,
 65: 60,
 994: 61,
 230: 62,
 908: 63,
 103: 64,
 272: 65,
 280: 66,
 496: 67,
 878: 68,
 234: 69,
 1012: 70,
 453: 71,
 378: 72,
 637: 73,
 337: 74,
 493: 75,
 172: 76,
 810: 77,
 926: 78,
 965: 79,
 276: 80,
 146: 81,
 1011: 82,
 2161: 83,
 2: 84,
 470: 85,
 976: 86,
 6: 87,
 350: 88,
 565: 89,
 1079: 90,
 1493: 91,
 2242: 92,
 641: 93,
 31: 94,
 323: 95,
 266: 96,
 880: 97,
 1562: 98,
 956: 99,
 321: 100,
 242: 101,

## 3. 後端關聯：Confidence Matrix
- TaFeng Dataset
- Dunnhumby Dataset
- Instacart Dataset

In [27]:
# 所有I2V項目集合
item_u = model.wv.key_to_index.value()
# 購物籃數量
cart_max = df["CART_ID"].max()+1
# 項目數量
item_max = item_u.max()+1
print("item_u =", item_u)
print("cart_max =", cart_max)
print("item_max =", item_max)

dict_values([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 21

In [6]:
# # 所有項目集合
# item_u = df["NEW_ITEM_ID"].unique()
# # 購物籃數量
# cart_max = df["CART_ID"].max() + 1
# # 項目數量
# item_max = item_u.max()+1
# print("item_u =", item_u)
# print("cart_max =", cart_max)
# print("item_max =", item_max)

item_u = [   0    1    2 ... 3974 3975 3976]
cart_max = 147358
item_max = 3977


In [7]:
# 取出所有購物籃-項目欄位的資料
cart_item_df = df[["CART_ID", "NEW_ITEM_ID"]]
cart_item_df

Unnamed: 0,CART_ID,NEW_ITEM_ID
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
1486181,147357,222
1486182,147357,2457
1486183,147357,1904
1486184,147357,223


In [8]:
cart_list = []
for i in trange(cart_max):
    cart_list.append(cart_item_df[cart_item_df.CART_ID == i]["NEW_ITEM_ID"].tolist())
cart_list

100%|██████████| 147358/147358 [02:12<00:00, 1109.52it/s]


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 [12, 13, 14, 15],
 [16, 17, 18, 19, 4, 20, 21],
 [22, 23, 24],
 [1, 25, 26, 6, 27, 28, 29, 30],
 [31, 2, 32, 13, 33, 6, 10, 34, 21],
 [35, 36, 37],
 [38, 25, 39, 40, 41, 32, 42, 20, 43, 29, 44],
 [45, 46, 47, 48],
 [49, 50, 51, 52],
 [53, 50, 54, 55, 56],
 [57, 58, 59, 60, 61, 62, 63, 64, 54, 65, 47, 66, 67, 68],
 [57, 60, 50, 69, 13, 47, 70, 71, 72, 68, 73],
 [74, 50, 75],
 [76, 77, 78, 59, 79, 80, 45, 54, 81, 82, 13, 47, 83, 84, 85, 68],
 [63, 54, 86, 13, 87],
 [88, 89, 77, 78, 90, 91, 81, 92, 65, 47, 66, 93, 94, 68],
 [90, 95, 96],
 [97, 88, 98, 99, 78, 100, 50, 54, 101, 102, 47, 103, 104, 105],
 [106,
  107,
  88,
  108,
  45,
  64,
  50,
  54,
  109,
  110,
  111,
  65,
  112,
  47,
  84,
  113,
  85,
  68],
 [114, 59, 115, 116, 117, 118, 85, 119],
 [120, 121, 122, 123, 116, 13, 124, 125, 126, 127, 56, 128, 129, 130],
 [131,
  132,
  133,
  134,
  135,
  123,
  136,
  137,
  13,
  138,
  139,
  140,
  141,
  56,
  128,
  142,
  143],
 [144

####  匯出list 至 pickle 壓縮檔

In [9]:
confidences_folder = "../preprocessing-data/confidences"
if not os.path.exists(confidences_folder):
    os.mkdir(confidences_folder)
with gzip.GzipFile(os.path.join(confidences_folder, DATASET_NAME + "_cart_list_I2V.gz"), "wb") as fp:  # Pickling & gzip
    pickle.dump(cart_list, fp)

#### 從 pickle 檔匯入

In [10]:
with gzip.open(os.path.join("../preprocessing-data/confidences", DATASET_NAME + "_cart_list_I2V.gz"), "rb") as fp:
    cart_list = pickle.load(fp)

### 信賴度矩陣計算

**使用矩陣紀錄每個項目出現的次數**\
index代表他的item_id，value代表他的出現次數\
**共現次數用二維列陣存**\
第一維的index是第一個項目id，第二維的index是第二個項目的id。

In [11]:
# P(item_b | item a) = P(item_b ∩ item_a) / P(item_a)
def compute_all_count(data, item_max):
    column, row = item_max, item_max
    item_count_array = np.zeros(row, dtype=int)
    item_2_array = np.zeros((column, row), dtype=int)
    
    for transaction in tqdm(data):
        trans_set = set(transaction)
        # 計算項目(item_a, item_b)共現次數
        for item_a, item_b in combinations(trans_set, 2):
            item_2_array[item_a][item_b] += 1
            item_2_array[item_a][item_b] += 1
        
        # 項目item_a出現次數
        for x in trans_set:
            item_count_array[x] += 1
    
    return item_2_array, item_count_array

**1.TaFeng Dataset\
2.Dunnhumby Dataset\
3.Instacart Dataset**

In [12]:
# 計算項目的出現次數(item_count_array)，項目的共現次數(item_2_array)
item_2_array_df, item_count_array_df = compute_all_count(cart_list, item_max)

100%|██████████| 147358/147358 [00:04<00:00, 31516.79it/s]


In [13]:
confidences_array_df = np.zeros((item_max, item_max), dtype=float)
for i in item_u:
    confidences_array_df[i] = item_2_array_df[i] / item_count_array_df[i]

#### 匯出 array 至 pickle 檔

In [14]:
with gzip.GzipFile(os.path.join("../preprocessing-data/confidences", DATASET_NAME +"_confidences_array_I2V.gz"), "wb") as fp:  # Pickling & gzip
    pickle.dump(confidences_array_df, fp)

#### 從 pickle 檔匯入

In [15]:
with gzip.open(os.path.join("../preprocessing-data/confidences", DATASET_NAME+"_confidences_array_I2V.gz"), "rb") as fp:
    confidences_array_df = pickle.load(fp)