## 1. 載入套件

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm, trange # 進度條
import gzip
import pickle
import os

# 計算信賴度使用
from collections import defaultdict
from itertools import combinations, permutations # 排列組合

## 2. 讀取資料
- TaFeng Dataset
- Dunnhumby Dataset
- Instacart Dataset

In [2]:
DATASET_NAME = "TaFeng"     # 讀取TaFeng資料
# DATASET_NAME = "Dunnhumby"  # 讀取Dunnhumby資料
# DATASET_NAME = "Instacart"  # 讀取Instacart資料    
EMBEDDING_DIMENSION = 32

In [3]:
df = pd.read_csv(os.path.join("../cleaned_dataset", DATASET_NAME+"_clean.csv"))
df

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,NEW_ITEM_ID
0,1113,4902105011621,2000-11-26,0,0
1,1113,7616100830794,2000-11-26,0,1
2,1113,4710892632017,2000-11-26,0,2
3,1113,4710905340113,2000-11-27,1,3
4,1113,4717362901277,2000-11-27,1,4
...,...,...,...,...,...
533054,20002000,4710339772139,2001-01-20,62360,4546
533055,20002000,20513184,2001-01-20,62360,1351
533056,20002000,4714800731229,2001-01-20,62360,2946
533057,20002000,4714541091071,2001-01-20,62360,7382


In [4]:
# load word2Vec pre_train model
model_filename = f"../preprocessing-data/item2vec_models/item2vec_{DATASET_NAME}.{EMBEDDING_DIMENSION}d.model"
# {DATASET}
with open(model_filename, "rb") as fp:
    model = pickle.load(fp)
model.wv.vectors

array([[-0.6380839 , -0.16559337,  0.44700974, ..., -0.26519555,
        -0.21365549, -0.43370908],
       [ 0.23654032, -0.4052614 , -0.15312208, ...,  0.16291003,
        -0.3654118 , -0.11965667],
       [ 0.04307808, -0.00726012,  0.02145827, ..., -0.5144637 ,
        -0.27971143,  0.37542278],
       ...,
       [ 0.17551799, -0.32207224,  0.02814815, ..., -0.04330132,
        -0.1767716 ,  0.06817531],
       [ 0.06556011, -0.21100514, -0.01509141, ..., -0.02504948,
        -0.10826813,  0.08305621],
       [ 0.27763417, -0.7832042 , -0.09279366, ..., -0.17076828,
        -0.38461998,  0.15063186]], dtype=float32)

In [5]:
# 定義資料轉換函數(於collate_batch函式中使用)
item_index_pipeline = lambda x : [[model.wv.key_to_index[j] for j in i] for i in x] # 取得購物籃中，項目的索引值(只有使用Item2Vec時會需要)

In [6]:
model.wv.key_to_index

{151: 0,
 129: 1,
 749: 2,
 287: 3,
 466: 4,
 1261: 5,
 120: 6,
 955: 7,
 707: 8,
 469: 9,
 365: 10,
 178: 11,
 906: 12,
 7: 13,
 348: 14,
 183: 15,
 917: 16,
 172: 17,
 477: 18,
 6: 19,
 886: 20,
 8: 21,
 625: 22,
 1311: 23,
 12: 24,
 76: 25,
 2939: 26,
 356: 27,
 1353: 28,
 943: 29,
 415: 30,
 283: 31,
 959: 32,
 680: 33,
 861: 34,
 498: 35,
 53: 36,
 409: 37,
 883: 38,
 2115: 39,
 501: 40,
 2183: 41,
 763: 42,
 492: 43,
 449: 44,
 347: 45,
 1048: 46,
 1108: 47,
 401: 48,
 361: 49,
 2629: 50,
 291: 51,
 298: 52,
 374: 53,
 1260: 54,
 414: 55,
 787: 56,
 1465: 57,
 1987: 58,
 960: 59,
 2567: 60,
 429: 61,
 59: 62,
 464: 63,
 2143: 64,
 644: 65,
 216: 66,
 460: 67,
 481: 68,
 1403: 69,
 1763: 70,
 167: 71,
 502: 72,
 1484: 73,
 687: 74,
 3225: 75,
 994: 76,
 1767: 77,
 1348: 78,
 478: 79,
 215: 80,
 1630: 81,
 923: 82,
 565: 83,
 164: 84,
 1443: 85,
 305: 86,
 311: 87,
 2295: 88,
 1005: 89,
 762: 90,
 765: 91,
 250: 92,
 1613: 93,
 376: 94,
 1993: 95,
 388: 96,
 493: 97,
 950: 98,
 991

## 3. 後端關聯：Confidence Matrix
- TaFeng Dataset
- Dunnhumby Dataset
- Instacart Dataset

In [7]:
# 所有I2V項目集合
item_u = list(model.wv.key_to_index.values())
# 購物籃數量
cart_max = df["CART_ID"].max()+1
# 項目數量
item_max = max(item_u)+1
print("item_u =", item_u)
print("cart_max =", cart_max)
print("item_max =", item_max)

item_u = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 

In [8]:
# 取出所有購物籃-項目欄位的資料
cart_item_df = df[["CART_ID", "NEW_ITEM_ID"]]
cart_item_df

Unnamed: 0,CART_ID,NEW_ITEM_ID
0,0,0
1,0,1
2,0,2
3,1,3
4,1,4
...,...,...
533054,62360,4546
533055,62360,1351
533056,62360,2946
533057,62360,7382


In [9]:
def I2V_cart(cart):
    item_list = []
    for item in cart["NEW_ITEM_ID"]:
        item_list.append(model.wv.key_to_index[item])
    return item_list

In [10]:
cart_list = []
for i in trange(cart_max):
    cart_list.append(I2V_cart(cart_item_df[cart_item_df.CART_ID == i]))
    # cart_list.append(cart_item_df[cart_item_df.CART_ID == i]["NEW_ITEM_ID"].tolist())
cart_list

100%|██████████| 62361/62361 [00:17<00:00, 3512.40it/s]


[[4278, 5761, 136],
 [1595, 10098, 665, 19, 13, 21],
 [863, 590, 901, 24, 10745, 963, 2159],
 [3481, 3579, 2476, 3998, 2151, 470],
 [2299, 3468, 14289, 1559, 2307, 4660, 7177, 6689, 2925, 996],
 [2742,
  1146,
  8121,
  132,
  5092,
  1193,
  11958,
  2498,
  6661,
  321,
  3558,
  4818,
  7529,
  2176,
  899,
  698],
 [492, 1147, 282, 3278, 2361],
 [36,
  5599,
  414,
  10759,
  2931,
  1276,
  62,
  8283,
  11951,
  558,
  1934,
  1184,
  191,
  6733,
  2466,
  1791,
  2640,
  7121,
  239,
  2631,
  1704,
  2280,
  1672,
  25,
  1254,
  385,
  956,
  1084,
  740,
  3428,
  2854,
  10687,
  1051,
  1857,
  2545,
  1729],
 [5085,
  13188,
  8230,
  524,
  3387,
  3886,
  3884,
  3479,
  11889,
  10007,
  3870,
  4694,
  125,
  757,
  3323,
  162,
  703,
  789,
  5048,
  1426,
  3048,
  1995,
  2212,
  1209,
  6791,
  3057,
  1971,
  4488,
  800,
  925,
  1926],
 [6,
  1856,
  6559,
  1743,
  866,
  1158,
  663,
  7297,
  1778,
  1729,
  1,
  1254,
  2353,
  1236,
  2769,
  5891,
  3293

####  匯出list 至 pickle 壓縮檔

In [11]:
confidences_folder = "../preprocessing-data/confidences"
if not os.path.exists(confidences_folder):
    os.mkdir(confidences_folder)
with gzip.GzipFile(os.path.join(confidences_folder, DATASET_NAME + "_cart_list_I2V.gz"), "wb") as fp:  # Pickling & gzip
    pickle.dump(cart_list, fp)

#### 從 pickle 檔匯入

In [12]:
with gzip.open(os.path.join("../preprocessing-data/confidences", DATASET_NAME + "_cart_list_I2V.gz"), "rb") as fp:
    cart_list = pickle.load(fp)

### 信賴度矩陣計算

**使用矩陣紀錄每個項目出現的次數**\
index代表他的item_id，value代表他的出現次數\
**共現次數用二維列陣存**\
第一維的index是第一個項目id，第二維的index是第二個項目的id。

In [13]:
# P(item_b | item a) = P(item_b ∩ item_a) / P(item_a)
def compute_all_count(data, item_max):
    column, row = item_max, item_max
    item_count_array = np.zeros(row, dtype=int)
    item_2_array = np.zeros((column, row), dtype=int)
    
    for transaction in tqdm(data):
        trans_set = set(transaction)
        # 計算項目(item_a, item_b)共現次數
        for item_a, item_b in combinations(trans_set, 2):
            item_2_array[item_a][item_b] += 1
            item_2_array[item_a][item_b] += 1
        
        # 項目item_a出現次數
        for x in trans_set:
            item_count_array[x] += 1
    
    return item_2_array, item_count_array

**1.TaFeng Dataset\
2.Dunnhumby Dataset\
3.Instacart Dataset**

In [14]:
# 計算項目的出現次數(item_count_array)，項目的共現次數(item_2_array)
item_2_array_df, item_count_array_df = compute_all_count(cart_list, item_max)

100%|██████████| 62361/62361 [00:01<00:00, 35826.52it/s]


In [15]:
confidences_array_df = np.zeros((item_max, item_max), dtype=float)
for i in item_u:
    confidences_array_df[i] = item_2_array_df[i] / item_count_array_df[i]

#### 匯出 array 至 pickle 檔

In [16]:
with gzip.GzipFile(os.path.join("../preprocessing-data/confidences", DATASET_NAME +"_confidences_array_I2V.gz"), "wb") as fp:  # Pickling & gzip
    pickle.dump(confidences_array_df, fp)

#### 從 pickle 檔匯入

In [17]:
with gzip.open(os.path.join("../preprocessing-data/confidences", DATASET_NAME+"_confidences_array_I2V.gz"), "rb") as fp:
    confidences_array_df = pickle.load(fp)