# 1. 載入套件

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm # 進度條
import os

from gensim.models import Word2Vec
import multiprocessing as mp
print("Number of CPU cores:", mp.cpu_count())

Number of CPU cores: 20


# 2.讀取資料

- TaFeng Dataset
- Dunnhumby Dataset
- Instacart Dataset

In [2]:
# dataset_name = "TaFeng"
# dataset_name = "Dunnhumby"
dataset_name = "Instacart"
df = pd.read_csv("../cleaned_dataset/" + dataset_name + "_clean.csv")
df

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,PRODUCT_ID,CART_ID,NEW_ITEM_ID
0,1,1,196,0,0
1,1,1,14084,0,1
2,1,1,12427,0,2
3,1,1,26088,0,3
4,1,1,26405,0,4
...,...,...,...,...,...
3252971,20620,5,22963,295153,74
3252972,20620,5,12204,295153,1604
3252973,20620,5,21543,295153,3295
3252974,20620,5,23541,295153,417


# 3. Item2Vec方法

## 使用gensim 的 Word2Vec (將window sizes設為很大)

https://github.com/ikatsov/tensor-house/blob/master/recommendations/item2vec.ipynb

In [3]:
# 建立一個cart_list(相當於句子)，其中每個 cart 還包含此購物車中所有item ID(相當於單詞) list。
def make_item_corpus(df_list):
    print("建立項目語料庫中...")
    item_list = []
    cart_list = []
    new_cart_id = df_list[0][0]
    for (CART_ID, ITEM_ID) in tqdm(df_list):
        if new_cart_id != CART_ID:
            cart_list.append(item_list)
            item_list = []
            new_cart_id = CART_ID
        item_list.append(ITEM_ID)
    cart_list.append
    print(cart_list[:10])
    return cart_list

In [4]:
def word2vec(cart_list, DATASET_NAME, TRAIN_ITEM_MODEL):
    # TRAIN_ITEM_MODEL: True - create a new model, False - Load a previously created model
    LOGGING_ENABLED = True
    MODEL_DIR = "../preprocessing-data/item2vec_models"
    
    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)
        
    if LOGGING_ENABLED:
        import logging
        logging.basicConfig(format="%(levelname)s - %(asctime)s : %(message)s", datefmt="%H:%M:%S", level=logging.INFO)
    
    # 設定嵌入維度
    WORD_DIM = 16
    model_filename = f"../preprocessing-data/item2vec_models/item2vec_{DATASET_NAME}.{WORD_DIM}d.model"
    if TRAIN_ITEM_MODEL:
        model = Word2Vec(sentences = cart_list, 
                         vector_size = WORD_DIM,
                         window = 500, 
                         sg = 1,
                         hs = 0, 
                         negative = 5,
                         ns_exponent = 0.75,
                         workers = 4,
                         min_count = 1)
        model.save(model_filename)
        print(f"Model saved to [{model_filename}]")
        
    else:
        model = Word2Vec.load(model_filename)
        print(f"Model loaded from[{model_filename}]")
    return model

**1.TaFeng Dataset\
2.Dunnhumby Dataset\
3.Instacart Dataset**

In [5]:
# 購物車, 新項目id, 串列
df_list = df[["CART_ID", "NEW_ITEM_ID"]].values.tolist()
df_list[:10]

[[0, 0],
 [0, 1],
 [0, 2],
 [0, 3],
 [0, 4],
 [1, 0],
 [1, 5],
 [1, 2],
 [1, 6],
 [1, 3]]

In [6]:
cart_list = make_item_corpus(df_list)
model = word2vec(cart_list, dataset_name, True) 

建立項目語料庫中...


  0%|          | 0/3252976 [00:00<?, ?it/s]

INFO - 22:33:42 : collecting all words and their counts


[[0, 1, 2, 3, 4], [0, 5, 2, 6, 3, 7], [0, 2, 5, 8, 9], [0, 2, 5, 8, 4], [0, 2, 5, 8, 10, 11, 12, 6], [0, 2, 5, 8], [0, 5, 2, 8, 7], [2, 0, 5, 8, 13, 14], [14, 13, 8, 0, 5, 2], [0, 13, 15, 16, 8, 5, 17, 7, 2]]


INFO - 22:33:42 : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 22:33:42 : PROGRESS: at sentence #10000, processed 109482 words, keeping 12258 word types
INFO - 22:33:42 : PROGRESS: at sentence #20000, processed 216112 words, keeping 16480 word types
INFO - 22:33:42 : PROGRESS: at sentence #30000, processed 325883 words, keeping 18874 word types
INFO - 22:33:43 : PROGRESS: at sentence #40000, processed 439103 words, keeping 20694 word types
INFO - 22:33:43 : PROGRESS: at sentence #50000, processed 543593 words, keeping 21941 word types
INFO - 22:33:43 : PROGRESS: at sentence #60000, processed 654215 words, keeping 22896 word types
INFO - 22:33:43 : PROGRESS: at sentence #70000, processed 764399 words, keeping 23691 word types
INFO - 22:33:43 : PROGRESS: at sentence #80000, processed 878877 words, keeping 24279 word types
INFO - 22:33:43 : PROGRESS: at sentence #90000, processed 986768 words, keeping 24747 word types
INFO - 22:33:43 : PROGRESS: at sentence #10

INFO - 22:34:13 : EPOCH 2: training on 3252957 raw words (3124177 effective words) took 10.7s, 292923 effective words/s
INFO - 22:34:14 : EPOCH 3 - PROGRESS: at 9.32% examples, 285931 words/s, in_qsize 7, out_qsize 0
INFO - 22:34:15 : EPOCH 3 - PROGRESS: at 19.28% examples, 293732 words/s, in_qsize 7, out_qsize 0
INFO - 22:34:16 : EPOCH 3 - PROGRESS: at 28.34% examples, 285694 words/s, in_qsize 7, out_qsize 0
INFO - 22:34:17 : EPOCH 3 - PROGRESS: at 38.44% examples, 289282 words/s, in_qsize 8, out_qsize 0
INFO - 22:34:18 : EPOCH 3 - PROGRESS: at 47.70% examples, 288371 words/s, in_qsize 7, out_qsize 0
INFO - 22:34:19 : EPOCH 3 - PROGRESS: at 56.83% examples, 288221 words/s, in_qsize 7, out_qsize 0
INFO - 22:34:20 : EPOCH 3 - PROGRESS: at 65.57% examples, 286719 words/s, in_qsize 7, out_qsize 0
INFO - 22:34:21 : EPOCH 3 - PROGRESS: at 74.30% examples, 285474 words/s, in_qsize 7, out_qsize 0
INFO - 22:34:22 : EPOCH 3 - PROGRESS: at 83.61% examples, 285419 words/s, in_qsize 7, out_qsize 0

Model saved to [../preprocessing-data/item2vec_models/item2vec_Instacart.16d.model]


In [7]:
# Prepare the embeddings
word_vectors = model.wv
vocab = list(model.wv.key_to_index.keys())
item2vector_dict = {arg:model.wv[arg] for arg in vocab}
X = pd.DataFrame(item2vector_dict).T.values
X.shape, len(vocab), vocab[0]

((27095, 16), 27095, 44)

## 檢查

In [8]:
model.wv.key_to_index

{44: 0,
 6: 1,
 149: 2,
 141: 3,
 45: 4,
 20: 5,
 402: 6,
 29: 7,
 31: 8,
 266: 9,
 18: 10,
 501: 11,
 1001: 12,
 209: 13,
 151: 14,
 216: 15,
 959: 16,
 486: 17,
 196: 18,
 236: 19,
 247: 20,
 225: 21,
 456: 22,
 30: 23,
 159: 24,
 283: 25,
 111: 26,
 190: 27,
 42: 28,
 1275: 29,
 308: 30,
 28: 31,
 184: 32,
 14: 33,
 362: 34,
 273: 35,
 521: 36,
 396: 37,
 182: 38,
 679: 39,
 185: 40,
 1699: 41,
 404: 42,
 158: 43,
 1610: 44,
 110: 45,
 544: 46,
 17: 47,
 410: 48,
 636: 49,
 257: 50,
 163: 51,
 1289: 52,
 115: 53,
 2914: 54,
 1805: 55,
 393: 56,
 681: 57,
 121: 58,
 1538: 59,
 206: 60,
 84: 61,
 2158: 62,
 945: 63,
 113: 64,
 405: 65,
 214: 66,
 204: 67,
 443: 68,
 518: 69,
 1827: 70,
 2313: 71,
 373: 72,
 264: 73,
 27: 74,
 1562: 75,
 202: 76,
 290: 77,
 520: 78,
 1248: 79,
 437: 80,
 267: 81,
 1601: 82,
 698: 83,
 3150: 84,
 296: 85,
 98: 86,
 1674: 87,
 1298: 88,
 36: 89,
 120: 90,
 292: 91,
 298: 92,
 50: 93,
 12: 94,
 371: 95,
 401: 96,
 1771: 97,
 515: 98,
 364: 99,
 242: 100,
