# 下載套件

In [2]:
import torch
import torch.nn as nn
from torch import nn, matmul, softmax
from torch.nn.init import xavier_uniform_
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable

import numpy as np
import pandas as pd
import gensim
from gensim.models import KeyedVectors
import pickle
import gzip 
import gc
import random

In [3]:
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataset import random_split

from tqdm import tqdm
from collections import Counter

import math

# 參數設置

In [4]:
# 超參數
EPOCHS = 50
LEARNING_RATE = 0.0001 #
BATCH_SIZE = 8
BETA = 0.5
ALPHA = 0.01
EMBEDDING_DIMENSION = 32                      # 嵌入維度
MODEL_DIMENSION = EMBEDDING_DIMENSION         # 模型維度
HIDDEN_DIMENSION = 128                        # NLP 隱藏層維度
NUM_HEAD = 4
NUM_LAYER = 4

isI2V = 1        #1:使用Item2Vec     #0:不使用Item2Vec

# 匯入檔案
- item2Vec_TaFeng.32d.model
- TaFeng_user_cart_itemid_list.gz
- Ta_feng_clean.csv

In [5]:
# Load word2Vec pre_train model
model_filename = f"../preprocessing-data/item2vec_models/item2vec_TaFeng.{EMBEDDING_DIMENSION}d.model"
## TaFeng
with open(model_filename, "rb") as fp:
    model = pickle.load(fp)
weights = torch.FloatTensor(model.wv.vectors)
weights.shape

torch.Size([15764, 32])

In [6]:
## TaFeng # TaFeng_user_cart_itemid_list # 用戶id, 購物籃時間差(不會使用到), 此用戶的購物籃串列(每個串列包含多筆項目)。
with gzip.open("../preprocessing-data/TaFeng_user_cart_Itemid_list.gz", "rb") as fp:
    user_cart_itemid_list = pickle.load(fp)
user_cart_itemid_list[:2]

[(1113,
  array([0., 1.]),
  [[0, 1, 2], [3, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14, 15]],
  [3, 6, 7]),
 (5241,
  array([0.        , 0.70967742, 1.        ]),
  [[16, 17, 18, 19, 20, 21],
   [22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
   [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
   [48, 49, 50, 51, 52]],
  [6, 10, 16, 5])]

In [7]:
# Load TaFeng confidences_Matrix
with gzip.open("../preprocessing-data/confidences/TaFeng_confidences_array.gz", "rb") as fp:
    TaFeng_confidences_array = pickle.load(fp)

In [8]:
# Ta Feng Dataset
TaFeng = pd.read_csv("../cleaned_dataset/ta_feng_clean.csv")

# 最多購物籃數
max_cart_count = TaFeng.groupby('CUSTOMER_ID')['CART_ID'].nunique().max()
print(max_cart_count)

TaFeng

72


Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,NEW_ITEM_ID
0,1113,4902105011621,2000-11-26,0,0
1,1113,7616100830794,2000-11-26,0,1
2,1113,4710892632017,2000-11-26,0,2
3,1113,4710905340113,2000-11-27,1,3
4,1113,4717362901277,2000-11-27,1,4
...,...,...,...,...,...
533054,20002000,4710339772139,2001-01-20,62360,4546
533055,20002000,20513184,2001-01-20,62360,1351
533056,20002000,4714800731229,2001-01-20,62360,2946
533057,20002000,4714541091071,2001-01-20,62360,7382


# 切分資料集
- 分成輸入資料與標籤資料
- 訓練集:驗證集:測試集 = 7:1:2

In [9]:
# 切分資料集
train_set_size = int(len(user_cart_itemid_list) * 0.7)
valid_set_size = int(len(user_cart_itemid_list) * 0.1)
test_set_size = len(user_cart_itemid_list)-train_set_size-valid_set_size
train_set, valid_set, test_set = random_split(user_cart_itemid_list, [train_set_size, valid_set_size, test_set_size])
print(len(train_set))
print(len(valid_set))
print(len(test_set))

7457
1065
2132


In [10]:
# 將切割好的資料集暫存起來

# # 訓練集
# filepath = "../preprocessing-data/TaFeng_dataset/train_set.pkl"
# with open(filepath, "wb") as f:
#     pickle.dump(train_set, f, pickle.HIGHEST_PROTOCOL)
# # 驗證集
# filepath = "../preprocessing-data/TaFeng_dataset/valid_set.pkl"
# with open(filepath, "wb") as f:
#     pickle.dump(valid_set, f, pickle.HIGHEST_PROTOCOL)
# # 測試集
# filepath = "../preprocessing-data/TaFeng_dataset/test_set.pkl"
# with open(filepath, "wb") as f:
#     pickle.dump(test_set, f, pickle.HIGHEST_PROTOCCOL)

In [11]:
# 讀取之前暫存的資料集

# # 載入訓練、驗證、測試集
# with open("../preprocessing-data/TaFeng_dataset/train_set.pkl", "rb") as fp:
#     train_set = pickle.load(fp)
# with open("../preprocessing-data/TaFeng_dataset/valid_set.pkl", "rb") as fp:
#     valid_set = pickle.load(fp)
# with open("../preprocessing-data/TaFeng_dataset/test_set.pkl", "rb") as fp:
#     test_set = pickle.load(fp)

# Batch

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [13]:
# 定義資料轉換函數(於collate_batch函式中使用)
item_index_pipeline = lambda x:[[model.wv.key_to_index[j] for j in i]for i in x] # 取得購物籃中，項目的索引值(只有使用Item2Vec時會需要)

In [14]:
class TensorDataset(Dataset):
    # TensorDataset繼承Dataset, 重載__init__, __getitem__, __len__
    # 實現將一組Tensor數據封裝成Tensor數據集
    # 能夠通過index得到數據集的數據，能夠通過len，得到數據集大小
    def __init__(self, data_tensor):
        self.data_tensor = data_tensor
    def __getitem__(self, index):
        return self.data_tensor[index]
    def __len__(self):
        return len(self.data_tensor)

def collate_batch(batch): # 輸出userID, input_list跟label(最後一個購物籃)
    # 使用ID、時間差、訓練的購物籃項目、預測的購物籃項目
    userID, input_list, label_list, input_size_list, label_size_list, offsets = [], [], [], [], [], [0]
    for _user in batch:
        userID.append(_user[0]) # userID
        label_list.append(torch.tensor(_user[2][-1])) # 所有購物籃的項目ID串列中的最後一個購物籃項目ID
        label_size_list.append(torch.tensor(_user[3][-1]))
        
        if isI2V == 0: # 不使用Item2Vec進行項目嵌入
            train_list = _user[2][0:-1]
        else: #　使用Item2Vec進行項目嵌入
            train_list = item_index_pipeline(_user[2][0:-1])
        input_size_list.append(_user[3][0:-1])
        
        input_list.append(train_list) # 所有購物籃的項目ID串列(除了最後一個購物籃)
        offsets.append(len(train_list))
    
    return userID, input_list, label_list, input_size_list, label_size_list, offsets

In [15]:
# 轉成 Dataset
split_train_ = TensorDataset(train_set)
split_valid_ = TensorDataset(valid_set)
split_test_ = TensorDataset(test_set)

In [16]:
# DataLoader 
train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch, drop_last=True)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch, drop_last=True)
test_dataloader = DataLoader(split_test_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch, drop_last=True)

# MLP層

In [None]:
class SizeEmbedding(nn.Moudle):
    def __init__(self, input_size, output_size):
        super(LinearTransformation, self).__init__()

In [None]:
class MyModel01(nn.Module):
    def __init__(self, embed_dim, model_dim, hidden_dim, items_count):
        super(MyModel01, self).__init__()
        self.size_embed = 

        
    def forward(self, basket_input, size_input ,lengths):
        for batch_idx, (userID, basket_input, basket_label, size_input, size_label, offsets) in enumerate(tqdm(test_dataloader)):
            input_size = self.linear_transform(torch.tensor([[float(_)] for _ in size_input[i]]).to(device))

In [19]:
for batch_idx, (userID, basket_input, basket_label, size_input, size_label, offsets) in enumerate(tqdm(test_dataloader)):
    input_size = self.linear_transform(torch.tensor([[float(_)] for _ in size_input[i]]).to(device))

100%|██████████| 266/266 [00:00<00:00, 5782.58it/s]

[[12, 18], [8, 4, 3, 16, 7, 6], [14, 5, 8, 5, 4, 3], [5, 7, 4, 3, 5, 4], [5, 6, 5, 9, 3, 4, 9, 5, 12, 3, 5, 7, 6], [3, 11, 10, 8], [3, 15, 4, 8, 8, 24, 10, 12], [18, 7, 3]]
[[8, 13, 9, 27], [3, 5], [4, 8, 6, 8, 13, 6, 7, 5], [3, 4, 6, 6, 4], [7, 22, 6, 4], [7, 4], [16, 22], [3, 9]]
[[3, 5, 5], [19, 11, 10, 4, 5], [5, 10], [10, 7, 4, 7, 4, 9, 3, 4, 5, 7, 8, 10, 3, 4, 15, 7, 7], [4, 6, 4, 4, 4, 8], [11, 11], [9, 6, 11, 8, 20, 6], [3, 9, 8, 4, 5, 3]]
[[3, 7, 10, 17, 16], [6, 10], [5, 8], [5, 3, 6, 4, 6], [9, 17], [6, 17], [5, 4, 5, 8], [8, 7]]
[[8, 26, 5, 14, 4, 26, 5, 18, 4, 16, 16, 4, 21, 6, 23, 16, 3, 30, 8, 6, 12, 9], [15, 8, 4, 11], [15, 14, 19], [15, 3], [8, 10, 6, 7, 10, 11, 3, 4], [4, 3, 10], [25, 19, 22, 13, 15, 11, 13, 31, 12, 13, 4, 6, 10, 33, 4], [5, 10]]
[[31, 26], [5, 7, 5, 10, 5], [11, 11, 17, 15, 11, 4, 8], [7, 5, 12, 7], [4, 3, 6, 3], [26, 8, 8, 11, 8], [16, 40, 3], [3, 3, 5, 4, 4, 5, 3, 5, 3, 3]]
[[27, 10], [14, 12, 3], [9, 3, 4, 4, 7], [16, 9, 3, 4], [5, 7, 4, 6, 6], [6


