In [1]:
import numpy as np
import pandas as pd
# sparse 행렬을 만들어야 하기 때문에 다음과 같이 import
import scipy.sparse as sp

import math
import tqdm as tqdm
import random
from datetime import datetime
from time import time

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

# from box import Box

import warnings
warnings.filterwarnings('ignore')


from collections import defaultdict
import os

  from .autonotebook import tqdm as notebook_tqdm


# Preprocess

In [115]:
# https://github.com/gusye1234/LightGCN-PyTorch/blob/master/code/dataloader.py
# https://github.com/SeongBeomLEE/RecsysTutorial/blob/main/LightGCN/LightGCN.ipynb
# https://radish-greens.tistory.com/1
## 참고하여 재정의

class Preprocess :
    def __init__(self, data_path, config) : 
        self.data = pd.read_csv(data_path, names=['user_id', 'item_id','rating','timestamp'], skiprows=1)
        self.config = config
        # bpr을 이용하기 때문에 encoder, decoder의 구조가 필요
        ## 유저, 아이템 행렬에 대한 각각의 encoder, decoder가 필요
        self.user_encoder, self.user_decoder, self.num_users = self._encode_user()
        self.item_encoder, self.item_decoder, self.num_items = self._encode_item()

        # 유저와 아이템의 관계에 대한 인접행렬 생산
        #  build a graph in torch.sparse.IntTensor.
        # Details in NGCF's matrix form
        # A = 
        #     |I,   R|
        #     |R^T, I|
        # """
        self.user_item_matrix = self._generate_user_item_matrix()
        self.adjacency_matrix = self._generate_adjacency_matrix() 


        self.exist_users = list(self.user_encoder.values())  # 혹은 다른 방식으로 존재하는 사용자 ID 목록 생성
        self.exist_items = list(self.item_encoder.values())  # 존재하는 아이템 ID 목록
        self.user_train = self._generate_user_train()  # 사용자별 긍정 아이템 목록을 생성하는 메서드 필요

    def _encode_user(self) :
        unique_users = self.data['user_id'].unique()
        user_encoder = {user_id:idx for idx, user_id in enumerate(unique_users)}
        user_decoder = {idx:user_id for user_id, idx in user_encoder.items()}
        return user_encoder,user_decoder,len(unique_users)

    def _encode_item(self) : 
        unique_items = self.data['item_id'].unique()
        item_encoder = {item_id:idx for idx, item_id in enumerate(unique_items)}
        item_decoder = {idx:item_id for item_id, idx in item_encoder.items()}
        
        return item_encoder,item_decoder,len(unique_items)

    def _generate_user_item_matrix(self) :
        # matrix에 들어갈 내용을 정합니다.
        # rows, cols, value로 구성됨 
        rows = self.data['user_id'].map(self.user_encoder)
        cols = self.data['item_id'].map(self.item_encoder)
        values = np.ones(len(self.data))
        user_item_matrix = sp.csr_matrix((values, (rows,cols)), shape=(self.num_users, self.num_items))
        return user_item_matrix

    def _generate_adjacency_matrix(self) : 
        # user에 대한 그래프 + 아이템에 대한 그래프에 대한 인접그래프 생성
        user_item_matrix = self.user_item_matrix
        item_user_matrix = self.user_item_matrix.transpose()

        zero_user_to_user = sp.csr_matrix((self.num_users, self.num_users))
        zero_item_to_item = sp.csr_matrix((self.num_items, self.num_items))

        # 상단 블록 (사용자-사용자 연결 및 사용자-아이템 연결)
        upper_block = sp.hstack([zero_user_to_user, user_item_matrix], format='csr')
        # 하단 블록 (아이템-사용자 연결 및 아이템-아이템 연결)
        lower_block = sp.hstack([item_user_matrix, zero_item_to_item], format='csr')

        adjacency_matrix = sp.vstack([upper_block, lower_block], format='csr')

        return adjacency_matrix

    def _generate_user_train(self) :
        user_train = {}
        for _, row in self.data.iterrows() :
            user_id = self.user_encoder[row.user_id]
            item_id = self.item_encoder[row.item_id]
            if user_id not in user_train :
                user_train[user_id] = []
            user_train[user_id].append(item_id)
        return user_train

    def sampling(self):
        users = random.sample(self.exist_users, self.config['n_batch'])

        def sample_pos_items_for_u(u, num):
            pos_items = self.user_train[u]
            pos_batch = random.sample(pos_items, num)
            return pos_batch

        def sample_neg_items_for_u(u, num):
            neg_items = list(set(self.exist_items) - set(self.user_train[u]))
            neg_batch = random.sample(neg_items, num)
            return neg_batch

        pos_items, neg_items = [], []
        for user in users:
            pos_items += sample_pos_items_for_u(user, 1)
            neg_items += sample_neg_items_for_u(user, 1)

        return users, pos_items, neg_items

In [4]:
data_path = '/home/siyun/ephemeral/lightgcn/LightGCN-PyTorch/data/amazon/amazon.csv'

In [8]:
# data = pd.read_csv(data_path, names=['user_id', 'item_id','rating','timestamp'], skiprows=1)
# data.shape

(24139, 4)

Lightgcn의 구조

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

원저자와의 차이

1. 모델 초기화 차이
- xavier -> normal
2. dropout 처리
- 원저자 : dropout 생략
- 내 구조 : 추가
3. forward
- 원저자 : graph convolution 직접 계산, 모든 layer의 임베딩을 평균 -> 최종 임베딩
- 내 구조 : 간략한 계산
4. loss function
- 원저자 : BPR class 직접 사용
    - 파라미터 업데이트에 사용되는 optimizer 관리
- 내 구조 : BPR 직접계산

In [None]:
class LightGCN(nn.Module):
    def __init__(self, n_users, n_items, emb_dim, n_layers, reg, adj_mtx, device):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.emb_dim = emb_dim
        
        self.l = adj_mtx
        self.graph = self._convert_sp_mat_to_sp_tensor(self.l)

        self.reg = reg
        self.n_layers = n_layers
        self.device = device
        
        self.user_embedding = torch.nn.Embedding(num_embeddings=n_users, embedding_dim=emb_dim).to(device)
        self.item_embedding = torch.nn.Embedding(num_embeddings=n_items, embedding_dim=emb_dim).to(device)
        
        # Initialize embeddings
        torch.nn.init.normal_(self.user_embedding.weight, std=0.1)
        torch.nn.init.normal_(self.item_embedding.weight, std=0.1)

        self.adj_mtx = self._convert_sp_mat_to_sp_tensor(adj_mtx).to(device)
        
    def _convert_sp_mat_to_sp_tensor(self, X):
        coo = X.tocoo().astype(np.float32)
        i = torch.LongTensor(np.mat([coo.row, coo.col]))
        v = torch.FloatTensor(coo.data)
        res = torch.sparse.FloatTensor(i, v, coo.shape).to(device)
        return res
    def forward(self, user, pos_item, neg_item):
        """
        Computes the forward pass
        
        Arguments:
        ---------
        user = user
        pos_item = positive item (user interacted with item)
        neg_item = negative item (user did not interact with item)
        """
        
        all_embeddings = self.compute_embeddings()
        
        u_embeddings = all_embeddings[:self.n_users]
        i_embeddings = all_embeddings[self.n_users:]

        users_emb = u_embeddings[user]
        pos_emb = i_embeddings[pos_item]
        neg_emb = i_embeddings[neg_item]

        pos_scores = torch.sum(users_emb * pos_emb, dim=-1)
        neg_scores = torch.sum(users_emb * neg_emb, dim=-1)

        loss = -torch.mean(F.logsigmoid(pos_scores - neg_scores))

        # Regularization term
        if self.reg > 0:
            reg_term = (1/2)*(users_emb.norm(2).pow(2) + 
                              pos_emb.norm(2).pow(2) + 
                              neg_emb.norm(2).pow(2))/float(len(user))
            loss += self.reg * reg_term

        return loss

    def compute_embeddings(self):
        users_emb = self.user_embedding.weight
        items_emb = self.item_embedding.weight
        all_emb = torch.cat([users_emb, items_emb])

        embs = [all_emb]
        for _ in range(self.n_layers):
            all_emb = torch.sparse.mm(self.adj_mtx, all_emb)
            embs.append(all_emb)

        embs = torch.stack(embs, dim=1)
        light_out = torch.mean(embs, dim=1)
        return light_out

In [65]:
# class LightGCN(nn.Module):
#     # dropout 제거
#     # def __init__(self, n_users, n_items, emb_dim, n_layers, reg, node_dropout, adj_mtx):
#     def __init__(self, n_users, n_items, emb_dim, n_layers, reg, adj_mtx, device):
#         super().__init__()
#         # initialize Class attributes
#         self.n_users = n_users
#         self.n_items = n_items
#         self.emb_dim = emb_dim
        
#         self.l = adj_mtx
#         self.graph = self._convert_sp_mat_to_sp_tensor(self.l)

#         self.reg = reg
#         self.n_layers = n_layers
#         self.device = device
#         # --------------------------------
#         # 제거
#         # self.node_dropout = node_dropout

#         # Initialize weights
#         # self.weight_dict = self._init_weights()
#         # print("Weights initialized.")

#     # # initialize weights
#     # def _init_weights(self):
#     #     print("Initializing weights...")
#     #     weight_dict = nn.ParameterDict()

#     #     # initializer = torch.nn.init.xavier_uniform_
#     #     initializer = torch.nn.init.normal_
        
#     #     weight_dict['user_embedding'] = nn.Parameter(initializer(torch.empty(self.n_users, self.emb_dim).to(device)))
#     #     weight_dict['item_embedding'] = nn.Parameter(initializer(torch.empty(self.n_items, self.emb_dim).to(device)))

#     #     return weight_dict
#     # --------------------------------
#         self.user_embedding = torch.nn.Embedding(num_embeddings=n_users, embedding_dim=emb_dim).to(device)
#         self.item_embedding = torch.nn.Embedding(num_embeddings=n_items, embedding_dim=emb_dim).to(device)
        
#         # Initialize embeddings
#         torch.nn.init.normal_(self.user_embedding.weight, std=0.1)
#         torch.nn.init.normal_(self.item_embedding.weight, std=0.1)

#         self.adj_mtx = self._convert_sp_mat_to_sp_tensor(adj_mtx).to(device)


#     # convert sparse matrix into sparse PyTorch tensor
#     def _convert_sp_mat_to_sp_tensor(self, X):
#         """
#         Convert scipy sparse matrix to PyTorch sparse matrix

#         Arguments:
#         ----------
#         X = Adjacency matrix, scipy sparse matrix
#         """
#         coo = X.tocoo().astype(np.float32)
#         i = torch.LongTensor(np.mat([coo.row, coo.col]))
#         v = torch.FloatTensor(coo.data)
#         res = torch.sparse.FloatTensor(i, v, coo.shape).to(device)
#         return res
#     # 드랍아웃을 사용하지 않는 경우 주석처리
#     # apply node_dropout
#     # def _droupout_sparse(self, X):
#     #     """
#     #     Drop individual locations in X
        
#     #     Arguments:
#     #     ---------
#     #     X = adjacency matrix (PyTorch sparse tensor)
#     #     dropout = fraction of nodes to drop
#     #     noise_shape = number of non non-zero entries of X
#     #     """
#     #     node_dropout_mask = ((self.node_dropout) + torch.rand(X._nnz())).floor().bool().to(device)
#     #     i = X.coalesce().indices()
#     #     v = X.coalesce()._values()
#     #     i[:,node_dropout_mask] = 0
#     #     v[node_dropout_mask] = 0
#     #     X_dropout = torch.sparse.FloatTensor(i, v, X.shape).to(X.device)

#     #     return  X_dropout.mul(1/(1-self.node_dropout))

#     def forward(self, user, pos_item, neg_item):
#         """
#         Computes the forward pass
        
#         Arguments:
#         ---------
#         user = user
#         pos_item = positive item (user interacted with item)
#         neg_item = negative item (user did not interact with item)
#         """
        
#         all_embeddings = self.compute_embeddings()
        
#         u_embeddings = all_embeddings[:self.n_users]
#         i_embeddings = all_embeddings[self.n_users:]

#         users_emb = u_embeddings[user]
#         pos_emb = i_embeddings[pos_item]
#         neg_emb = i_embeddings[neg_item]

#         pos_scores = torch.sum(users_emb * pos_emb, dim=-1)
#         neg_scores = torch.sum(users_emb * neg_emb, dim=-1)

#         loss = -torch.mean(F.logsigmoid(pos_scores - neg_scores))

#         # Regularization term
#         if self.reg > 0:
#             reg_term = (1/2)*(users_emb.norm(2).pow(2) + 
#                               pos_emb.norm(2).pow(2) + 
#                               neg_emb.norm(2).pow(2))/float(len(user))
#             loss += self.reg * reg_term

#         return loss

#     def compute_embeddings(self):
#         users_emb = self.user_embedding.weight
#         items_emb = self.item_embedding.weight
#         all_emb = torch.cat([users_emb, items_emb])

#         embs = [all_emb]
#         for _ in range(self.n_layers):
#             all_emb = torch.sparse.mm(self.adj_mtx, all_emb)
#             embs.append(all_emb)

#         embs = torch.stack(embs, dim=1)
#         light_out = torch.mean(embs, dim=1)
#         return light_out
#         # # apply drop-out mask
#         # graph = self._droupout_sparse(self.graph) if self.node_dropout > 0 else self.graph
#         # ego_embeddings = torch.cat([self.weight_dict['user_embedding'], self.weight_dict['item_embedding']], 0)
#         # final_embeddings = [ego_embeddings]

#         # for k in range(self.n_layers):
#         #     ego_embeddings = torch.sparse.mm(graph, final_embeddings[k])
#         #     final_embeddings.append(ego_embeddings)                                       

#         # final_embeddings = torch.stack(final_embeddings, dim=1)
#         # final_embeddings = torch.mean(final_embeddings, dim=1)
        
#         # u_final_embeddings, i_final_embeddings = final_embeddings.split([self.n_users, self.n_items], 0)

#         # self.u_final_embeddings = nn.Parameter(u_final_embeddings)
#         # self.i_final_embeddings = nn.Parameter(i_final_embeddings)
        
#         # # loss 계산
#         # u_emb = u_final_embeddings[u] # user embeddings
#         # p_emb = i_final_embeddings[i] # positive item embeddings
#         # n_emb = i_final_embeddings[j] # negative item embeddings
        
#         # y_ui = torch.sum(torch.mul(u_emb, p_emb), dim = 1)                        
#         # y_uj = torch.sum(torch.mul(u_emb, n_emb), dim = 1)
        
#         # log_prob = torch.mean(torch.log(torch.sigmoid(y_ui - y_uj))) 
#         # bpr_loss = -log_prob        
#         # if self.reg > 0.:
#         #     l2norm = (torch.sum(u_emb**2)/2. + torch.sum(p_emb**2)/2. + torch.sum(n_emb**2)/2.) / u_emb.shape[0]
#         #     l2reg = self.reg * l2norm
#         #     bpr_loss += l2reg

#         # return bpr_loss


In [122]:
def train(model, make_graph_data_set, optimizer, n_batch, device):
    model.train()
    loss_val = 0
    for _ in tqdm(range(n_batch), desc="Training..."):
        user, pos, neg = make_graph_data_set.sampling()
        user = torch.LongTensor(user).to(device)
        pos = torch.LongTensor(pos).to(device)
        neg = torch.LongTensor(neg).to(device)
        optimizer.zero_grad()
        loss = model(user, pos, neg)
        loss.backward()
        optimizer.step()
        loss_val += loss.item()
    return loss_val / n_batch

def split_matrix(X, n_splits=10):
    splits = []
    chunk_size = X.shape[0] // n_splits
    for i in range(n_splits):
        start = i * chunk_size
        end = X.shape[0] if i == n_splits - 1 else (i + 1) * chunk_size
        splits.append(X[start:end])
    return splits

# def compute_ndcg_k(pred_items, test_items, test_indices, k):

#     r = (test_items * pred_items).gather(1, test_indices)
#     f = torch.from_numpy(np.log2(np.arange(2, k+2))).float().to(device)

#     dcg = (r[:, :k]/f).sum(1)                                               
#     dcg_max = (torch.sort(r, dim=1, descending=True)[0][:, :k]/f).sum(1)   
#     ndcg = dcg/dcg_max                                                     

#     ndcg[torch.isnan(ndcg)] = 0
#     return ndcg

def compute_ndcg_k(pred_items, test_items, k):
    # DCG 계산
    # print(f"test_items : {test_items.shape}")
    topk_preds = pred_items[:, :k]
    # print(f"topk_preds shape : {topk_preds.shape}")
    tp = (test_items * topk_preds).sum(dim=1)
    log_pos = torch.log2(torch.arange(2, k + 2, device=pred_items.device).float())
    dcg = (tp / log_pos).sum(dim=1)
    # IDCG 계산
    ideal_tp = torch.sort(test_items, dim=1, descending=True)[0][:, :k]
    idcg = (ideal_tp / log_pos).sum(dim=1)
    idcg[idcg == 0] = 1  # 0으로 나누는 것을 방지
    ndcg = dcg / idcg
    return ndcg.mean().item()

# def calculate_metrics(test_items, pred_items,test_indices, k, num_items):

#     # ground_truth = torch.gather(test_items, 1, test_indices)
#     ground_truth = torch.zeros(test_items.shape[0], num_items).to(test_items.device)
#     ground_truth.scatter_(1, test_indices, 1)
#     print(pred_items.shape)
#     print(ground_truth.shape)
#     tp = (ground_truth * pred_items).sum(dim=1)
#     precision = tp / k
#     recall = tp / test_items.sum(dim=1)
#     f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
#     accuracy = tp / (pred_items.sum(dim=1) + 1e-8)

#     return recall.mean().item(), precision.mean().item(), f1.mean().item(), accuracy.mean().item()




# def calculate_metrics(test_items, topk_preds):
#     # True Positives
#     TP = (test_items * topk_preds).sum(dim=1)
#     # Recall@k: TP / (TP + FN)
#     recall = TP / test_items.sum(dim=1)
#     # Precision@k: TP / (TP + FP)
#     precision = TP / topk_preds.sum(dim=1)
#     # F1 Score: 2 * (Precision * Recall) / (Precision + Recall)
#     f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
#     f1[torch.isnan(f1)] = 0.0  # handle NaN
#     # Accuracy: TP / Total number of items
#     accuracy = TP / test_items.shape[0]
    
#     return recall.mean().item(), precision.mean().item(), f1.mean().item(), accuracy.mean().item()


# def evaluate(model, u_emb, i_emb, Rtr, Rte, k=10, device=None):
    
#     u_embeddings = all_embeddings[:model.n_users]
#     i_embeddings = all_embeddings[model.n_users:]

#     # split matrices
#     ue_splits = split_matrix(u_emb)
#     tr_splits = split_matrix(Rtr)
#     te_splits = split_matrix(Rte)

#     recall_k, ndcg_k, precision_list, f1_list, accuracy_list = [], [], [], [], []
#     # compute results for split matrices
#     for ue_f, tr_f, te_f in zip(ue_splits, tr_splits, te_splits):
#         scores = torch.mm(ue_f.to(device), i_emb.to(device).t())

#         test_items = torch.from_numpy(te_f.todense()).float().to(device)
#         non_train_items = torch.from_numpy(1-(tr_f.todense())).float().to(device)
#         scores = scores * non_train_items

#         _, test_indices = torch.topk(scores, dim=1, k=k)
        
#         pred_items = torch.zeros_like(scores).float().to(device)
#         pred_items.scatter_(dim=1, index=test_indices, src=torch.ones_like(test_indices).float().to(device))
        
#         topk_preds = torch.zeros_like(scores).float().to(device)
#         topk_preds.scatter_(dim=1, index=test_indices[:, :k], value=1)

#         recall, precision, f1, accuracy = calculate_metrics(test_items, topk_preds)
#         recall_k.append(recall)
#         precision_list.append(precision)
#         f1_list.append(f1)
#         accuracy_list.append(accuracy)

#         ndcg_score = compute_ndcg_k(pred_items, test_items, test_indices, k)
#         ndcg_k.append(ndcg_score.mean().item())

#     # 리스트의 평균을 계산
#     mean_recall = np.mean(recall_k)
#     mean_precision = np.mean(precision_list)
#     mean_f1 = np.mean(f1_list)
#     mean_accuracy = np.mean(accuracy_list)
#     mean_ndcg = np.mean(ndcg_k)

#     return mean_ndcg, mean_recall, mean_precision, mean_f1, mean_accuracy

In [146]:
import torch
from sklearn.metrics import roc_auc_score
import numpy as np

def compute_metrics(pred_items, test_items, k):
    """
    예측된 아이템 점수와 실제 테스트 아이템 간의 Recall, Precision, F1 점수를 계산합니다.

    :param pred_items: 예측된 아이템 점수 (사용자 수 x 아이템 수)
    :param test_items: 실제 테스트 아이템 (사용자 수 x 아이템 수), 1과 0으로 이루어진 행렬
    :param k: 상위 k 개의 아이템을 고려
    :return: recall@k, precision@k, F1@k
    """
    _, topk_indices = torch.topk(pred_items, k=k, dim=1)
    topk_preds = torch.zeros_like(pred_items).float()
    topk_preds.scatter_(1, topk_indices, 1)

    # Recall@k: (TP) / (TP + FN)
    tp = (test_items * topk_preds).sum(1)
    recall = (tp / test_items.sum(1)).mean()

    # Precision@k: (TP) / (TP + FP)
    precision = (tp / k).mean()

    # F1 Score: 2 * (precision * recall) / (precision + recall)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)
    f1 = torch.where(torch.isnan(f1), torch.zeros_like(f1), f1)  # NaN 값 처리

    return recall.item(), precision.item(), f1.mean().item()
def evaluate(model, Rtr, Rte, k, device):
    model.eval()
    with torch.no_grad():
        all_embeddings = model.compute_embeddings()
        u_embeddings = all_embeddings[:model.n_users]
        i_embeddings = all_embeddings[model.n_users:]

        scores = torch.matmul(u_embeddings, i_embeddings.T)

        # 실제 테스트 데이터와 비교할 수 있도록 변환
        test_items = torch.FloatTensor(Rte.toarray()).to(device)
        
        recall, precision, f1 = compute_metrics(scores, test_items, k)

    return recall, precision, f1


In [127]:
# def evaluate(model, Rtr, Rte, k, device):
#     model.eval()
#     with torch.no_grad():
#         all_embeddings = model.compute_embeddings()
#         u_embeddings = all_embeddings[:model.n_users].to(device)
#         i_embeddings = all_embeddings[model.n_users:].to(device)

#         scores = torch.matmul(u_embeddings, i_embeddings.T)

#         test_items = torch.FloatTensor(Rte.toarray()).to(device)
#         non_train_items = torch.FloatTensor((~Rtr.toarray().astype(bool)).astype(int)).to(device)
#         scores = scores * non_train_items  # Train에 포함되지 않은 아이템에 대해서만 점수를 계산

#         _, topk_indices = torch.topk(scores, k=k, dim=1)
#         topk_preds = torch.zeros(scores.shape).to(device)
#         topk_preds.scatter_(1, topk_indices, 1)

#         # Recall@10 계산
#         hit = (test_items * topk_preds).sum(dim=1)
#         recall_at_k = (hit / test_items.sum(dim=1)).mean()

#         # Precision@10 계산
#         precision_at_k = (hit / k).mean()

#         # F1-Score 계산
#         f1_score = 2 * (precision_at_k * recall_at_k) / (precision_at_k + recall_at_k + 1e-10)

#         # Accuracy 계산
#         accuracy = hit.sum() / (k * test_items.shape[0])

#     return recall_at_k.item(), precision_at_k.item(), f1_score.item(), accuracy.item()


In [126]:
# def evaluate(model, Rtr, Rte, k=10, device=None):
#     model.eval()
#     with torch.no_grad():
#         all_embeddings = model.compute_embeddings()
#         u_embeddings = all_embeddings[:model.n_users].to(device)
#         i_embeddings = all_embeddings[model.n_users:].to(device)

#         # 테스트 데이터에 대한 행렬을 미리 계산
#         test_items_matrix = torch.from_numpy(Rte.toarray()).float().to(device)
#         non_train_items_matrix = torch.from_numpy((~Rtr.toarray().astype(bool)).astype(int)).float().to(device)

#         scores = torch.matmul(u_embeddings, i_embeddings.T)
#         scores = scores * non_train_items_matrix  # Train에 포함되지 않은 아이템에 대해서만 점수를 계산

#         _, test_indices = torch.topk(scores, k=k, dim=1)
#         pred_items = torch.zeros_like(scores).float()
#         pred_items.scatter_(1, test_indices, 1)

#         recall, precision, f1, accuracy = calculate_metrics(test_items_matrix, pred_items)
#         ndcg_score = compute_ndcg_k(pred_items, test_items_matrix, test_indices, k)

#     mean_recall = recall
#     mean_precision = precision
#     mean_f1 = f1
#     mean_accuracy = accuracy
#     mean_ndcg = ndcg_score.mean().item()

#     return mean_ndcg, mean_recall, mean_precision, mean_f1, mean_accuracy


In [147]:
config = {
    "emb_dim": 64,
    "n_layers": 3,
    "reg": 0.00001,
    "node_dropout": 0.1,
    "lr": 0.001,
    "num_epochs": 50,
    "n_batch": 256,  
    "model_path": "./models",
    "model_name": "lightgcn_model.pt",
    'device' : 'device',
    'weight_decay' : 0.0001,
}

In [148]:
from torch.optim import Adam

data_path = '/home/siyun/ephemeral/lightgcn/LightGCN-PyTorch/data/amazon/amazon.csv'  
preprocess = Preprocess(data_path=data_path, config=config)

# 인접 행렬 및 사용자, 아이템 수 등 필요한 정보 가져오기
adjacency_matrix = preprocess.adjacency_matrix
num_users = preprocess.num_users
num_items = preprocess.num_items

model = LightGCN(
    n_users=num_users,
    n_items=num_items,
    emb_dim=config['emb_dim'],  # 임베딩 차원
    n_layers=config['n_layers'],  # GCN 레이어
    reg=config['reg'],  # 정규화 
    # node_dropout=0.1,  # 드롭아웃 
    adj_mtx=adjacency_matrix,  # 인접 행렬
    device = device
)

optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay']) 


In [17]:
# from tqdm import tqdm

In [None]:
# 학습 및 평가
best_f1 = 0
for epoch in range(1, config["num_epochs"] + 1):
    train_loss = train(
        model=model,
        make_graph_data_set=preprocess,
        optimizer=optimizer,
        n_batch=config["n_batch"],
        device=device
    )

    # with torch.no_grad():
    #     mean_ndcg, mean_recall, mean_precision, mean_f1, mean_accuracy = evaluate(
    #     # u_emb=model.u_final_embeddings.detach(),
    #     # i_emb=model.i_final_embeddings.detach(),
    #     Rtr=preprocess.user_item_matrix,
    #     Rte=preprocess.user_item_matrix,
    #     k=10,
    #     device=device
    # )
    recall, precision, f1 = evaluate(model, 
                                       preprocess.user_item_matrix, 
                                       preprocess.user_item_matrix, 
                                       k=10, 
                                       device=device)

    if best_f1 < f1:
        best_f1 = f1
        model_save_path = config["model_path"]
        if not os.path.exists(model_save_path):
            os.mkdir(model_save_path)

        torch.save(model.state_dict(), os.path.join(model_save_path, config["model_name"]))
        print(f"Epoch {epoch}: New best model saved with F1: {best_f1:.4f}")

    print(f"Recall@10: {recall}, Precision@10: {precision}, f1: {f1}")

inference관련

In [199]:
# https://github.com/tm1897/mlg_cs224w_project/tree/main

def load_model(model_path, model, device):
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    return model

def encode_session_items(preprocess, session_items):
    # 새로운 사용자 세션의 아이템을 인코딩
    encoded_items = [preprocess.item_encoder[item] for item in session_items if item in preprocess.item_encoder]
    return encoded_items

def infer_embeddings(model, encoded_session_items, num_users, num_items, device):
    # 모든 사용자와 아이템의 임베딩을 계산
    with torch.no_grad():
        all_embeddings = model.compute_embeddings()
        user_embeddings = all_embeddings[:num_users]
        item_embeddings = all_embeddings[num_users:]
        
        # 새로운 사용자 세션의 아이템 임베딩을 평균내어 사용자 임베딩을 대체
        if encoded_session_items:
            print(f"if encoded_session_items : 1")
            session_item_embeddings = item_embeddings[encoded_session_items].mean(dim=0).unsqueeze(0)
        else:
            # 세션에 아이템이 없는 경우 무작위 사용자 임베딩 사용
            print(f"if not encoded_session_items : 2")
            session_item_embeddings = user_embeddings[torch.randint(0, num_users, (1,))]
    return session_item_embeddings, item_embeddings

def recommend_items(session_user_embedding, item_embeddings, top_k):
    # 사용자 임베딩과 모든 아이템 임베딩 간의 유사도를 계산
    scores = torch.matmul(session_user_embedding, item_embeddings.T)
    top_scores, top_indices = torch.topk(scores, k=top_k, dim=1)
    return top_indices.squeeze().tolist()

In [None]:
data_path = '/home/siyun/ephemeral/lightgcn/LightGCN-PyTorch/data/amazon/'

data = pd.read_csv(data_path + 'amazon.csv')


In [191]:
data[data['user_id'] == 'AZZXCFBNEWIBQ'].item_id.tolist()

['B01GQE9CU2',
 'B018V7WYZM',
 'B01A96Y9RY',
 'B00FXKK44E',
 'B01D3GZUVQ',
 'B012P0TZC6',
 'B018FLQS24',
 'B016QMOA38',
 'B014JPKXW6',
 'B015JBOMYE',
 'B016LPMOW4']

In [167]:
user_item_counts = data.groupby('user_id')['item_id'].count()

# item_id 개수가 가장 많은 user_id 찾기
most_frequent_user_id = user_item_counts.idxmax()

print("가장 많은 item_id를 구매한 user_id:", most_frequent_user_id)

가장 많은 item_id를 구매한 user_id: A3G5KDMFNRUXHB


In [None]:
list(data[data['user_id'] == 'A3G5KDMFNRUXHB'].item_id)

In [164]:
# 모델과 데이터 로드
model_path = config["model_path"] + '/' + config["model_name"]
model = LightGCN(num_users, num_items, config['emb_dim'], config['n_layers'], config['reg'], adjacency_matrix, device)
model = load_model(model_path, model, device)


In [200]:
# 새로운 유저 입력
session_items = ['B01GQE9CU2',
 'B018V7WYZM',
 'B01A96Y9RY',
 'B00FXKK44E',
 'B01D3GZUVQ',
 'B012P0TZC6',
 'B018FLQS24',
 'B016QMOA38',
 'B014JPKXW6',
 'B015JBOMYE',
 'B016LPMOW4']  
encoded_session_items = encode_session_items(preprocess, session_items)

# 사용자와 아이템의 임베딩 계산
session_user_embedding, item_embeddings = infer_embeddings(model, encoded_session_items, num_users, num_items, device)

# 상위 N개의 추천 아이템 생성
top_k = 10  # 추천하고 싶은 아이템의 수
recommended_item_indices = recommend_items(session_user_embedding, item_embeddings, top_k)

# 인코딩된 아이템 인덱스를 실제 아이템 ID로 디코딩
recommended_item_ids = [preprocess.item_decoder[idx] for idx in recommended_item_indices if idx in preprocess.item_decoder]
print("추천 아이템 ID:", recommended_item_ids)

if encoded_session_items : 1
추천 아이템 ID: ['B0014F8TIU', 'B000YFSR5G', 'B0017LD0BM', 'B0017LGD34', 'B000YFSR4W', 'B00I0VHS10', 'B00ZOXFI1E', 'B0017U1KBK', 'B00201ER88', 'B000P0X15G']


In [195]:
user_id_data = data[data['user_id'] == 'AZZXCFBNEWIBQ']['item_id']
user_id_data

24128    B01GQE9CU2
24129    B018V7WYZM
24130    B01A96Y9RY
24131    B00FXKK44E
24132    B01D3GZUVQ
24133    B012P0TZC6
24134    B018FLQS24
24135    B016QMOA38
24136    B014JPKXW6
24137    B015JBOMYE
24138    B016LPMOW4
Name: item_id, dtype: object

In [198]:
# 추천된 아이템 목록에 있는 아이템의 등장 횟수 확인

user_id_data = data[data['user_id'] == 'AZZXCFBNEWIBQ']

count_recommended_items = user_id_data[user_id_data['item_id'].isin(recommended_item_ids)].shape[0]

print("추천된 아이템 목록에 있는 아이템의 수:", count_recommended_items)

추천된 아이템 목록에 있는 아이템의 수: 0
