In [1]:
import numpy as np
import pandas as pd
# sparse 행렬을 만들어야 하기 때문에 다음과 같이 import
import scipy.sparse as sp

import math
from tqdm import tqdm
import random
from datetime import datetime
from time import time

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

# from box import Box

import warnings
warnings.filterwarnings('ignore')

from collections import defaultdict
import os

  from .autonotebook import tqdm as notebook_tqdm


# 데이터 전처리
- whole data -> filtering

# Preprocess
- 모델에 넣기 위한 dataset 정리

In [2]:
# https://github.com/gusye1234/LightGCN-PyTorch/blob/master/code/dataloader.py
# https://github.com/SeongBeomLEE/RecsysTutorial/blob/main/LightGCN/LightGCN.ipynb
# https://radish-greens.tistory.com/1
## 참고하여 재정의

class Preprocess :
    def __init__(self, data_path, config) : 
        self.data = pd.read_csv(data_path, usecols=['user_id', 'product_id','rating','unixReviewTime'], header=0)
        # self.data = self.data[['user_id', 'product_id','interaction','timestamp']]
        self.config = config
        # bpr을 이용하기 때문에 encoder, decoder의 구조가 필요
        ## 유저, 아이템 행렬에 대한 각각의 encoder, decoder가 필요
        self.user_encoder, self.user_decoder, self.num_users = self._encode_user()
        self.item_encoder, self.item_decoder, self.num_items = self._encode_item()

        # 유저와 아이템의 관계에 대한 인접행렬 생산
        #  build a graph in torch.sparse.IntTensor.
        # Details in NGCF's matrix form
        # A = 
        #     |I,   R|
        #     |R^T, I|
        # """
        self.user_item_matrix = self._generate_user_item_matrix()
        self.adjacency_matrix = self._generate_adjacency_matrix() 


        self.exist_users = list(self.user_encoder.values())  # 혹은 다른 방식으로 존재하는 사용자 ID 목록 생성
        self.exist_items = list(self.item_encoder.values())  # 존재하는 아이템 ID 목록
        self.user_train = self._generate_user_train()  # 사용자별 긍정 아이템 목록을 생성하는 메서드 필요

    def _encode_user(self) :
        unique_users = self.data['user_id'].unique()
        user_encoder = {user_id:idx for idx, user_id in enumerate(unique_users)}
        user_decoder = {idx:user_id for user_id, idx in user_encoder.items()}
        return user_encoder,user_decoder,len(unique_users)

    def _encode_item(self) : 
        unique_items = self.data['product_id'].unique()
        item_encoder = {item_id:idx for idx, item_id in enumerate(unique_items)}
        item_decoder = {idx:item_id for item_id, idx in item_encoder.items()}
        
        return item_encoder,item_decoder,len(unique_items)

    def _generate_user_item_matrix(self) :
        # matrix에 들어갈 내용을 정합니다.
        # rows, cols, value로 구성됨 
        rows = self.data['user_id'].map(self.user_encoder)
        cols = self.data['product_id'].map(self.item_encoder)
        values = np.ones(len(self.data))
        user_item_matrix = sp.csr_matrix((values, (rows,cols)), shape=(self.num_users, self.num_items))
        return user_item_matrix

    def _generate_adjacency_matrix(self) : 
        # user에 대한 그래프 + 아이템에 대한 그래프에 대한 인접그래프 생성
        user_item_matrix = self.user_item_matrix
        item_user_matrix = self.user_item_matrix.transpose()

        zero_user_to_user = sp.csr_matrix((self.num_users, self.num_users))
        zero_item_to_item = sp.csr_matrix((self.num_items, self.num_items))

        # 상단 블록 (사용자-사용자 연결 및 사용자-아이템 연결)
        upper_block = sp.hstack([zero_user_to_user, user_item_matrix], format='csr')
        # 하단 블록 (아이템-사용자 연결 및 아이템-아이템 연결)
        lower_block = sp.hstack([item_user_matrix, zero_item_to_item], format='csr')

        adjacency_matrix = sp.vstack([upper_block, lower_block], format='csr')

        return adjacency_matrix

    def _generate_user_train(self) :
        user_train = {}
        for _, row in self.data.iterrows() :
            user_id = self.user_encoder[row.user_id]
            item_id = self.item_encoder[row.product_id]
            if user_id not in user_train :
                user_train[user_id] = []
            user_train[user_id].append(item_id)
        return user_train

    def sampling(self):
        users = random.sample(self.exist_users, self.config['n_batch'])

        def sample_pos_items_for_u(u, num):
            pos_items = self.user_train[u]
            pos_batch = random.sample(pos_items, num)
            return pos_batch

        def sample_neg_items_for_u(u, num):
            neg_items = list(set(self.exist_items) - set(self.user_train[u]))
            neg_batch = random.sample(neg_items, num)
            return neg_batch

        pos_items, neg_items = [], []
        for user in users:
            pos_items += sample_pos_items_for_u(user, 1)
            neg_items += sample_neg_items_for_u(user, 1)

        return users, pos_items, neg_items

Lightgcn의 구조

원저자와의 차이

1. 모델 초기화 차이
- xavier -> normal
2. dropout 처리
- 원저자 : dropout 생략
- 내 구조 : 추가
3. forward
- 원저자 : graph convolution 직접 계산, 모든 layer의 임베딩을 평균 -> 최종 임베딩
- 내 구조 : 간략한 계산
4. loss function
- 원저자 : BPR class 직접 사용
    - 파라미터 업데이트에 사용되는 optimizer 관리
- 내 구조 : BPR 직접계산

In [3]:
class LightGCN(nn.Module):
    def __init__(self, n_users, n_items, emb_dim, n_layers, reg, adj_mtx, device):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.emb_dim = emb_dim

        self.l = adj_mtx
        self.graph = self._convert_sp_mat_to_sp_tensor(self.l)

        self.reg = reg
        self.n_layers = n_layers
        self.device = device
        
        self.user_embedding = torch.nn.Embedding(num_embeddings=n_users, embedding_dim=emb_dim).to(device)
        self.item_embedding = torch.nn.Embedding(num_embeddings=n_items, embedding_dim=emb_dim).to(device)

        # Initialize embeddings
        torch.nn.init.normal_(self.user_embedding.weight, std=0.1)
        torch.nn.init.normal_(self.item_embedding.weight, std=0.1)

        self.adj_mtx = self._convert_sp_mat_to_sp_tensor(adj_mtx).to(device)

    def _convert_sp_mat_to_sp_tensor(self, X):
        coo = X.tocoo().astype(np.float32)
        i = torch.LongTensor(np.mat([coo.row, coo.col]))
        v = torch.FloatTensor(coo.data)
        res = torch.sparse.FloatTensor(i, v, coo.shape).to(device)
        return res
    def forward(self, user, pos_item, neg_item):
        """
        Computes the forward pass
        
        Arguments:
        ---------
        user = user
        pos_item = positive item (user interacted with item)
        neg_item = negative item (user did not interact with item)
        """

        all_embeddings = self.compute_embeddings()

        u_embeddings = all_embeddings[:self.n_users]
        i_embeddings = all_embeddings[self.n_users:]

        users_emb = u_embeddings[user]
        pos_emb = i_embeddings[pos_item]
        neg_emb = i_embeddings[neg_item]

        pos_scores = torch.sum(users_emb * pos_emb, dim=-1)
        neg_scores = torch.sum(users_emb * neg_emb, dim=-1)

        loss = -torch.mean(F.logsigmoid(pos_scores - neg_scores))

        # Regularization term
        if self.reg > 0:
            reg_term = (1/2)*(users_emb.norm(2).pow(2) + 
                              pos_emb.norm(2).pow(2) + 
                              neg_emb.norm(2).pow(2))/float(len(user))
            loss += self.reg * reg_term

        return loss

    def compute_embeddings(self):
        users_emb = self.user_embedding.weight
        items_emb = self.item_embedding.weight
        all_emb = torch.cat([users_emb, items_emb])

        embs = [all_emb]
        for _ in range(self.n_layers):
            all_emb = torch.sparse.mm(self.adj_mtx, all_emb)
            embs.append(all_emb)

        embs = torch.stack(embs, dim=1)
        light_out = torch.mean(embs, dim=1)
        return light_out

In [65]:
# class LightGCN(nn.Module):
#     # dropout 제거
#     # def __init__(self, n_users, n_items, emb_dim, n_layers, reg, node_dropout, adj_mtx):
#     def __init__(self, n_users, n_items, emb_dim, n_layers, reg, adj_mtx, device):
#         super().__init__()
#         # initialize Class attributes
#         self.n_users = n_users
#         self.n_items = n_items
#         self.emb_dim = emb_dim
        
#         self.l = adj_mtx
#         self.graph = self._convert_sp_mat_to_sp_tensor(self.l)

#         self.reg = reg
#         self.n_layers = n_layers
#         self.device = device
#         # --------------------------------
#         # 제거
#         # self.node_dropout = node_dropout

#         # Initialize weights
#         # self.weight_dict = self._init_weights()
#         # print("Weights initialized.")

#     # # initialize weights
#     # def _init_weights(self):
#     #     print("Initializing weights...")
#     #     weight_dict = nn.ParameterDict()

#     #     # initializer = torch.nn.init.xavier_uniform_
#     #     initializer = torch.nn.init.normal_
        
#     #     weight_dict['user_embedding'] = nn.Parameter(initializer(torch.empty(self.n_users, self.emb_dim).to(device)))
#     #     weight_dict['item_embedding'] = nn.Parameter(initializer(torch.empty(self.n_items, self.emb_dim).to(device)))

#     #     return weight_dict
#     # --------------------------------
#         self.user_embedding = torch.nn.Embedding(num_embeddings=n_users, embedding_dim=emb_dim).to(device)
#         self.item_embedding = torch.nn.Embedding(num_embeddings=n_items, embedding_dim=emb_dim).to(device)
        
#         # Initialize embeddings
#         torch.nn.init.normal_(self.user_embedding.weight, std=0.1)
#         torch.nn.init.normal_(self.item_embedding.weight, std=0.1)

#         self.adj_mtx = self._convert_sp_mat_to_sp_tensor(adj_mtx).to(device)


#     # convert sparse matrix into sparse PyTorch tensor
#     def _convert_sp_mat_to_sp_tensor(self, X):
#         """
#         Convert scipy sparse matrix to PyTorch sparse matrix

#         Arguments:
#         ----------
#         X = Adjacency matrix, scipy sparse matrix
#         """
#         coo = X.tocoo().astype(np.float32)
#         i = torch.LongTensor(np.mat([coo.row, coo.col]))
#         v = torch.FloatTensor(coo.data)
#         res = torch.sparse.FloatTensor(i, v, coo.shape).to(device)
#         return res
#     # 드랍아웃을 사용하지 않는 경우 주석처리
#     # apply node_dropout
#     # def _droupout_sparse(self, X):
#     #     """
#     #     Drop individual locations in X
        
#     #     Arguments:
#     #     ---------
#     #     X = adjacency matrix (PyTorch sparse tensor)
#     #     dropout = fraction of nodes to drop
#     #     noise_shape = number of non non-zero entries of X
#     #     """
#     #     node_dropout_mask = ((self.node_dropout) + torch.rand(X._nnz())).floor().bool().to(device)
#     #     i = X.coalesce().indices()
#     #     v = X.coalesce()._values()
#     #     i[:,node_dropout_mask] = 0
#     #     v[node_dropout_mask] = 0
#     #     X_dropout = torch.sparse.FloatTensor(i, v, X.shape).to(X.device)

#     #     return  X_dropout.mul(1/(1-self.node_dropout))

#     def forward(self, user, pos_item, neg_item):
#         """
#         Computes the forward pass
        
#         Arguments:
#         ---------
#         user = user
#         pos_item = positive item (user interacted with item)
#         neg_item = negative item (user did not interact with item)
#         """
        
#         all_embeddings = self.compute_embeddings()
        
#         u_embeddings = all_embeddings[:self.n_users]
#         i_embeddings = all_embeddings[self.n_users:]

#         users_emb = u_embeddings[user]
#         pos_emb = i_embeddings[pos_item]
#         neg_emb = i_embeddings[neg_item]

#         pos_scores = torch.sum(users_emb * pos_emb, dim=-1)
#         neg_scores = torch.sum(users_emb * neg_emb, dim=-1)

#         loss = -torch.mean(F.logsigmoid(pos_scores - neg_scores))

#         # Regularization term
#         if self.reg > 0:
#             reg_term = (1/2)*(users_emb.norm(2).pow(2) + 
#                               pos_emb.norm(2).pow(2) + 
#                               neg_emb.norm(2).pow(2))/float(len(user))
#             loss += self.reg * reg_term

#         return loss

#     def compute_embeddings(self):
#         users_emb = self.user_embedding.weight
#         items_emb = self.item_embedding.weight
#         all_emb = torch.cat([users_emb, items_emb])

#         embs = [all_emb]
#         for _ in range(self.n_layers):
#             all_emb = torch.sparse.mm(self.adj_mtx, all_emb)
#             embs.append(all_emb)

#         embs = torch.stack(embs, dim=1)
#         light_out = torch.mean(embs, dim=1)
#         return light_out
#         # # apply drop-out mask
#         # graph = self._droupout_sparse(self.graph) if self.node_dropout > 0 else self.graph
#         # ego_embeddings = torch.cat([self.weight_dict['user_embedding'], self.weight_dict['item_embedding']], 0)
#         # final_embeddings = [ego_embeddings]

#         # for k in range(self.n_layers):
#         #     ego_embeddings = torch.sparse.mm(graph, final_embeddings[k])
#         #     final_embeddings.append(ego_embeddings)                                       

#         # final_embeddings = torch.stack(final_embeddings, dim=1)
#         # final_embeddings = torch.mean(final_embeddings, dim=1)
        
#         # u_final_embeddings, i_final_embeddings = final_embeddings.split([self.n_users, self.n_items], 0)

#         # self.u_final_embeddings = nn.Parameter(u_final_embeddings)
#         # self.i_final_embeddings = nn.Parameter(i_final_embeddings)
        
#         # # loss 계산
#         # u_emb = u_final_embeddings[u] # user embeddings
#         # p_emb = i_final_embeddings[i] # positive item embeddings
#         # n_emb = i_final_embeddings[j] # negative item embeddings
        
#         # y_ui = torch.sum(torch.mul(u_emb, p_emb), dim = 1)                        
#         # y_uj = torch.sum(torch.mul(u_emb, n_emb), dim = 1)
        
#         # log_prob = torch.mean(torch.log(torch.sigmoid(y_ui - y_uj))) 
#         # bpr_loss = -log_prob        
#         # if self.reg > 0.:
#         #     l2norm = (torch.sum(u_emb**2)/2. + torch.sum(p_emb**2)/2. + torch.sum(n_emb**2)/2.) / u_emb.shape[0]
#         #     l2reg = self.reg * l2norm
#         #     bpr_loss += l2reg

#         # return bpr_loss


In [4]:
def train(model, make_graph_data_set, optimizer, n_batch, device):
    model.train()
    loss_val = 0
    for _ in tqdm(range(n_batch), desc="Training..."):
        user, pos, neg = make_graph_data_set.sampling()
        user = torch.LongTensor(user).to(device)
        pos = torch.LongTensor(pos).to(device)
        neg = torch.LongTensor(neg).to(device)
        optimizer.zero_grad()
        loss = model(user, pos, neg)
        loss.backward()
        optimizer.step()
        loss_val += loss.item()
    return loss_val / n_batch

def split_matrix(X, n_splits=10):
    splits = []
    chunk_size = X.shape[0] // n_splits
    for i in range(n_splits):
        start = i * chunk_size
        end = X.shape[0] if i == n_splits - 1 else (i + 1) * chunk_size
        splits.append(X[start:end])
    return splits


# def compute_ndcg_k(pred_items, test_items, k):
#     # DCG 계산
#     # print(f"test_items : {test_items.shape}")
#     topk_preds = pred_items[:, :k]
#     # print(f"topk_preds shape : {topk_preds.shape}")
#     tp = (test_items * topk_preds).sum(dim=1)
#     log_pos = torch.log2(torch.arange(2, k + 2, device=pred_items.device).float())
#     dcg = (tp / log_pos).sum(dim=1)
#     # IDCG 계산
#     ideal_tp = torch.sort(test_items, dim=1, descending=True)[0][:, :k]
#     idcg = (ideal_tp / log_pos).sum(dim=1)
#     idcg[idcg == 0] = 1  # 0으로 나누는 것을 방지
#     ndcg = dcg / idcg
#     return ndcg.mean().item()


In [5]:
import torch
from sklearn.metrics import roc_auc_score
import numpy as np

def compute_metrics(pred_items, test_items, k, i_embeddings):
    """
    예측된 아이템 점수와 실제 테스트 아이템 간의 Recall, Precision, F1 점수를 계산합니다.

    :param pred_items: 예측된 아이템 점수 (사용자 수 x 아이템 수)
    :param test_items: 실제 테스트 아이템 (사용자 수 x 아이템 수), 1과 0으로 이루어진 행렬
    :param k: 상위 k 개의 아이템을 고려
    :return: recall@k, precision@k, F1@k, ndcg
    : 추가 return : diversity 계산 결과
    """
    _, topk_indices = torch.topk(pred_items, k=k, dim=1)
    topk_preds = torch.zeros_like(pred_items).float()
    topk_preds.scatter_(1, topk_indices, 1)

    # Recall@k: (TP) / (TP + FN)
    tp = (test_items * topk_preds).sum(1)
    recall = (tp / test_items.sum(1)).mean()

    # Precision@k: (TP) / (TP + FP)
    precision = (tp / k).mean()

    # F1 Score: 2 * (precision * recall) / (precision + recall)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)
    f1 = torch.where(torch.isnan(f1), torch.zeros_like(f1), f1)  # NaN 값 처리
    
    # https://walwalgabu.tistory.com/entry/4-NDCG-Normalized-Discounted-Cumulative-Gain%ED%8F%89%EA%B0%80%EC%A7%80%ED%91%9C
    # NDCG
    
    # topk에 대한 실제 값 가져오기
    relevant_scores = torch.gather(test_items, 1, topk_indices)
    
    log_pos = torch.log2(torch.arange(2, k+2, device=pred_items.device).float())
    dcg = (relevant_scores / log_pos).sum(dim=1)
    
    # ideal DCG 계산
    _, sorted_indices = torch.sort(test_items, dim=1, descending=True)
    ideal_scores = torch.gather(test_items, 1, sorted_indices)[:, :k]
    idcg = (ideal_scores / log_pos).sum(dim=1)
    
    ndcg = dcg / idcg
    ndcg[torch.isnan(ndcg)] = 0  # NaN 처리
    
    # diversity 계산
    diversity = calculate_diversity(i_embeddings, topk_indices)
    
    # f1와 diversity의 조화평균으로 trade off 계산
    # https://aclanthology.org/2022.coling-1.332.pdf
    
    trade_off = 2* (diversity * f1) / (diversity + f1 + 1e-8)

    
    # log_pos = torch.log2(torch.arange(2, k+2, device=pred_items.device).float())
    # # log_pos, tp의 크기 일치를 위해 unsqueeze
    # log_pos_expanded = log_pos.unsqueeze(0).expand(pred_items.size(0), -1)

    # dcg = (tp.unsqueeze(1) / log_pos_expanded).sum(dim=1)
    # ideal = torch.sort(test_items, dim=1, descending=True)[0][:, :k]
    # idcg = (ideal / log_pos).sum(dim=1)
    # idcg[idcg==0] = 1
    # ndcg = dcg / idcg

    # https://velog.io/@rockgoat2/Recommendation-System-%EC%B6%94%EC%B2%9C-%EC%8B%9C%EC%8A%A4%ED%85%9C%EC%9D%98-%EC%97%AC%EB%9F%AC-Measure-Beyond-Accuracy
    ## intra list diversity를 확인하여 비슷한 속성으로 아이템이 구성되는 것을 지양하기 위한 확인 metric으로 활용
    
    return recall.item(), precision.item(), f1.mean().item(), torch.mean(ndcg).item(), diversity, trade_off

def calculate_diversity(item_embeddings, topk_indices) :
    """
    Args:
        item_embeddings : 아이템의 임베딩 tensor (item, embedding)
        topk_indices : 사용자 별 추천된 k개 아이템의 idx tensor (user,k)
        similarity : cosine similarity
        return : divergence 계수
    """
    n_users, k = topk_indices.size()
    similarity = 0

    for i in range(k) :
        for j in range(i+1, k) :
            item_i_emb = item_embeddings[topk_indices[:, i]]
            item_j_emb = item_embeddings[topk_indices[:, j]]
            sim = F.cosine_similarity(item_i_emb, item_j_emb, dim=1)
            similarity += sim

    avg_sim = similarity / (k * (k-1) / 2)
    diversity = 1 - avg_sim

    return diversity.mean()


def evaluate(model, Rtr, Rte, k, device):
    model.eval()
    with torch.no_grad():
        all_embeddings = model.compute_embeddings()
        u_embeddings = all_embeddings[:model.n_users]
        i_embeddings = all_embeddings[model.n_users:]

        scores = torch.matmul(u_embeddings, i_embeddings.T)

        # 실제 테스트 데이터와 비교할 수 있도록 변환
        test_items = torch.FloatTensor(Rte.toarray()).to(device)

        recall, precision, f1, ndcg, diversity, trade_off = compute_metrics(scores, test_items, k, i_embeddings)
    return recall, precision, f1, ndcg, diversity, trade_off


In [6]:
import faiss
from tqdm import tqdm

def user_cosine(u_embeddings) :
    user_embeddings_to_numpy = u_embeddings.cpu().detach().numpy()
    # 유저 임베딩 L2 normalization
    faiss.normalize_L2(user_embeddings_to_numpy)
    
    d = user_embeddings_to_numpy.shape[1]
    index = faiss.IndexFlatIP(d)
    
    index.add(user_embeddings_to_numpy)
    
    return index

def similar(index, u_embeddings, user_id, k=10) : 
    query_embedding = np.expand_dims(u_embeddings[user_id].cpu().detach().numpy(), axis=0)
    faiss.normalize_L2(query_embedding)
    
    # 유사한 유저 검색
    D, I = index.search(query_embedding, k)
    return D, I

def random_cosine(index, u_embeddings, ratio=0.1) : 
    n_users = u_embeddings.shape[0]
    sample_size = int(n_users*ratio)
    
    sampled_indeces = np.random.choice(n_users, size=sample_size, replace=False)
    
    sampled_cosine = []
    
    for user_id in sampled_indeces :
        emb = np.expand_dims(u_embeddings[user_id].cpu().detach().numpy(), axis=0)
        faiss.normalize_L2(emb)
        
        # 코사인 유사도 계산
        D, _ = index.search(emb, k=2)  # 자기 자신을 제외한 가장 가까운 유저와의 유사도
        sampled_cosine.append(D[0][1:])  # 자기 자신을 제외한 결과
    
    # 평균 코사인 유사도를 계산
    average_cosine_similarity = np.mean(sampled_cosine)
    return average_cosine_similarity

In [37]:
# import torch
# from sklearn.metrics import roc_auc_score
# import numpy as np

# def compute_metrics(pred_items, test_items, k, i_embeddings, u_embeddings):
#     """
#     예측된 아이템 점수와 실제 테스트 아이템 간의 Recall, Precision, F1 점수를 계산합니다.

#     :param pred_items: 예측된 아이템 점수 (사용자 수 x 아이템 수)
#     :param test_items: 실제 테스트 아이템 (사용자 수 x 아이템 수), 1과 0으로 이루어진 행렬
#     :param k: 상위 k 개의 아이템을 고려
#     :return: recall@k, precision@k, F1@k, ndcg
#     : 추가 return : diversity 계산 결과
#     """
#     _, topk_indices = torch.topk(pred_items, k=k, dim=1)
#     topk_preds = torch.zeros_like(pred_items).float()
#     topk_preds.scatter_(1, topk_indices, 1)

#     # Recall@k: (TP) / (TP + FN)
#     tp = (test_items * topk_preds).sum(1)
#     recall = (tp / test_items.sum(1)).mean()

#     # Precision@k: (TP) / (TP + FP)
#     precision = (tp / k).mean()

#     # F1 Score: 2 * (precision * recall) / (precision + recall)
#     f1 = 2 * precision * recall / (precision + recall + 1e-8)
#     f1 = torch.where(torch.isnan(f1), torch.zeros_like(f1), f1)  # NaN 값 처리
    
#     # https://walwalgabu.tistory.com/entry/4-NDCG-Normalized-Discounted-Cumulative-Gain%ED%8F%89%EA%B0%80%EC%A7%80%ED%91%9C
#     # NDCG
    
#     # topk에 대한 실제 값 가져오기
#     relevant_scores = torch.gather(test_items, 1, topk_indices)
    
#     log_pos = torch.log2(torch.arange(2, k+2, device=pred_items.device).float())
#     dcg = (relevant_scores / log_pos).sum(dim=1)
    
#     # ideal DCG 계산
#     _, sorted_indices = torch.sort(test_items, dim=1, descending=True)
#     ideal_scores = torch.gather(test_items, 1, sorted_indices)[:, :k]
#     idcg = (ideal_scores / log_pos).sum(dim=1)
    
#     ndcg = dcg / idcg
#     ndcg[torch.isnan(ndcg)] = 0  # NaN 처리
    
#     # diversity 계산
#     cosine, ILD = calculate_diversity(i_embeddings, topk_indices, u_embeddings)
    
#     # f1와 diversity의 조화평균으로 trade off 계산
#     # https://aclanthology.org/2022.coling-1.332.pdf
    
#     trade_off = 2* (ILD * f1) / (ILD + f1 + 1e-8)

    
#     # log_pos = torch.log2(torch.arange(2, k+2, device=pred_items.device).float())
#     # # log_pos, tp의 크기 일치를 위해 unsqueeze
#     # log_pos_expanded = log_pos.unsqueeze(0).expand(pred_items.size(0), -1)

#     # dcg = (tp.unsqueeze(1) / log_pos_expanded).sum(dim=1)
#     # ideal = torch.sort(test_items, dim=1, descending=True)[0][:, :k]
#     # idcg = (ideal / log_pos).sum(dim=1)
#     # idcg[idcg==0] = 1
#     # ndcg = dcg / idcg

#     # https://velog.io/@rockgoat2/Recommendation-System-%EC%B6%94%EC%B2%9C-%EC%8B%9C%EC%8A%A4%ED%85%9C%EC%9D%98-%EC%97%AC%EB%9F%AC-Measure-Beyond-Accuracy
#     ## intra list diversity를 확인하여 비슷한 속성으로 아이템이 구성되는 것을 지양하기 위한 확인 metric으로 활용
    
#     return recall.item(), precision.item(), f1.mean().item(), torch.mean(ndcg).item(), cosine, ILD, trade_off

# def calculate_diversity(item_embeddings, topk_indices, u_embeddings) :
#     """
#     Args:
#         item_embeddings : 아이템의 임베딩 tensor (item, embedding)
#         topk_indices : 사용자 별 추천된 k개 아이템의 idx tensor (user,k)
#         similarity : cosine similarity
#         return : 코사인 유사도 및 ILD 반환
#     """
#     n_users, k = topk_indices.size()
#     pair_count = 0
#     ild_sum = 0

#     for i in range(k) :
#         for j in range(i+1, k) :
#             item_i_emb =item_embeddings[topk_indices[:, i]]
#             item_j_emb =item_embeddings[topk_indices[:, j]]
            
#             distance = torch.norm(item_i_emb - item_j_emb, dim=1)
            
#             ild_sum += distance
#             pair_count += 1
#     ILD = ild_sum / pair_count if pair_count > 0 else 0
    
    
#     user_similarity_sum = 0
#     user_pair_count = 0
#     for i in range(n_users):
#         for j in range(i + 1, n_users):
#             user_i_emb = u_embeddings[i].unsqueeze(0)
#             user_j_emb = u_embeddings[j].unsqueeze(0)
#             sim = F.cosine_similarity(user_i_emb, user_j_emb)
#             user_similarity_sum += sim
#             user_pair_count += 1

#     user_cosine_similarity = user_similarity_sum / user_pair_count if user_pair_count > 0 else 0

#     return user_cosine_similarity.mean().item(), ILD.mean().item()



# def evaluate(model, Rtr, Rte, k, device):
#     model.eval()
#     with torch.no_grad():
#         all_embeddings = model.compute_embeddings()
#         u_embeddings = all_embeddings[:model.n_users]
#         i_embeddings = all_embeddings[model.n_users:]

#         scores = torch.matmul(u_embeddings, i_embeddings.T)

#         # 실제 테스트 데이터와 비교할 수 있도록 변환
#         test_items = torch.FloatTensor(Rte.toarray()).to(device)
        
#         recall, precision, f1, ndcg, cosine, ILD, trade_off = compute_metrics(scores, test_items, k, i_embeddings, u_embeddings)
#     return recall, precision, f1, ndcg, cosine, ILD, trade_off


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [91]:
config = {
    "emb_dim": 64,
    "n_layers": 3,
    "reg": 0.00001,
    "node_dropout": 0.1,
    "lr": 0.001,
    "num_epochs": 50,
    "n_batch": 256,  
    "model_path": "/home/siyun/ephemeral/lightgcn/models",
    "model_name": "rating4_lightgcn_model_filtered.pt",
    'device' : 'device',
    'weight_decay' : 0.0001,
}

In [9]:
# data_path = '/home/siyun/ephemeral/lightgcn/data/'
# data = pd.read_csv(data_path + 'rating_filtered_4.csv', usecols=['reviewerID', 'asin','rating','unixReviewTime'], header=0)
# data['asin'], data['reviewerID'] = data['reviewerID'], data['asin']
# data = data.rename(columns = {'asin' : 'user_id', 'reviewerID' : 'product_id'})
# data.to_csv('/home/siyun/ephemeral/lightgcn/data/rating_filtered_4_col_change.csv', index=False)

In [94]:
from torch.optim import Adam

data_path = '/home/siyun/ephemeral/lightgcn/data/rating_4_and_1_col_change.csv'  

preprocess = Preprocess(data_path=data_path, config=config)

# 인접 행렬 및 사용자, 아이템 수 등 필요한 정보 가져오기
adjacency_matrix = preprocess.adjacency_matrix
num_users = preprocess.num_users
num_items = preprocess.num_items

model = LightGCN(
    n_users=num_users,
    n_items=num_items,
    emb_dim=config['emb_dim'],  # 임베딩 차원
    n_layers=config['n_layers'],  # GCN 레이어
    reg=config['reg'],  # 정규화 
    # node_dropout=0.1,  # 드롭아웃 
    adj_mtx=adjacency_matrix,  # 인접 행렬
    device = device
)

optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay']) 


random sampling과 성능을 비교하기 위해 생성

가장 많이 선택된 아이템의 성능 비교

In [95]:
def compute_metrics_for_random(pred_items, test_items, k, device):
    """
    예측된 아이템 점수와 실제 테스트 아이템 간의 Recall, Precision, F1 점수를 계산

    :param pred_items: 예측된 아이템 점수 (사용자 수 x 아이템 수)
    :param test_items: 실제 테스트 아이템 (사용자 수 x 아이템 수), 1과 0으로 이루어진 행렬
    :param k: 상위 k 개의 아이템을 고려
    :return: recall@k, precision@k, F1@k
    """
    _, topk_indices = torch.topk(pred_items, k=k, dim=1)
    topk_preds = torch.zeros_like(pred_items).float()
    topk_preds.scatter_(1, topk_indices, 1)

    # Recall@k: (TP) / (TP + FN)
    tp = (test_items * topk_preds).sum(1)
    recall = (tp / test_items.sum(1)).mean()

    # Precision@k: (TP) / (TP + FP)
    precision = (tp / k).mean()

    # F1 Score: 2 * (precision * recall) / (precision + recall)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)
    f1 = torch.where(torch.isnan(f1), torch.zeros_like(f1), f1)  # NaN 값 처리
    
    # https://walwalgabu.tistory.com/entry/4-NDCG-Normalized-Discounted-Cumulative-Gain%ED%8F%89%EA%B0%80%EC%A7%80%ED%91%9C
    # NDCG
    log_pos = torch.log2(torch.arange(2, k+2, device=pred_items.device).float())
    # log_pos, tp의 크기 일치를 위해 unsqueeze
    log_pos_expanded = log_pos.unsqueeze(0).expand(pred_items.size(0), -1)

    dcg = (tp.unsqueeze(1) / log_pos_expanded).sum(dim=1)
    ideal = torch.sort(test_items, dim=1, descending=True)[0][:, :k]
    idcg = (ideal / log_pos).sum(dim=1)
    idcg[idcg==0] = 1
    ndcg = (dcg / idcg).mean()

    return recall.item(), precision.item(), f1.mean().item(), ndcg.item()

In [96]:
# def random_sampling_performance(test_items, num_items, k, device):
#     # 랜덤 점수 생성
#     random_scores = torch.rand(test_items.size(0), num_items).to(device)
#     _, topk_indices = torch.topk(random_scores, k=k, dim=1)
#     return topk_indices

def random_sampling_performance(test_items, num_items, k, device):
    # 랜덤 점수 생성
    random_scores = torch.rand(test_items.size(0), num_items, device=device)
    recall, precision, f1, ndcg = compute_metrics_for_random(random_scores, test_items, k, device)
    return recall, precision, f1, ndcg

num_items = preprocess.num_items  # 아이템의 총 수
recall, precision, f1, ndcg = random_sampling_performance(
    torch.FloatTensor(preprocess.user_item_matrix.toarray()).to(device), 
    num_items, 
    10,  # 상위 10개 아이템
    device
)
print(f"Random Sampling - Recall@10: {recall}, Precision@10: {precision}, F1@10: {f1}, ndcg : {ndcg}")


Random Sampling - Recall@10: 0.0015180769842118025, Precision@10: 0.0009567197412252426, F1@10: 0.001173727447167039, ndcg : 0.012397877871990204


In [64]:
# def random_sampling_performance_with_ids(test_items, item_encoder, num_items, k, device):
#     # 전체 아이템 목록에서 랜덤하게 k개의 아이템 인덱스를 선택
#     random_idx = torch.randint(0, num_items, (k,), device=device)
    
#     # 실제 테스트 아이템에 대해 선택된 랜덤 아이템 인덱스의 존재 여부 확인
#     selected_items_mask = torch.zeros(test_items.size(0), num_items, device=device)
#     selected_items_mask[:, random_idx] = 1  # 선택된 인덱스에 1을 할당

#     # 선택된 랜덤 아이템을 기반으로 성능 메트릭 계산
#     recall, precision, f1, ndcg = compute_metrics_for_random(selected_items_mask, test_items, k, device)

#     return recall, precision, f1, ndcg, random_idx.cpu().numpy()

# # 함수 실행 및 랜덤 아이템 선정 결과 출력
# recall, precision, f1, ndcg, random_product_idxs = random_sampling_performance_with_ids(
#     torch.FloatTensor(preprocess.user_item_matrix.toarray()).to(device), 
#     preprocess.item_encoder,  # 아이템 인코더
#     preprocess.num_items,  # 아이템의 총 수
#     10,  # 상위 10개 아이템
#     device
# )

# print(f"Random Sampling - Recall@10: {recall}, Precision@10: {precision}, F1@10: {f1}, NDCG: {ndcg}")
# # 랜덤으로 선택된 아이템 인덱스 출력
# print(f"Randomly selected item indexes: {random_product_idxs}")
# # 실제 product_id로 변환
# random_product_ids = [preprocess.item_decoder[idx] for idx in random_product_idxs]
# print(f"Randomly selected product IDs: {random_product_ids}")


Random Sampling - Recall@10: 0.0008287232485599816, Precision@10: 0.0004555808554869145, F1@10: 0.000587940972764045, NDCG: 0.006483666133135557
Randomly selected item indexes: [1897 4111 1386 1354 6396 3466 4512 2583  578 3926]
Randomly selected product IDs: ['B00N1SZXSM', 'B015K193WY', 'B00K808HAM', 'B00K2XPFEQ', 'B01H2CZ9Y0', 'B0108TSNDS', 'B017OFB8WC', 'B00SM0FDZ2', 'B00DM05DAW', 'B014I5CIOS']


In [None]:
# data_path = '/home/siyun/ephemeral/lightgcn/data/'
# data = pd.read_csv(data_path + 'rating_4_and_1.csv')
# data

In [130]:
# `product_id` 별로 선택된 횟수 계산
data_path = '/home/siyun/ephemeral/lightgcn/data/'
data = pd.read_csv(data_path + 'rating_4_and_1_col_change.csv')
  
product_counts = data['product_id'].value_counts()
# print(product_counts)

# 가장 많이 선택된 `product_id` 10개
top_10_products = product_counts.index[:10].tolist()
print(top_10_products)
product_id2idx = {id: idx for idx, id in enumerate(top_10_products)}

#기존 1
# def evaluate_top_products_performance(top_products, test_items, k, device):
#     # 인기 제품의 인덱스를 기반으로 실제 테스트 세트에 포함되어 있는지 여부를 확인
#     top_products_mask = torch.zeros_like(test_items).float()
#     hits = 0
#     total = test_items.sum().item()
    
#     for product_id in top_products:
#         product_idx = product_id2idx.get(product_id)
#         if product_idx is not None:
#             top_products_mask += (test_items == product_idx).float()
#     # top_products_mask를 사용하여 성능 평가
#     return compute_metrics_for_random(top_products_mask, test_items, k, device)

# 다시1
# def evaluate_top_products_performance(top_products, test_items, k, device, item_encoder):
#     # top_products 내 각 product_id에 대한 실제 인덱스를 찾기
#     top_product_idxs = [item_encoder[product_id] for product_id in top_products if product_id in item_encoder]
    

#     # 실제 테스트 아이템에 대해 top_product_idxs의 존재 여부를 확인
#     top_products_mask = torch.zeros(test_items.size(0), len(top_product_idxs), device=device)
    
#     for idx, product_idx in enumerate(top_product_idxs):
#         top_products_mask[:, idx] = test_items[:, product_idx]

#     # top_products_mask를 사용하여 실제 테스트 아이템에 포함된 인기 제품 수 계산
#     hits = top_products_mask.sum().item()
#     total_possible_hits = test_items.size(0) * min(k, len(top_product_idxs))
    
#     recall = hits / total_possible_hits
#     precision = hits / (len(top_product_idxs) * test_items.size(0))
#     f1 = 2 * (precision * recall) / (precision + recall + 1e-8) if (precision + recall) > 0 else 0
    
#     # NDCG 계산은 이 경우에는 적용되지 않음 (또는 별도로 계산 필요)
#     ndcg = 0

#     return recall, precision, f1, ndcg


# 다시 2

def evaluate_top_products_performance(top_products, test_items, k, device, item_encoder):
    # top_products 내 각 product_id에 대한 실제 인덱스를 찾기
    top_product_idxs = [item_encoder.get(product_id, None) for product_id in top_products]
    top_product_idxs = [idx for idx in top_product_idxs if idx is not None]

    # top_products의 인덱스에 해당하는 예측 점수를 생성
    pred_items = torch.zeros(test_items.size(0), test_items.size(1), device=device)
    for idx in top_product_idxs:
        pred_items[:, idx] = 1.0  # 인기 제품에 대한 점수를 1로 설정

    _, topk_indices = torch.topk(pred_items, k=k, dim=1)
    topk_preds = torch.zeros_like(pred_items).float()
    topk_preds.scatter_(1, topk_indices, 1)

    # Recall@k: (TP) / (TP + FN)
    tp = (test_items * topk_preds).sum(1)
    recall = (tp / test_items.sum(1)).mean()

    # Precision@k: (TP) / (TP + FP)
    precision = (tp / k).mean()


    # F1 Score: 2 * (precision * recall) / (precision + recall)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)
    f1 = torch.where(torch.isnan(f1), torch.zeros_like(f1), f1)  # NaN 값 처리


    # NDCG 계산
    log_pos = torch.log2(torch.arange(2, k+2, device=device).float())
    log_pos_expanded = log_pos.unsqueeze(0).expand(test_items.size(0), -1)

    relevant_scores = torch.gather(test_items, 1, topk_indices)
    dcg = (relevant_scores / log_pos).sum(dim=1)

    _, sorted_indices = torch.sort(test_items, dim=1, descending=True)
    ideal_scores = torch.gather(test_items, 1, sorted_indices)[:, :k]
    idcg = (ideal_scores / log_pos).sum(dim=1)
    print(f"idcg : {idcg}")
    idcg[idcg == 0] = 1e-8  # 0으로 나누는 것을 방지
    ndcg = (dcg / idcg).mean()

    return recall.item(), precision.item(), f1.item(), ndcg.item()

item_encoder = preprocess.item_encoder

recall_top, precision_top, f1_top, ndcg = evaluate_top_products_performance(
    top_10_products,
    torch.FloatTensor(preprocess.user_item_matrix.toarray()).to(device),
    10,  # 상위 10개 아이템
    device,
    item_encoder
)

print(f"Top 10 Products - Recall@10: {recall_top}, Precision@10: {precision_top}, F1@10: {f1_top}, ndcg : {ndcg}")

['B009MA34NY', 'B005AGO4LU', 'B010RRWKT4', 'B0092UF54A', 'B014IBJKNO', 'B0014F7B98', 'B0058YEJ5K', 'B001IKJOLW', 'B000YFSR5G', 'B000YFSR4W']
idcg : tensor([3.5616, 4.2619, 3.5616,  ..., 2.5616, 2.9485, 2.5616], device='cuda:0')
Top 10 Products - Recall@10: 0.17086172103881836, Precision@10: 0.12856492400169373, F1@10: 0.14672592282295227, ndcg : 0.15018948912620544


In [60]:
# data_path = '/home/siyun/ephemeral/LightGCN_Recommender/data/'
# data = pd.read_csv(data_path + 'filtered_data.csv')
# data['product_id'].nunique()

8587

In [97]:
model

LightGCN(
  (user_embedding): Embedding(2195, 64)
  (item_embedding): Embedding(6484, 64)
)

In [98]:
# 학습 및 평가
best_f1 = 0
for epoch in range(1, config["num_epochs"] + 1):
    train_loss = train(
        model=model,
        make_graph_data_set=preprocess,
        optimizer=optimizer,
        n_batch=config["n_batch"],
        device=device
    )

    recall, precision, f1, ndcg, ILD, trade_off = evaluate(model, 
                                       preprocess.user_item_matrix, 
                                       preprocess.user_item_matrix, 
                                       k=10, 
                                       device=device)

    u_embeddings = model.user_embedding.weight
    index = user_cosine(u_embeddings)

    cosine = random_cosine(index, u_embeddings, ratio=1.0)

    if best_f1 < f1:
        best_f1 = f1
        model_save_path = config["model_path"]
        if not os.path.exists(model_save_path):
            os.mkdir(model_save_path)

        torch.save(model.state_dict(), os.path.join(model_save_path, config["model_name"]))
        print(f"Epoch {epoch}: New best model saved with F1: {best_f1:.4f}")

    print(f"Recall@10: {recall}, Precision@10: {precision}, f1: {f1}, ndcg@10 : {ndcg}, cosine : {cosine}, ILD : {ILD},  trade_off : {trade_off}")

Training...: 100%|██████████| 256/256 [00:16<00:00, 15.42it/s]


Epoch 1: New best model saved with F1: 0.2274
Recall@10: 0.2861240804195404, Precision@10: 0.1886104792356491, f1: 0.2273523211479187, ndcg@10 : 0.26523298025131226, cosine : 0.41673028469085693, ILD : 0.5774409174919128,  trade_off : 0.3262515962123871


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.65it/s]


Epoch 2: New best model saved with F1: 0.2299
Recall@10: 0.28973251581192017, Precision@10: 0.1905239224433899, f1: 0.22988125681877136, ndcg@10 : 0.27377381920814514, cosine : 0.4173014461994171, ILD : 0.5944643616676331,  trade_off : 0.3315507769584656


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.60it/s]


Epoch 3: New best model saved with F1: 0.2478
Recall@10: 0.3162004053592682, Precision@10: 0.20369020104408264, f1: 0.24777106940746307, ndcg@10 : 0.30062663555145264, cosine : 0.4175242483615875, ILD : 0.6202173233032227,  trade_off : 0.3540874719619751


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.74it/s]


Epoch 4: New best model saved with F1: 0.2554
Recall@10: 0.32646113634109497, Precision@10: 0.20970386266708374, f1: 0.25536975264549255, ndcg@10 : 0.3049313426017761, cosine : 0.41872042417526245, ILD : 0.6004480123519897,  trade_off : 0.3583385646343231


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.67it/s]


Epoch 5: New best model saved with F1: 0.2566
Recall@10: 0.32776743173599243, Precision@10: 0.21088838577270508, f1: 0.256647527217865, ndcg@10 : 0.31593653559684753, cosine : 0.4198525547981262, ILD : 0.5818547010421753,  trade_off : 0.3561864495277405


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.65it/s]


Epoch 6: New best model saved with F1: 0.2703
Recall@10: 0.3479887843132019, Precision@10: 0.2209567129611969, f1: 0.27029111981391907, ndcg@10 : 0.3308470845222473, cosine : 0.4209963083267212, ILD : 0.6477338671684265,  trade_off : 0.3814203441143036


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.50it/s]


Recall@10: 0.34663641452789307, Precision@10: 0.22022779285907745, f1: 0.2693377733230591, ndcg@10 : 0.32992640137672424, cosine : 0.4229480028152466, ILD : 0.6130778193473816,  trade_off : 0.37425678968429565


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.60it/s]


Epoch 8: New best model saved with F1: 0.2820
Recall@10: 0.3647404909133911, Precision@10: 0.22979497909545898, f1: 0.28195300698280334, ndcg@10 : 0.359437495470047, cosine : 0.4247424900531769, ILD : 0.6207695603370667,  trade_off : 0.387777715921402


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.63it/s]


Epoch 9: New best model saved with F1: 0.3033
Recall@10: 0.3936389982700348, Precision@10: 0.24674257636070251, f1: 0.3033425807952881, ndcg@10 : 0.3920848071575165, cosine : 0.4261576235294342, ILD : 0.6326585412025452,  trade_off : 0.41006848216056824


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.64it/s]


Epoch 10: New best model saved with F1: 0.3104
Recall@10: 0.40337836742401123, Precision@10: 0.25220954418182373, f1: 0.31036531925201416, ndcg@10 : 0.40183135867118835, cosine : 0.427913099527359, ILD : 0.6360276937484741,  trade_off : 0.4171648323535919


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.59it/s]


Recall@10: 0.3930870294570923, Precision@10: 0.24601365625858307, f1: 0.30262768268585205, ndcg@10 : 0.3892669975757599, cosine : 0.4292571544647217, ILD : 0.6074292063713074,  trade_off : 0.40398550033569336


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.55it/s]


Epoch 12: New best model saved with F1: 0.3218
Recall@10: 0.41978535056114197, Precision@10: 0.26086562871932983, f1: 0.32177305221557617, ndcg@10 : 0.4191402792930603, cosine : 0.4305473268032074, ILD : 0.633538007736206,  trade_off : 0.42678341269493103


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.62it/s]


Epoch 13: New best model saved with F1: 0.3336
Recall@10: 0.4362933337688446, Precision@10: 0.2699772119522095, f1: 0.3335527777671814, ndcg@10 : 0.43093380331993103, cosine : 0.4324464201927185, ILD : 0.6446740627288818,  trade_off : 0.4396379590034485


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.61it/s]


Epoch 14: New best model saved with F1: 0.3343
Recall@10: 0.43737825751304626, Precision@10: 0.27052393555641174, f1: 0.3342871069908142, ndcg@10 : 0.4292082190513611, cosine : 0.43402713537216187, ILD : 0.6345565915107727,  trade_off : 0.43789127469062805


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.43it/s]


Recall@10: 0.39460188150405884, Precision@10: 0.24555808305740356, f1: 0.30272960662841797, ndcg@10 : 0.3802398443222046, cosine : 0.4369826316833496, ILD : 0.5364323258399963,  trade_off : 0.38703837990760803


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.53it/s]


Recall@10: 0.273507684469223, Precision@10: 0.18118451535701752, f1: 0.21797321736812592, ndcg@10 : 0.2403700053691864, cosine : 0.4458226263523102, ILD : 0.28379446268081665,  trade_off : 0.24656666815280914


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.58it/s]


Recall@10: 0.3018576204776764, Precision@10: 0.19662870466709137, f1: 0.2381364107131958, ndcg@10 : 0.28074806928634644, cosine : 0.4485854208469391, ILD : 0.3398359417915344,  trade_off : 0.2800386846065521


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.20it/s]


Recall@10: 0.30140420794487, Precision@10: 0.19630980491638184, f1: 0.23776143789291382, ndcg@10 : 0.2903873324394226, cosine : 0.4515240788459778, ILD : 0.41046568751335144,  trade_off : 0.30110716819763184


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.59it/s]


Recall@10: 0.3283766806125641, Precision@10: 0.21088837087154388, f1: 0.25683408975601196, ndcg@10 : 0.3264370858669281, cosine : 0.4521487057209015, ILD : 0.44168758392333984,  trade_off : 0.3248014748096466


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.61it/s]


Recall@10: 0.3532783091068268, Precision@10: 0.2236446589231491, f1: 0.273897260427475, ndcg@10 : 0.35273975133895874, cosine : 0.4536949694156647, ILD : 0.4633307456970215,  trade_off : 0.34427616000175476


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.55it/s]


Recall@10: 0.36917516589164734, Precision@10: 0.2321184277534485, f1: 0.2850266993045807, ndcg@10 : 0.3693165183067322, cosine : 0.45511600375175476, ILD : 0.4821133315563202,  trade_off : 0.35825315117836


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.56it/s]


Recall@10: 0.36277472972869873, Precision@10: 0.22856491804122925, f1: 0.2804397642612457, ndcg@10 : 0.35898154973983765, cosine : 0.4566560685634613, ILD : 0.47596275806427,  trade_off : 0.35293084383010864


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.59it/s]


Recall@10: 0.3803793489933014, Precision@10: 0.23817768692970276, f1: 0.2929329574108124, ndcg@10 : 0.382392942905426, cosine : 0.4575607478618622, ILD : 0.4975361227989197,  trade_off : 0.368755042552948


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.63it/s]


Recall@10: 0.39539268612861633, Precision@10: 0.24592256546020508, f1: 0.30323928594589233, ndcg@10 : 0.3936353325843811, cosine : 0.4587456285953522, ILD : 0.5202628970146179,  trade_off : 0.3831541836261749


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.62it/s]


Recall@10: 0.35435330867767334, Precision@10: 0.22387242317199707, f1: 0.2743908762931824, ndcg@10 : 0.35336509346961975, cosine : 0.45939111709594727, ILD : 0.4791412651538849,  trade_off : 0.34894856810569763


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.41it/s]


Recall@10: 0.4121735692024231, Precision@10: 0.2552163898944855, f1: 0.3152383267879486, ndcg@10 : 0.4078519642353058, cosine : 0.46026578545570374, ILD : 0.535204291343689,  trade_off : 0.3967743217945099


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.22it/s]


Recall@10: 0.4233155846595764, Precision@10: 0.26173120737075806, f1: 0.3234666585922241, ndcg@10 : 0.40719443559646606, cosine : 0.46184226870536804, ILD : 0.5463249683380127,  trade_off : 0.406345397233963


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.63it/s]


Epoch 28: New best model saved with F1: 0.3372
Recall@10: 0.44292452931404114, Precision@10: 0.2722095549106598, f1: 0.337190717458725, ndcg@10 : 0.4298350214958191, cosine : 0.4622642993927002, ILD : 0.589012861251831,  trade_off : 0.42886826395988464


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.56it/s]


Epoch 29: New best model saved with F1: 0.3437
Recall@10: 0.4523605704307556, Precision@10: 0.2770842909812927, f1: 0.34366410970687866, ndcg@10 : 0.4473329484462738, cosine : 0.462776243686676, ILD : 0.5667935013771057,  trade_off : 0.4278872013092041


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.48it/s]


Recall@10: 0.4006355106830597, Precision@10: 0.25070616602897644, f1: 0.3084150552749634, ndcg@10 : 0.38684672117233276, cosine : 0.46356815099716187, ILD : 0.4845629632472992,  trade_off : 0.3769247233867645


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.56it/s]


Recall@10: 0.404047429561615, Precision@10: 0.25120729207992554, f1: 0.3098021447658539, ndcg@10 : 0.3975365459918976, cosine : 0.465616911649704, ILD : 0.5030444264411926,  trade_off : 0.38345304131507874


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.21it/s]


Recall@10: 0.441998690366745, Precision@10: 0.2722095549106598, f1: 0.3369220793247223, ndcg@10 : 0.4379187524318695, cosine : 0.46609821915626526, ILD : 0.5317092537879944,  trade_off : 0.4124755263328552


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.51it/s]


Recall@10: 0.40099504590034485, Precision@10: 0.24974943697452545, f1: 0.30779603123664856, ndcg@10 : 0.38928404450416565, cosine : 0.46804147958755493, ILD : 0.4253443479537964,  trade_off : 0.35714665055274963


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.48it/s]


Recall@10: 0.3845730721950531, Precision@10: 0.24159452319145203, f1: 0.29676002264022827, ndcg@10 : 0.3744712769985199, cosine : 0.4730665683746338, ILD : 0.4437335431575775,  trade_off : 0.3556611239910126


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.52it/s]


Recall@10: 0.41282525658607483, Precision@10: 0.25708431005477905, f1: 0.316851407289505, ndcg@10 : 0.4146716892719269, cosine : 0.47370660305023193, ILD : 0.47648724913597107,  trade_off : 0.38060835003852844


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.52it/s]


Recall@10: 0.41175755858421326, Precision@10: 0.2555352747440338, f1: 0.31535953283309937, ndcg@10 : 0.3935696482658386, cosine : 0.47421202063560486, ILD : 0.49099254608154297,  trade_off : 0.38404855132102966


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.47it/s]


Recall@10: 0.41174042224884033, Precision@10: 0.25585421919822693, f1: 0.315597265958786, ndcg@10 : 0.39576655626296997, cosine : 0.4742485582828522, ILD : 0.5125572681427002,  trade_off : 0.3906557559967041


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.37it/s]


Recall@10: 0.42848148941993713, Precision@10: 0.2646469175815582, f1: 0.3272014260292053, ndcg@10 : 0.4155873954296112, cosine : 0.47482430934906006, ILD : 0.5203746557235718,  trade_off : 0.40177473425865173


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.62it/s]


Recall@10: 0.4344456195831299, Precision@10: 0.2678815424442291, f1: 0.3314124047756195, ndcg@10 : 0.4277324080467224, cosine : 0.47610440850257874, ILD : 0.5356171727180481,  trade_off : 0.4094673991203308


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.54it/s]


Recall@10: 0.4378160536289215, Precision@10: 0.27020499110221863, f1: 0.3341710865497589, ndcg@10 : 0.42996934056282043, cosine : 0.47663095593452454, ILD : 0.5444677472114563,  trade_off : 0.4141528308391571


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.62it/s]


Recall@10: 0.4397262930870056, Precision@10: 0.2708428204059601, f1: 0.3352149724960327, ndcg@10 : 0.43668943643569946, cosine : 0.47727689146995544, ILD : 0.5592222809791565,  trade_off : 0.4191678762435913


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.52it/s]


Recall@10: 0.447967529296875, Precision@10: 0.2750341594219208, f1: 0.34081903100013733, ndcg@10 : 0.43863967061042786, cosine : 0.47869324684143066, ILD : 0.5485888719558716,  trade_off : 0.42043596506118774


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.33it/s]


Epoch 43: New best model saved with F1: 0.3452
Recall@10: 0.45362722873687744, Precision@10: 0.278542160987854, f1: 0.3451504707336426, ndcg@10 : 0.4490987956523895, cosine : 0.48042798042297363, ILD : 0.560064971446991,  trade_off : 0.4270954132080078


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.63it/s]


Recall@10: 0.44372519850730896, Precision@10: 0.27230069041252136, f1: 0.3374924659729004, ndcg@10 : 0.44041189551353455, cosine : 0.48074430227279663, ILD : 0.5589113831520081,  trade_off : 0.4208557903766632


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.50it/s]


Epoch 45: New best model saved with F1: 0.3561
Recall@10: 0.46868792176246643, Precision@10: 0.28715264797210693, f1: 0.3561200201511383, ndcg@10 : 0.4666652977466583, cosine : 0.482117235660553, ILD : 0.5636096596717834,  trade_off : 0.43646013736724854


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.45it/s]


Epoch 46: New best model saved with F1: 0.3766
Recall@10: 0.49627357721328735, Precision@10: 0.3034623861312866, f1: 0.3766252100467682, ndcg@10 : 0.4930812418460846, cosine : 0.4832591712474823, ILD : 0.5726152062416077,  trade_off : 0.4543871283531189


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.61it/s]


Recall@10: 0.49590644240379333, Precision@10: 0.3028701841831207, f1: 0.3760632574558258, ndcg@10 : 0.4983411729335785, cosine : 0.484240859746933, ILD : 0.5635664463043213,  trade_off : 0.4511067271232605


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.70it/s]


Recall@10: 0.48035746812820435, Precision@10: 0.29348519444465637, f1: 0.36435776948928833, ndcg@10 : 0.4728507697582245, cosine : 0.48537981510162354, ILD : 0.5521425008773804,  trade_off : 0.43901219964027405


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.56it/s]


Recall@10: 0.4803867042064667, Precision@10: 0.2929840683937073, f1: 0.3639797866344452, ndcg@10 : 0.473136842250824, cosine : 0.4864576458930969, ILD : 0.5603971481323242,  trade_off : 0.4413204491138458


Training...: 100%|██████████| 256/256 [00:16<00:00, 15.79it/s]


Recall@10: 0.390554815530777, Precision@10: 0.2450113743543625, f1: 0.30111852288246155, ndcg@10 : 0.3675791323184967, cosine : 0.4892670512199402, ILD : 0.45717567205429077,  trade_off : 0.36308878660202026


# inference관련
- 0316 해야할 일
- 추천받은 아이템이 실제 유저가 산 히스토리의 아이템과 유사한지 확인(완전 같지는 않아도 괜찮음)
    - ex) 가방 -> 가방, 신발 -> 신발
- 어느정도 나오는 것은 확인함

## 추천과정
- lightgcn -> 유저와 아이템간의 히스토리를 바탕으로 유저의 행동패턴에 대한 학습
- 새로운 유저 -> 기존의 행동패턴이 비슷한 유저들을 반환
- 그 유저들이 산 아이템을 하나로 합친 후 k개 반환
    - 이렇게 한 이유 : Lightgcn의 학습 방식은 item을 반환하는 것이 아닌 유저와 아이템간의 상호작용에 대한 확률을 계산
    - 결과적으로는 유저가 반환됨.
    - 그렇기 때문에 비슷한 행동패턴을 보이는 유저는 비슷한 아이템을 샀을 것이라 가정 -> 그 유저들이 산 아이템들을 추천하는 방식

In [41]:
# https://github.com/tm1897/mlg_cs224w_project/tree/main

def load_model(model_path, model, device):
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    return model

def encode_session_items(preprocess, session_items):
    # 새로운 사용자 세션의 아이템을 인코딩
    encoded_items = [preprocess.item_encoder[item] for item in session_items if item in preprocess.item_encoder]
    return encoded_items

def infer_embeddings(model, encoded_session_items, num_users, num_items, device):
    # 모든 사용자와 아이템의 임베딩을 계산
    with torch.no_grad():
        all_embeddings = model.compute_embeddings()
        user_embeddings = all_embeddings[:num_users]
        item_embeddings = all_embeddings[num_users:]

        # 새로운 사용자 세션의 아이템 임베딩을 평균내어 사용자 임베딩을 대체
        if encoded_session_items:
            print(f"if encoded_session_items : 1")
            session_item_embeddings = item_embeddings[encoded_session_items].mean(dim=0).unsqueeze(0)
        else:
            # 세션에 아이템이 없는 경우 무작위 사용자 임베딩 사용
            print(f"if not encoded_session_items : 2")
            session_item_embeddings = user_embeddings[torch.randint(0, num_users, (1,))]
    return session_item_embeddings, item_embeddings, user_embeddings

def recommend_item(session_user_embedding, item_embeddings, top_k):
    # 사용자 임베딩과 모든 아이템 임베딩 간의 유사도를 계산
    scores = torch.matmul(session_user_embedding, item_embeddings.T)
    top_scores, top_indices = torch.topk(scores, k=top_k, dim=1)
    return top_indices.squeeze().tolist()





def recommend_users(session_item_embeddings, user_embeddings, top_k):
    # 세션 사용자 임베딩과 모든 사용자 임베딩 간의 유사도를 계산
    scores = torch.matmul(session_item_embeddings, user_embeddings.T)
    top_scores, top_indices = torch.topk(scores, k=top_k, dim=1)
    return top_indices.squeeze().tolist()



def random_item(recommended_user_ids, data, k) : 
    # 유사한 사용자의 아이템 추천 받기
    total_items = []
    for user_id in recommended_user_ids:
        # 여기서는 'product_id' 대신 실제 data에서 사용하는 상호작용 아이템 컬럼명을 사용하세요.
        user_items = data[data['user_id'] == user_id]['product_id'].tolist()
        total_items.extend(user_items)


    # 고유 아이템 선택
    unique_items = list(set(total_items))

    recommended_items = random.sample(unique_items, k) if len(unique_items) >= k else unique_items
    return recommended_items[:10]

In [10]:
data_path = '/home/siyun/ephemeral/LightGCN_Recommender/data/'
data = pd.read_csv(data_path + 'filtered_data.csv')


In [25]:
# user_item_counts = data.groupby('user_id')['item_id'].count()
# # item_id 개수가 가장 많은 user_id 찾기
# most_frequent_user_id = user_item_counts.idxmax()
# print("가장 많은 item_id를 구매한 user_id:", most_frequent_user_id)
# list(data[data['user_id'] == 'A3G5KDMFNRUXHB'].item_id)

가장 많은 item_id를 구매한 user_id: A3G5KDMFNRUXHB


In [11]:
# 모델과 데이터 로드
model_path = config["model_path"] + '/' + config["model_name"]
model = LightGCN(num_users, num_items, config['emb_dim'], config['n_layers'], config['reg'], adjacency_matrix, device)
model = load_model(model_path, model, device)


In [88]:
# data[data['user_id'] == 'A0986263H7SX62P1SRDD']['product_id'].tolist()

['B00JO5MTXI',
 'B00LD84PGS',
 'B00OR6KCAQ',
 'B00S60V1RW',
 'B00W6TY12Q',
 'B01CKX5Y4Q']

아이템 추천의 경우

In [44]:
# 새로운 유저 입력
session_items = ['B00JO5MTXI',
 'B00LD84PGS',
 'B00OR6KCAQ',
 'B00S60V1RW',
 'B00W6TY12Q',
 'B01CKX5Y4Q']

# 새로운 유저의 입력을 받아서 인코딩
encoded_session_items = encode_session_items(preprocess, session_items)
print(f"encoded_session_items: {encoded_session_items}")

# 사용자와 아이템의 임베딩 계산
session_user_embedding, item_embeddings, _ = infer_embeddings(model, encoded_session_items, num_users, num_items, device)

# 유사한 아이템 행동패턴을 보이는 유저 반환
top_k = 10  # 행동 확률이 높은 K명의 유저 반환
recommended_user_indices = recommend_item(session_user_embedding, item_embeddings, top_k)


# 모델의 구조 상 유사한 유저의 아이템 목록이 아닌 유사한 행동패턴을 가진 유저가 반환
recommended_user = [preprocess.item_decoder[idx] for idx in recommended_user_indices if idx in preprocess.item_decoder]
print(f"recommended_user : {recommended_user}")


encoded_session_items: [0, 1, 2, 3, 4, 5]
if encoded_session_items : 1
recommended_user : ['B000YFSR5G', 'B000YFSR4W', 'B00KA3VO3O', 'B00S99PCSE', 'B00KW4LCCE', 'B00UDF11O6', 'B00KA3VEG6', 'B00LMU7GHM', 'B00KA3WMJY', 'B000K2PJ4K']


유사한 유저 추천

In [42]:
unique_user_ids = data['user_id'].unique()
id2idx = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
idx2id = {idx: user_id for user_id, idx in id2idx.items()}



session_items = ['B00JO5MTXI',
 'B00LD84PGS',
 'B00OR6KCAQ',
 'B00S60V1RW',
 'B00W6TY12Q',
 'B01CKX5Y4Q']



# 새로운 유저의 입력을 받아서 인코딩
encoded_session_items = encode_session_items(preprocess, session_items)
# 사용자와 아이템의 임베딩 계산
session_user_embedding, item_embeddings, user_embeddings = infer_embeddings(model, encoded_session_items, num_users, num_items, device)

# 유사한 사용자 찾기
top_k_users = 15  # 행동 패턴이 유사한 상위 K명의 사용자
# id2idx가 반환됨
recommended_user_indices = recommend_users(session_user_embedding, user_embeddings, top_k_users)
# idx2id
recommended_user_ids = [idx2id[idx] for idx in recommended_user_indices]

random_item(recommended_user_ids,data,top_k_users)






# # 유사한 사용자의 아이템 추천 받기
# total_items = []
# for user_id in recommended_user_ids:
#     # 여기서는 'product_id' 대신 실제 data에서 사용하는 상호작용 아이템 컬럼명을 사용하세요.
#     user_items = data[data['user_id'] == user_id]['product_id'].tolist()
#     total_items.extend(user_items)


# # 고유 아이템 선택
# unique_items = list(set(total_items))

# recommended_items = random.sample(unique_items, top_k_users) if len(unique_items) >= top_k_users else unique_items



if encoded_session_items : 1


['B005GPKRV6',
 'B00OAZOMMI',
 'B00IZ61MU8',
 'B00KTEO274',
 'B01E5GNWMW',
 'B015GXCM5G',
 'B00FYU9THG',
 'B00P1XXR6A',
 'B000YFSR5G',
 'B00MSC158E']

In [37]:
recommended_items[:10]

['B005GPKRV6',
 'B00OAZOMMI',
 'B00IZ61MU8',
 'B00KTEO274',
 'B01E5GNWMW',
 'B015GXCM5G',
 'B00FYU9THG',
 'B00P1XXR6A',
 'B000YFSR5G',
 'B00MSC158E']

In [None]:
['B00JO5MTXI',
 'B00LD84PGS',
 'B00OR6KCAQ',
 'B00S60V1RW',
 'B00W6TY12Q',
 'B01CKX5Y4Q']

In [99]:
data[data['product_id'] =='B000K2PJ4K']['image_url'].tolist()

['https://images-na.ssl-images-amazon.com/images/I/51yCQvuWSnL.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/51yCQvuWSnL.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/51yCQvuWSnL.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/51yCQvuWSnL.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/51yCQvuWSnL.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/51yCQvuWSnL.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/51yCQvuWSnL.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/51yCQvuWSnL.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/51yCQvuWSnL.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/51yCQvuWSnL.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/51yCQvuWSnL.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/51yCQvuWSnL.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/51yCQvuWSnL.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/51yCQvuWSnL.jpg',
 'https://images-na.ssl-images-ama