In [None]:
import pandas as pd
import numpy as np
import torch

In [None]:
# InvoiceNo: 주문 번호
# StockCode: 상품 코드
# Description: 상품 설명
# Quantity: 상품 수량
# InvoiceDate: 주문 날짜
# UnitPrice: 상품 단가
# CustomerID: 고객 ID
# Country: 국가

In [None]:
# online_retail.csv 파일 불러오기
df = pd.read_csv(f"data/online_retail.csv")
df = df.dropna()
df.shape

In [None]:
df['StockCodeNo'] = df['StockCode'].astype("category").cat.codes
# 상품 코드와 고객 ID로 이루어진 새로운 데이터프레임 생성
new_df = df[['StockCodeNo', 'CustomerID', 'Quantity']]

# NaN 값을 가지는 행 제거
new_df = new_df.dropna()

# 고객 ID와 상품 코드를 정수형으로 변환
new_df['CustomerID'] = new_df['CustomerID'].astype(int)
new_df['StockCodeNo'] = new_df['StockCodeNo'].astype(int)

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df[['StockCodeNo', 'CustomerID', 'Quantity']], test_size=0.2)

In [None]:
train_df.head()

In [None]:
import torch
from torch import nn

class MF(nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super(MF, self).__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.item_factors = nn.Embedding(n_items, n_factors)

    def forward(self, user, item):
        user_embedding = self.user_factors(user)
        item_embedding = self.item_factors(item)
        return (user_embedding * item_embedding).sum(1)


In [None]:
# 사용자 ID와 제품 코드를 고유한 값으로 매핑
user_to_idx = {user: i for i, user in enumerate(train_df['CustomerID'].unique())}
item_to_idx = {item: i for i, item in enumerate(train_df['StockCodeNo'].unique())}

# 훈련 및 검증 데이터셋에 매핑된 인덱스 추가
train_df['user'] = train_df['CustomerID'].apply(lambda x: user_to_idx[x])
train_df['item'] = train_df['StockCodeNo'].apply(lambda x: item_to_idx[x])
val_df['user'] = val_df['CustomerID'].apply(lambda x: user_to_idx[x] if x in user_to_idx else -1)
val_df['item'] = val_df['StockCodeNo'].apply(lambda x: item_to_idx[x] if x in item_to_idx else -1)

# PyTorch DataLoader 생성
train_data = torch.utils.data.TensorDataset(
    torch.LongTensor(train_df['user']), 
    torch.LongTensor(train_df['item']), 
    torch.FloatTensor(train_df['Quantity']))
train_loader = torch.utils.data.DataLoader(train_data, batch_size=256, shuffle=True)
val_data = torch.utils.data.TensorDataset(
    torch.LongTensor(val_df[val_df['user'] != -1]['user']), 
    torch.LongTensor(val_df[val_df['item'] != -1]['item']), 
    torch.FloatTensor(val_df[val_df['item'] != -1]['Quantity']))
val_loader = torch.utils.data.DataLoader(val_data, batch_size=256, shuffle=False)


In [None]:
model = MF(len(user_to_idx), len(item_to_idx), n_factors=20)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

for epoch in range(10):
    for user, item, rating in train_loader:
        optimizer.zero_grad()
        output = model(user, item)
        loss = criterion(output, rating)
        loss.backward()
        optimizer.step()

    with torch.no_grad():
        val_loss = 0.0
        for user, item, rating in val_loader:
            output = model(user, item)
            val_loss += criterion(output, rating).item() * len(user)
        val_loss /= len(val_df)
        print('epoch: {}, validation RMSE loss: {:.4f}'.format(epoch+1, val_loss**0.5))


In [None]:
# 각 제품의 인덱스를 제품 코드로 매핑
idx_to_item = {i: item for item, i in item_to_idx.items()}

# 특정 사용자에게 추천할 상위 10개의 제품 출력
user_idx = 0
user_items = set(train_df[train_df['user'] == user_idx]['item'])
scores = model(torch.LongTensor([user_idx]*len(item_to_idx)), torch.LongTensor(list(item_to_idx.values()))).detach().numpy()
item_indices = list(range(len(item_to_idx)))
item_scores = list(zip(item_indices, scores))
item_scores = sorted(item_scores, key=lambda x: x[1], reverse=True)
recommended_items = []
for item_idx, score in item_scores:
    item_code = idx_to_item[item_idx]
    if item_code not in user_items:
        recommended_items.append(item_code)
    if len(recommended_items) >= 10:
        break
print('Recommended items for user {}:'.format(user_idx))
print(recommended_items)
