In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [23]:
# online_retail.csv 파일 불러오기
df = pd.read_csv(f"data/online_retail.csv")
df = df[df["CustomerID"].notnull()]
df.shape

(406829, 8)

In [26]:
# 상품 코드와 고객 ID로 이루어진 새로운 데이터프레임 생성
new_df = df[['StockCode', 'CustomerID', 'Quantity']]

# NaN 값을 가지는 행 제거
new_df = new_df.dropna()

# 고객 ID와 상품 코드를 정수형으로 변환
new_df['CustomerNo'] = new_df['CustomerID'].astype("category").cat.codes.astype(int)
new_df['StockCodeNo'] = new_df['StockCode'].astype("category").cat.codes.astype(int)
new_df.isnull().sum()

StockCode      0
CustomerID     0
Quantity       0
CustomerNo     0
StockCodeNo    0
dtype: int64

In [27]:
df_ratings = new_df.groupby(["CustomerNo", "StockCodeNo"])["Quantity"].sum().reset_index()
df_ratings.columns = ["user_id", "item_id", "rating"]
df_ratings.head()

Unnamed: 0,user_id,item_id,rating
0,0,2001,0
1,1,25,24
2,1,87,36
3,1,130,6
4,1,167,40


In [28]:
train_ratings, test_ratings = train_test_split(df_ratings, test_size=0.1, random_state=42)
train_ratings, val_ratings = train_test_split(train_ratings, test_size=0.1, random_state=42)

In [29]:
class RecDataset(Dataset):
    def __init__(self, ratings):
        self.user_ids = torch.LongTensor(df_ratings['user_id'].values)
        self.item_ids = torch.LongTensor(df_ratings['item_id'].values)
        self.ratings = torch.FloatTensor(df_ratings['rating'].values)
        
    def __len__(self):
        return len(self.user_ids)
    
    def __getitem__(self, idx):
        return self.user_ids[idx], self.item_ids[idx], self.ratings[idx]
        
train_dataset = RecDataset(train_ratings)
val_dataset = RecDataset(val_ratings)
test_dataset = RecDataset(test_ratings)

In [30]:
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [31]:
class RecommenderNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=64):
        super().__init__()
        # print("num_users", num_users, "num_items", num_items)
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.fc1 = nn.Linear(emb_size*2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()
        
    def forward(self, user_ids, item_ids):
        user_emb = self.user_emb(user_ids)
        item_emb = self.item_emb(item_ids)
        x = torch.cat([user_emb, item_emb], dim=-1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.relu(x)
        return x.squeeze()

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RecommenderNet(num_users=df_ratings['user_id'].nunique(), num_items=df_ratings['item_id'].nunique())
model.to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [33]:
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0.
    for user_ids, item_ids, ratings in dataloader:
        user_ids, item_ids, ratings = user_ids.view(-1).to(device), item_ids.view(-1).to(device), ratings.to(device)
        optimizer.zero_grad()
        outputs = model(user_ids, item_ids)
        loss = criterion(outputs, ratings)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        for user_ids, item_ids, ratings in dataloader:
            user_ids, item_ids, ratings = user_ids.view(-1).to(device), item_ids.view(-1).to(device), ratings.to(device)
            outputs = model(user_ids, item_ids)
            loss = criterion(outputs, ratings)
            total_loss += loss.item()
    return total_loss / len(dataloader)


In [34]:
n_epochs = 10
for epoch in range(n_epochs):
    train_loss = train(model, train_loader, criterion, optimizer, device)
    val_loss = evaluate(model, val_loader, criterion, device)
    print(f'epoch {epoch+1}, train_loss: {train_loss:.4f}, val_loss: {val_loss:.4f}')

test_loss = evaluate(model, test_loader, criterion, device)
print(f'test_loss: {test_loss:.4f}')

epoch 1, train_loss: 8232.3744, val_loss: 7076.8229
epoch 2, train_loss: 7062.9830, val_loss: 5618.3064
epoch 3, train_loss: 6887.2501, val_loss: 5437.2059
epoch 4, train_loss: 6047.6210, val_loss: 4765.5956
epoch 5, train_loss: 5749.1470, val_loss: 4447.1925
epoch 6, train_loss: 5387.7942, val_loss: 3929.3161
epoch 7, train_loss: 4993.9927, val_loss: 4114.3146
epoch 8, train_loss: 4548.3926, val_loss: 3881.2343
epoch 9, train_loss: 4273.7493, val_loss: 3390.0099
epoch 10, train_loss: 4908.2015, val_loss: 3492.9389
test_loss: 3492.9389
