<a href="https://colab.research.google.com/github/cshooon/MovieRecommendation/blob/main/BiasMF2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
import matplotlib.pyplot as plt
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.model_selection import train_test_split
import random
from torch.utils.data import DataLoader, BatchSampler, SequentialSampler

In [None]:
class MovieLens(Dataset):
    def __init__(self, user_tensor, item_tensor, target_tensor):
        self.user_tensor = user_tensor
        self.item_tensor = item_tensor
        self.target_tensor = target_tensor

    def __getitem__(self, index):
        return self.user_tensor[index], self.item_tensor[index], self.target_tensor[index]

    def __len__(self):
        return self.user_tensor.size(0)

def load_and_sample_data(csv_file, frac=0.1):
    df = pd.read_csv(csv_file)
    df = df.sample(frac=frac, random_state=42)
    return df

ratings_path = '/content/drive/MyDrive/data/ratings.csv'
movies_path = '/content/drive/MyDrive/data/movies.csv'

ratings = load_and_sample_data(ratings_path)
movies = load_and_sample_data(movies_path)

# 영화 정보와 평점 정보를 결합
full_data = pd.merge(ratings, movies, on='movieId', how='left')

# 타임스탬프를 기준으로 데이터를 분할하여 트레이닝, 검증, 테스트 세트를 생성
full_data_sorted = full_data.sort_values('timestamp')
train_cutoff = int(0.7 * len(full_data_sorted))
val_cutoff = int(0.85 * len(full_data_sorted))

train_df = full_data_sorted[:train_cutoff].copy()
validation_df = full_data_sorted[train_cutoff:val_cutoff].copy()
test_df = full_data_sorted[val_cutoff:].copy()

# 사용자와 아이템의 인덱스를 생성
user_to_index = {user_id: index for index, user_id in enumerate(sorted(full_data['userId'].unique()))}
item_to_index = {item_id: index for index, item_id in enumerate(sorted(full_data['movieId'].unique()))}

def create_dataset(df, user_to_index, item_to_index):
    df_copy = df.copy()
    df_copy.loc[:, 'userIndex'] = df['userId'].map(user_to_index)
    df_copy.loc[:, 'itemIndex'] = df['movieId'].map(item_to_index)

    user_tensor = torch.tensor(df_copy['userIndex'].values, dtype=torch.long)
    item_tensor = torch.tensor(df_copy['itemIndex'].values, dtype=torch.long)
    target_tensor = torch.tensor(df_copy['rating'].values, dtype=torch.float32)

    return MovieLens(user_tensor, item_tensor, target_tensor)

def filter_users(df, min_ratings=10):
    # 각 userId별 평가 항목 수 계산
    user_rating_counts = df['userId'].value_counts()

    # 10개 이상의 평가를 한 userId만 필터링
    valid_users = user_rating_counts[user_rating_counts >= min_ratings].index

    # 해당 userId의 데이터만 남김
    return df[df['userId'].isin(valid_users)]

# 필터링 적용
validation_df = filter_users(validation_df)
test_df = filter_users(test_df)

validation_df = validation_df.sort_values(by='userId')
test_df = test_df.sort_values(by='userId')

train_dataset = create_dataset(train_df, user_to_index, item_to_index)
validation_dataset = create_dataset(validation_df, user_to_index, item_to_index)
test_dataset = create_dataset(test_df, user_to_index, item_to_index)

batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
class UserBatchSampler(BatchSampler):
    def __init__(self, data_source):
        self.user_batches = []
        self.data_source = data_source
        self.create_batches()

    def create_batches(self):
        user_data = {}
        for idx, (user_id, _, _) in enumerate(self.data_source):
            user_id = user_id.item()
            if user_id not in user_data:
                user_data[user_id] = []
            user_data[user_id].append(idx)

        self.user_batches = list(user_data.values())

    def __iter__(self):
        for batch in self.user_batches:
            yield batch

    def __len__(self):
        return len(self.user_batches)

# 사용자별 BatchSampler 사용
val_batch_sampler = UserBatchSampler(validation_dataset)
test_batch_sampler = UserBatchSampler(test_dataset)

val_loader = DataLoader(validation_dataset, batch_sampler=val_batch_sampler)
test_loader = DataLoader(test_dataset, batch_sampler=test_batch_sampler)

In [None]:
# https://github.com/AmazingDD/MF-pytorch/blob/master/BiasMFRecommender.py

class BiasMF(torch.nn.Module):
    def __init__(self, params):
        super(BiasMF, self).__init__()
        self.num_users = params['num_users']
        self.num_items = params['num_items']
        self.latent_dim = params['latent_dim']
        self.mu = params['global_mean']

        self.user_embedding = torch.nn.Embedding(self.num_users, self.latent_dim)
        self.item_embedding = torch.nn.Embedding(self.num_items, self.latent_dim)

        self.user_bias = torch.nn.Embedding(self.num_users, 1)
        self.user_bias.weight.data = torch.zeros(self.num_users, 1).float()
        self.item_bias = torch.nn.Embedding(self.num_items, 1)
        self.item_bias.weight.data = torch.zeros(self.num_items, 1).float()

    def forward(self, user_indices, item_indices):
        user_vec = self.user_embedding(user_indices)
        item_vec = self.item_embedding(item_indices)
        dot = torch.mul(user_vec, item_vec).sum(dim=1)

        rating = dot + self.mu + self.user_bias(user_indices).view(-1) + self.item_bias(item_indices).view(-1) + self.mu

        return rating

In [None]:
# 모델 매개변수 설정
params = {
    'num_users': len(user_to_index),
    'num_items': len(item_to_index),
    'latent_dim': 50,
    'global_mean': 3.0
}

model = BiasMF(params)
loss_function = torch.nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BiasMF(
  (user_embedding): Embedding(159453, 50)
  (item_embedding): Embedding(31805, 50)
  (user_bias): Embedding(159453, 1)
  (item_bias): Embedding(31805, 1)
)

In [None]:
# https://github.com/guoyang9/NCF/blob/master/evaluate.py
def hit(gt_item, pred_items):
	if gt_item in pred_items:
		return 1
	return 0


def ndcg(gt_item, pred_items):
	if gt_item in pred_items:
		index = pred_items.index(gt_item)
		return np.reciprocal(np.log2(index+2))
	return 0

In [None]:
def evaluate_model(model, loader, loss_function, top_k, device):
    model.eval()
    total_loss = 0
    HR, NDCG = [], []

    with torch.no_grad():
        for user, item, label in tqdm(loader):
            user, item, label = user.to(device), item.to(device), label.to(device)
            predictions = model(user, item)
            loss = loss_function(predictions, label)
            total_loss += loss.item()
            _, indices = torch.topk(predictions, top_k)

            recommends = torch.take(item, indices).cpu().numpy().tolist()

            # 각 사용자별 실제 상위 평점 아이템 추출
            _, label_indices = torch.topk(label, top_k)
            gt_items_batch = torch.take(item, label_indices).cpu().numpy()

            for gt_items in gt_items_batch:
                HR.append(hit(gt_items, recommends))
                NDCG.append(ndcg(gt_items, recommends))

    avg_loss = total_loss / len(test_loader)
    return avg_loss, np.mean(HR), np.mean(NDCG)

In [None]:
num_epochs = 5

# 훈련 및 검증 루프
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for user_indices, item_indices, ratings in tqdm(train_loader):
        user_indices, item_indices, ratings = user_indices.to(device), item_indices.to(device), ratings.to(device)

        # 예측 및 손실 계산
        predictions = model(user_indices, item_indices)
        loss = loss_function(predictions, ratings)
        train_loss += loss.item()

        # 역전파 및 최적화
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss, HR, NDCG = evaluate_model(model, val_loader, loss_function, 10, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f}, HR: {HR:.4f}, NDCG: {NDCG:.4f}")
    torch.cuda.empty_cache()

100%|██████████| 6836/6836 [00:31<00:00, 215.62it/s]
100%|██████████| 10573/10573 [00:11<00:00, 909.30it/s] 


Epoch 1/5, Training Loss: 41.3508
Validation Loss: 36.6970, HR: 0.5360, NDCG: 0.2424


100%|██████████| 6836/6836 [00:31<00:00, 214.92it/s]
100%|██████████| 10573/10573 [00:11<00:00, 891.26it/s] 


Epoch 2/5, Training Loss: 17.0495
Validation Loss: 25.5352, HR: 0.5336, NDCG: 0.2409


100%|██████████| 6836/6836 [00:31<00:00, 215.73it/s]
100%|██████████| 10573/10573 [00:10<00:00, 992.64it/s]


Epoch 3/5, Training Loss: 7.4887
Validation Loss: 20.1674, HR: 0.5361, NDCG: 0.2416


100%|██████████| 6836/6836 [00:31<00:00, 219.99it/s]
100%|██████████| 10573/10573 [00:11<00:00, 913.30it/s] 


Epoch 4/5, Training Loss: 3.7281
Validation Loss: 17.3782, HR: 0.5403, NDCG: 0.2433


100%|██████████| 6836/6836 [00:31<00:00, 216.39it/s]
100%|██████████| 10573/10573 [00:11<00:00, 897.06it/s] 


Epoch 5/5, Training Loss: 2.1090
Validation Loss: 15.7912, HR: 0.5436, NDCG: 0.2446


In [None]:
# torch.save(model.state_dict(), '/content/drive/MyDrive/data/MF_model.pth')
avg_val_loss, HR, NDCG = evaluate_model(model, test_loader, loss_function, 10, device)
print(f"Test Loss: {avg_val_loss:.4f}, HR: {HR:.4f}, NDCG: {NDCG:.4f}")

100%|██████████| 10150/10150 [00:11<00:00, 877.64it/s]


Test Loss: 20.5621, HR: 0.5188, NDCG: 0.2338
