In [1]:
import pandas as pd
import numpy as np
import torch
import torch.utils.data as data_utils

# 데이터셋 로드
ratings_df = pd.read_csv('data/ratings.csv')

# 유저, 아이템 인덱스 매핑
user_ids = ratings_df['userId'].unique().tolist()
# user2idx = {o:i for i,o in enumerate(user_ids)}
# ratings_df['user'] = ratings_df['userId'].apply(lambda x: user2idx[x])
ratings_df['user'] = ratings_df['userId'].astype("category").cat.codes

movie_ids = ratings_df['movieId'].unique().tolist()
# movie2idx = {o:i for i,o in enumerate(movie_ids)}
# ratings_df['movie'] = ratings_df['movieId'].apply(lambda x: movie2idx[x])
ratings_df['movie'] = ratings_df['movieId'].astype("category").cat.codes

# 트레이닝셋, 테스트셋 분리
shuffled_ratings = ratings_df.sample(frac=1., random_state=42)
val_size = int(len(shuffled_ratings) * 0.2)
train_ratings = shuffled_ratings[val_size:]
test_ratings = shuffled_ratings[:val_size]

# 파이토치 데이터셋으로 변환
train_data = data_utils.TensorDataset(torch.LongTensor(train_ratings['user']),
                                       torch.LongTensor(train_ratings['movie']),
                                       torch.FloatTensor(train_ratings['rating']))

test_data = data_utils.TensorDataset(torch.LongTensor(test_ratings['user'].values),
                                      torch.LongTensor(test_ratings['movie'].values),
                                      torch.FloatTensor(test_ratings['rating'].values))


In [2]:
import torch.nn as nn

class MF(nn.Module):
    def __init__(self, n_users, n_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(n_users, emb_size)
        self.item_emb = nn.Embedding(n_items, emb_size)
        self.rating_pred = nn.Sequential(
            nn.Linear(emb_size*2, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, user, item):
        user_emb = self.user_emb(user)
        item_emb = self.item_emb(item)
        pred = self.rating_pred(torch.cat([user_emb, item_emb], dim=1))
        return pred.view(-1)

In [3]:
import torch.optim as optim
from torch.utils.data import DataLoader

# 모델 및 옵티마이저 생성
# model = MF(len(user2idx), len(movie2idx), emb_size=100)
model = MF(ratings_df['user'].nunique(), ratings_df['movie'].nunique(), emb_size=100)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# 데이터 로더 생성
train_loader = DataLoader(train_data, batch_size=512, shuffle=True)
test_loader = DataLoader(test_data, batch_size=512, shuffle=False)

# 모델 학습
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    train_loss = 0.
    for user, item, rating in train_loader:
        optimizer.zero_grad()
        pred = model(user, item)
        loss = criterion(pred, rating)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * user.shape[0]
    train_loss /= len(train_ratings)
    
    model.eval()
    with torch.no_grad():
        test_loss = 0.
        for user, item, rating in test_loader:
            pred = model(user, item)
            loss = criterion(pred, rating)
            test_loss += loss.item() * user.shape[0]
        test_loss /= len(test_ratings)

    print('Epoch {} - train loss: {:.4f} - test loss: {:.4f}'.format(epoch+1, train_loss, test_loss))

Epoch 1 - train loss: 7.9998 - test loss: 7.3952
Epoch 2 - train loss: 7.3729 - test loss: 7.3508
Epoch 3 - train loss: 7.3543 - test loss: 7.3433
Epoch 4 - train loss: 7.3501 - test loss: 7.3407
Epoch 5 - train loss: 7.3485 - test loss: 7.3396
Epoch 6 - train loss: 7.3477 - test loss: 7.3390
Epoch 7 - train loss: 7.3473 - test loss: 7.3386
Epoch 8 - train loss: 7.3470 - test loss: 7.3384
Epoch 9 - train loss: 7.3469 - test loss: 7.3382
Epoch 10 - train loss: 7.3468 - test loss: 7.3381


In [4]:
from sklearn.metrics import mean_squared_error

# 예측 평점 계산
model.eval()
with torch.no_grad():
    preds = []
    for user, item, rating in test_loader:
        pred = model(user, item)
        preds.extend(pred.detach().numpy())

# 검증 데이터셋의 실제 평점과 예측 평점 비교
test_ratings = test_ratings.copy()
test_ratings['pred'] = preds
test_ratings['rating'] = test_ratings['rating'].clip(0., 5.)
test_ratings['pred'] = test_ratings['pred'].clip(0., 5.)
rmse = mean_squared_error(test_ratings['rating'], test_ratings['pred'], squared=False)
print('Test RMSE: {:.4f}'.format(rmse))

Test RMSE: 2.7089


In [5]:
train_ratings

Unnamed: 0,userId,movieId,rating,timestamp,user,movie
87803,567,750,3.0,1525287719,566,602
80568,509,7347,3.0,1435994597,508,4893
50582,326,71462,4.0,1322252335,325,7127
8344,57,2115,3.0,965798155,56,1575
99603,610,1127,4.0,1479544102,609,855
...,...,...,...,...,...,...
6265,42,4005,4.0,996259059,41,2986
54886,364,141,4.0,869443367,363,116
76820,480,6867,4.0,1179163171,479,4601
860,6,981,3.0,845556567,5,748


In [9]:
# 특정 사용자에게 추천할 영화 리스트 생성
user_id = 3
num_movies = 10
movies_not_rated_by_user = []
for movie_id in range(num_movies):
    if movie_id not in train_ratings[train_ratings['user'] == user_id]['movieId'].unique():
        movies_not_rated_by_user.append(movie_id)

# 생성한 영화 리스트로부터 예측 평점 계산
model.eval()
ratings = []
with torch.no_grad():
    for movie_id in movies_not_rated_by_user:
        user = torch.LongTensor([user_id-1])
        item = torch.LongTensor([movie_id])
        rating = model(user, item)
        ratings.append(rating.item())
        
movies_not_rated_by_user, ratings

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0.9999598264694214,
  0.9960137605667114,
  0.9990236759185791,
  0.9987804293632507,
  0.9997416138648987,
  0.9998155236244202,
  0.999221920967102,
  0.999830961227417,
  0.9961310625076294,
  0.9989318251609802])

In [11]:
df_recomm = pd.DataFrame([movies_not_rated_by_user, ratings]).T
df_recomm.columns = ["movie", "ratings"]
df_recomm.nlargest(10, "ratings")

Unnamed: 0,movie,ratings
0,0.0,0.99996
7,7.0,0.999831
5,5.0,0.999816
4,4.0,0.999742
6,6.0,0.999222
2,2.0,0.999024
9,9.0,0.998932
3,3.0,0.99878
8,8.0,0.996131
1,1.0,0.996014
