In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from datetime import datetime

In [15]:
ratings = pd.read_csv('/content/sample_data/MyDrive/BigData/Project/3/ratings.csv')
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [17]:
# timestamp를 날짜로 변환해서 최신순으로 가중치 부여
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

# 최근일 기준 경과일수 계산 -> 가중치 계산
most_recent = ratings['timestamp'].max()
ratings['days_since'] = (most_recent - ratings['timestamp']).dt.days

ratings['weight'] = np.exp(-ratings['days_since']/30)

In [27]:
# 모델 학습을 위해 userId, movieId 인덱스 변환
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

ratings['user_idx'] = user_encoder.fit_transform(ratings['userId'])
ratings['movie_idx'] = movie_encoder.fit_transform(ratings['movieId'])

ratings.set_index(['user_idx', 'movie_idx'], inplace=True)

# tmdb의 tmdb_id랑 ratings의 movieId 매핑을 위해 links.csv 병합
links = pd.read_csv('/content/sample_data/MyDrive/BigData/Project/3/links.csv')
ratings_with_tmdb = pd.merge(ratings.reset_index(), links[['movieId', 'tmdbId']], on='movieId', how='left')

ratings_with_tmdb.head(10)

Unnamed: 0,user_idx,movie_idx,userId,movieId,rating,timestamp,days_since,weight,tmdbId
0,0,0,1,1,4.0,2000-07-30 18:45:03,6629,1.084916e-96,862.0
1,0,2,1,3,4.0,2000-07-30 18:20:47,6629,1.084916e-96,15602.0
2,0,5,1,6,4.0,2000-07-30 18:37:04,6629,1.084916e-96,949.0
3,0,43,1,47,5.0,2000-07-30 19:03:35,6629,1.084916e-96,807.0
4,0,46,1,50,5.0,2000-07-30 18:48:51,6629,1.084916e-96,629.0
5,0,62,1,70,3.0,2000-07-30 18:40:00,6629,1.084916e-96,755.0
6,0,89,1,101,5.0,2000-07-30 18:14:28,6629,1.084916e-96,13685.0
7,0,97,1,110,4.0,2000-07-30 18:36:16,6629,1.084916e-96,197.0
8,0,124,1,151,5.0,2000-07-30 19:07:21,6629,1.084916e-96,11780.0
9,0,130,1,157,5.0,2000-07-30 19:08:20,6629,1.084916e-96,1775.0


In [24]:
num_users = ratings_with_tmdb['user_idx'].nunique()
num_movies = ratings_with_tmdb['movie_idx'].nunique()

print(f"Users: {num_users}, Movies: {num_movies}")

Users: 610, Movies: 9724


In [25]:
train_df, val_df = train_test_split(ratings_with_tmdb, test_size=0.2, random_state=42)

In [33]:
class MovieLensDataset(Dataset):
  def __init__(self, df):
    self.users = torch.tensor(df['user_idx'].values, dtype=torch.long)
    self.movies = torch.tensor(df['movie_idx'].values, dtype=torch.long)
    self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)
    self.weights = torch.tensor(df['weight'].values, dtype=torch.float32)

  def __len__(self):
    return len(self.ratings)

  def __getitem__(self, idx):
    return {
        'user': self.users[idx],
        'movie': self.movies[idx],
        'rating': self.ratings[idx],
        'weight': self.weights[idx]
    }

# 학습, 검증 데이터셋
train_dataset = MovieLensDataset(train_df)
val_dataset = MovieLensDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False)

In [37]:
# NCF 모델 정의
class NCF(nn.Module):
  def __init__(self, num_users, num_movies, embedding_dim=32, hidden_dims=[64, 32, 16]):
    super(NCF, self).__init__()

    self.user_embedding = nn.Embedding(num_users, embedding_dim)
    self.movie_embedding = nn.Embedding(num_movies, embedding_dim)

    layers = []
    input_dim = embedding_dim * 2
    for h in hidden_dims:
      layers.append(nn.Linear(input_dim, h))
      layers.append(nn.ReLU())
      input_dim = h
    layers.append(nn.Linear(input_dim, 1))

    self.mlp = nn.Sequential(*layers)

  def forward(self, user_indices, movie_indices):
    user_vec = self.user_embedding(user_indices)
    movie_vec = self.movie_embedding(movie_indices)

    x = torch.cat([user_vec, movie_vec], dim=1)
    out = self.mlp(x).squeeze()
    return out

In [38]:
# 손실 함수 정의
def weighted_mse_loss(pred, target, weight):
  return torch.mean(weight*(pred-target)**2)

In [39]:
device = torch.device('cuba' if torch.cuba.is_available() else 'cpu')
model = NCF(num_users, num_movies).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [40]:
# 학습 루프
epochs = 10

for epoch in range(epochs):
  model.train()
  total_loss = 0

  for batch in train_loader:
    user = batch['user'].to(device)
    movie = batch['movie'].to(device)
    rating = batch['rating'].to(device)
    weight = batch['weight'].to(device)

    pred = model(user, movie)
    loss = weighted_mse_loss(pred, rating, weight)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item() * len(user)

avg_train_loss = total_loss / len(train_loader.dataset)

model.eval()
total_val_loss = 0
with torch.no_grad():
  for batch in val_loader:
    user = batch['user'].to(device)
    movie = batch['movie'].to(device)
    rating = batch['rating'].to(device)
    weight = batch['weight'].to(device)

    pred = model(user, movie)
    loss = weighted_mse_loss(pred, rating, weight)
    total_val_loss += loss.item() * len(user)

avg_val_loss = total_val_loss / len(val_loader.dataset)

print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

Epoch 10/10 | Train Loss: 0.0028 | Val Loss: 0.0068


In [42]:
# 사용자 0이 영화 0에 줄 평점 예측
user_id = 0
movie_id = 0

model.eval()
with torch.no_grad():
  pred = model(
      torch.tensor([user_id], dtype=torch.long).to(device),
      torch.tensor([movie_id], dtype=torch.long).to(device)
  )

print(f"예측 평점: {pred.item():.2f}")

예측 평점: 3.18
