# Wide & Deep Model Implementation

이 노트북에서는 Google의 **Wide & Deep Learning** 모델을 구현합니다.
- **Wide Component**: 암기(Memorization)를 담당. 여기서는 영화의 **장르(Genre)** 정보를 사용합니다.
- **Deep Component**: 일반화(Generalization)를 담당. 유저와 아이템의 **임베딩(Embedding)**을 사용합니다.

### 1. 데이터 로드 및 전처리

In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# 1. 데이터 로드
ratings = pd.read_csv('../data/ratings.csv')
movies = pd.read_csv('../data/movies.csv')

# 2. 데이터 병합 (평점 + 영화 정보)
data = pd.merge(ratings, movies, on='movieId')

# 3. 데이터 샘플링 (속도를 위해 10%만 사용)
data = data.sample(frac=0.1, random_state=42)

print(f"Data size: {len(data)}")
data.head()

Data size: 3200020


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
10685861,66954,781,5.0,850944577,Stealing Beauty (1996),Drama
1552723,9877,574,4.0,945495614,Spanking the Monkey (1994),Comedy|Drama
6145184,38348,1088,2.0,999974867,Dirty Dancing (1987),Drama|Musical|Romance
16268584,101952,2706,1.0,1203077565,American Pie (1999),Comedy|Romance
22418634,140400,275079,3.5,1653782463,Chip 'n Dale: Rescue Rangers (2022),Adventure|Animation|Children|Comedy|Fantasy|My...


### 2. Feature Engineering (Wide & Deep Input)
- **Deep Input**: `userId`, `movieId` (Label Encoding -> Embedding)
- **Wide Input**: `genres` (Multi-hot Encoding)

In [2]:
# 1. Deep Part: UserID, MovieID 인덱싱
user_ids = data['userId'].unique()
item_ids = data['movieId'].unique()

user2idx = {u: i for i, u in enumerate(user_ids)}
item2idx = {m: i for i, m in enumerate(item_ids)}

data['user_idx'] = data['userId'].map(user2idx)
data['item_idx'] = data['movieId'].map(item2idx)

num_users = len(user_ids)
num_items = len(item_ids)

# 2. Wide Part: Genre Multi-hot Encoding
# 장르 문자열을 리스트로 변환 ('Toy Story (1995)' -> ['Adventure', 'Animation', ...])
data['genres_list'] = data['genres'].apply(lambda x: x.split('|'))

mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(data['genres_list'])
num_genres = len(mlb.classes_)

# 데이터프레임에 장르 벡터 추가 (나중에 텐서로 변환하기 쉽게)
# 주의: 데이터프레임에 리스트나 배열을 직접 넣으면 느려질 수 있으니, Dataset에서 처리하거나 여기서 미리 변환
print(f"Users: {num_users}, Items: {num_items}, Genres: {num_genres}")
print(f"Genre Classes: {mlb.classes_}")

Users: 197270, Items: 42809, Genres: 20
Genre Classes: ['(no genres listed)' 'Action' 'Adventure' 'Animation' 'Children' 'Comedy'
 'Crime' 'Documentary' 'Drama' 'Fantasy' 'Film-Noir' 'Horror' 'IMAX'
 'Musical' 'Mystery' 'Romance' 'Sci-Fi' 'Thriller' 'War' 'Western']


### 3. Dataset & DataLoader

In [3]:
class WideAndDeepDataset(Dataset):
    def __init__(self, df, genres_encoded):
        self.users = torch.LongTensor(df['user_idx'].values)
        self.items = torch.LongTensor(df['item_idx'].values)
        self.ratings = torch.FloatTensor(df['rating'].values)
        self.genres = torch.FloatTensor(genres_encoded)
        
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.genres[idx], self.ratings[idx]

# Train/Val/Test Split
# 인덱스를 나눠서 genres_encoded도 같이 나누기 위함
train_idx, test_idx = train_test_split(range(len(data)), test_size=0.1, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.111, random_state=42)

train_df = data.iloc[train_idx]
val_df = data.iloc[val_idx]
test_df = data.iloc[test_idx]

train_genres = genres_encoded[train_idx]
val_genres = genres_encoded[val_idx]
test_genres = genres_encoded[test_idx]

train_ds = WideAndDeepDataset(train_df, train_genres)
val_ds = WideAndDeepDataset(val_df, val_genres)
test_ds = WideAndDeepDataset(test_df, test_genres)

batch_size = 4096
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

### 4. Wide & Deep Model Architecture
- **Wide**: Linear(num_genres -> 1)
- **Deep**: Embedding(User) + Embedding(Item) -> MLP -> Linear( -> 1)
- **Output**: Wide + Deep

In [4]:
class WideAndDeep(nn.Module):
    def __init__(self, num_users, num_items, num_genres, embed_dim=32, mlp_layers=[64, 32, 16]):
        super().__init__()
        
        # --- Wide Part ---
        # 장르(Multi-hot)를 입력받아 평점 예측에 기여
        self.wide_linear = nn.Linear(num_genres, 1)
        
        # --- Deep Part ---
        self.user_embedding = nn.Embedding(num_users, embed_dim)
        self.item_embedding = nn.Embedding(num_items, embed_dim)
        
        mlp_modules = []
        input_size = embed_dim * 2
        for output_size in mlp_layers:
            mlp_modules.append(nn.Linear(input_size, output_size))
            mlp_modules.append(nn.ReLU())
            mlp_modules.append(nn.Dropout(0.2))
            input_size = output_size
        self.deep_mlp = nn.Sequential(*mlp_modules)
        self.deep_predict = nn.Linear(input_size, 1)
        
        self._init_weights()
        
    def _init_weights(self):
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)
        nn.init.xavier_uniform_(self.wide_linear.weight)
        for m in self.deep_mlp:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
        nn.init.kaiming_uniform_(self.deep_predict.weight, a=1, nonlinearity='sigmoid')

    def forward(self, users, items, genres):
        # Wide
        wide_out = self.wide_linear(genres)
        
        # Deep
        u_emb = self.user_embedding(users)
        i_emb = self.item_embedding(items)
        deep_in = torch.cat([u_emb, i_emb], dim=1)
        deep_features = self.deep_mlp(deep_in)
        deep_out = self.deep_predict(deep_features)
        
        # Combine
        return (wide_out + deep_out).squeeze()


### 5. Training Loop

In [5]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model = WideAndDeep(num_users, num_items, num_genres).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 10
best_val_rmse = float('inf')
patience = 3
counter = 0

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for users, items, genres, ratings in train_loader:
        users, items, genres, ratings = users.to(device), items.to(device), genres.to(device), ratings.to(device)
        
        optimizer.zero_grad()
        outputs = model(users, items, genres)
        loss = criterion(outputs, ratings)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * users.size(0)
        
    train_rmse = (train_loss / len(train_ds)) ** 0.5
    
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for users, items, genres, ratings in val_loader:
            users, items, genres, ratings = users.to(device), items.to(device), genres.to(device), ratings.to(device)
            outputs = model(users, items, genres)
            val_loss += criterion(outputs, ratings).item() * users.size(0)
    val_rmse = (val_loss / len(val_ds)) ** 0.5
    
    print(f"Epoch {epoch+1:2d} | Train RMSE: {train_rmse:.4f} | Val RMSE: {val_rmse:.4f}")
    
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        torch.save(model.state_dict(), '../models/wide_deep_model.pth')
        counter = 0
        print("  -> Saved Best Model")
    else:
        counter += 1
        if counter >= patience:
            print("Early Stopping")
            break

Using device: mps
Epoch  1 | Train RMSE: 1.7443 | Val RMSE: 0.9131
  -> Saved Best Model
Epoch  2 | Train RMSE: 1.0625 | Val RMSE: 0.8934
  -> Saved Best Model
Epoch  3 | Train RMSE: 0.9802 | Val RMSE: 0.8871
  -> Saved Best Model
Epoch  4 | Train RMSE: 0.9246 | Val RMSE: 0.8867
  -> Saved Best Model
Epoch  5 | Train RMSE: 0.8791 | Val RMSE: 0.8868
Epoch  6 | Train RMSE: 0.8394 | Val RMSE: 0.8896
Epoch  7 | Train RMSE: 0.8074 | Val RMSE: 0.8938
Early Stopping


In [6]:
# Final Evaluation
model.load_state_dict(torch.load('../models/wide_deep_model.pth'))
model.eval()
test_loss = 0
with torch.no_grad():
    for users, items, genres, ratings in test_loader:
        users, items, genres, ratings = users.to(device), items.to(device), genres.to(device), ratings.to(device)
        outputs = model(users, items, genres)
        test_loss += criterion(outputs, ratings).item() * users.size(0)
print(f"Test RMSE: {(test_loss / len(test_ds))**0.5:.4f}")

Test RMSE: 0.8881
