<a href="https://colab.research.google.com/github/h5ng/4_prography_node_study/blob/master/6-1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import torch
from torch import nn, optim
from torch.utils.data import (Dataset, DataLoader, TensorDataset)
import tqdm
import pandas as pd
# 훈련 데이터와 테스트 데이터를 나누기 위해 사용한다
from sklearn import model_selection

In [18]:
df = pd.read_csv('./sample_data/ratings.csv')[0:294842]

In [20]:
# X는 (userId, movieId) 쌍
X = df[['userId', 'movieId']].values
Y = df[['rating']].values

print(X)

[[   1    2]
 [   1   29]
 [   1   32]
 ...
 [2019 3186]
 [2019 3201]
 [2019 3298]]


In [21]:
# 훈련 데이터와 테스트 데이터를 9대 1로 분할
train_X, test_X, train_Y, test_Y\
 = model_selection.train_test_split(X, Y, test_size=0.1)

In [22]:
# X는 ID이고 정수이므로 int64, Y는 실수이므로 float32의 텐서로 변환
train_dataset = TensorDataset(
    torch.tensor(train_X, dtype=torch.int64), torch.tensor(train_Y, dtype=torch.float32)
)
test_dataset = TensorDataset(
    torch.tensor(test_X, dtype=torch.int64), torch.tensor(test_Y, dtype=torch.float32)
)

train_loader = DataLoader(
    train_dataset, batch_size=1024, num_workers=4, shuffle=True
)

test_loader = DataLoader(
    test_dataset, batch_size=1024, num_workers=4
)

In [23]:
class MatrixFactorization(nn.Module):
  def __init__(self, max_user, max_item, k=20):
    super().__init__()
    self.max_user = max_user
    self.max_item = max_item
    self.user_emb = nn.Embedding(max_user, k, 0)
    self.item_emb = nn.Embedding(max_item, k, 0)

  def forward(self, x):
    user_idx = x[:, 0]
    item_idx = x[:, 1]
    user_feature = self.user_emb(user_idx)
    item_feature = self.item_emb(item_idx)

    # user_feature * item_feature는 (batch_size, k)차원이므로
    # k의 sum을 구하면 각 샘플의 내적이 된다.
    out = torch.sum(user_feature * item_feature, 1)

    # [0, 5] 범위 내로 조정
    out = nn.functional.sigmoid(out) * 5
    return out

In [24]:
max_user, max_item = X.max(0)
print(X.max(0))
# np.int64형을 파이썬의 표준 int로 캐스트
max_user = int(max_user)
max_item = int(max_item)

net = MatrixFactorization(max_user+1, max_item+1)

[  2019 130219]


In [24]:
# 평가 함수
def eval_net(net, loader, score_fn=nn.functional.l1_loss, device="cpu"):
  ys = []
  ypreds = []
  for x, y in loader:
    x = x.to(device)
    ys.append(y)
    with torch.no_grad():
      ypred = net(x).to("cpu").view(-1)
      ypreds.append(ypred)
    ypreds.append(ypred)
  score = score_fn(torch.cat(ys).squeeze(), torch.cat(ypreds))
  return score.item()