<a href="https://colab.research.google.com/github/drvoss/Colab-Notebooks/blob/master/MatrixFactorization_RecommendationSystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget http://files.grouplens.org/datasets/movielens/ml-20m.zip
!unzip ml-20m.zip

--2019-03-20 12:55:07--  http://files.grouplens.org/datasets/movielens/ml-20m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.34.235
Connecting to files.grouplens.org (files.grouplens.org)|128.101.34.235|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 198702078 (189M) [application/zip]
Saving to: ‘ml-20m.zip’


2019-03-20 12:55:10 (66.8 MB/s) - ‘ml-20m.zip’ saved [198702078/198702078]

Archive:  ml-20m.zip
   creating: ml-20m/
  inflating: ml-20m/genome-scores.csv  
  inflating: ml-20m/genome-tags.csv  
  inflating: ml-20m/links.csv        
  inflating: ml-20m/movies.csv       
  inflating: ml-20m/ratings.csv      
  inflating: ml-20m/README.txt       
  inflating: ml-20m/tags.csv         


In [0]:
import torch
from torch import nn, optim
from torch.utils.data import (Dataset,DataLoader,TensorDataset)
import tqdm

import pandas as pd
# 훈련 데이터와 테스트 데이터를 나누기 위해 사용한다
from sklearn import model_selection

df = pd.read_csv("/content/ml-20m/ratings.csv")
# X는 (userId, movieId) 쌍
X = df[["userId", "movieId"]].values
Y = df[["rating"]].values
# 훈련 데이터와 테스트 데이터를 9대 1로 분할
train_X, test_X, train_Y, test_Y\
  = model_selection.train_test_split(X, Y, test_size=0.1)
# X는 ID이고 정수이므로 int64, Y는 실수이므로 float32의 텐서로 변환
train_dataset = TensorDataset(
  torch.tensor(train_X, dtype=torch.int64), torch.tensor(train_Y, dtype=torch.float32))
test_dataset = TensorDataset(
torch.tensor(test_X, dtype=torch.int64), torch.tensor(test_Y, dtype=torch.float32))
train_loader = DataLoader(train_dataset, batch_size=1024, num_workers=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, num_workers=4)

class MatrixFactorization(nn.Module):
  def __init__(self, max_user, max_item, k=20):
    super().__init__()
    self.max_user = max_user
    self.max_item = max_item
    self.user_emb = nn.Embedding(max_user, k, 0)
    self.item_emb = nn.Embedding(max_item, k, 0)

  def forward(self, x):
    user_idx = x[:, 0]
    item_idx = x[:, 1]
    user_feature = self.user_emb(user_idx)
    item_feature = self.item_emb(item_idx)
    # user_feature*item_feature는 (batch_size,k) 차원이므로
    # k의 sum을 구하면 각 샘플의 내적이 된다
    out = torch.sum(user_feature * item_feature, 1)
    # [0, 5] 범위 내로 조정
    out = nn.functional.sigmoid(out) * 5
    return out
  
max_user, max_item = X.max(0)
# np.int64형을 파이썬의 표준 int로 캐스트
max_user = int(max_user)
max_item = int(max_item)
net = MatrixFactorization(max_user+1, max_item+1)

def eval_net(net, loader, score_fn=nn.functional.l1_loss, device="cpu"):
  ys = []
  ypreds = []
  for x, y in loader:
    x = x.to(device)
    ys.append(y)
    with torch.no_grad():
      ypred = net(x).to("cpu").view(-1)
    ypreds.append(ypred)
  score = score_fn(torch.cat(ys).squeeze(), torch.cat(ypreds))
  return score.item()

from statistics import mean
net.to("cuda:0")
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()
for epoch in range(5):
  loss_log = []
  for x, y in tqdm.tqdm(train_loader):
    x = x.to("cuda:0")
    y = y.to("cuda:0")
    o = net(x)
    loss = loss_f(o, y.view(-1))
    net.zero_grad()
    loss.backward()
    opt.step()
    loss_log.append(loss.item())
  test_score = eval_net(net, test_loader, device="cuda:0")
  print(epoch, mean(loss_log), test_score, flush=True)
  
# 훈련한 모델을 CPU로 이동
net.to("cpu")
# 사용자1의 영화10에 대한 평가 계산
query = (1, 10)
# int64 텐서로 변환하고 batch 차원을 추가
query = torch.tensor(query, dtype=torch.int64).view(1, -1)
# net에 전달
net(query)

query = torch.stack([
torch.zeros(max_item).fill_(1),
torch.arange(1, max_item+1)
], 1).long()
# scores는 상위 k개의 점수
# indices는 상위 k개의 위치, 즉 movieId
scores, indices = torch.topk(net(query), 5)