In [1]:
import pickle
import pandas as pd
import numpy as np
import os, sys, gc 
from plotnine import *
import plotnine

from tqdm.notebook import tqdm as tqdm_notebook
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib as mpl
from matplotlib import rc
import re
from matplotlib.ticker import PercentFormatter
import datetime
from math import log # IDF 계산을 위해

## Data

In [2]:
movie = pd.read_csv("./ratings.csv")

In [3]:
user2idx = {}
for i, l in enumerate(movie['userId'].unique()):
    user2idx[l] = i

movie2idx = {}
for i, l in enumerate(movie['movieId'].unique()):
    movie2idx[l] = i

idx2user = {i: user for user, i in user2idx.items()}
idx2movie = {i: item for item, i in movie2idx.items()}

In [4]:
useridx = movie['useridx'] = movie['userId'].apply(lambda x: user2idx[x]).values
movieidx = movie['movieidx'] = movie['movieId'].apply(lambda x: movie2idx[x]).values
rating = movie['rating'].values

In [5]:
n_users = movie['userId'].nunique()
n_items = movie['movieId'].nunique()

In [6]:
import scipy
ratings = scipy.sparse.csr_matrix((rating, (useridx, movieidx)), shape=(len(set(useridx)), len(set(movieidx))))

## Model

In [51]:
import torch
import torch.nn.functional as F
from torch import nn
import torch.nn.init as weight_init

class MatrixFactorization(nn.Module):
    def __init__(self,R, n_users, n_items, n_factors=20):
        super().__init__() # 부모 클래스(torch.nn.Module)의 init을 불러옴 
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.item_factors = nn.Embedding(n_items, n_factors)
        
        # weight 초기화 
        weight_init.xavier_uniform_(self.user_factors.weight)
        weight_init.xavier_uniform_(self.item_factors.weight)
       
        # original Matrix 
        self.R = R
        
    def forward(self, user, item):
        pred = (self.user_factors(user) * self.item_factors(item)).sum(1)
        return pred
    
    def complete_matrix(self):
        return torch.matmul(self.user_factors.weight, self.item_factors.weight.T)

In [47]:
model = MatrixFactorization(ratings, n_users, n_items, n_factors=20)

Batch를 사용하지 않은 Matrix Factorization

In [None]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
optimizer = torch.optim.SGD(model.parameters(), lr=5e-3)  # learning rate
loss_func = torch.nn.MSELoss()

In [None]:
rows, cols = ratings.nonzero()

nb_epochs = 10
for epoch in tqdm_notebook(range(nb_epochs)):
    train_loss = 0
    for row, col in zip(*(rows, cols)):
        # gradient 값을 0으로 설정 
        optimizer.zero_grad()

        # 데이터를 Tensor형태로 변환 
        rating = torch.FloatTensor([ratings[row, col]])
        row = torch.LongTensor([row])
        col = torch.LongTensor([col])

        # 예측값을 만들고 Loss를 계산 
        prediction = model(row, col)
        loss = loss_func(prediction, rating)
        train_loss += loss.item()
        
        # 역전파 
        loss.backward()

        # 파라미터를 갱신
        optimizer.step()
    cost_ = model.cost()
    print('Epoch {:4d}/{} Loss: {:.6f}'.format(epoch+1, nb_epochs, train_loss/len(rows)))

## Recommend 

In [None]:
idx2rec = {}
for u in useridx.key():
    item_rec = np.argsort(-torch.matmul(model.user_factors.weight[user2idx[u]], model.item_factors.weight.T).detach().numpy())[0:200]
    # 추천에서 제외해야할 항목
    item_rec = [idx2movie[x[0]] for x in item_rec if x not in movie[movie['useridx']==u]['movieidx'].unique()][0:100]
    idx2rec[idx2user[u]] = item_rec   

In [None]:
idx2rec[0]