In [1]:
import pickle
import pandas as pd
import numpy as np
import os, sys, gc 
from plotnine import *
import plotnine

from tqdm.notebook import tqdm as tqdm_notebook
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib as mpl
from matplotlib import rc
import re
from matplotlib.ticker import PercentFormatter
import datetime
from math import log # IDF 계산을 위해

In [2]:
movie = pd.read_csv("./ratings.csv")

In [3]:
user2idx = {}
for i, l in enumerate(movie['userId'].unique()):
    user2idx[l] = i

movie2idx = {}
for i, l in enumerate(movie['movieId'].unique()):
    movie2idx[l] = i

idx2user = {i: user for user, i in user2idx.items()}
idx2movie = {i: item for item, i in movie2idx.items()}

In [4]:
useridx = movie['useridx'] = movie['userId'].apply(lambda x: user2idx[x]).values
movieidx = movie['movieidx'] = movie['movieId'].apply(lambda x: movie2idx[x]).values
rating = movie['rating'].values

In [5]:
import torch
import torch.nn.functional as F
from torch import nn

class MatrixFactorization(nn.Module):
    def __init__(self,R, n_users, n_items, n_factors=20):
        super().__init__() # 부모 클래스(torch.nn.Module)의 init을 불러옴 
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.item_factors = nn.Embedding(n_items, n_factors)
        self.R = R
        
    def forward(self, user, item):
        pred = (self.user_factors(user) * self.item_factors(item)).sum(1)
        return pred
    
    def complete_matrix(self):
        return torch.matmul(model.user_factors.weight, model.item_factors.weight.T)
    
    def cost(self):
        xi, yi = self.R.nonzero()
        predicted = self.complete_matrix()
        cost = 0
        for x, y in zip(xi, yi):
            cost += torch.pow(torch.as_tensor(ratings[x, y]) - predicted[x, y], 2)
        return torch.pow(cost, 1/2).detach().numpy()

In [6]:
n_users = movie['userId'].nunique()
n_items = movie['movieId'].nunique()

In [7]:
import scipy
ratings = scipy.sparse.csr_matrix((rating, (useridx, movieidx)), shape=(len(set(useridx)), len(set(movieidx))))

In [8]:
model = MatrixFactorization(ratings, n_users, n_items, n_factors=20)

In [9]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
optimizer = torch.optim.SGD(model.parameters(), lr=5e-2)  # learning rate
loss_func = torch.nn.MSELoss()

In [None]:
rows, cols = ratings.nonzero()

nb_epochs = 10
for epoch in tqdm_notebook(range(nb_epochs)):
    for row, col in zip(*(rows, cols)):
        # Set gradients to zero
        optimizer.zero_grad()

        # Turn data into tensors
        rating = torch.FloatTensor([ratings[row, col]])
        row = torch.LongTensor([row])
        col = torch.LongTensor([col])

        # Predict and calculate loss
        prediction = model(row, col)
        loss = loss_func(prediction, rating)

        # Backpropagate
        loss.backward()

        # Update the parameters
        optimizer.step()
    cost_ = model.cost()
    print('Epoch {:4d}/{} Cost: {:.6f}'.format(epoch+1, nb_epochs, cost_))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))