In [1]:
import pickle
import pandas as pd
import numpy as np
import os, sys, gc 
from plotnine import *
import plotnine

from tqdm.notebook import tqdm as tqdm_notebook
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib as mpl
from matplotlib import rc
import re
from matplotlib.ticker import PercentFormatter
import datetime
from math import log # IDF 계산을 위해

## Data

In [2]:
movie = pd.read_csv("./ratings.csv")

In [3]:
user2idx = {}
for i, l in enumerate(movie['userId'].unique()):
    user2idx[l] = i

movie2idx = {}
for i, l in enumerate(movie['movieId'].unique()):
    movie2idx[l] = i

idx2user = {i: user for user, i in user2idx.items()}
idx2movie = {i: item for item, i in movie2idx.items()}

In [4]:
useridx = movie['useridx'] = movie['userId'].apply(lambda x: user2idx[x]).values
movieidx = movie['movieidx'] = movie['movieId'].apply(lambda x: movie2idx[x]).values
rating = movie['rating'].values

In [5]:
n_users = movie['userId'].nunique()
n_items = movie['movieId'].nunique()

In [6]:
import scipy
ratings = scipy.sparse.csr_matrix((rating, (useridx, movieidx)), shape=(len(set(useridx)), len(set(movieidx))))

## Model
- Batch (o)
- Batch (x)

Batch 를 활용한 Matrix Factorization

In [7]:
from torch.utils.data import Dataset, DataLoader
class MovieLenseDataset(Dataset):
    def __init__(self, train, label):
        self.x = train
        self.y = label
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return torch.tensor(self.x[idx]), torch.tensor(self.y[idx])

In [51]:
import torch
import torch.nn.functional as F
from torch import nn
import torch.nn.init as weight_init

class MatrixFactorization(nn.Module):
    def __init__(self,R, n_users, n_items, n_factors=20):
        super().__init__() # 부모 클래스(torch.nn.Module)의 init을 불러옴 
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.item_factors = nn.Embedding(n_items, n_factors)
        self.user_bias = nn.Embedding(n_users, 1)
        self.item_bias = nn.Embedding(n_items, 1)
        
        # weight 초기화 
        weight_init.xavier_uniform_(self.user_factors.weight)
        weight_init.xavier_uniform_(self.item_factors.weight)
        
        self.user_bias.weight.data.fill_(0.)
        self.item_bias.weight.data.fill_(0.)
        
        # original Matrix 
        self.R = R
        
    def forward(self, user, item):
        batch_size = len((model.user_factors(train_batch[:, 0]) * model.item_factors(train_batch[:, 1])).sum(1))
        pred = (self.user_factors(user) * self.item_factors(item)).sum(1) + (self.user_bias(user) + self.item_bias(item)).view(batch_size)
        return pred
    
    def complete_matrix(self):
        return torch.matmul(self.user_factors.weight, self.item_factors.weight.T) + self.user_bias.weight + self.item_bias.weight.T

In [47]:
train_dataloader = DataLoader(MovieLenseDataset(np.array(movie[['useridx', 'movieidx']]), np.array(movie['rating'])), batch_size=32, shuffle=True)
model = MatrixFactorization(ratings, n_users, n_items, n_factors=20)

In [121]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
optimizer = torch.optim.SGD(model.parameters(), lr=5e-2, weight_decay=1e-5)  # learning rate
loss_func = torch.nn.MSELoss()

nb_epochs = 10
reg_alpha = torch.tensor(0.05)
for epoch in tqdm_notebook(range(0, nb_epochs)):
    train_loss = 0
    for train_batch, label_batch in train_dataloader:
        optimizer.zero_grad()
        
        prediction = model(train_batch[:, 0], train_batch[:, 1])
        loss = loss_func(prediction, label_batch.to(dev))
        
        # Regularization Term
        l2_reg = torch.tensor(0).float()
        for embedding in [model.user_factors, model.user_bias]:
            l2_reg += reg_alpha * sum(torch.pow(embedding(train_batch[:, 0]), 2).sum(1))
        
        for embedding in [model.item_factors, model.item_bias]:
            l2_reg += reg_alpha * sum(torch.pow(embedding(train_batch[:, 1]), 2).sum(1))
        loss += l2_reg
            
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print('Epoch {:4d}/{} Loss: {:.6f}'.format(epoch+1, nb_epochs, train_loss/len(train_dataloader.dataset)))

Epoch    1/10 Loss: 0.272459
Epoch    2/10 Loss: 0.225241
Epoch    3/10 Loss: 0.218957
Epoch    4/10 Loss: 0.216221
Epoch    5/10 Loss: 0.214616


KeyboardInterrupt: 

## Recommend 

In [None]:
idx2rec = {}
for u in useridx.key():
    item_rec = np.argsort(-torch.matmul(model.user_factors.weight[user2idx[u]], model.item_factors.weight.T).detach().numpy())[0:200]
    # 추천에서 제외해야할 항목
    item_rec = [idx2movie[x[0]] for x in item_rec if x not in movie[movie['useridx']==u]['movieidx'].unique()][0:100]
    idx2rec[idx2user[u]] = item_rec   

In [None]:
idx2rec[0]