In [1]:
import pickle
import pandas as pd
import numpy as np
import os, sys, gc 
from plotnine import *
import plotnine

from tqdm.notebook import tqdm as tqdm_notebook
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib as mpl
from matplotlib import rc
import re
from matplotlib.ticker import PercentFormatter
import datetime
from math import log # IDF 계산을 위해

In [7]:
from torch.utils.data import Dataset, DataLoader

class MovieLenseDataset(Dataset):
    """ MovieLense dataset."""
    # Initialize your data, download, etc.
    def __init__(self):
        self.movie = pd.read_csv("ratings.csv")
        self.user2idx = {}
        for i, l in enumerate(self.movie['userId'].unique()):
            self.user2idx[l] = i

        self.movie2idx = {}
        for i, l in enumerate(movie['movieId'].unique()):
            self.movie2idx[l] = i
        
        useridx = movie['useridx'] = movie['userId'].apply(lambda x: self.user2idx[x]).values
        movieidx = movie['movieidx'] = movie['movieId'].apply(lambda x: self.movie2idx[x]).values
        rating = movie['rating'].values
        
        self.idx2user = {i: user for user, i in self.user2idx.items()}
        self.idx2movie = {i: item for item, i in self.movie2idx.items()}
        
        i = torch.LongTensor([useridx, movieidx])
        v = torch.FloatTensor(rating)
        self.x = torch.sparse.FloatTensor(i, v, torch.Size([len(self.user2idx),len(self.movie2idx)])).to_dense()
        
    def __getitem__(self, index):
        return self.x[index]

    def __len__(self):
        return len(self.x)

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as weight_init
from torch.autograd import Variable

def MSEloss(inputs, targets, size_average=False):
    mask = targets != 0
    num_ratings = torch.sum(mask.float())
    criterion = nn.MSELoss(reduction='sum' if not size_average else 'mean')
    return criterion(inputs * mask.float(), targets), Variable(torch.Tensor([1.0])) if size_average else num_ratings

# 참고 코드 : https://github.com/NVIDIA/DeepRecommender/blob/master/reco_encoder/model/model.py
class AutoEncoder(nn.Module):
    """
    Describes an AutoEncoder model
    :param self.layer_sizes: Encoder network description. Should start with feature size (e.g. dimensionality of x).
    For example: [10000, 1024, 512] will result in:
      - encoder 2 layers: 10000x1024 and 1024x512. Representation layer (z) will be 512
      - decoder 2 layers: 512x1024 and 1024x10000.
    :param dp_drop_prob: (default: 0.0) Dropout drop probability
    """
    def __init__(self, layer_sizes, drop_prop=0):
        super().__init__() # 부모 클래스(torch.nn.Module)의 init을 불러옴 
        print("AutoEncoder")
        self.drop_prop = nn.Dropout(drop_prop)
        self.layer_sizes = layer_sizes
        
        # encode_weight : ex) [10000, 1024], [1024, 512]
        self.encode_w = nn.ParameterList([nn.Parameter(torch.rand(self.layer_sizes[i+1], self.layer_sizes[i])) for i in range(len(self.layer_sizes)-1)])
        # weight initializer 
        for ind, w in enumerate(self.encode_w): 
            weight_init.xavier_uniform_(w)
        
        # decoder_layers
        reversed_enc_layers = list(reversed(self.layer_sizes))
        self.decode_w = nn.ParameterList([nn.Parameter(torch.rand(reversed_enc_layers[i+1], reversed_enc_layers[i])) for i in range(len(reversed_enc_layers)-1)])
         # weight initializer 
        for ind, w in enumerate(self.decode_w): 
            weight_init.xavier_uniform_(w)       
        
        # encoder_bias 
        self.encode_b = nn.ParameterList([nn.Parameter(torch.zeros(self.layer_sizes[i+1])) for i in range(len(self.layer_sizes)-1)])
        self.decode_b = nn.ParameterList([nn.Parameter(torch.zeros(reversed_enc_layers[i+1])) for i in range(len(reversed_enc_layers) - 1)])
        
    def encode(self, x):
        for ind, w in enumerate(self.encode_w):
            # activation function 
            # 논문에 따르면 Encoder의 모든 계층에 activation function을 적용했음 
            x = F.selu(input=F.linear(input=x, weight=w, bias=self.encode_b[ind]))
        if self.drop_prop.p > 0: x = self.drop_prop(x)
        return x
    
    def decode(self, x):
        for ind, w in enumerate(self.decode_w):
            # activation function
            # 논문에 따르면 Decoder의 마지막 게층은 activation function을 적용하지 않았음 
            if ind != (len(self.layer_sizes)-1):
                x = F.selu(input=F.linear(input=x, weight=w, bias=self.decode_b[ind]))
            else:
                x = F.linear(input=x, weight=w, bias=self.decode_b[ind])
        return x
    
    def forward(self, x):
        return self.decode(self.encode(x))

In [11]:
train_dataloader = DataLoader(MovieLenseDataset(), batch_size=128, shuffle=True)

NameError: name 'movie' is not defined

In [None]:
model = AutoEncoder(layer_sizes=[9066, 64, 32], drop_prop=0.3)
model

In [298]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-5)  # learning rate

In [305]:
nb_epochs = 100
for epoch in tqdm_notebook(range(0, nb_epochs)):
    train_loss = 0
    for train_batch in train_dataloader:
        optimizer.zero_grad()
        
        prediction = model(train_batch)
        loss, num_ratings = MSEloss(train_batch, prediction)
        loss = loss / num_ratings
        
        loss.backward()
        train_loss += loss.item() 
        optimizer.step()
    if epoch % 10 == 0: 
        print('Epoch {:4d}/{} Loss: {:.6f}'.format(epoch+1, nb_epochs, train_loss/len(train_dataloader.dataset)))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Epoch    1/100 Loss: 0.001100
Epoch   11/100 Loss: 0.001130
Epoch   21/100 Loss: 0.001066
Epoch   31/100 Loss: 0.001077
Epoch   41/100 Loss: 0.001143
Epoch   51/100 Loss: 0.001064
Epoch   61/100 Loss: 0.001087
Epoch   71/100 Loss: 0.001064
Epoch   81/100 Loss: 0.001085
Epoch   91/100 Loss: 0.001108



In [315]:
model.decode(model.encode(train_dataloader.dataset[:]))

tensor([[ 0.0940,  0.0376,  0.0511,  ...,  0.0011,  0.0039,  0.0061],
        [ 0.8432,  0.0726,  0.1038,  ..., -0.0033, -0.0042,  0.0053],
        [ 0.0682,  0.0573,  0.0142,  ...,  0.0125,  0.0062,  0.0078],
        ...,
        [ 0.1156, -0.0770,  0.0377,  ...,  0.0060,  0.0028,  0.0054],
        [-0.0059, -0.1544,  0.1159,  ..., -0.0086, -0.0073,  0.0086],
        [ 0.0678,  0.6561, -0.1429,  ...,  0.0305,  0.0072,  0.0028]],
       grad_fn=<EluBackward>)

In [318]:
model.encode().weight

TypeError: encode() missing 1 required positional argument: 'x'

In [311]:
train_dataloader.dataset[:]

tensor([[2.5000, 3.0000, 3.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [304]:
train_dataloader.dataset[:]

tensor([[2.5000, 3.0000, 3.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])