In [1]:
import pickle
import pandas as pd
import numpy as np
import os, sys, gc 

from tqdm.notebook import tqdm as tqdm_notebook
import warnings
import re

In [2]:
from torch.utils.data import Dataset, DataLoader

class MovieLenseDataset(Dataset):
    """ MovieLense dataset."""
    # Initialize your data, download, etc.
    def __init__(self):
        # self.movie = pd.read_csv("ratings.csv")
        
        input_movie_path = 'C:/Users/User/Documents/GitHub/Daejeon-Learning-Day/input/nf_prize_dataset.tar/download/training_set/training_set/'
        # os.listdir : 해당 경로에 있는 모든 파일들을 불러오는 명령어 
        file_list = os.listdir(input_movie_path)        
        
        movie_df_list = []
        exclude_file_lst = ['training_set']
        for file in tqdm_notebook(file_list):
            # 예외처리 
            if file in exclude_file_lst:
                continue 
            else:
                file_path = input_movie_path + file
                df_temp = pd.read_csv(file_path, header=None, names=['movieId', 'rating', 'date'])[1:]
                df_temp['userId'] = int(file.split('_')[1].split('.')[0])
                movie_df_list.append(df_temp)

        self.movie = pd.concat(movie_df_list)
        del self.movie['date']
        
        self.user2idx = {}
        for i, l in enumerate(self.movie['userId'].unique()):
            self.user2idx[l] = i

        self.movie2idx = {}
        for i, l in enumerate(self.movie['movieId'].unique()):
            self.movie2idx[l] = i
        
        useridx = self.movie['useridx'] = self.movie['userId'].apply(lambda x: self.user2idx[x]).values
        movieidx = self.movie['movieidx'] = self.movie['movieId'].apply(lambda x: self.movie2idx[x]).values
        rating = self.movie['rating'].values
        
        self.idx2user = {i: user for user, i in self.user2idx.items()}
        self.idx2movie = {i: item for item, i in self.movie2idx.items()}
        
        i = torch.LongTensor([useridx, movieidx])
        v = torch.FloatTensor(rating)
        self.x = torch.sparse.FloatTensor(i, v, torch.Size([len(self.user2idx),len(self.movie2idx)])).to_dense()
        
    def __getitem__(self, index):
        return self.x[index]

    def __len__(self):
        return len(self.x)

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as weight_init
from torch.autograd import Variable
import torchvision

def MSEloss(inputs, targets, size_average=False):
    mask = targets != 0
    num_ratings = torch.sum(mask.float())
    criterion = nn.MSELoss(reduction='sum' if not size_average else 'mean')
    return criterion(inputs * mask.float(), targets), Variable(torch.Tensor([1.0])) if size_average else num_ratings

# 참고 코드 : https://github.com/NVIDIA/DeepRecommender/blob/master/reco_encoder/model/model.py
class AutoEncoder(nn.Module):
    """
    Describes an AutoEncoder model
    :param self.layer_sizes: Encoder network description. Should start with feature size (e.g. dimensionality of x).
    For example: [10000, 1024, 512] will result in:
      - encoder 2 layers: 10000x1024 and 1024x512. Representation layer (z) will be 512
      - decoder 2 layers: 512x1024 and 1024x10000.
    :param dp_drop_prob: (default: 0.0) Dropout drop probability
    """
    def __init__(self, layer_sizes, drop_prop=0):
        super().__init__() # 부모 클래스(torch.nn.Module)의 init을 불러옴 
        self.drop_prop = nn.Dropout(drop_prop)
        self.layer_sizes = layer_sizes
        
        # encode_weight : ex) [10000, 1024], [1024, 512]
        self.encode_w = nn.ParameterList([nn.Parameter(torch.rand(self.layer_sizes[i+1], self.layer_sizes[i])) for i in range(len(self.layer_sizes)-1)])
        # weight initializer 
        for ind, w in enumerate(self.encode_w): 
            weight_init.xavier_uniform_(w)
        
        # decoder_layers
        reversed_enc_layers = list(reversed(self.layer_sizes))
        self.decode_w = nn.ParameterList([nn.Parameter(torch.rand(reversed_enc_layers[i+1], reversed_enc_layers[i])) for i in range(len(reversed_enc_layers)-1)])
         # weight initializer 
        for ind, w in enumerate(self.decode_w): 
            weight_init.xavier_uniform_(w)       
        
        # encoder_bias 
        self.encode_b = nn.ParameterList([nn.Parameter(torch.zeros(self.layer_sizes[i+1])) for i in range(len(self.layer_sizes)-1)])
        self.decode_b = nn.ParameterList([nn.Parameter(torch.zeros(reversed_enc_layers[i+1])) for i in range(len(reversed_enc_layers) - 1)])
        
    def encode(self, x):
        for ind, w in enumerate(self.encode_w):
            # activation function 
            # 논문에 따르면 Encoder의 모든 계층에 activation function을 적용했음 
            x = F.selu(input=F.linear(input=x, weight=w, bias=self.encode_b[ind]))
        if self.drop_prop.p > 0: x = self.drop_prop(x)
        return x
    
    def decode(self, x):
        for ind, w in enumerate(self.decode_w):
            # activation function
            # 논문에 따르면 Decoder의 마지막 게층은 activation function을 적용하지 않았음 
            if ind != (len(self.layer_sizes)-1):
                x = F.selu(input=F.linear(input=x, weight=w, bias=self.decode_b[ind]))
            else:
                x = F.linear(input=x, weight=w, bias=self.decode_b[ind])
        return x
    
    def forward(self, x):
        return self.decode(self.encode(x))

In [8]:
train_dataloader = DataLoader(MovieLenseDataset(), batch_size=512, shuffle=True)

HBox(children=(FloatProgress(value=0.0, max=17770.0), HTML(value='')))




In [9]:
layer_size = len(train_dataloader.dataset[0])
dev = torch.cuda.set_device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

model = AutoEncoder(layer_sizes=[layer_size, 512, 256, 128], drop_prop=0.3).to(dev)
model

AutoEncoder(
  (drop_prop): Dropout(p=0.3, inplace=False)
  (encode_w): ParameterList(
      (0): Parameter containing: [torch.FloatTensor of size 512x480189]
      (1): Parameter containing: [torch.FloatTensor of size 256x512]
      (2): Parameter containing: [torch.FloatTensor of size 128x256]
  )
  (decode_w): ParameterList(
      (0): Parameter containing: [torch.FloatTensor of size 256x128]
      (1): Parameter containing: [torch.FloatTensor of size 512x256]
      (2): Parameter containing: [torch.FloatTensor of size 480189x512]
  )
  (encode_b): ParameterList(
      (0): Parameter containing: [torch.FloatTensor of size 512]
      (1): Parameter containing: [torch.FloatTensor of size 256]
      (2): Parameter containing: [torch.FloatTensor of size 128]
  )
  (decode_b): ParameterList(
      (0): Parameter containing: [torch.FloatTensor of size 256]
      (1): Parameter containing: [torch.FloatTensor of size 512]
      (2): Parameter containing: [torch.FloatTensor of size 480189]
 

In [10]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)  # learning rate

In [11]:
nb_epochs = 10
for epoch in tqdm_notebook(range(0, nb_epochs)):
    train_loss = 0
    for train_batch in train_dataloader:
        train_batch = train_batch.to(dev)
        optimizer.zero_grad()
        
        prediction = model(train_batch)
        loss, num_ratings = MSEloss(train_batch, prediction)
        loss = loss / num_ratings
        
        loss.backward()
        train_loss += loss.item() 
        optimizer.step()
    if epoch % 1 == 0: 
        print('Epoch {:4d}/{} Loss: {:.6f}'.format(epoch+1, nb_epochs, train_loss/len(train_dataloader.dataset)))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch    1/10 Loss: 0.000332
Epoch    2/10 Loss: 0.000333
Epoch    3/10 Loss: 0.000332
Epoch    4/10 Loss: 0.000333
Epoch    5/10 Loss: 0.000333
Epoch    6/10 Loss: 0.000333
Epoch    7/10 Loss: 0.000332
Epoch    8/10 Loss: 0.000333
Epoch    9/10 Loss: 0.000333
Epoch   10/10 Loss: 0.000333

