In [11]:
import pickle
import pandas as pd
import numpy as np
import os, sys, gc 
from plotnine import *
import plotnine

from tqdm.notebook import tqdm as tqdm_notebook
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib as mpl
from matplotlib import rc
import re
from matplotlib.ticker import PercentFormatter
import datetime
from math import log # IDF 계산을 위해

In [None]:
from torch.utils.data import Dataset, DataLoader

class MovieLenseDataset(Dataset):
    """ MovieLense dataset."""
    # Initialize your data, download, etc.
    def __init__(self):
        movie = pd.read_csv("./ratings.csv")
        self.user2idx = {}
        for i, l in enumerate(movie['userId'].unique()):
            self.user2idx[l] = i

        self.movie2idx = {}
        for i, l in enumerate(movie['movieId'].unique()):
            self.movie2idx[l] = i

        self.idx2user = {i: user for user, i in user2idx.items()}
        self.idx2movie = {i: item for item, i in movie2idx.items()}
        
    def __getitem__(self, index):
        
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return len(self.movie)

In [43]:
def convert(data):
    new_data = []
    for id_users in range(1, n_users + 1):
        # 5 : useridx, 4 : movieidx
        id_movies = data[:,5][data[:,4] == id_users]
        id_ratings = data[:,2][data[:,4] == id_users]
        ratings = np.zeros(n_items)
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data
training_set = convert(np.array(movie, dtype = 'int'))

In [42]:
np.array(movie, dtype = 'int')[:, 'useridx']

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as weight_init
from torch.autograd import Variable

def MSEloss(inputs, targets, size_average=False):
    mask = targets != 0
    num_ratings = torch.sum(mask.float())
    criterion = nn.MSELoss(reduction='sum' if not size_average else 'mean')
    return criterion(inputs * mask.float(), targets), Variable(torch.Tensor([1.0])) if size_average else num_ratings

# 참고 코드 : https://github.com/NVIDIA/DeepRecommender/blob/master/reco_encoder/model/model.py
class AutoEncoder(nn.Module):
    """
    Describes an AutoEncoder model
    :param layer_sizes: Encoder network description. Should start with feature size (e.g. dimensionality of x).
    For example: [10000, 1024, 512] will result in:
      - encoder 2 layers: 10000x1024 and 1024x512. Representation layer (z) will be 512
      - decoder 2 layers: 512x1024 and 1024x10000.
    :param dp_drop_prob: (default: 0.0) Dropout drop probability
    """
    def __init__(self, layer_sizes, drop_prop=0):
        super(AutoEnocoder, self).__init__() # 부모 클래스(torch.nn.Module)의 init을 불러옴 
        self.drop_prop = drop_prop
        # encode_weight : ex) [10000, 1024], [1024, 512]
        self.encode_w = nn.ParameterList([nn.Parameter(torch.rand(layer_sizes[i], layer_sizes[i+1])) for i in range(len(layer_sizes)-1)])
        # weight initializer 
        for ind, w in enumerate(self.encode_w): weight_init.xavier_uniform_(w)
        
        # decoder_layers
        reversed_enc_layers = list(reversed(layer_sizes))

        # encoder_bias 
        self.encode_b = nn.ParameterList([nn.Parameter(torch.zeros(layer_sizes[i])) for i in range(len(layer_sizes)-1)])
        self.decode_b = nn.ParameterList([nn.Parameter(torch.zeros(reversed_enc_layers[i])) for i in range(len(reversed_enc_layers) - 1)])
        
    def encode(self, x):
        for ind, w in enumerate(self.encode_w):
            # activation function 
            # 논문에 따르면 Encoder의 모든 계층에 activation function을 적용했음 
            x = F.selu(input=F.linear(input=x, weight=w, bias=self.encode_b[ind]))
        if self.drop_prob > 0: x = nn.Dropout(x, self.drop_prop)
        return x
    
    def decode(self, x):
        for ind, w in enumerate(self.decode_w):
            # activation function
            # 논문에 따르면 Decoder의 마지막 게층은 activation function을 적용하지 않았음 
            if ind != (len(self.layer_sizes)-1):
                x = F.selu(input=F.linear(input=x, weight=w, bias=self.decode_b[ind]))
            else:
                x = F.linear(input=x, weight=w, bias=self.decode_b[ind])
        return x
    
    def forward(self, x):
        return self.decode(self.encode(x))

In [34]:
import scipy
ratings = scipy.sparse.csr_matrix((rating, (useridx, movieidx)), shape=(len(set(useridx)), len(set(movieidx))))

In [35]:
model = AutoEncoder(layer_sizes=[10000, 1024, 512], drop_prop=0.2)

In [36]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
optimizer = torch.optim.SGD(model.parameters(), lr=5e-2)  # learning rate
loss_func = torch.nn.MSELoss()

In [10]:
rows, cols = ratings.nonzero()

nb_epochs = 10
for epoch in tqdm_notebook(range(nb_epochs)):
    train_loss = 0
    
    for batch_idx, 
    # Set gradients to zero
    optimizer.zero_grad()
    
    # Predict and calculate loss
    prediction = model()
    loss = MSEloss(prediction, rating)
    train_loss += loss.item()

    # Backpropagate
    loss.backward()

    # Update the parameters
    optimizer.step()
print('Epoch {:4d}/{} Loss: {:.6f}'.format(epoch+1, nb_epochs, train_loss))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




KeyboardInterrupt: 