# AutoEncoder Meet Collaborative Filtering

- Collaborative Filtering을 위해 user-item matrix 만들기
- AutoEncoder 모델 구조 정의하기
- Training Deep AutoEncoder 논문은 [저자 코드](https://github.com/NVIDIA/DeepRecommender) 참고

## 논문 종류
- AutoRec
- Training Deep AutoEncoder
- Variational AutoEncoder

## 1. Data Loader 

In [1]:
from google.colab import drive
drive.mount('/content/drive')
data_path = '/content/drive/My Drive/data/kmrd/kmr_dataset/datafile/kmrd-small'

Mounted at /content/drive


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
def read_data(data_path):
    df = pd.read_csv(os.path.join(data_path,'rates.csv'))[:10000]
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=1234, shuffle=True)

    user_to_index = {original: idx for idx, original in enumerate(df.user.unique())}
    movie_to_index = {original: idx for idx, original in enumerate(df.movie.unique())}

    return train_df, val_df, user_to_index, movie_to_index

In [4]:
class KMRDdataset(Dataset):
    def __init__(self, df, user_to_index, movie_to_index, item_based=True):
        self.min_rating = min(df.rate)
        self.max_rating = max(df.rate)

        # user의 기존 value를 user_to_index를 통해 재설정된 index로 변환
        self.user = [user_to_index[u] for u in df.user.values]

        # movie의 기존 value를 movie_to_index를 통해 재설정된 index로 변환
        self.movie = [movie_to_index[m] for m in df.movie.values]
        self.rating = df.rate.values

        if item_based:
          input_tensor = torch.LongTensor([self.movie, self.user])

          # torch.sparse : http://man.hubwiz.com/docset/PyTorch.docset/Contents/Resources/Documents/sparse.html
          '''
            ex) input_tensor = [
                  [0, 15, 466] => movie의 index, 
                  [0, 50, 532] => user의 index
                ],
                
                rating = [0, 3, 5] => 15번째 row, 50번째 column에 3을 넣는다                
          '''      
          self.data = torch.sparse.FloatTensor(input_tensor, torch.FloatTensor(self.rating),
                                             torch.Size([len(movie_to_index), len(user_to_index)])).to_dense()
          # | self.data | = ( movie_count, user_count )                                     
        else:
          input_tensor = torch.LongTensor([self.user, self.movie])
          self.data = torch.sparse.FloatTensor(input_tensor, torch.FloatTensor(self.rating),
                                             torch.Size([len(user_to_index), len(movie_to_index)])).to_dense()
          # | self.data | = ( user_count, movie_count )

    def __len__(self):
      return len(self.data)
    
    def __getitem__(self, idx):
      return self.data[idx]

In [5]:
train_df, val_df, user_to_index, movie_to_index = read_data(data_path=data_path)

In [6]:
train_dataset = KMRDdataset(train_df, user_to_index, movie_to_index)
val_dataset = KMRDdataset(val_df, user_to_index, movie_to_index)

In [9]:
print(train_df.shape)
print(train_dataset.data[0].size())
print(len(train_dataset))
print(val_df.shape)
print(val_dataset.data[0].size())

(8000, 4)
torch.Size([466])
532
(2000, 4)
torch.Size([466])


In [None]:
print(len(list(movie_to_index.keys())))

In [None]:
train_dataset.data[0]

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)