In [1]:
from google.colab import drive
drive.mount('/content/data')

Mounted at /content/data


In [2]:
import numpy as np 
import pandas as pd 

In [3]:
import easydict
import json

args = easydict.EasyDict()
args.default_path = '/content/data/MyDrive/data engineering/Deep Learning/추천 시스템/data/Movie/ml-1m/'
args.ratings = args.default_path+'ratings.csv'

In [4]:
ratings = pd.read_csv(args.ratings)

print(f'{ratings.shape}')
ratings.head()

(100004, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [None]:
import random

class LoadData(object):
  def __init__(self, args):
    self.ratings = self.__load_ratings(args.ratings)
    self.__binarize()

    self.user_pool = set(self.ratings['uid'].unique())
    self.item_pool = set(self.ratings['iid'].unique())

    self.negatives = self.__sample_negative()
    self.train_ratings, self.test_ratings = self.__split_loop()
  
  def __load_ratings(self, ratings_path):
    df_ratings = pd.read_csv(ratings_path)
    df_ratings.drop_duplicates(inplace=True) # 중복 제거 
    # Reindex - 정확하지 않게 정의된 인덱스를 그대로 사용하면 문제가 발생할 수 있으므로 
    df_users = df_ratings[['userId']].drop_duplicates().reindex()
    df_users['uid'] = np.arange(len(df_users))
    df_ratings = pd.merge(df_ratings, df_users, on=['userId'], how='left')

    df_items = df_ratings[['movieId']].drop_duplicates().reindex()
    df_items['iid'] = np.arange(len(df_items))
    df_ratings = pd.merge(df_ratings, df_items, on=['movieId'], how='left')

    return df_ratings[['uid', 'iid', 'rating', 'timestamp']]

  def __binarize(self):
    """
    binarize into 0 or 1, implicit feedback
    """
    self.ratings['rating'][self.ratings['rating'] > 0] = 1.0

  def __sample_negative(self, num_ng=99):
    """
    return alll negative items & sampled negative items
    """
    interact_status = self.ratings.groupby(['uid'])['iid'].apply(set).reset_index().rename(
        columns={'iid':'interacted_iid'}
    )
    interact_status['negative_iid'] = interact_status['interacted_iid'].map(lambda x: self.item_pool - x)
    interact_status['negative_samples'] = interact_status['negative_iid'].map(lambda x: random.sample(x, num_ng))
    return interact_status[['uid', 'negative_iid', 'negative_samples']]

  def __split_loop(self):
    self.ratings['rank_latest'] = self.ratings.groupby(['uid'])['timestamp'].rank(method='first', ascending=False)
    test = self.ratings[self.ratings['rank_latest'] == 1]
    train = self.ratings[self.ratings['rank_latest'] > 1]

    assert train['uid'].nunique() == test['uid'].nunique() 
    return train[['uid', 'iid', 'rating']], test[['uid', 'iid', 'rating']]