In [2]:
import time
import datetime
# 시간 표시 함수
def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

start_time = time.time()
print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))

  Training epoch took: 0:00:00


In [3]:
"""Rotten Tomato dataset"""

import os
import scipy.sparse as sp

import numpy as np
import pandas as pd
import torch as th

import dgl 
from dgl.data.utils import download, extract_archive, get_download_dir
from refex import extract_refex_feature
import utils

Using backend: pytorch


In [4]:
class RottenTomato(object):
    def __init__(self, path, testing=False, 
                 test_ratio=0.1, valid_ratio=0.2):
      
        print("Using RottenTomato dataset split train/test with 20% validation set size...")
        (
            num_user, num_movie, adj_train, train_labels, train_u_indices, train_v_indices,
            val_labels, val_u_indices, val_v_indices, test_labels, test_u_indices, 
            test_v_indices, class_values
        ) = load_official_trainvaltest_split(testing, None, None, 1.0)
            
        self._num_user = num_user
        self._num_movie = num_movie

        # reindex u and v, v nodes start after u
        train_v_indices += self.num_user
        val_v_indices += self.num_user
        test_v_indices += self.num_user

        self.train_rating_pairs = (th.LongTensor(train_u_indices), th.LongTensor(train_v_indices))
        self.valid_rating_pairs = (th.LongTensor(val_u_indices), th.LongTensor(val_v_indices))
        self.test_rating_pairs = (th.LongTensor(test_u_indices), th.LongTensor(test_v_indices))
        self.train_rating_values = th.FloatTensor(train_labels)
        self.valid_rating_values = th.FloatTensor(val_labels)
        self.test_rating_values = th.FloatTensor(test_labels)

        print("\tTrain rating pairs : {}".format(len(train_labels)))
        print("\tValid rating pairs : {}".format(len(val_labels)))
        print("\tTest rating pairs  : {}".format(len(test_labels)))

        # build dgl graph object, which is homogeneous and bidirectional and contains only training edges
        self.train_graph = dgl.graph((th.cat([self.train_rating_pairs[0], self.train_rating_pairs[1]]), 
                                      th.cat([self.train_rating_pairs[1], self.train_rating_pairs[0]])))
        self.train_graph.edata['etype'] = th.cat([self.train_rating_values, self.train_rating_values]).to(th.long)

    @property
    def num_rating(self):
        return self._rating.size

    @property
    def num_user(self):
        return self._num_user

    @property
    def num_movie(self):
        return self._num_movie

In [5]:
import os
import random
import pickle as pkl
import pandas as pd
import numpy as np
import scipy.sparse as sp

# For automatic dataset downloading
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO

In [6]:
def map_data(data):
    """
    Map data to proper indices in case they are not in a continues [0, N) range
    Parameters
    ----------
    data : np.int32 arrays
    Returns
    -------
    mapped_data : np.int32 arrays
    n : length of mapped_data
    """
    uniq = list(set(data))

    id_dict = {old: new for new, old in enumerate(sorted(uniq))}
    data = np.array([id_dict[x] for x in data])
    n = len(uniq)

    return data, id_dict, n

def load_official_trainvaltest_split(testing=False, rating_map=None, post_rating_map=None, ratio=1.0):
    dtypes = {'u_nodes': np.int16, 'v_nodes': np.int16, 'ratings': np.float16}
    
    data_train = pd.read_csv(path + 's_trainset.csv', dtype=dtypes)
    data_test  = pd.read_csv(path + 's_testset.csv', dtype=dtypes)

    data_train.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'rating_0.5':'ratings'}, inplace=True)
    data_test.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'rating_0.5':'ratings'}, inplace=True)
    
    columns = ['u_nodes','v_nodes','ratings','review_score','sentiment','emotion','review_date','origin_rating_0.5','review_content']
#     data_train = pd.read_csv(path + 'trainset_filtered.csv', dtype=dtypes)
#     data_test  = pd.read_csv(path + 'testset_filtered.csv', dtype=dtypes)
        
#     data_train.rename(columns={'user_id':'u_nodes', 'movie_id':'v_nodes', 'rating':'ratings'}, inplace=True)
#     data_test.rename(columns={'user_id':'u_nodes', 'movie_id':'v_nodes', 'rating':'ratings'}, inplace=True)
    
#     columns = ['u_nodes','v_nodes','ratings','review_score','sentiment','emotion','review_date','origin_rating','review_content']
    data_train = data_train[columns]
    data_test  = data_test[columns]
    
    data_array_train = data_train.values.tolist()
    data_array_train = np.array(data_array_train)
    data_array_test = data_test.values.tolist()
    data_array_test = np.array(data_array_test)

    data_array = np.concatenate([data_array_train, data_array_test], axis=0)

    u_nodes_ratings = data_array[:, 0].astype(dtypes['u_nodes'])
    v_nodes_ratings = data_array[:, 1].astype(dtypes['v_nodes'])
    ratings = data_array[:, 2].astype(dtypes['ratings'])
    if rating_map is not None:
        for i, x in enumerate(ratings):
            ratings[i] = rating_map[x]

    u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
    v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

    u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(np.int64), v_nodes_ratings.astype(np.int32)
    ratings = ratings.astype(np.float64)

    u_nodes = u_nodes_ratings
    v_nodes = v_nodes_ratings

    neutral_rating = -1  # int(np.ceil(np.float(num_classes)/2.)) - 1

    # assumes that ratings_train contains at least one example of every rating type
    rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

    labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
    labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])

    # 경고무시함
#     for i in range(len(u_nodes)):
#         assert(labels[u_nodes[i], v_nodes[i]] == rating_dict[ratings[i]])

    labels = labels.reshape([-1])

    # number of test and validation edges, see cf-nade code

    num_train = data_array_train.shape[0]
    num_test = data_array_test.shape[0]
    num_val = int(np.ceil(num_train * 0.2))
    num_train = num_train - num_val

    pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])
    idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero])

    # 경고 무시함
#     for i in range(len(ratings)):
#         assert(labels[idx_nonzero[i]] == rating_dict[ratings[i]])

    idx_nonzero_train = idx_nonzero[0:num_train+num_val]
    idx_nonzero_test = idx_nonzero[num_train+num_val:]

    pairs_nonzero_train = pairs_nonzero[0:num_train+num_val]
    pairs_nonzero_test = pairs_nonzero[num_train+num_val:]

    # Internally shuffle training set (before splitting off validation set)
    rand_idx = list(range(len(idx_nonzero_train)))
    np.random.seed(1234)
    np.random.shuffle(rand_idx)
    idx_nonzero_train = idx_nonzero_train[rand_idx]
    pairs_nonzero_train = pairs_nonzero_train[rand_idx]

    idx_nonzero = np.concatenate([idx_nonzero_train, idx_nonzero_test], axis=0)
    pairs_nonzero = np.concatenate([pairs_nonzero_train, pairs_nonzero_test], axis=0)

    val_idx = idx_nonzero[0:num_val]
    train_idx = idx_nonzero[num_val:num_train + num_val]
    test_idx = idx_nonzero[num_train + num_val:]

    assert(len(test_idx) == num_test)

    val_pairs_idx = pairs_nonzero[0:num_val]
    train_pairs_idx = pairs_nonzero[num_val:num_train + num_val]
    test_pairs_idx = pairs_nonzero[num_train + num_val:]

    u_test_idx, v_test_idx = test_pairs_idx.transpose()
    u_val_idx, v_val_idx = val_pairs_idx.transpose()
    u_train_idx, v_train_idx = train_pairs_idx.transpose()

    # create labels
    train_labels = labels[train_idx]
    val_labels = labels[val_idx]
    test_labels = labels[test_idx]

    if testing:
        u_train_idx = np.hstack([u_train_idx, u_val_idx])
        v_train_idx = np.hstack([v_train_idx, v_val_idx])
        train_labels = np.hstack([train_labels, val_labels])
        # for adjacency matrix construction
        train_idx = np.hstack([train_idx, val_idx])
    
    class_values = np.sort(np.unique(ratings))

    # make training adjacency matrix
    rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
    if post_rating_map is None:
        rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.
    else:
        rating_mx_train[train_idx] = np.array([post_rating_map[r] for r in class_values[labels[train_idx]]]) + 1.
    rating_mx_train = sp.csr_matrix(rating_mx_train.reshape(num_users, num_items))
    
    
    return num_users, num_items, rating_mx_train, train_labels, u_train_idx, v_train_idx, \
        val_labels, u_val_idx, v_val_idx, test_labels, u_test_idx, v_test_idx, class_values

# 1. Main()

In [7]:
if __name__ == '__main__':
#     dataset = MovieLens("ml-100k", testing=True)
    path = './raw_data/rotten_tomato/'
    dataset = RottenTomato(path, testing=True)
#     dataset = RottenTomato(path, testing=False, test_ratio=0.1, valid_ratio=0.2)

Using RottenTomato dataset split train/test with 20% validation set size...
	Train rating pairs : 90174
	Valid rating pairs : 18035
	Test rating pairs  : 21435


# 2. load_official_trainvaltest_split 함수 분석

- 매개변수

In [8]:
testing = False
rating_map = None
post_rating_map = None
ratio = 1.0

- Load the data 

In [9]:
path = './raw_data/rotten_tomato/'
os.listdir(path)

['final_rating(sentiment,emotion).csv',
 'movie_info.csv',
 's_testset.csv',
 's_testset_removed.csv',
 's_trainset.csv',
 'testset_filtered.csv',
 'testset_removed.csv',
 'trainset_filtered.csv',
 'user_info.csv']

In [67]:
dtypes = {'u_nodes': np.int64, 'v_nodes': np.int64, 'ratings': np.float64}
data_train = pd.read_csv(path + 'trainset_filtered.csv', dtype=dtypes)
data_test  = pd.read_csv(path + 'testset_filtered.csv', dtype=dtypes)

In [68]:
data_train.rename(columns={'user_id':'u_nodes', 'movie_id':'v_nodes', 'rating':'ratings'}, inplace=True)
data_test.rename(columns={'user_id':'u_nodes', 'movie_id':'v_nodes', 'rating':'ratings'}, inplace=True)

In [69]:
columns = ['u_nodes','v_nodes','ratings','review_score','sentiment','emotion','review_date','review_content']
data_train = data_train[columns]
data_test  = data_test[columns]

In [70]:
data_train.head()

Unnamed: 0,u_nodes,v_nodes,ratings,review_score,sentiment,emotion,review_date,review_content
0,7403,2494,5.0,1.0,3,4,1800-01-01,"A grimly seductive end-of-the-world thriller, ..."
1,6220,970,1.0,0.2,1,2,1800-01-01,"All in all, AIrborne is not bad for what it is..."
2,9296,7594,3.0,0.6,4,2,1800-01-01,It's exciting to see a British horror film wit...
3,7403,8800,3.5,0.7,4,2,1800-01-01,"Coming out from behind Spike Lee's camera, Ern..."
4,2343,11205,1.5,0.25,0,5,1800-01-01,It's the sort of film that can only be watched...


In [71]:
data_array_train = data_train.values.tolist()
data_array_train = np.array(data_array_train)
data_array_test = data_test.values.tolist()
data_array_test = np.array(data_array_test)

print(data_array_train.shape)
print(data_array_test.shape)

(216328, 8)
(28766, 8)


In [72]:
data_array = np.concatenate([data_array_train, data_array_test], axis=0)

In [73]:
data_array.shape

(245094, 8)

In [74]:
u_nodes_ratings = data_array[:, 0].astype(dtypes['u_nodes'])
v_nodes_ratings = data_array[:, 1].astype(dtypes['v_nodes'])
ratings = data_array[:, 2].astype(dtypes['ratings'])
if rating_map is not None:
    for i, x in enumerate(ratings):
        ratings[i] = rating_map[x]

In [75]:
# 인덱스를 0번부터 시작하게끔 모든 인덱스를 당기기  ex) 1번 -> 0번 / 456번 -> 455번
u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

In [76]:
u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(np.int32), v_nodes_ratings.astype(np.int32)
ratings = ratings.astype(np.float64)

u_nodes = u_nodes_ratings
v_nodes = v_nodes_ratings

- Sparse matrix (희소행렬 형태의 인접행렬 생성)

In [51]:
neutral_rating = -1  # int(np.ceil(np.float(num_classes)/2.)) - 1

# assumes that ratings_train contains at least one example of every rating type
rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
print(labels.shape)

(1112, 8521)


In [19]:
labels # user,item의 rating matrix를 -1로 초기화

array([[-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       ...,
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1]])

In [20]:
# 메모리가 터지는 부분!!!!!
labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])

In [21]:
# 평점이 제대로 들어갔는지 점검
error_index = list()
for i in range(len(u_nodes)):
    if labels[u_nodes[i], v_nodes[i]] != rating_dict[ratings[i]]:
            error_index.append(i)
    assert(labels[u_nodes[i], v_nodes[i]] == rating_dict[ratings[i]])

In [22]:
error_index

[]

In [23]:
labels = labels.reshape([-1])

# number of test and validation edges, see cf-nade code

num_train = data_array_train.shape[0]
num_test = data_array_test.shape[0]
num_val = int(np.ceil(num_train * 0.2))
num_train = num_train - num_val

In [24]:
print(num_train)
print(num_val)
print(num_test)

106124
26531
21115


In [25]:
len(v_nodes)

153770

In [26]:
pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])
idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero]) # rating index 번호

- trainset/testset 분리

In [27]:
for i in range(len(ratings)):
    assert(labels[idx_nonzero[i]] == rating_dict[ratings[i]])

idx_nonzero_train = idx_nonzero[0:num_train+num_val]
idx_nonzero_test = idx_nonzero[num_train+num_val:]

pairs_nonzero_train = pairs_nonzero[0:num_train+num_val]
pairs_nonzero_test = pairs_nonzero[num_train+num_val:]

In [28]:
# validset 생성 전에 trainset을 섞기
rand_idx = list(range(len(idx_nonzero_train)))
np.random.seed(1234)
np.random.shuffle(rand_idx)
idx_nonzero_train = idx_nonzero_train[rand_idx]
pairs_nonzero_train = pairs_nonzero_train[rand_idx]

idx_nonzero = np.concatenate([idx_nonzero_train, idx_nonzero_test], axis=0)
pairs_nonzero = np.concatenate([pairs_nonzero_train, pairs_nonzero_test], axis=0)

val_idx = idx_nonzero[0:num_val]
train_idx = idx_nonzero[num_val:num_train + num_val]
test_idx = idx_nonzero[num_train + num_val:]

assert(len(test_idx) == num_test)

val_pairs_idx = pairs_nonzero[0:num_val]
train_pairs_idx = pairs_nonzero[num_val:num_train + num_val]
test_pairs_idx = pairs_nonzero[num_train + num_val:]

In [29]:
u_test_idx, v_test_idx = test_pairs_idx.transpose() # 2 x 20000으로 변경
u_val_idx, v_val_idx = val_pairs_idx.transpose()
u_train_idx, v_train_idx = train_pairs_idx.transpose()

# create labels
train_labels = labels[train_idx]
val_labels = labels[val_idx]
test_labels = labels[test_idx]

In [30]:
if testing:
    u_train_idx = np.hstack([u_train_idx, u_val_idx])
    v_train_idx = np.hstack([v_train_idx, v_val_idx])
    train_labels = np.hstack([train_labels, val_labels])
    # for adjacency matrix construction
    train_idx = np.hstack([train_idx, val_idx])

class_values = np.sort(np.unique(ratings))

In [31]:
# trainset의 인접행렬 생성
rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
if post_rating_map is None:
    rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.
else:
    rating_mx_train[train_idx] = np.array([post_rating_map[r] for r in class_values[labels[train_idx]]]) + 1.
    
rating_mx_train = sp.csr_matrix(rating_mx_train.reshape(num_users, num_items))

- Movie Features

In [32]:
# # movie features (genres)
# sep = r'|'
# movie_file = 'raw_data/' + dataset + '/u.item'
# movie_headers = ['movie id', 'movie title', 'release date', 'video release date',
#                  'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
#                  'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
#                  'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
#                  'Thriller', 'War', 'Western']
# movie_df = pd.read_csv(movie_file, sep=sep, header=None, 
#                        names=movie_headers, encoding='ISO-8859-1')

In [33]:
# movie_df.head(1)

In [34]:
# genre_headers = movie_df.columns.values[6:]
# num_genres = genre_headers.shape[0]

# v_features = np.zeros((num_items, num_genres), dtype=np.float32)

In [36]:
# v_features.shape

In [37]:
# # g_vec : movie_id별 장르 벡터
# for movie_id, g_vec in zip(movie_df['movie id'].values.tolist(), movie_df[genre_headers].values.tolist()):
#     # check if movie_id was listed in ratings file and therefore in mapping dictionary
#     if movie_id in v_dict.keys():
#         v_features[v_dict[movie_id], :] = g_vec

- User features

In [38]:
# # user features
# sep = r'|'
# users_file = 'raw_data/' + dataset + '/u.user'
# users_headers = ['user id', 'age', 'gender', 'occupation', 'zip code']
# users_df = pd.read_csv(users_file, sep=sep, header=None,
#                        names=users_headers, engine='python')

# occupation = set(users_df['occupation'].values.tolist())

In [39]:
# age = users_df['age'].values
# age_max = age.max()

# gender_dict = {'M': 0., 'F': 1.}
# occupation_dict = {f: i for i, f in enumerate(occupation, start=2)}

# num_feats = 2 + len(occupation_dict)

In [40]:
# # user feature 유형: age(1), gender(1), occupation(21)   총 23개
# num_feats

In [41]:
# users_df.head()

In [42]:
# u_features = np.zeros((num_users, num_feats), dtype=np.float32)
# for _, row in users_df.iterrows():
#     u_id = row['user id']
#     if u_id in u_dict.keys():
#         # age
#         u_features[u_dict[u_id], 0] = row['age'] / np.float(age_max)
#         # gender
#         u_features[u_dict[u_id], 1] = gender_dict[row['gender']]
#         # occupation
#         u_features[u_dict[u_id], occupation_dict[row['occupation']]] = 1.

In [43]:
# u_features = sp.csr_matrix(u_features)
# v_features = sp.csr_matrix(v_features)

# print("User features shape: "+str(u_features.shape))
# print("Item features shape: "+str(v_features.shape))

# 3. RottenTomato 클래스 분석

In [44]:
# MovieLens("ml-100k", testing=True)

testing = False
test_ratio = 0.1
valid_ratio = 0.2
    

print("Using official MovieLens dataset split u1.base/u1.test with 20% validation set size...")
(
    num_user, num_item, adj_train, 
    train_labels, train_u_indices, train_v_indices,
    val_labels, val_u_indices, val_v_indices, 
    test_labels, test_u_indices, test_v_indices, 
    class_values
) = load_dataset(testing, None, None, 1.0)

_num_user = num_user
_num_movie = num_item

Using official MovieLens dataset split u1.base/u1.test with 20% validation set size...


In [45]:
print(_num_user)
print(_num_movie)

821
6603


In [46]:
# property 할당(동일한 이름으로 사용)
# num_rating = _rating.size  #GCMC에서 사용
num_user = _num_user
num_movie = _num_movie

In [47]:
print(train_v_indices)
print(val_v_indices)
print(test_v_indices)
print(num_user)

[1215 2031 1625 ... 3429 3275 3238]
[ 995 2081 5081 ... 5461 2558 1791]
[5490 4821 6393 ... 6204 4364   58]
821


In [48]:
train_v_indices + num_user

array([2036, 2852, 2446, ..., 4250, 4096, 4059], dtype=int64)

In [49]:
# reindex u and v, v nodes start after u (v노드는 u노드 다음으로 인덱스를 부여함)
train_v_indices += num_user
val_v_indices += num_user
test_v_indices += num_user

train_rating_pairs  = (th.LongTensor(train_u_indices), th.LongTensor(train_v_indices))
valid_rating_pairs  = (th.LongTensor(val_u_indices), th.LongTensor(val_v_indices))
test_rating_pairs   = (th.LongTensor(test_u_indices), th.LongTensor(test_v_indices))
train_rating_values = th.FloatTensor(train_labels)
valid_rating_values = th.FloatTensor(val_labels)
test_rating_values  = th.FloatTensor(test_labels)

print("\tTrain rating pairs : {}".format(len(train_labels)))
print("\tValid rating pairs : {}".format(len(val_labels)))
print("\tTest rating pairs  : {}".format(len(test_labels)))

	Train rating pairs : 89192
	Valid rating pairs : 22298
	Test rating pairs  : 19507


In [50]:
train_rating_pairs

(tensor([280, 354, 675,  ..., 354, 112, 394]),
 tensor([2036, 2852, 2446,  ..., 4250, 4096, 4059]))

In [51]:
# build dgl graph object, which is homogeneous and bidirectional and contains only training edges
train_graph = dgl.graph((th.cat([train_rating_pairs[0], train_rating_pairs[1]]), 
                         th.cat([train_rating_pairs[1], train_rating_pairs[0]])))
train_graph.edata['etype'] = th.cat([train_rating_values, train_rating_values]).to(th.long)

In [52]:
train_graph

Graph(num_nodes=7424, num_edges=178384,
      ndata_schemes={}
      edata_schemes={'etype': Scheme(shape=(), dtype=torch.int64)})