In [1]:
import time
import datetime
# 시간 표시 함수
def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

start_time = time.time()
print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))

  Training epoch took: 0:00:00


In [4]:
"""Rotten Tomato dataset"""

import os
import scipy.sparse as sp

import numpy as np
import pandas as pd
import torch as th

import dgl 
from dgl.data.utils import download, extract_archive, get_download_dir
import utils

In [61]:
class RottenTomato(object):
    def __init__(self, data_type, label_type, path, testing=False, 
                 test_ratio=0.1, valid_ratio=0.2):
      
        print(f"Data_type: {data_type}")
        print(f"Label_type: {label_type}")
        (
            num_user, num_movie, adj_train, 
            train_labels, train_u_indices, train_v_indices,
            val_labels, val_u_indices, val_v_indices, 
            test_labels, test_u_indices, test_v_indices, 
            class_values
        ) = load_official_trainvaltest_split(data_type, label_type, path, testing, None, None, 1.0)
            
        self._num_user = num_user
        self._num_movie = num_movie

        # reindex u and v, v nodes start after u
        train_v_indices += self.num_user
        val_v_indices += self.num_user
        test_v_indices += self.num_user

        self.train_rating_pairs = (th.LongTensor(train_u_indices), th.LongTensor(train_v_indices))
        self.valid_rating_pairs = (th.LongTensor(val_u_indices), th.LongTensor(val_v_indices))
        self.test_rating_pairs = (th.LongTensor(test_u_indices), th.LongTensor(test_v_indices))
        self.train_rating_values = th.FloatTensor(train_labels)
        self.valid_rating_values = th.FloatTensor(val_labels)
        self.test_rating_values = th.FloatTensor(test_labels)

        print("\tTrain rating pairs : {}".format(len(train_labels)))
        print("\tValid rating pairs : {}".format(len(val_labels)))
        print("\tTest rating pairs  : {}".format(len(test_labels)))

        # build dgl graph object, which is homogeneous and bidirectional and contains only training edges
        self.train_graph = dgl.graph((th.cat([self.train_rating_pairs[0], self.train_rating_pairs[1]]), 
                                      th.cat([self.train_rating_pairs[1], self.train_rating_pairs[0]])))
        self.train_graph.edata['etype'] = th.cat([self.train_rating_values, self.train_rating_values]).to(th.long)

    @property
    def num_rating(self):
        return self._rating.size

    @property
    def num_user(self):
        return self._num_user

    @property
    def num_movie(self):
        return self._num_movie

In [7]:
import os
import random
import pickle as pkl
import pandas as pd
import numpy as np
import scipy.sparse as sp

# For automatic dataset downloading
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO

In [63]:
def map_data(data):
    """
    Map data to proper indices in case they are not in a continues [0, N) range
    Parameters
    ----------
    data : np.int32 arrays
    Returns
    -------
    mapped_data : np.int32 arrays
    n : length of mapped_data
    """
    uniq = list(set(data))

    id_dict = {old: new for new, old in enumerate(sorted(uniq))}
    data = np.array([id_dict[x] for x in data])
    n = len(uniq)

    return data, id_dict, n

def load_official_trainvaltest_split(data_type, label_type, path, testing=False, rating_map=None, post_rating_map=None, ratio=1.0):
    dtypes = {'u_nodes': np.int16, 'v_nodes': np.int16, 'ratings': np.float16}
    
    # data 로드
    if data_type=='rotten':
        dtypes = {'u_nodes': np.int64, 'v_nodes': np.int64, 'ratings': np.float64}
        data_train = pd.read_csv(path + 'rotten_trainset.csv', dtype=dtypes)
        data_test  = pd.read_csv(path + 'rotten_testset.csv', dtype=dtypes)
    elif data_type=='amazon':
        dtypes = {'u_nodes': np.int64, 'v_nodes': np.int64, 'ratings': np.int64}
        data_train = pd.read_csv(path + 'amazon_trainset.csv', dtype=dtypes)
        data_test  = pd.read_csv(path + 'amazon_testset.csv', dtype=dtypes)
    
    # label_type에 따른 처리
    if data_type=='rotten':
        if label_type=='rating':
            data_train.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'rating_0.5':'ratings'}, inplace=True)
            data_test.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'rating_0.5':'ratings'}, inplace=True)
        elif label_type=='sentiment':    
            data_train.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'sentiment':'ratings'}, inplace=True)
            data_test.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'sentiment':'ratings'}, inplace=True)    
        elif label_type=='emotion':    
            data_train.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'emotion':'ratings'}, inplace=True)
            data_test.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'emotion':'ratings'}, inplace=True)   
    elif data_type=='amazon':
        if label_type=='rating':
            data_train.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'rating':'ratings'}, inplace=True)
            data_test.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'rating':'ratings'}, inplace=True)
        elif label_type=='sentiment':    
            data_train.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'sentiment':'ratings'}, inplace=True)
            data_test.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'sentiment':'ratings'}, inplace=True)    
        elif label_type=='emotion':    
            data_train.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'emotion':'ratings'}, inplace=True)
            data_test.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'emotion':'ratings'}, inplace=True)   
        
    columns = ['u_nodes','v_nodes','ratings']

    data_train = data_train[columns]
    data_test  = data_test[columns]
    
    data_array_train = data_train.values.tolist()
    data_array_train = np.array(data_array_train)
    data_array_test = data_test.values.tolist()
    data_array_test = np.array(data_array_test)

    data_array = np.concatenate([data_array_train, data_array_test], axis=0)

    u_nodes_ratings = data_array[:, 0].astype(dtypes['u_nodes'])
    v_nodes_ratings = data_array[:, 1].astype(dtypes['v_nodes'])
    ratings = data_array[:, 2].astype(dtypes['ratings'])
    if rating_map is not None:
        for i, x in enumerate(ratings):
            ratings[i] = rating_map[x]

    u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
    v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

    u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(np.int64), v_nodes_ratings.astype(np.int32)
    ratings = ratings.astype(np.float64)

    u_nodes = u_nodes_ratings
    v_nodes = v_nodes_ratings

    neutral_rating = -1  # int(np.ceil(np.float(num_classes)/2.)) - 1

    # assumes that ratings_train contains at least one example of every rating type
    rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

    labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
    labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])

    # 경고무시함
#     for i in range(len(u_nodes)):
#         assert(labels[u_nodes[i], v_nodes[i]] == rating_dict[ratings[i]])

    labels = labels.reshape([-1])

    # number of test and validation edges, see cf-nade code

    num_train = data_array_train.shape[0]
    num_test = data_array_test.shape[0]
    num_val = int(np.ceil(num_train * 0.2))
    num_train = num_train - num_val

    pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])
    idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero])

    # 경고 무시함
#     for i in range(len(ratings)):
#         assert(labels[idx_nonzero[i]] == rating_dict[ratings[i]])

    idx_nonzero_train = idx_nonzero[0:num_train+num_val]
    idx_nonzero_test = idx_nonzero[num_train+num_val:]

    pairs_nonzero_train = pairs_nonzero[0:num_train+num_val]
    pairs_nonzero_test = pairs_nonzero[num_train+num_val:]

    # Internally shuffle training set (before splitting off validation set)
    rand_idx = list(range(len(idx_nonzero_train)))
    np.random.seed(1234)
    np.random.shuffle(rand_idx)
    idx_nonzero_train = idx_nonzero_train[rand_idx]
    pairs_nonzero_train = pairs_nonzero_train[rand_idx]

    idx_nonzero = np.concatenate([idx_nonzero_train, idx_nonzero_test], axis=0)
    pairs_nonzero = np.concatenate([pairs_nonzero_train, pairs_nonzero_test], axis=0)

    val_idx = idx_nonzero[0:num_val]
    train_idx = idx_nonzero[num_val:num_train + num_val]
    test_idx = idx_nonzero[num_train + num_val:]

    assert(len(test_idx) == num_test)

    val_pairs_idx = pairs_nonzero[0:num_val]
    train_pairs_idx = pairs_nonzero[num_val:num_train + num_val]
    test_pairs_idx = pairs_nonzero[num_train + num_val:]

    u_test_idx, v_test_idx = test_pairs_idx.transpose()
    u_val_idx, v_val_idx = val_pairs_idx.transpose()
    u_train_idx, v_train_idx = train_pairs_idx.transpose()

    # create labels
    train_labels = labels[train_idx]
    val_labels = labels[val_idx]
    test_labels = labels[test_idx]

    if testing:
        u_train_idx = np.hstack([u_train_idx, u_val_idx])
        v_train_idx = np.hstack([v_train_idx, v_val_idx])
        train_labels = np.hstack([train_labels, val_labels])
        # for adjacency matrix construction
        train_idx = np.hstack([train_idx, val_idx])
    
    class_values = np.sort(np.unique(ratings))

    # make training adjacency matrix
    rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
    if post_rating_map is None:
        rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.
    else:
        rating_mx_train[train_idx] = np.array([post_rating_map[r] for r in class_values[labels[train_idx]]]) + 1.
    rating_mx_train = sp.csr_matrix(rating_mx_train.reshape(num_users, num_items))
    
    
    return num_users, num_items, rating_mx_train, train_labels, u_train_idx, v_train_idx, \
        val_labels, u_val_idx, v_val_idx, test_labels, u_test_idx, v_test_idx, class_values

# 1. Main()

In [85]:
path = './raw_data/rotten_tomato/'
data_type = 'amazon'

In [86]:
label_type = 'rating'
dataset = RottenTomato(data_type, label_type, path, testing=True)

Label_type: rating
	Train rating pairs : 216328
	Valid rating pairs : 43266
	Test rating pairs  : 28766


In [87]:
label_type = 'sentiment'
dataset = RottenTomato((data_type, label_type, path, testing=True)

Label_type: sentiment
	Train rating pairs : 216328
	Valid rating pairs : 43266
	Test rating pairs  : 28766


In [88]:
label_type = 'emotion'
dataset = RottenTomato((data_type, label_type, path, testing=True)

Label_type: emotion
	Train rating pairs : 216328
	Valid rating pairs : 43266
	Test rating pairs  : 28766


# 2. load_official_trainvaltest_split 함수 분석

- 매개변수

In [35]:
data_type = 'amazon'
# data_type = 'rotten'

label_type = 'emotion'
# label_type = 'sentiment'

testing = False
rating_map = None
post_rating_map = None
ratio = 1.0

- Load the data 

In [36]:
path = './raw_data/'
os.listdir(path)

['amazon',
 'amazon_testset.csv',
 'amazon_trainset.csv',
 'ml-100k',
 'model',
 'rotten_testset.csv',
 'rotten_tomato',
 'rotten_trainset.csv']

In [38]:
if data_type=='rotten':
    dtypes = {'u_nodes': np.int64, 'v_nodes': np.int64, 'ratings': np.float64}
    data_train = pd.read_csv(path + 'rotten_trainset.csv', dtype=dtypes)
    data_test  = pd.read_csv(path + 'rotten_testset.csv', dtype=dtypes)
elif data_type=='amazon':
    dtypes = {'u_nodes': np.int64, 'v_nodes': np.int64, 'ratings': np.int64}
    data_train = pd.read_csv(path + 'amazon_trainset.csv', dtype=dtypes)
    data_test  = pd.read_csv(path + 'amazon_testset.csv', dtype=dtypes)

In [39]:
if data_type=='rotten':
    if label_type=='rating':
        data_train.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'rating_0.5':'ratings'}, inplace=True)
        data_test.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'rating_0.5':'ratings'}, inplace=True)
    elif label_type=='sentiment':    
        data_train.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'sentiment':'ratings'}, inplace=True)
        data_test.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'sentiment':'ratings'}, inplace=True)    
    elif label_type=='emotion':    
        data_train.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'emotion':'ratings'}, inplace=True)
        data_test.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'emotion':'ratings'}, inplace=True)   
        
elif data_type=='amazon':
    if label_type=='rating':
        data_train.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'rating':'ratings'}, inplace=True)
        data_test.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'rating':'ratings'}, inplace=True)
    elif label_type=='sentiment':    
        data_train.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'sentiment':'ratings'}, inplace=True)
        data_test.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'sentiment':'ratings'}, inplace=True)    
    elif label_type=='emotion':    
        data_train.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'emotion':'ratings'}, inplace=True)
        data_test.rename(columns={f'user_id':'u_nodes', 'movie_id':'v_nodes', 'emotion':'ratings'}, inplace=True)   

In [40]:
columns = ['u_nodes','v_nodes','ratings']
data_train = data_train[columns]
data_test  = data_test[columns]

In [41]:
data_train.head()

Unnamed: 0,u_nodes,v_nodes,ratings
0,104555,30037,0
1,80035,13314,2
2,73255,47748,2
3,65251,3302,2
4,39753,40862,2


In [42]:
data_array_train = data_train.values.tolist()
data_array_train = np.array(data_array_train)
data_array_test = data_test.values.tolist()
data_array_test = np.array(data_array_test)

print(data_array_train.shape)
print(data_array_test.shape)

(160000, 3)
(35947, 3)


In [43]:
data_array = np.concatenate([data_array_train, data_array_test], axis=0)

In [44]:
data_array.shape

(195947, 3)

In [45]:
u_nodes_ratings = data_array[:, 0].astype(dtypes['u_nodes'])
v_nodes_ratings = data_array[:, 1].astype(dtypes['v_nodes'])
ratings = data_array[:, 2].astype(dtypes['ratings'])
if rating_map is not None:
    for i, x in enumerate(ratings):
        ratings[i] = rating_map[x]

In [46]:
# 인덱스를 0번부터 시작하게끔 모든 인덱스를 당기기  ex) 1번 -> 0번 / 456번 -> 455번
u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

In [47]:
u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(np.int32), v_nodes_ratings.astype(np.int32)
ratings = ratings.astype(np.float64)

u_nodes = u_nodes_ratings
v_nodes = v_nodes_ratings

- Sparse matrix (희소행렬 형태의 인접행렬 생성)

In [48]:
neutral_rating = -1  # int(np.ceil(np.float(num_classes)/2.)) - 1

# assumes that ratings_train contains at least one example of every rating type
rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
print(labels.shape)

(3659, 33898)


In [49]:
rating_dict

{0.0: 0, 1.0: 1, 2.0: 2, 3.0: 3, 4.0: 4, 5.0: 5}

In [50]:
labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])

In [51]:
labels = labels.reshape([-1])

# number of test and validation edges, see cf-nade code

num_train = data_array_train.shape[0]
num_test = data_array_test.shape[0]
num_val = int(np.ceil(num_train * 0.2))
num_train = num_train - num_val

In [52]:
print(num_train)
print(num_val)
print(num_test)

128000
32000
35947


In [53]:
len(v_nodes)

195947

In [54]:
pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])
idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero]) # rating index 번호

- trainset/testset 분리

In [55]:
# for i in range(len(ratings)):
#     assert(labels[idx_nonzero[i]] == rating_dict[ratings[i]])

idx_nonzero_train = idx_nonzero[0:num_train+num_val]
idx_nonzero_test = idx_nonzero[num_train+num_val:]

pairs_nonzero_train = pairs_nonzero[0:num_train+num_val]
pairs_nonzero_test = pairs_nonzero[num_train+num_val:]

In [56]:
# validset 생성 전에 trainset을 섞기
rand_idx = list(range(len(idx_nonzero_train)))
np.random.seed(1234)
np.random.shuffle(rand_idx)
idx_nonzero_train = idx_nonzero_train[rand_idx]
pairs_nonzero_train = pairs_nonzero_train[rand_idx]

# 다시 합치고 train/val/test 나누기
idx_nonzero = np.concatenate([idx_nonzero_train, idx_nonzero_test], axis=0)
pairs_nonzero = np.concatenate([pairs_nonzero_train, pairs_nonzero_test], axis=0)

val_idx = idx_nonzero[0:num_val]
train_idx = idx_nonzero[num_val:num_train + num_val]
test_idx = idx_nonzero[num_train + num_val:]

val_pairs_idx = pairs_nonzero[0:num_val]
train_pairs_idx = pairs_nonzero[num_val:num_train + num_val]
test_pairs_idx = pairs_nonzero[num_train + num_val:]

In [57]:
u_test_idx, v_test_idx = test_pairs_idx.transpose() 
u_val_idx, v_val_idx = val_pairs_idx.transpose()
u_train_idx, v_train_idx = train_pairs_idx.transpose()

# create labels
train_labels = labels[train_idx]
val_labels = labels[val_idx]
test_labels = labels[test_idx]

In [58]:
if testing:
    u_train_idx = np.hstack([u_train_idx, u_val_idx])
    v_train_idx = np.hstack([v_train_idx, v_val_idx])
    train_labels = np.hstack([train_labels, val_labels])
    # for adjacency matrix construction
    train_idx = np.hstack([train_idx, val_idx])

class_values = np.sort(np.unique(ratings))

In [59]:
# trainset의 인접행렬 생성
rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
if post_rating_map is None:
    rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.
else:
    rating_mx_train[train_idx] = np.array([post_rating_map[r] for r in class_values[labels[train_idx]]]) + 1.
    
rating_mx_train = sp.csr_matrix(rating_mx_train.reshape(num_users, num_items))

# 3. RottenTomato 클래스 분석

In [68]:
# MovieLens("ml-100k", testing=True)
data_type, label_type = 'amazon', 'rating'
testing = False
test_ratio = 0.1
valid_ratio = 0.2
    
print("Using official MovieLens dataset split u1.base/u1.test with 20% validation set size...")
(
    num_user, num_item, adj_train, 
    train_labels, train_u_indices, train_v_indices,
    val_labels, val_u_indices, val_v_indices, 
    test_labels, test_u_indices, test_v_indices, 
    class_values
) = load_official_trainvaltest_split(data_type, label_type, path, testing, None, None, 1.0)

_num_user = num_user
_num_movie = num_item

Using official MovieLens dataset split u1.base/u1.test with 20% validation set size...


In [69]:
print(_num_user)
print(_num_movie)

3659
33898


In [70]:
# property 할당(동일한 이름으로 사용)
# num_rating = _rating.size  #GCMC에서 사용
num_user = _num_user
num_movie = _num_movie

In [71]:
print(train_v_indices)
print(val_v_indices)
print(test_v_indices)
print(num_user)

[ 4017 25711 16489 ... 17970 15017 27717]
[ 7269  3705 30667 ...  8334 29362 16697]
[14635 26642  3973 ... 33773 14558 29084]
3659


In [72]:
train_v_indices + num_user

array([ 7676, 29370, 20148, ..., 21629, 18676, 31376], dtype=int64)

In [73]:
# reindex u and v, v nodes start after u (v노드는 u노드 다음으로 인덱스를 부여함)
train_v_indices += num_user
val_v_indices += num_user
test_v_indices += num_user

train_rating_pairs  = (th.LongTensor(train_u_indices), th.LongTensor(train_v_indices))
valid_rating_pairs  = (th.LongTensor(val_u_indices), th.LongTensor(val_v_indices))
test_rating_pairs   = (th.LongTensor(test_u_indices), th.LongTensor(test_v_indices))
train_rating_values = th.FloatTensor(train_labels)
valid_rating_values = th.FloatTensor(val_labels)
test_rating_values  = th.FloatTensor(test_labels)

print("\tTrain rating pairs : {}".format(len(train_labels)))
print("\tValid rating pairs : {}".format(len(val_labels)))
print("\tTest rating pairs  : {}".format(len(test_labels)))

	Train rating pairs : 128000
	Valid rating pairs : 32000
	Test rating pairs  : 35947


In [74]:
train_rating_pairs

(tensor([2235,  345, 3522,  ..., 1470, 2738, 2813]),
 tensor([ 7676, 29370, 20148,  ..., 21629, 18676, 31376]))

In [75]:
# build dgl graph object, which is homogeneous and bidirectional and contains only training edges
train_graph = dgl.graph((th.cat([train_rating_pairs[0], train_rating_pairs[1]]), 
                         th.cat([train_rating_pairs[1], train_rating_pairs[0]])))
train_graph.edata['etype'] = th.cat([train_rating_values, train_rating_values]).to(th.long)

In [76]:
train_graph

Graph(num_nodes=37557, num_edges=256000,
      ndata_schemes={}
      edata_schemes={'etype': Scheme(shape=(), dtype=torch.int64)})

In [77]:
train_rating_values.shape

torch.Size([128000])