# データ整形：TransRec

In [1]:
import pandas as pd
import scipy.sparse as sp
import numpy as np
from collections import defaultdict
import copy
import os

In [2]:
# config
dataset_name = 'MovieLens100k'
filename        = '../../data/' + dataset_name + '/ratings.csv'

df = pd.read_csv(filename, sep=',', header=None,
                names=['user_id', 'item_id', 'rating', 'time'], index_col=False)

In [3]:
df

Unnamed: 0,user_id,item_id,rating,time
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [4]:
print('Dataset size: {}'.format(len(df)))

Dataset size: 100004


以下のデータ数でサンプルを作成する（以下はそれぞれの数字／実際はこれの倍の大きさ）   

- MovieLens20M
    - Train: 14,000,000    
    - Eval: 4,000,000    
    - Test: 2,000,263
- MovieLens100k
    - Train: 70,000
    - Eval: 20,000
    - TestL 10,000

In [5]:
train_df = df.iloc[:70000,:]
varidation_df = df.iloc[70000:90000,:]
test_df = df.iloc[90000:,:]

In [6]:
train_df.head()

Unnamed: 0,user_id,item_id,rating,time
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [7]:
varidation_df.head()

Unnamed: 0,user_id,item_id,rating,time
70000,483,1580,3.0,1465387435
70001,483,1676,4.0,1465387586
70002,483,1682,3.0,1465387384
70003,483,1732,5.0,1465387406
70004,483,1884,4.0,1465387578


In [8]:
test_df.head()

Unnamed: 0,user_id,item_id,rating,time
90000,597,2668,3.0,940707473
90001,597,2699,4.0,940707412
90002,597,2716,5.0,940708124
90003,597,2720,2.0,940706200
90004,597,2761,5.0,940705799


In [9]:
user_min=5 
item_min=5

In [10]:
print('\tnum_users = ' + str(len(df['user_id'].unique())))
print('\tnum_items = ' + str(len(df['item_id'].unique())))
print('\tdf_shape  = ' + str(df.shape))

	num_users = 671
	num_items = 9066
	df_shape  = (100004, 4)


In [11]:
user_counts = df['user_id'].value_counts()
print('Collected user counts...')
item_counts = df['item_id'].value_counts()
print('Collected item counts...')

Collected user counts...
Collected item counts...


In [12]:
# Filter based on user and item counts
df = df[df.apply(
        lambda x: user_counts[x['user_id']] >= user_min, axis=1)]
print('User filtering done...')

df = df[df.apply(
        lambda x: item_counts[x['item_id']] >= item_min, axis=1)]
print('Item filtering done...')

print('Second pass')
print('\tnum_users = ' + str(len(df['user_id'].unique())))
print('\tnum_items = ' + str(len(df['item_id'].unique())))
print('\tdf_shape  = ' + str(df.shape))

User filtering done...
Item filtering done...
Second pass
	num_users = 671
	num_items = 3496
	df_shape  = (90072, 4)


In [13]:
# Normalize temporal values
print('Normalizing temporal values...')
mean = df['time'].mean()
std  = df['time'].std()
ONE_YEAR = (60 * 60 * 24 * 365) / mean
ONE_DAY  = (60 * 60 * 24) / mean
df['time'] = (df['time'] - mean) / std

Normalizing temporal values...


In [14]:
# 時系列にソート
df = df.sort_values('time')
# indexを振り直す
df = df.reset_index(drop=True)

## dataを整形

In [15]:
def shaping_data_set(df):
        # 時系列にソート
        df = df.sort_values('time')
        # indexを振り直す
        df = df.reset_index(drop=True)
        
        users = df['user_id'].unique()
        users_list = df['user_id']
        item_list = df['item_id']

        target_data_set = {}

        for target_user in users:
                target_users_item_list = []
                for user, item in zip(users_list, item_list):
                        if user == target_user:
                                target_users_item_list.append(item)
        target_data_set[target_user] = target_users_item_list


        return target_data_set

In [16]:
user_train =  shaping_data_set(train_df)

In [17]:
user_validation = shaping_data_set(varidation_df)

In [18]:
user_test = shaping_data_set(test_df)

In [22]:
# 整形したdictをファイル出力
np.save('user_train', user_train)
np.save('user_validation', user_validation)
np.save('user_test', user_test)