In [None]:
import os
import time
import json
import zipfile
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [63]:
DATASET = 'Yelp2018'
# ORIGIN_PATH = '/work/cywang/workspace/Data/Yelp/'
ORIGIN_PATH = '/data2/fanlu/ReChorus/data/Yelp/'
RAW_PATH = os.path.join('./', DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 1000

# Load Data

1. Load interaction data
2. Retain records since 2018
3. Filter out items with less than 5 interactions
4. Calculate basic statistics

In [64]:
data_path = os.path.join(ORIGIN_PATH, 'yelp_academic_dataset_review.json')
reader = pd.read_json(data_path, lines=True, chunksize=1000000,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int})

In [65]:
useful_df_lst = list()
for data in reader:
    useful_df_lst.append(data[['user_id', 'business_id', 'date']].copy())
    del data
data_df = pd.concat(useful_df_lst)
data_df.head()

Unnamed: 0,user_id,business_id,date
0,OwjRMXRC0KyPrIlcjaXeFQ,-MhfebM0QIsKt87iDN-FNw,2015-04-15 05:21:16
1,nIJD_7ZXHq-FX8byPMOkMQ,lbrU8StCq3yDfr-QMnGrmQ,2013-12-07 03:16:52
2,V34qejxNsCbcgD8C0HVk-Q,HQl28KMwrEKHqhFrrDqVNQ,2015-12-05 03:18:11
3,ofKDkJKXSKZXu5xJNGiiBQ,5JxlZaqCnk1MnbgRirs40Q,2011-05-27 05:30:52
4,UgMW8bLE0QMJDCkQ1Ax5Mg,IS4cv902ykd8wj1TR0N3-A,2017-01-14 21:56:57


In [66]:
# Only retain records since 2018

print('Filter before:', len(data_df))
data_df['time'] = data_df['date'].apply(lambda x: time.mktime(time.strptime(x, '%Y-%m-%d %H:%M:%S')))
begin_time = time.mktime(time.strptime('2018-01-01 00:00:00', '%Y-%m-%d %H:%M:%S'))
data_df = data_df[data_df['time'] > begin_time]
data_df = data_df.rename(columns={'business_id': 'item_id'})
print('Filter after:', len(data_df))

Filter before: 8021122
Filter after: 2533890


In [67]:
# Only retain users and items with at least 5 associated interactions

print('Filter before:', len(data_df))
filter_before = -1
while filter_before != len(data_df):
    filter_before = len(data_df)
    for stage in ['user_id', 'item_id']:
        val_cnt = data_df[stage].value_counts()
        cnt_df = pd.DataFrame({stage: val_cnt.index, 'cnt': val_cnt.values})
        data_df = pd.merge(data_df, cnt_df, on=stage, how='left')
        data_df = data_df[data_df['cnt'] >= 5].drop(columns=['cnt'])
print('Filter after:', len(data_df))

Filter before: 2533890
Filter after: 1057181


### Statistics

In [68]:
n_users = data_df['user_id'].value_counts().size
n_items = data_df['item_id'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['time'].min()
max_time = data_df['time'].max()

In [69]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 88437
# Items: 45939
# Interactions: 1057181
Time Span: 2017-12-31/2019-12-13


# Build Dataset

### Interaction data

In [70]:
np.random.seed(RANDOM_SEED)

In [71]:
out_df = data_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df.sort_values(by=['time', 'user_id'], kind='mergesort', inplace=True)
out_df = out_df.reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,Z3S7Y6ywAOrWUS-jevfu6Q,zr42_UsWfaIF-rcp37OpwA,1514736000.0
1,Lnvh5QonYyH-zTyCwAovJg,ptWNY_h088kmKhsL-gaOEg,1514736000.0
2,e8gCRa-g0VkblC-LQx0ajg,EvE23d1PSbfGWe7EA5HRBQ,1514736000.0
3,reEismUZQ2DV9EP_XGDQ7w,lmxA0dJM0XsPCIHPXhEQ-g,1514736000.0
4,sCelgwFoaNLMC_A7Y8usCw,2Xix4Iv6gdj3F6fbJX3zgA,1514736000.0


In [72]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,49926,45836,1514736000.0
1,31552,38682,1514736000.0
2,58301,11468,1514736000.0
3,77058,35775,1514736000.0
4,77824,2597,1514736000.0


In [73]:
# leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    n_items = data_df['item_id'].value_counts().size
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, n_items + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, n_items + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [74]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(880307, 88437, 88437)

In [75]:
train_df.head()

Unnamed: 0,user_id,item_id,time
0,49926,45836,1514736000.0
1,31552,38682,1514736000.0
2,58301,11468,1514736000.0
3,77058,35775,1514736000.0
4,77824,2597,1514736000.0


In [76]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
364,56411,44123,1514760000.0,"[2733, 43568, 42614, 45892, 21244, 30404, 3210..."
840,83096,15884,1514812000.0,"[31070, 39527, 13196, 7763, 36077, 28676, 3293..."
1172,13208,12927,1514827000.0,"[27252, 27582, 35841, 13219, 1011, 9445, 33980..."
2131,14903,4817,1514900000.0,"[14769, 17957, 38366, 41508, 3528, 17345, 1414..."
2415,38501,45769,1514913000.0,"[21274, 12396, 44963, 34174, 6001, 35372, 2726..."


In [77]:
# save results

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)