In [1]:
import os
import time
import gzip
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
DATASET = 'Gowalla'
# ORIGIN_PATH = '/work/cywang/workspace/Data/Gowalla/'
ORIGIN_PATH = '/data2/fanlu/ReChorus/data/Gowalla/'

RAW_PATH = os.path.join('./', DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 1000

# Load Data

1. Load interaction data
2. Filter out items with less than 5 interactions
3. Calculate basic statistics

In [3]:
data_path = os.path.join(ORIGIN_PATH, 'loc-gowalla_totalCheckins.txt')
data_df = pd.read_csv(data_path, sep='\t', names=['user_id', 'time', 'latitude', 'longitude', 'item_id'])
data_df['time'] = data_df['time'].apply(lambda x: time.mktime(time.strptime(x, '%Y-%m-%dT%H:%M:%SZ')))
data_df = data_df[['user_id', 'item_id', 'time']]
data_df.head()

Unnamed: 0,user_id,item_id,time
0,0,22847,1287504000.0
1,0,420315,1287411000.0
2,0,316637,1287330000.0
3,0,16516,1287315000.0
4,0,5535878,1287226000.0


In [4]:
# Only retain users and items with at least 5 associated interactions

print('Filter before:', len(data_df))
filter_before = -1
while filter_before != len(data_df):
    filter_before = len(data_df)
    for stage in ['user_id', 'item_id']:
        val_cnt = data_df[stage].value_counts()
        cnt_df = pd.DataFrame({stage: val_cnt.index, 'cnt': val_cnt.values})
        data_df = pd.merge(data_df, cnt_df, on=stage, how='left')
        data_df = data_df[data_df['cnt'] >= 5].drop(columns=['cnt'])
print('Filter after:', len(data_df))

Filter before: 6442892
Filter after: 4616686


### Statistics

In [5]:
n_users = data_df['user_id'].value_counts().size
n_items = data_df['item_id'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['time'].min()
max_time = data_df['time'].max()

In [6]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 76894
# Items: 304443
# Interactions: 4616686
Time Span: 2009-02-03/2010-10-22


# Build Dataset

### Interaction data

In [7]:
np.random.seed(RANDOM_SEED)

In [8]:
out_df = data_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df.sort_values(by=['time', 'user_id'], kind='mergesort', inplace=True)
out_df = out_df.reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,138,9056,1233696000.0
1,24,8904,1233786000.0
2,138,8957,1234870000.0
3,138,8956,1234870000.0
4,138,9208,1234954000.0


In [9]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,124,42,1233696000.0
1,21,1,1233786000.0
2,124,8,1234870000.0
3,124,7,1234870000.0
4,124,140,1234954000.0


In [10]:
# leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    n_items = data_df['item_id'].value_counts().size
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, n_items + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, n_items + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [11]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(4462389, 76894, 76894)

In [12]:
train_df.head()

Unnamed: 0,user_id,item_id,time
0,124,42,1233696000.0
1,21,1,1233786000.0
2,124,8,1234870000.0
3,124,7,1234870000.0
4,124,140,1234954000.0


In [13]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
1040,36986,403,1240076000.0,"[117953, 152316, 304138, 122580, 86294, 211544..."
1188,73487,456,1240477000.0,"[249707, 206335, 204061, 232157, 259138, 11591..."
1443,3940,2198,1241258000.0,"[218367, 44753, 268660, 126573, 297232, 177434..."
2853,14818,4209,1246451000.0,"[290288, 169907, 128978, 138412, 30160, 68381,..."
3449,39529,152,1248288000.0,"[196259, 197062, 265991, 212737, 115437, 20552..."


### Filter dev/test users

In [14]:
clicked_item_set = dict()
for user_id, seq_df in train_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())

In [21]:
idx_select = list()
for u, i in zip(test_df['user_id'].values, test_df['item_id'].values):
    idx_select.append(i not in clicked_item_set[u])

In [22]:
len(idx_select), np.sum(idx_select)

(76894, 41733)

In [23]:
filter_test_df = test_df[np.array(idx_select)]

In [24]:
idx_select = list()
for u, i in zip(dev_df['user_id'].values, dev_df['item_id'].values):
    idx_select.append(i not in clicked_item_set[u])

In [25]:
len(idx_select), np.sum(idx_select)

(76894, 42271)

In [26]:
filter_dev_df = dev_df[np.array(idx_select)]

In [28]:
# save results

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
filter_dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
filter_test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)