In [21]:
import pandas as pd
import os
from datetime import datetime

In [22]:
DATASET = '4Square_NYC'  # only support "ml-100k" and "ml-1m" now
RAW_PATH = os.path.join('./', DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 99

# Load Data

In [23]:
inters = []
with open('4Square_NYC/dataset_TSMC2014_NYC.txt') as f:
    for line in f.readlines():
        inters.append(line.strip('\n').split('\t'))

In [24]:
df = pd.DataFrame(inters)

In [25]:
data_df = df[[0, 1, 7]]

In [26]:
data_df.columns = ['user_id', 'item_id', 'time']

In [27]:
data_df['time'] = (pd.to_datetime(data_df['time']).dt.tz_localize(None) - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


## Statistics

In [28]:
n_users = data_df['user_id'].value_counts().size
n_items = data_df['item_id'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['time'].min()
max_time = data_df['time'].max()

In [29]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 1083
# Items: 38333
# Interactions: 227428
Time Span: 2012-04-03/2013-02-16


# Build Dataset

## Interaction data

In [30]:
np.random.seed(RANDOM_SEED)

In [31]:
out_df = data_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df.sort_values(by=['time', 'user_id'], kind='mergesort', inplace=True)
out_df = out_df.reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,470,49bbd6c0f964a520f4531fe3,1333476009
1,979,4a43c0aef964a520c6a61fe3,1333476025
2,69,4c5cc7b485a1e21e00d35711,1333476144
3,395,4bc7086715a7ef3bef9878da,1333476161
4,87,4cf2c5321d18a143951b5cec,1333476180


In [32]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,497,2389,1333476009
1,1061,3922,1333476025
2,740,20329,1333476144
3,413,15115,1333476161
4,940,23551,1333476180


In [33]:
out_df['time_shift'] = out_df.groupby('user_id').time.shift(-1)

In [59]:
count_items = {}
for cat, val in df.groupby(3)[3].count().sort_values(ascending=False).iteritems():
    count_items[cat] = val

In [40]:
items = {}
for cat in df[3].unique():
    items[cat] = item2id[df[df[3]==cat][1].values[0]]

In [34]:
# leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    n_items = data_df['item_id'].value_counts().size
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, n_items + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, n_items + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [35]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(225012, 1083, 1083)

In [36]:
train_df.head()

Unnamed: 0,user_id,item_id,time,time_shift
0,497,2389,1333476009,1333496000.0
1,1061,3922,1333476025,1333498000.0
2,740,20329,1333476144,1333482000.0
3,413,15115,1333476161,1334024000.0
4,940,23551,1333476180,1333476000.0


In [37]:
test_df.head()

Unnamed: 0,user_id,item_id,time,time_shift,neg_items
44223,561,4759,1335908139,,"[2733, 21244, 30404, 32104, 20758, 14936, 1543..."
60855,618,5817,1336765966,,"[23307, 36951, 13161, 26636, 33076, 13393, 143..."
69116,497,2389,1337104813,,"[8390, 895, 30143, 32905, 26814, 25969, 13689,..."
70622,795,23116,1337173212,,"[18492, 14673, 26267, 30333, 20644, 11835, 173..."
74949,1023,23816,1337378381,,"[18505, 15836, 7789, 24405, 26952, 25172, 3811..."


In [38]:
data = pd.concat((train_df, dev_df[['user_id', 'item_id', 'time']], test_df[['user_id', 'item_id', 'time']]))
data.to_csv('4Square_NYC/data.csv', index=None)

In [39]:
# save results

train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)