In [1]:
import sys
sys.path.append('../')

In [2]:
import pandas as pd

from src.preprocess import filter_items, filter_users
from src.preprocess import add_time_idx

In [3]:
CONFIG = {
    'item_min_count': 5,
    'user_min_count': 20,
    'last_n_items': 10
}

In [4]:
DATASET_NAME = 'ml-20m.csv'

# Path to folder with DATASET_NAME
INPUT_DATA_PATH = '../datasets/'

OUTPUT_DATA_PATH = '../data/'

In [5]:
dataset = pd.read_csv(INPUT_DATA_PATH + DATASET_NAME)
print(dataset.shape)
dataset.head()

(19983694, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,0,908,3.5,2004-09-10 03:06:38
1,0,903,3.5,2004-09-10 03:07:01
2,0,2598,3.5,2004-09-10 03:07:30
3,0,1533,3.5,2004-09-10 03:07:36
4,0,1058,4.0,2004-09-10 03:07:45


In [6]:
dataset.nunique()

user_id        138476
item_id         18342
rating             10
timestamp    15335139
dtype: int64

In [7]:
dataset

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,908,3.5,2004-09-10 03:06:38
1,0,903,3.5,2004-09-10 03:07:01
2,0,2598,3.5,2004-09-10 03:07:30
3,0,1533,3.5,2004-09-10 03:07:36
4,0,1058,4.0,2004-09-10 03:07:45
...,...,...,...,...
19983689,138475,6395,3.0,2009-12-07 18:18:28
19983690,138475,11365,4.0,2009-12-07 18:18:40
19983691,138475,1248,3.0,2010-01-01 20:42:32
19983692,138475,6848,3.0,2010-01-01 20:42:35


## Preprocess

In [8]:
dataset_ = filter_items(dataset, CONFIG['item_min_count'])
dataset_ = filter_users(dataset_, CONFIG['user_min_count'])

Filtering items..
Number of items before 18342
Number of items after 18342
Interactions length before: 19983694
Interactions length after: 19983694
Filtering users..
Number of users before 138476
Number of users after 138476
Interactions length before: 19983694
Interactions length after: 19983694


In [9]:
dataset_

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,908,3.5,2004-09-10 03:06:38
1,0,903,3.5,2004-09-10 03:07:01
2,0,2598,3.5,2004-09-10 03:07:30
3,0,1533,3.5,2004-09-10 03:07:36
4,0,1058,4.0,2004-09-10 03:07:45
...,...,...,...,...
19983689,138475,6395,3.0,2009-12-07 18:18:28
19983690,138475,11365,4.0,2009-12-07 18:18:40
19983691,138475,1248,3.0,2010-01-01 20:42:32
19983692,138475,6848,3.0,2010-01-01 20:42:35


In [10]:
%%time
dataset_ = add_time_idx(dataset_)
print(dataset_.shape)

(19983694, 6)
CPU times: user 52.4 s, sys: 3.92 s, total: 56.3 s
Wall time: 56.3 s


In [11]:
dataset_.user_id = dataset_.user_id.astype('category').cat.codes

# We later use 0 as a padding_idx
dataset_.item_id = dataset_.item_id.astype('category').cat.codes + 1

In [12]:
dataset_ = dataset_.drop(['time_idx', 'time_idx_reversed'], axis=1)

In [13]:
dataset_

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,908,3.5,2004-09-10 03:06:38
1,0,903,3.5,2004-09-10 03:07:01
2,0,2598,3.5,2004-09-10 03:07:30
3,0,1533,3.5,2004-09-10 03:07:36
4,0,1058,4.0,2004-09-10 03:07:45
...,...,...,...,...
19983689,138475,6395,3.0,2009-12-07 18:18:28
19983690,138475,11365,4.0,2009-12-07 18:18:40
19983691,138475,1248,3.0,2010-01-01 20:42:32
19983692,138475,6848,3.0,2010-01-01 20:42:35


In [14]:
dataset_.nunique()

user_id        138476
item_id         18342
rating             10
timestamp    15335139
dtype: int64

In [15]:
dataset_.groupby('user_id')['timestamp'].count().mean()

144.31160634333747

In [16]:
dataset_.to_csv(OUTPUT_DATA_PATH + DATASET_NAME, index=False)