In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
random.seed(2020)
np.random.seed(2020)
# from pandarallel import pandarallel
# pandarallel.initialize(nb_workers=12, shm_size_mb=10000, progress_bar=True)
%matplotlib inline
data_path = '../datasets/rating.csv'
movie_path = '../datasets/movie.csv'

In [3]:
df_data = pd.read_csv(data_path, parse_dates=True)
df_movie = pd.read_csv(movie_path)

Randomly allocate a genre for each movie:

In [4]:
df_movie.genres = df_movie.genres.apply(lambda x: random.choice(x.split('|')))
df_data = df_data.merge(df_movie[['movieId', 'genres']], on='movieId', how='left')

对时间戳排序：

In [5]:
df_data.timestamp = df_data.timestamp.apply(lambda x: pd.Timestamp(x))
df_data.sort_values('timestamp', inplace=True)

Filter samples:
- filter samples with rating less than 4
- filter users with iteractions less than 11

In [6]:
df_data = df_data[df_data.rating >= 4]
cnt = df_data.groupby('userId')['movieId'].count()
user_id = cnt.index[cnt > 10]
df_data = df_data[df_data.userId.isin(user_id)]

In [7]:
def encode_category(df, col_name):
    unique = df[col_name].unique()
    df.loc[:, col_name] = df[col_name].map(dict(zip(unique, range(1, len(unique) + 1))))

Encode the categorical features:

In [8]:
encode_category(df_data, 'userId')
encode_category(df_data, 'movieId')
encode_category(df_data, 'genres')

In [None]:
# history = df_data.groupby(['userId', 'timestamp'])[['movieId']].agg(lambda x: list(x))
# # max_len = df_data.groupby(['user_id', 'time_id'])['item_id'].agg(lambda x: len(x)).max()
# history = history.reset_index().groupby(['userId']).agg({
#     'timestamp': lambda x: list(x),
#     'movieId': lambda x: list(x)
# })
# # valid_user = history.time_id.apply(lambda x: (np.max(x) == 8 and len(x) > 2))
# # history = history[valid_user]
# history = history.reset_index()
# index = np.random.choice(len(history), 2000, replace=False)
# val_index = index[:1000]
# test_index = index[1000:]
# history.loc[val_index, 'type'] = 'val'
# history.loc[test_index, 'type'] = 'test'
# history['type'].fillna('train', inplace=True)
# history.to_pickle('../datasets/movielensHistory.pkl')

Generate the initial item sequence:

In [9]:
item = df_data[['movieId', 'genres']].drop_duplicates('movieId')
item = item.groupby(['genres'])['movieId'].agg(lambda x: list(x))

item_sorted = []
for _, x in item.iteritems():
    item_sorted += x

np.save('../datasets/movieSorted.npy', item_sorted)

Aggregate historical behaviour for each user:

In [10]:
history = df_data.groupby(['userId'], as_index=False)[['movieId', 'timestamp']].agg(list)
history = history.rename(columns={
    'movieId': 'history'
})
index = np.random.choice(len(history), 2000, replace=False)
val_index = index[:1000]
test_index = index[1000:]
history.loc[val_index, 'type'] = 'val'
history.loc[test_index, 'type'] = 'test'
history['type'].fillna('train', inplace=True)
history.to_pickle('../datasets/movielensHistory.pkl')

In [1]:
def build_time_window(t, hist, max_window_size=8, gap='30 day'):
    t = np.array(t)
    hist = np.array(hist)
    delta = t[1:] - t[:-1]
    is_gap = delta < pd.Timedelta(gap)
    i = len(is_gap)
    max_len = 0
    history_window = []
    while len(history_window) < max_window_size:
        window = []
        window.append(hist[i])
        i -= 1
        while i >= 0 and is_gap[i]:
            window.append(hist[i])
            i -= 1
        history_window.append(window)
        max_len = max(len(window), max_len)
        if i < 0:
            break
    while len(history_window) < max_window_size:
        history_window.append([0])

    for i in range(len(history_window)):
        history_window[i] += [0] * (max_len - len(history_window[i]))
    return history_window, max_len

In [46]:
# def generate_train_data(fname, data, seq_len=70, min_len=8):
# #     with open(fname, 'w') as f:
#     for _, row in data.iterrows():
#         timestamp = row.timestamp
#         items = row.history
# #         arr_items = [0 for i in range(seq_len - min_len)] + list(items)
# #         arr_timestamp = [0 for i in range(seq_len - min_len)] + list(timestamp)
        
#         for i in range(min_len, len(items)):
#             sample = items[:i]
#             label = sample[-1]
#             sample = sample[:-1]
#             hist = sample
#             t = timestamp[:i][:-1]
# #             break
#         break
#     return hist, t
            