In [1]:
# basic packages
import pandas as pd
import numpy as np
import math
import time
import itertools
from collections import defaultdict
from scipy import sparse
import pickle as pkl
import warnings
warnings.filterwarnings("ignore")

# evaluation
from sklearn.model_selection import train_test_split

# visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# self-build modules
import sys
sys.path.append('../model')
from mf_nn import *
from helper import *

Using TensorFlow backend.


# 1. data loading and sampling

In [2]:
df = pd.read_csv('../ml-latest/ratings.csv')
df_tag = pd.read_csv('../ml-latest/genome-scores.csv')

In [3]:
# sampled df
df_sample = sample_df(df, user_thresh=20, item_thresh=500)

number of users: 20000
number of items: 1000
number of ratings: 1179969


In [4]:
# tag of sampled movies
sample_tag = df_tag[df_tag['movieId'].isin(df_sample.movieId.unique())]

In [5]:
train_df, test_df = train_test_split_by_time(df_sample)
print(len(train_df))
print(len(test_df))

rating_train = train_df.pivot(index='movieId', columns='userId', values='rating')
rating_test = test_df.pivot(index='movieId', columns='userId', values='rating')
print(rating_train.shape)
print(rating_test.shape)

952005
227964
(1000, 20000)
(1000, 20000)


# 2. parameter tuning

In [6]:
# split train_df into training set and validation set by timestamp
cf_nn_train_df, cf_nn_val_df = train_test_split_by_time(train_df)

In [8]:
# parameters to be tuned
maxIter_list = [5, 15, 25]
regParam_list = [0.1, 0.01]
k_list = [20, 30, 50]
rank_list = [10, 15, 20]

In [None]:
params = list(itertools.product(maxIter_list, regParam_list, k_list, rank_list))
val_y = cf_nn_val_df.iloc[:, -2].values

## 2.1 hybrid model without movie tag embedding

In [1]:
# try to load finised epochs from local file
try:
    with open('./mse_mae_list.pkl', 'rb') as f:
        mse_list, mae_list, break_point = pkl.load(f)
        
# initialize local pkl file if not existing
except:
    mse_list, mae_list, break_point = [], [], 0
    with open('./mse_mae_list.pkl', 'wb') as f:
        pkl.dump([mse_list, mae_list, break_point], f)

In [12]:
# parameter tuning without tag embedded
for idx, (max_iter, regparam, k, rank) in enumerate(params):
    if idx < break_point:
        continue
    try:
        print(f'params:{max_iter},{regparam},{k},{rank}')
        model_cfnn = mf_nn(max_iter = max_iter, regparam = regparam, k = k, rank = rank, movie_tag_embed = False) 
        model_cfnn.fit(cf_nn_train_df, sample_tag)
        val_x = np.array(list(cf_nn_val_df.copy().drop('timestamp', axis = 1).apply(lambda x: model_cfnn.uf_dict[x['userId']] + model_cfnn.if_dict[x['movieId']] + model_cfnn.tag_dict[x['movieId']], axis=1).values))
        mse, mae = model_cfnn.nn.evaluate(val_x, val_y, verbose = 0)
        mse_list.append(mse)
        mae_list.append(mae)
    except:
        print(f'Broke at index: {idx}, params: {max_iter}, {regparam}, {k}, {rank}')
        break
        
with open('./mse_mae_list.pkl', 'wb') as f:
    pkl.dump([mse_list, mae_list, idx], f)

In [11]:
# best parameter
with open('./mse_mae_list.pkl', 'rb') as f:
    print(params[np.argmin(pkl.load(f)[1])])

(15, 0.1, 30, 10)


## 2.2 hybrid model with movie tag embedding

In [9]:
# try to load finised epochs from local file
try:
    with open('./mse_mae_list_tag_embed.pkl', 'rb') as f:
        mse_list, mae_list, break_point = pkl.load(f)
        
# initialize local pkl file if not existing
except:
    mse_list, mae_list, break_point = [], [], 0
    with open('./mse_mae_list_tag_embed.pkl', 'wb') as f:
        pkl.dump([mse_list, mae_list, break_point], f)

In [10]:
# parameter tuning with tag embedded
for idx, (max_iter, regparam, k, rank) in enumerate(params):
    if idx < break_point:
        continue
    try:
        print(f'params:{max_iter},{regparam},{k},{rank}')
        model_cfnn = mf_nn(max_iter = max_iter, regparam = regparam, k = k, rank = rank, movie_tag_embed = True) 
        model_cfnn.fit(cf_nn_train_df, sample_tag)
        val_x = np.array(list(cf_nn_val_df.copy().drop('timestamp', axis = 1).apply(lambda x: model_cfnn.uf_dict[x['userId']] + model_cfnn.if_dict[x['movieId']] + model_cfnn.tag_dict[x['movieId']], axis=1).values))
        mse, mae = model_cfnn.nn.evaluate(val_x, val_y, verbose = 0)
        mse_list.append(mse)
        mae_list.append(mae)
    except:
        print(f'Broke at index: {idx}, params: {max_iter}, {regparam}, {k}, {rank}')
        break
with open('./mse_mae_list_tag_embed.pkl', 'wb') as f:
    pkl.dump([mse_list, mae_list, idx], f)

params:20,0.1,20,10
Training begins.......
training set created
start training neural network......
model training finished
params:20,0.1,20,15
Training begins.......
training set created
start training neural network......
model training finished
params:20,0.1,20,20
Training begins.......
training set created
start training neural network......
model training finished
params:20,0.1,30,10
Training begins.......
training set created
start training neural network......
model training finished
params:20,0.1,30,15
Training begins.......
training set created
start training neural network......
model training finished
params:20,0.1,30,20
Training begins.......
training set created
start training neural network......
model training finished
params:20,0.1,50,10
Training begins.......
training set created
start training neural network......
model training finished
params:20,0.1,50,15
Training begins.......
training set created
start training neural network......
model training finished
params:2

In [11]:
# best parameter
with open('./mse_mae_list_tag_embed.pkl', 'rb') as f:
    print(params[np.argmin(pkl.load(f)[1])])

(15, 0.1, 30, 10)
