In [3]:
import mxnet as mx
from movielens_data import get_data_iter, max_id
from matrix_fact import train, RMSE

In [4]:
ctx = [mx.gpu(0)]

In [5]:
train_test_data = get_data_iter(batch_size=100)
max_user, max_item = max_id('./ml-100k/u.data')

In [2]:
# user = mx.sym.Variable("user")
# movie = mx.sym.Variable("movie")

# y = mx.sym.Variable("softmax_label")

# # embedding
# user = mx.sym.Embedding(user, input_dim=, output_dim=)
# movie = mx.sym.Embedding(movie, input_dim=, output_dim=)

# # network
# nn = mx.sym.concat(user, movie)
# nn = mx.sym.flatten(nn)

# nn = mx.sym.FullyConnected(nn, num_hidden=)
# nn = mx.sym.Activation(nn, act_type='relu')

# nn = mx.sym.FullyConnected(nn, num_hidden=1)

# yhat = mx.sym.LinearRegressionOutput(nn, y)

# model = mx.module.Module(
#             symbol = yhat,
#             context=ctx, 
#             data_names=["user", "moive"],
#             label_names=["softmax_label"]
#         )

# model.fit(
#     X_train,
#     eval_data=X_eval,
#     eval_metric='rmse',
#     num_epoch=10, 
#     optimizer='adam', 
#     optimizer_params={'learning_rate': 1e-3}
# )

In [6]:
import os
import urllib
import zipfile
import pandas as pd
import mxnet as mx

ctx = [mx.gpu(0)]

# url, name = 'http://files.grouplens.org/datasets/movielens/ml-20m.zip', 'ml-20m.zip'
# if not os.path.exists(name):
#     urllib.urlretrieve(url, name)

# with zipfile.ZipFile(name, 'r') as f:
#     f.extractall('./')

data = pd.read_csv('./ml-20m/ratings.csv', sep=',', usecols=(0, 1, 2))
data.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [7]:
n_users, n_movies = data['userId'].max(), data['movieId'].max()
batch_size = 4096 * 8

In [8]:
# train test split
ntrain = 19000000
data = data.sample(frac=1).reset_index(drop=True)

train_users = data['userId'].values[:ntrain] - 1
train_movies = data['movieId'].values[:ntrain] - 1
train_ratings = data['rating'].values[:ntrain]

valid_users = data['userId'].values[ntrain:] - 1
valid_movies = data['movieId'].values[ntrain:] - 1
valid_ratings = data['rating'].values[ntrain:]

train_iter = {
    'user': train_users, 
    'movie': train_movies 
}

valid_iter = {
    'user': valid_users, 
    'movie': valid_movies 
}

In [9]:
X_train = mx.io.NDArrayIter(train_iter, 
                            label=train_ratings, 
                            batch_size=batch_size)
X_eval  = mx.io.NDArrayIter(valid_iter, 
                            label=valid_ratings, 
                            batch_size=batch_size)

In [10]:
# matrix factorization
def simpleMF():
    user = mx.sym.Variable("user")
    movie = mx.sym.Variable("movie")

    y = mx.sym.Variable("softmax_label")

    # embedding
    user = mx.sym.Embedding(user, input_dim=n_users, output_dim=25)
    movie = mx.sym.Embedding(movie, input_dim=n_movies, output_dim=25)

    # network
    nn = mx.sym.sum_axis((user * movie), axis=1)
    nn = mx.sym.flatten(nn)
    yhat = mx.sym.LinearRegressionOutput(nn, y)

    model = mx.module.Module(
                symbol = yhat,
                context=ctx, 
                data_names=["user", "movie"],
                label_names=["softmax_label"]
            )

    model.fit(
        X_train,
        eval_data=X_eval,
        eval_metric='rmse',
        num_epoch=10, 
        optimizer='adam', 
        optimizer_params={'learning_rate': 1e-3},
        batch_end_callback=mx.callback.Speedometer(batch_size, 250)
    )

simpleMF()

INFO:root:Epoch[0] Batch [250]	Speed: 12608976.50 samples/sec	rmse=3.308167
INFO:root:Epoch[0] Batch [500]	Speed: 12648048.21 samples/sec	rmse=1.673588
INFO:root:Epoch[0] Train-rmse=1.192819
INFO:root:Epoch[0] Time cost=1.549
INFO:root:Epoch[0] Validation-rmse=1.142205
INFO:root:Epoch[1] Batch [250]	Speed: 12696326.52 samples/sec	rmse=1.045299
INFO:root:Epoch[1] Batch [500]	Speed: 12637664.93 samples/sec	rmse=0.946934
INFO:root:Epoch[1] Train-rmse=0.915932
INFO:root:Epoch[1] Time cost=1.489
INFO:root:Epoch[1] Validation-rmse=0.912990
INFO:root:Epoch[2] Batch [250]	Speed: 13775595.75 samples/sec	rmse=0.898491
INFO:root:Epoch[2] Batch [500]	Speed: 12486717.63 samples/sec	rmse=0.885531
INFO:root:Epoch[2] Train-rmse=0.880092
INFO:root:Epoch[2] Time cost=1.448
INFO:root:Epoch[2] Validation-rmse=0.881017
INFO:root:Epoch[3] Batch [250]	Speed: 13485994.58 samples/sec	rmse=0.875747
INFO:root:Epoch[3] Batch [500]	Speed: 13772558.80 samples/sec	rmse=0.873186
INFO:root:Epoch[3] Train-rmse=0.871655

In [11]:
# deep matrix factorization
def deepMF():
    user = mx.sym.Variable("user")
    movie = mx.sym.Variable("movie")

    y = mx.sym.Variable("softmax_label")

    # embedding
    user = mx.sym.Embedding(user, input_dim=n_users, output_dim=50)
    movie = mx.sym.Embedding(movie, input_dim=n_movies, output_dim=25)
    
    nn = mx.sym.concat(user, movie)
    nn = mx.sym.flatten(nn)
    
    # network
    nn = mx.symbol.FullyConnected(data=nn, num_hidden=64)
    nn = mx.symbol.BatchNorm(nn)
    nn = mx.symbol.Activation(data=nn, act_type='relu')
    nn = mx.symbol.FullyConnected(data=nn, num_hidden=64)
    nn = mx.symbol.BatchNorm(nn)
    nn = mx.symbol.Activation(data=nn, act_type='relu')
    nn = mx.symbol.FullyConnected(data=nn, num_hidden=1)
    
    yhat = mx.sym.LinearRegressionOutput(nn, y)

    model = mx.module.Module(
                symbol = yhat,
                context=ctx, 
                data_names=["user", "movie"],
                label_names=["softmax_label"]
            )

    model.fit(
        X_train,
        eval_data=X_eval,
        eval_metric='rmse',
        num_epoch=10, 
        optimizer='adam', 
        optimizer_params={'learning_rate': 1e-3},
        batch_end_callback=mx.callback.Speedometer(batch_size, 250)
    )
deepMF()

INFO:root:Epoch[0] Batch [250]	Speed: 2493377.07 samples/sec	rmse=1.392114
INFO:root:Epoch[0] Batch [500]	Speed: 2463581.65 samples/sec	rmse=0.862773
INFO:root:Epoch[0] Train-rmse=0.850301
INFO:root:Epoch[0] Time cost=7.772
INFO:root:Epoch[0] Validation-rmse=0.846704
INFO:root:Epoch[1] Batch [250]	Speed: 2445668.23 samples/sec	rmse=0.839253
INFO:root:Epoch[1] Batch [500]	Speed: 2434540.47 samples/sec	rmse=0.830354
INFO:root:Epoch[1] Train-rmse=0.825419
INFO:root:Epoch[1] Time cost=7.752
INFO:root:Epoch[1] Validation-rmse=0.832387
INFO:root:Epoch[2] Batch [250]	Speed: 2521759.02 samples/sec	rmse=0.813261
INFO:root:Epoch[2] Batch [500]	Speed: 2525345.03 samples/sec	rmse=0.812137
INFO:root:Epoch[2] Train-rmse=0.809615
INFO:root:Epoch[2] Time cost=7.527
INFO:root:Epoch[2] Validation-rmse=0.833341
INFO:root:Epoch[3] Batch [250]	Speed: 2487557.84 samples/sec	rmse=0.799655
INFO:root:Epoch[3] Batch [500]	Speed: 2517537.79 samples/sec	rmse=0.800960
INFO:root:Epoch[3] Train-rmse=0.798268
INFO:ro