In [1]:
import numpy as np, pandas as pd

df_full = pd.read_csv("ratings.dat", delimiter="::", engine="python", header=None)
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_full.columns = ["UserId", "ItemId", "Rating", "Timestamp"]
df_full = df_full.drop("Timestamp", axis=1)
df_full["UserId"], _ = pd.factorize(df_full["UserId"])
df_full["ItemId"], _ = pd.factorize(df_full["ItemId"])
df_train["UserId"], users_train = pd.factorize(df_train["UserId"])
df_train["ItemId"], items_train = pd.factorize(df_train["ItemId"])
df_test["UserId"] = pd.Categorical(df_test["UserId"], users_train).codes
df_test["ItemId"] = pd.Categorical(df_test["ItemId"], items_train).codes

### Spotlight does not perform mean centering, so it has to be done manually
df_full["Rating"] -= df_full["Rating"].mean()
train_mean = df_train["Rating"].mean()
df_train["Rating"] -= train_mean
df_test["Rating"] -= train_mean

In [2]:
from spotlight.interactions import Interactions
from spotlight.factorization.explicit import ExplicitFactorizationModel

dt_full = Interactions(df_full.UserId.to_numpy().astype(np.int32),
                       df_full.ItemId.to_numpy().astype(np.int32),
                       df_full.Rating.to_numpy().astype(np.float32))
dt_train = Interactions(df_train.UserId.to_numpy().astype(np.int32),
                        df_train.ItemId.to_numpy().astype(np.int32),
                        df_train.Rating.to_numpy().astype(np.float32))

In [3]:
%%time
model = ExplicitFactorizationModel(
    loss='regression', embedding_dim=50, n_iter=15,
    l2=0.05, use_cuda=False, random_state=np.random.RandomState(seed=1)
)
model.fit(dt_full)

  allow_unreachable=True)  # allow_unreachable flag


CPU times: user 1d 2h 31min 41s, sys: 26min 5s, total: 1d 2h 57min 47s
Wall time: 3h 22min 21s


In [4]:
model = ExplicitFactorizationModel(
    loss='regression', embedding_dim=50, n_iter=15,
    l2=0.05, use_cuda=False, random_state=np.random.RandomState(seed=1)
)
model.fit(dt_train)

In [5]:
def print_rmse(pred, real):
    errors = pred - real
    rmse = np.sqrt(np.mean(np.array(errors) ** 2))
    print("RMSE: %f" % rmse)

pred = model.predict(df_test.UserId.to_numpy().astype(np.int32),
                     df_test.ItemId.to_numpy().astype(np.int32))
print_rmse(pred, df_test.Rating)

RMSE: 1.054698
