In [1]:
import numpy as np, pandas as pd

df_full = pd.read_csv("ratings.dat", delimiter="::", engine="python", header=None)
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_full.columns = ["UserId", "ItemId", "Rating", "Timestamp"]
df_full = df_full.drop("Timestamp", axis=1)
df_full["UserId"], _ = pd.factorize(df_full["UserId"])
df_full["ItemId"], _ = pd.factorize(df_full["ItemId"])
df_train["UserId"], users_train = pd.factorize(df_train["UserId"])
df_train["ItemId"], items_train = pd.factorize(df_train["ItemId"])
df_test["UserId"] = pd.Categorical(df_test["UserId"], users_train).codes
df_test["ItemId"] = pd.Categorical(df_test["ItemId"], items_train).codes

In [2]:
from surprise.dataset import Dataset
from surprise import Reader
from surprise.prediction_algorithms.matrix_factorization import SVD

reader = Reader(rating_scale=(1, 5))
dt_full = Dataset.load_from_df(df_full, reader).build_full_trainset()
dt_train = Dataset.load_from_df(df_train, reader).build_full_trainset()

In [3]:
%%time
model = SVD(n_factors=50, n_epochs=15, biased=True, reg_all=0.05,
            random_state=1, verbose=False)
model.fit(dt_full)

CPU times: user 2min 58s, sys: 99.6 ms, total: 2min 58s
Wall time: 2min 58s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f323de48850>

In [4]:
model = SVD(n_factors=50, n_epochs=15, biased=True, reg_all=0.05,
            random_state=1, verbose=False)
model.fit(dt_train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f32a63db790>

In [5]:
def predict_from_model(model, df):
    pred = np.empty(df.shape[0])
    uid = df.UserId.astype(str)
    iid = df.ItemId.astype(str)
    for ix in range(df.shape[0]):
        pred[ix] = model.predict(uid.iloc[ix], iid.iloc[ix]).est
    return pred

def print_rmse(pred, real):
    errors = pred - real
    rmse = np.sqrt(np.mean(np.array(errors) ** 2))
    print("RMSE: %f" % rmse)

pred = predict_from_model(model, df_test)
print_rmse(pred, df_test.Rating)

RMSE: 1.060049
