In [1]:
import numpy as np, pandas as pd

df_full = pd.read_csv("ratings.dat", delimiter="::", engine="python", header=None)
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_full.columns = ["UserId", "ItemId", "Rating", "Timestamp"]
df_full = df_full.drop("Timestamp", axis=1)
df_full["UserId"], _ = pd.factorize(df_full["UserId"])
df_full["ItemId"], _ = pd.factorize(df_full["ItemId"])
df_train["UserId"], users_train = pd.factorize(df_train["UserId"])
df_train["ItemId"], items_train = pd.factorize(df_train["ItemId"])
df_test["UserId"] = pd.Categorical(df_test["UserId"], users_train).codes
df_test["ItemId"] = pd.Categorical(df_test["ItemId"], items_train).codes

print(df_full.shape)
print(df_train.shape)
print(df_test.shape)

(10000054, 3)
(8000043, 3)
(1999975, 3)


In [2]:
### Now converting to Vowpal Wabbit's format
def save_in_wb_format(df, fname, is_test=False):
    df = df.assign(
        UserId = lambda x: "u " + x.UserId.astype(str),
        ItemId = lambda x: "i " + x.ItemId.astype(str),
        Rating = lambda x: x.Rating.astype(int).astype(str) + " "
    )
    df = df[["Rating", "UserId", "ItemId"]]
    if is_test:
        df = df.drop("Rating", axis=1)
    df.to_csv(fname, index=False, sep="|", header=None)
save_in_wb_format(df_full, "df_full_vw.txt")
save_in_wb_format(df_train, "df_train_vw.txt")
save_in_wb_format(df_test, "df_test_vw.txt", is_test=True)

In [3]:
### Will use the Command Line Interface for VW
import os

In [4]:
%%time
%%bash
./vw df_full_vw.txt -q ui --rank 50 --l2 0.05 --passes 15 --holdout_off \
--learning_rate 0.1 --random_seed 1 --cache_file temp.cache

creating quadratic features for pairs: ui 
using l2 regularization = 0.05
Num weight bits = 18
learning rate = 0.1
initial_t = 1
power_t = 0.5
decay_learning_rate = 1
creating cache_file = temp.cache
Reading datafile = df_full_vw.txt
num sources = 1
Enabled reductions: rank, scorer
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
22.211706 22.211706            1            1.0   5.0000   0.2871      103
20.379812 18.547918            2            2.0   5.0000   0.6933      103
17.292235 14.204659            4            4.0   5.0000   1.3913      103
13.700988 10.109741            8            8.0   5.0000   2.0814      103
9.731537 5.762087           16           16.0   5.0000   2.9223      103
6.710877 3.690217           32           32.0   3.0000   2.0444      103
4.319266 1.927655           64           64.0   4.0000   2.4860      103
3.441697 2.564129          128          128.0   3.00

CPU times: user 16.3 ms, sys: 8.76 ms, total: 25 ms
Wall time: 4min 53s


In [5]:
os.remove("temp.cache")

In [6]:
%%bash
./vw df_train_vw.txt -q ui --rank 50 --l2 0.05 --passes 15 --holdout_off \
--learning_rate 0.1 --random_seed 1 -f vw_model.reg --cache_file temp.cache

creating quadratic features for pairs: ui 
using l2 regularization = 0.05
final_regressor = vw_model.reg
Num weight bits = 18
learning rate = 0.1
initial_t = 1
power_t = 0.5
decay_learning_rate = 1
creating cache_file = temp.cache
Reading datafile = df_train_vw.txt
num sources = 1
Enabled reductions: rank, scorer
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
13.785847 13.785847            1            1.0   4.0000   0.2871      103
13.604061 13.422276            2            2.0   4.0000   0.3364      103
7.817205 2.030349            4            4.0   2.0000   0.5830      103
8.083151 8.349096            8            8.0   2.0000   0.9170      103
7.294627 6.506103           16           16.0   5.0000   1.1541      103
6.323093 5.351558           32           32.0   4.0000   1.6615      103
4.768966 3.214839           64           64.0   3.0000   2.0815      103
3.372009 1.975051       

In [7]:
%%bash
./vw df_test_vw.txt -i vw_model.reg -t --predictions vw_pred.txt

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

creating quadratic features for pairs: ui 
only testing
predictions = vw_pred.txt
Num weight bits = 18
learning rate = 10
initial_t = 1
power_t = 0.5
using no cache
Reading datafile = df_test_vw.txt
num sources = 1
Enabled reductions: rank, scorer
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
9.437434 9.437434            1            1.0   0.0000   3.0720       52
9.323956 9.210479            2            2.0   0.0000   3.0349       52
10.249872 11.175787            4            4.0   0.0000   3.4561       52
9.779625 9.309378            8            8.0   0.0000   

In [8]:
pred = pd.read_csv("vw_pred.txt", header=None, sep=" ")
pred = pred[0]
err = pred - df_test.Rating
rmse = np.sqrt(np.mean( (err**2).to_numpy() ))
print("RMSE is: %f" % rmse)

RMSE is: 1.054546


In [9]:
os.remove("vw_pred.txt")
os.remove("temp.cache")
os.remove("vw_model.reg")
os.remove("df_full_vw.txt")
os.remove("df_train_vw.txt")
os.remove("df_test_vw.txt")