In [1]:
import polars as pl
from polars import col

from easey import EASE

## MovieLens example notebook

This simple example notebook is a minimal example purely for getting started. No ML techniques are used here - e.g. train/test split, NDCG measurement. The only purpose is to show how to train and predict on a simple dataset.

In [2]:
# Download and unzip the ML-20M dataset from https://grouplens.org/datasets/movielens/
# Standardize the dataset to (user, item, rating) and binarize to implicit ratings (1 = watched, 0 = not watched).
ratings = pl.read_csv('ml-20m/ratings.csv').select(user='userId', item='movieId', rating=1.0)

In [3]:
# Limit to top 10,000 most popular items (movies).
# This only removes 1% of the training data for massive training time benefit.
# The downside, of course, is missing long tail items.
popular_items = ratings['item'].value_counts(sort=True)['item'].head(10000).implode()
ratings = ratings.filter(col('item').is_in(popular_items))

### Training

In [4]:
%%time
ease = EASE(lambda_=100).fit(ratings)

CPU times: user 6min 44s, sys: 899 ms, total: 6min 45s
Wall time: 35.7 s


### Inference

In [8]:
%%time
# Predict for in-sample users - would be tested against holdout set on the same users
scores = ease.predict(ratings['user'].unique().sample(15000))
scores

CPU times: user 28.1 s, sys: 241 ms, total: 28.3 s
Wall time: 2.95 s


user,item,score
i64,i64,f64
2,3927,0.599152
2,480,0.608414
2,1972,0.61899
2,589,0.620065
2,3917,0.64228
…,…,…
138486,2762,1.075553
138486,6502,1.09714
138486,4306,1.102761
138486,593,1.15839
