In [55]:
from tqdm.auto import tqdm
import polars as pl
from src.polars_collaborative_filtering.collaborative_filtering import CollaborativeFilter
from sklearn.model_selection import KFold

books = pl.scan_csv("books.csv").select("book_id", "title", "authors")
ratings = pl.scan_csv("ratings.csv").select("user_id", "book_id", "rating").rename({"book_id": "item_id"})

In [56]:
cf = CollaborativeFilter(
        correlation_method = "pearson",
        similarity_threshold = 0.7,
        minimal_number_of_ratings = 0,
        minimum_number_of_books_rated_in_common = 0,
        neighborhood_method = "threshold",)

In [57]:
user_ids = ratings.select("user_id").unique().sort("user_id").collect()
user_kf = KFold(n_splits=len(user_ids), shuffle=True, random_state=42)

In [58]:
books, ratings = pl.collect_all([books, ratings])

In [66]:
coverages = []
maes = []

for i in tqdm(range(user_kf.get_n_splits()-53000)):
    user_train_index, user_test_index = next(user_kf.split(user_ids))
    train_ratings = ratings.filter(pl.col("user_id").is_in(user_ids[user_train_index]))
    user_ratings = ratings.filter(pl.col("user_id").is_in(user_ids[user_test_index]))
    ratings_user_itemids = user_ratings.select("item_id").unique().sort("item_id") #.collect()
    ratings_kf = KFold(n_splits=len(ratings_user_itemids), shuffle=True, random_state=42)
    for k in range(ratings_kf.get_n_splits()):
        train_index, test_index = next(ratings_kf.split(ratings_user_itemids))
        input_ratings = user_ratings.filter(pl.col("item_id").is_in(ratings_user_itemids[train_index]))
        test_ratings = user_ratings.filter(pl.col("item_id").is_in(ratings_user_itemids[test_index]))

        predictions = cf.predict(train_ratings, input_ratings, None)
        predictions = predictions.join(test_ratings, on=["item_id"], how="inner")
        # mae = (pl.col("prediction")-pl.col("rating")).abs().mean().collect()[0]
        predictions = predictions.with_columns((pl.col("prediction")-pl.col("rating")).abs().alias("error"))
        predictions = predictions #.collect()
        if len(predictions) > 0:
            maes.append(predictions.select("error").mean().item())
            coverage = len(predictions)/len(test_ratings)
            coverages.append(coverage)
        else:
            coverages.append(0)
        



  4%|▍         | 18/424 [05:12<1:57:20, 17.34s/it]


KeyboardInterrupt: 