In [2]:
import polars as pl

books = pl.scan_csv("books.csv").select("book_id", "title", "authors")
ratings = pl.scan_csv("ratings.csv").select("user_id", "book_id", "rating")

In [3]:
ratings.limit().collect()

user_id,book_id,rating
i64,i64,i64
1,258,5
2,4081,4
2,260,5
2,9296,5
2,2318,3


In [4]:
# Create dataframe with my ratings and a custom user id
my_user_id = 0
my_ratings = [
    ["The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy, #1)", 5],
    ["The Martian", 5],
    ["Surely You're Joking, Mr. Feynman!: Adventures of a Curious Character", 5],
    ['Going Solo', 5],
    ["Flatland: A Romance of Many Dimensions", 5],
    ["Gödel, Escher, Bach: An Eternal Golden Braid", 5],
    ["The Hundred-Year-Old Man Who Climbed Out of the Window and Disappeared", 5],
    ["Gut: The Inside Story of Our Body's Most Underrated Organ", 5],
    ["Brave New World", 4],
    ["The Three-Body Problem (Remembrance of Earth's Past, #1)", 2],
    ["The Dark Forest (Remembrance of Earth's Past, #2)", 2],
    ["The Remains of the Day", 5],
    ["The Pursuit of Happyness", 5],
    ["Animal Farm", 5],
    ["1984", 5],
    ["Norwegian Wood", 5],
    ["Three Men in a Boat (Three Men, #1)", 5],
    ["Lord of the Flies", 3],
    ["Buddenbrooks: The Decline of a Family", 1],
    ["To Kill a Mockingbird", 5],
    ["Harry Potter and the Sorcerer's Stone (Harry Potter, #1)", 3],
    ["Harry Potter and the Chamber of Secrets (Harry Potter, #2)", 3],
    ["Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)", 3],
    ["Harry Potter and the Goblet of Fire (Harry Potter, #4)", 3],
    ["Harry Potter and the Order of the Phoenix (Harry Potter, #5)", 3],
    ["Harry Potter and the Half-Blood Prince (Harry Potter, #6)", 3],
    ["Harry Potter and the Deathly Hallows (Harry Potter, #7)", 3],
    ["Perfume: The Story of a Murderer", 1],
    ["Sapiens: A Brief History of Humankind", 5],
    ["The Circle", 1],
    ["The Reader", 2],
    ["Cloud Atlas", 5],
    ["A Briefer History of Time", 5],
    ["The Grand Design", 4],
    ["The Universe in a Nutshell", 4],
    ["All Quiet on the Western Front", 5],
    ["Inferno (Robert Langdon, #4)", 3],
    ["The Da Vinci Code (Robert Langdon, #2)", 4],
    ["I Am Legend", 5],
    ["Catch Me If You Can: The True Story of a Real Fake", 4],
    ["Memoirs of a Geisha", 5],
    ["A Fine Balance", 5],
    ["Man's Search for Meaning", 5],
    ["Dune (Dune Chronicles #1)", 4],
    ["The Kite Runner", 5],
    ["Kon-Tiki: Across The Pacific In A Raft", 5],
    ["Seven Years in Tibet", 5],
    ["The Diary of a Young Girl", 5],
    ["The Alchemist", 5],
    ["Siddhartha", 5],
    ["The Glass Bead Game", 4],
    ["Demian. Die Geschichte von Emil Sinclairs Jugend", 5],
    ["Steppenwolf", 4],
    ["Quo Vadis", 5],
    ["P.S. I Love You", 4],
    ["The Pillars of the Earth (The Kingsbridge Series, #1)", 4],
    ["Eye of the Needle", 4],
    ["Eragon (The Inheritance Cycle, #1)", 4],
    ["Wild Swans: Three Daughters of China", 5],
    ["I Am Malala: The Story of the Girl Who Stood Up for Education and Was Shot by the Taliban", 5]
] 
my_ratings = pl.LazyFrame(my_ratings, schema=[('title', pl.String), ('my_rating', pl.Int64)])
my_ratings = my_ratings.join(books.select("book_id", "title"), on='title', how='left')

In [5]:
ratings = ratings.join(my_ratings.select("book_id", "my_rating"), how="left", on="book_id")
ratings.limit().collect()

user_id,book_id,rating,my_rating
i64,i64,i64,i64
1,258,5,
2,4081,4,
2,260,5,
2,9296,5,
2,2318,3,


In [6]:
articles_rated_in_common = ratings.group_by("user_id").agg((pl.col("my_rating").is_not_null()).sum().alias("articles_rated_in_common"))
articles_rated_in_common.limit().collect()

user_id,articles_rated_in_common
i64,u32
40131,0
9206,12
9608,3
1837,4
22836,6


In [7]:
minimum_number_of_books_rated_in_common = 10
ratings = ratings.join(articles_rated_in_common, how="left", on="user_id").filter(
    pl.col("articles_rated_in_common")>=minimum_number_of_books_rated_in_common)

ratings.limit().collect()

user_id,book_id,rating,my_rating,articles_rated_in_common
i64,i64,i64,i64,u32
2,4081,4,,11
2,260,5,,11
2,9296,5,,11
2,2318,3,,11
2,26,4,4.0,11


In [8]:
similarities = ratings.group_by("user_id", maintain_order=True).agg(pl.corr("rating", "my_rating").alias("corr"))
similarities.limit().collect()

user_id,corr
i64,f64
2,-0.461906
4,-0.526262
9,-0.086335
15,0.123524
24,0.505381


In [9]:
minimal_similarity = 0.7
ratings = ratings.join(similarities, on="user_id", how="left")
ratings = ratings.filter(pl.col("corr")>minimal_similarity)
ratings = ratings.filter(pl.col("corr").is_not_nan())

In [10]:
minimal_number_of_ratings = 6
ratings_per_article = ratings.group_by("book_id").agg((pl.col("rating").is_not_null()).sum().alias("ratings_per_article"))
ratings = ratings.join(ratings_per_article, on="book_id", how="left")
ratings = ratings.filter(pl.col("ratings_per_article") >= minimal_number_of_ratings)
ratings.limit().collect()

user_id,book_id,rating,my_rating,articles_rated_in_common,corr,ratings_per_article
i64,i64,i64,i64,u32,f64,u32
55,212,5,,12,0.770761,35
55,2051,1,,12,0.770761,8
55,264,5,,12,0.770761,32
55,397,2,1.0,12,0.770761,15
55,653,5,,12,0.770761,11


In [11]:
def predict_func():
    return ((pl.col("rating")*pl.col("corr")).sum())/(pl.col("corr").sum())
predictions = ratings.group_by("book_id", maintain_order=True).agg(predict_func().alias("prediction")).select("book_id", "prediction")
predictions.limit(20).collect()

book_id,prediction
i64,f64
212,4.167762
2051,2.991773
264,3.787874
397,2.692973
653,4.115781
…,…
934,4.390389
614,3.698881
2024,3.337885
1624,3.536167


In [12]:
predictions = predictions.join(books, how="left", on="book_id")
predictions = predictions.collect().sort(by="prediction", descending=True).limit(20)
predictions

book_id,prediction,title,authors
i64,f64,str,str
757,4.884893,"""Lonesome Dove""","""Larry McMurtry…"
2757,4.852697,"""Ahab's Wife, o…","""Sena Jeter Nas…"
1909,4.844823,"""Someone Knows …","""Lawrence Hill"""
2247,4.82878,"""The Diving Bel…","""Jean-Dominique…"
267,4.821673,"""The Nightingal…","""Kristin Hannah…"
…,…,…,…
743,4.712646,"""Lamb: The Gosp…","""Christopher Mo…"
779,4.709997,"""A Fine Balance…","""Rohinton Mistr…"
2353,4.701241,"""Season of Mist…","""Neil Gaiman, M…"
3358,4.70059,"""The Hitchhiker…","""Douglas Adams"""


In [22]:
def polars_collaborative_filtering(ratings, my_ratings, minimal_similarity=0.7, minimal_number_of_ratings=6, minimum_number_of_books_rated_in_common=10):
    """
    Performs collaborative filtering on a dataset of book ratings.

    Args:
        ratings (DataFrame): The dataset of book ratings.
        my_ratings (DataFrame): The user's own ratings.
        minimal_similarity (float, optional): The minimum similarity threshold between users. Defaults to 0.7.
        minimal_number_of_ratings (int, optional): The minimum number of ratings required for a book to be considered. Defaults to 6.
        minimum_number_of_books_rated_in_common (int, optional): The minimum number of books rated in common between users. Defaults to 10.

    Returns:
        DataFrame: The predicted ratings for books.

    """
    # Join the ratings dataset with the user's own ratings
    ratings = ratings.join(my_ratings.select("book_id", "my_rating"), how="left", on="book_id")
    
    # Calculate the number of articles rated in common for each user
    articles_rated_in_common = ratings.group_by("user_id").agg((pl.col("my_rating").is_not_null()).sum().alias("articles_rated_in_common"))
    
    # Filter out users who have rated less than the minimum number of books in common
    ratings = ratings.join(articles_rated_in_common, how="left", on="user_id").filter(
        pl.col("articles_rated_in_common") >= minimum_number_of_books_rated_in_common)
    
    # Calculate the similarity between users based on their ratings
    similarities = ratings.group_by("user_id", maintain_order=True).agg(pl.corr("rating", "my_rating").alias("corr"))
    
    # Filter out users whose similarity is below the minimum similarity threshold
    ratings = ratings.join(similarities, on="user_id", how="left")
    ratings = ratings.filter(pl.col("corr") > minimal_similarity)
    ratings = ratings.filter(pl.col("corr").is_not_nan())
    
    # Calculate the number of ratings per article
    ratings_per_article = ratings.group_by("book_id").agg((pl.col("rating").is_not_null()).sum().alias("ratings_per_article"))
    
    # Filter out articles that have less than the minimum number of ratings
    ratings = ratings.join(ratings_per_article, on="book_id", how="left")
    ratings = ratings.filter(pl.col("ratings_per_article") >= minimal_number_of_ratings)
    
    # Define the prediction function
    def predict_func():
        return ((pl.col("rating") * pl.col("corr")).sum()) / (pl.col("corr").sum())
    
    # Calculate the predicted ratings for each book
    predictions = ratings.group_by("book_id", maintain_order=True).agg(predict_func().alias("prediction")).select("book_id", "prediction")
    
    return predictions


ratings = pl.scan_csv("ratings.csv").select("user_id", "book_id", "rating")
predictions = polars_collaborative_filtering(ratings, my_ratings)
predictions = predictions.join(books, how="left", on="book_id")
predictions = predictions.collect().sort(by="prediction", descending=True).limit(20)
predictions

book_id,prediction,title,authors
i64,f64,str,str
757,4.884893,"""Lonesome Dove""","""Larry McMurtry…"
2757,4.852697,"""Ahab's Wife, o…","""Sena Jeter Nas…"
1909,4.844823,"""Someone Knows …","""Lawrence Hill"""
2247,4.82878,"""The Diving Bel…","""Jean-Dominique…"
267,4.821673,"""The Nightingal…","""Kristin Hannah…"
…,…,…,…
743,4.712646,"""Lamb: The Gosp…","""Christopher Mo…"
779,4.709997,"""A Fine Balance…","""Rohinton Mistr…"
2353,4.701241,"""Season of Mist…","""Neil Gaiman, M…"
3358,4.70059,"""The Hitchhiker…","""Douglas Adams"""


In [23]:
import pandas as pd

def pandas_collaborative_filtering(ratings, my_ratings, minimal_similarity=0.7, minimal_number_of_ratings=6, minimum_number_of_books_rated_in_common=10):
    """
    Performs collaborative filtering on a dataset of book ratings.

    Args:
        ratings (DataFrame): The dataset of book ratings.
        my_ratings (DataFrame): The user's own ratings.
        minimal_similarity (float, optional): The minimum similarity threshold between users. Defaults to 0.7.
        minimal_number_of_ratings (int, optional): The minimum number of ratings required for a book to be considered. Defaults to 6.
        minimum_number_of_books_rated_in_common (int, optional): The minimum number of books rated in common between users. Defaults to 10.

    Returns:
        DataFrame: The predicted ratings for books.

    """
    # Join the ratings dataset with the user's own ratings
    ratings = pd.merge(ratings, my_ratings[["book_id", "my_rating"]], how="left", on="book_id")

    # Calculate the number of articles rated in common for each user
    articles_rated_in_common = ratings.groupby("user_id").agg({'my_rating': lambda x: x.notnull().sum()}).rename(columns={"my_rating": "articles_rated_in_common"})
    
    # Filter out users who have rated less than the minimum number of books in common
    ratings = pd.merge(ratings, articles_rated_in_common, how="left", on="user_id")
    ratings = ratings[ratings["articles_rated_in_common"]>=minimum_number_of_books_rated_in_common]
    
    # Calculate the similarity between users based on their ratings
    similarities = ratings.groupby("user_id")[["rating", "my_rating"]].corr().unstack().iloc[:, 1].rename("corr")

    # Filter out users whose similarity is below the minimum similarity threshold
    ratings = ratings.join(similarities, on="user_id", how="left")
    ratings = ratings[ratings["corr"]>minimal_similarity]
    ratings = ratings[ratings["corr"].notna()]
    
    # Calculate the number of ratings per article
    ratings_per_article = ratings.groupby("book_id").agg({'rating': lambda x: x.notnull().sum()}).rename(columns={"rating": "ratings_per_article"})
    
    # Filter out articles that have less than the minimum number of ratings
    ratings = pd.merge(ratings, ratings_per_article, on="book_id", how="left")
    ratings = ratings[ratings["ratings_per_article"] >= minimal_number_of_ratings]
    
    # Define the prediction function
    def predict_func(x):
        return ((x["rating"]*x["corr"]).sum())/(x["corr"].sum())
    
    # Calculate the predicted ratings for each book
    predictions = ratings.groupby("book_id").apply(predict_func).to_frame("prediction")
    
    return predictions

In [24]:
ratings_pd, my_ratings_pd = ratings.collect().to_pandas(), my_ratings.collect().to_pandas()
predictions = pandas_collaborative_filtering(ratings_pd, my_ratings_pd)
predictions = pd.merge(predictions, books.collect().to_pandas(), how="left", on="book_id")
predictions = predictions.sort_values(by="prediction", ascending=False)
predictions.head(20)

Unnamed: 0,book_id,prediction,title,authors
597,757,4.884893,Lonesome Dove,Larry McMurtry
1087,2757,4.852697,"Ahab's Wife, or The Star-Gazer",Sena Jeter Naslund
978,1909,4.844823,Someone Knows My Name,Lawrence Hill
1033,2247,4.82878,The Diving Bell and the Butterfly,"Jean-Dominique Bauby, Jeremy Leggatt"
251,267,4.821673,The Nightingale,Kristin Hannah
885,1496,4.810564,A Storm of Swords: Blood and Gold (A Song of I...,George R.R. Martin
917,1597,4.778453,"The Power of One (The Power of One, #1)",Bryce Courtenay
842,1338,4.777642,The Book of Mormon: Another Testament of Jesus...,"Anonymous, Joseph Smith Jr."
860,1380,4.772905,"The Complete Maus (Maus, #1-2)",Art Spiegelman
140,141,4.755184,The Martian,Andy Weir


In [25]:
from timeit import timeit

n = 10
polars_time_lazy = timeit(lambda: polars_collaborative_filtering(ratings, my_ratings).collect(), number=n)

ratings_df, my_ratings_df = ratings.collect(), my_ratings.collect()
polars_time_eager = timeit(lambda: polars_collaborative_filtering(ratings_df, my_ratings_df), number=n)

pandas_time = timeit(lambda: pandas_collaborative_filtering(ratings_pd, my_ratings_pd), number=n)

In [27]:
print(f"Polars (lazy): {polars_time_lazy/n:.2f} seconds")
print(f"Polars (eager): {polars_time_eager/n:.2f} seconds")
print(f"Pandas: {pandas_time/n:.2f} seconds")

Polars (lazy): 0.64 seconds
Polars (eager): 0.13 seconds
Pandas: 3.32 seconds
