# Recommendation Systems

- **`Content Based Recommendation`**
  - Recommends items based on their characteristics and a user's preferences for those characteristics.
  - For example, a movie recommender might suggest films with similar genres or actors to ones a user has liked in the past.

- **`Collaborative Filtering`**
  - recommends items based on the preferences of similar users.
  - It doesn't require knowledge of the items themselves, just information about user interactions.
  - For example, a music streaming service might recommend songs that other users with similar tastes have enjoyed.

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,mlxtend,omegaconf --conda

Python implementation: CPython
Python version       : 3.11.8
IPython version      : 8.22.2

numpy    : 1.26.4
pandas   : 2.2.1
polars   : 0.20.18
mlxtend  : 0.23.1
omegaconf: 2.3.0

conda environment: torch_p11



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")


# auto reload imports# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
from tqdm import tqdm
from typing import Generator


def process_text(text: list[str]) -> list[str]:
    """
    Process a list of text lines to extract product IDs and categories.

    Parameters
    ----------
    text : list[str]
        A list of strings representing lines of text to process.

    Returns
    -------
    list[str]
        A list of strings, each containing a product ID and its associated category.
    """
    result: list[str] = []
    # Starts with a capital letter and then lowercase letters
    pattern: re.Pattern[str] = re.compile(r"^[A-Z][a-z]+")
    prod_id: str = ""

    for line in tqdm(text, desc="Processing Text", unit="line", ncols=100):
        line: str = line.strip()
        if len(line.split(",")) == 1 and not pattern.match(line):
            prod_id = line
        elif pattern.match(line):
            result.append(f"{prod_id}: {line}")

    return result


def parse_keys_values(filename: str) -> Generator[dict[str, str], None, None]:
    """
    Parse key-value pairs from a file.

    Parameters
    ----------
    filename : str
        The path to the file to be parsed.

    Yields
    ------
    dict[str, str]
        A dictionary containing key-value pairs parsed from the file.

    """
    entry: dict[str, str] = {}
    f = open(filename, "rb")

    # Iterate over all lines in the file.
    for l in f:
        l = l.strip()
        # The key/value pairs are separated by a colon.
        colonPos: int = l.find(b":")
        if colonPos == -1:
            yield entry
            entry = {}
            continue
        key: str = l[:colonPos].decode("latin-1")
        value: str = l[colonPos + 2 :].decode("latin-1")
        entry[key] = value
    yield entry


def read_reviews(path: str, num: int = -1) -> pl.DataFrame:
    """
    Read reviews from a file and return them as a Polars DataFrame.

    Parameters
    ----------
    path : str
        The path to the file containing the reviews.
    num : int, optional
        The number of reviews to read. If -1, read all reviews. Default is -1.

    Returns
    -------
    pl.DataFrame
        A Polars DataFrame containing the parsed reviews.

    """
    i: int = 0
    df: dict[int, dict[str, str]] = {}
    for d in parse_keys_values(path):
        df[i] = d
        i += 1
        if i == num:
            break
    result: pl.DataFrame = pl.DataFrame([x for x in df.values()])
    return result

In [4]:
fp: str = "../../data/prod_categories.parquet"

prod_categories: pl.DataFrame = pl.read_parquet(fp)
print(f"{prod_categories.shape = }")
prod_categories.head(3)

prod_categories.shape = (2437878, 2)


product_id,product_category
str,str
"""B00005AL88""","""Cookware, Cookware Sets, Kitchen & Dining, Home & Kitchen"""
"""B000002ERS""","""Music, World Music Music, Latin Music"""
"""B000PR126O""","""Jewelry, Charms"""


In [5]:
# Set verbosity level
pl.Config.set_fmt_str_lengths(100)

polars.config.Config

In [6]:
fp: str = "../../data/Music_small.parquet"
df_music: pl.DataFrame = pl.read_parquet(fp)
print(f"{df_music.shape = }")
df_music.head(2)

df_music.shape = (6396351, 10)


productId,title,price,userId,profileName,helpfulness,score,time,summary,text
str,str,str,str,str,str,f32,str,str,str
"""B00002066I""","""ah""","""15.99""","""unknown""","""unknown""","""3/4""",5.0,"""939772800""","""Inspiring""","""I hope a lot of people hear this cd. We need more strong and positive vibes like this. Great vocals…"
"""B00002066I""","""ah""","""15.99""","""A2KLYVAS0MIBMQ""","""Stephen McClaning""","""0/0""",5.0,"""1332288000""","""Great CD""","""My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS a…"


### Data Cleaning

In [7]:
df_music.head()

productId,title,price,userId,profileName,helpfulness,score,time,summary,text
str,str,str,str,str,str,f32,str,str,str
"""B00002066I""","""ah""","""15.99""","""unknown""","""unknown""","""3/4""",5.0,"""939772800""","""Inspiring""","""I hope a lot of people hear this cd. We need more strong and positive vibes like this. Great vocals…"
"""B00002066I""","""ah""","""15.99""","""A2KLYVAS0MIBMQ""","""Stephen McClaning""","""0/0""",5.0,"""1332288000""","""Great CD""","""My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS a…"
"""B000058A81""","""Chrono Cross: Original Soundtrack""","""unknown""","""A18C9SNLZWVBIE""","""A reader""","""1/1""",5.0,"""1096934400""","""First album I've bought since Napster""","""We've come a long way since the days of Ninetendo synthesized music! I say without exaggeration tha…"
"""B000058A81""","""Chrono Cross: Original Soundtrack""","""unknown""","""A38QSOKE2DD8JD""","""Christopher Walden""","""1/1""",5.0,"""1088121600""","""Pleasant to the ear, musical masterpiece""","""Final fantasy fans may be at first skeptical of Chrono Cross's composition and production under unk…"
"""B000058A81""","""Chrono Cross: Original Soundtrack""","""unknown""","""AKZLIIH3AP4RU""","""IcemanJ""","""1/1""",5.0,"""1075939200""","""Much more than a game Soundtrack.""","""This has got to be one of the best video game soundtracks ever. I've actually never really played t…"


In [8]:
# Check for unique product and userId entries
p_id: str = "B000058A81"
user_id: str = "A18C9SNLZWVBIE"
df_music.filter((pl.col("productId").eq(p_id) & pl.col("userId").eq(user_id)))

productId,title,price,userId,profileName,helpfulness,score,time,summary,text
str,str,str,str,str,str,f32,str,str,str
"""B000058A81""","""Chrono Cross: Original Soundtrack""","""unknown""","""A18C9SNLZWVBIE""","""A reader""","""1/1""",5.0,"""1096934400""","""First album I've bought since Napster""","""We've come a long way since the days of Ninetendo synthesized music! I say without exaggeration tha…"


In [9]:
# We want a:
# unique user-product pair.
# unique user-title pair.
# unique product-title pair.

print(f"[Before data cleaning]: {df_music.shape}")

# Remove duplicates
df_music_cleaned: pl.DataFrame = df_music.unique(subset=["userId", "productId"])
df_music_cleaned = df_music_cleaned.unique(subset=["userId", "title"])
df_music_cleaned = df_music_cleaned.unique(subset=["productId", "title"])

print(f"[After data cleaning]: {df_music_cleaned.shape}")

[Before data cleaning]: (6396351, 10)
[After data cleaning]: (522317, 10)


In [10]:
df_music_cleaned.head()

productId,title,price,userId,profileName,helpfulness,score,time,summary,text
str,str,str,str,str,str,f32,str,str,str
"""B00008FF0U""","""Second Album""","""unknown""","""A3GTI1Z7YE49ZS""","""Tnahpellee ""Brendan""""","""0/0""",4.0,"""1307923200""","""Classic 60's stuff!""","""The Four Tops second album is formulaic; you know, vibraphone laced motown pop, with a strident bea…"
"""B0007N2G2U""","""Plavniki""","""unknown""","""A3H63KQJJIX264""","""10catz""","""0/0""",5.0,"""1137974400""","""Another great one by Dolphin.""","""I am somewhat repeating my review of another of Dophin's CD's ""Zvezda"":Dolphin, also know as Delfin…"
"""B0000007EV""","""Romeo & Juliet / Cinderella""","""unknown""","""A2LM3PDDT7EZKC""","""Tosh Ogawo""","""4/4""",5.0,"""1052352000""","""MUSIC IS FIRST, AND SO IS THE PIANO. WONDERFUL !!!""","""Mr. Chiu offered this listener arguably the most gratifying recorded music experience. He approache…"
"""B00000EL3M""","""Fire of Freedom""","""unknown""","""A3TD3W8KTCKELN""","""""devilzeye""""","""3/3""",5.0,"""945820800""","""Brilliant Lyrics Matched with Superb Melodies!A Sure Winner!""","""I never tire of playing this CD: it is my favorite , from my favorite band: Black 47! Its rare to f…"
"""B0000667PK""","""Brazilian Love Affair 3""","""unknown""","""A3CN9CCJUNIPKT""","""DJ Joe Sixpack""","""0/0""",4.0,"""1041984000""","""Surprisingly nice for a mellow, acid jazz set...""","""Lounge-y Brazilian electronica and soft funk-fusion. I was actually surprised by how much I enjoyed…"


In [11]:
# Check the unique userIDs
df_music_cleaned["userId"].value_counts(sort=True).head()

userId,count
str,u32
"""unknown""",49285
"""A9Q28YTLYREO7""",1270
"""A2WQY1B8ZS7QRZ""",1259
"""A2AIMXT9PLAM12""",1182
"""A3QS1EPDZTLPWS""",875


In [12]:
# Drop the unwanted userIDs
rem_str: str = "unknown"
print(f"[Before data cleaning]: {df_music_cleaned.shape}")
df_music_cleaned = df_music_cleaned.filter(pl.col("userId").ne(rem_str))
print(f"[After data cleaning]: {df_music_cleaned.shape}")

[Before data cleaning]: (522317, 10)
[After data cleaning]: (473031, 10)


In [13]:
df_music_cleaned.head()

productId,title,price,userId,profileName,helpfulness,score,time,summary,text
str,str,str,str,str,str,f32,str,str,str
"""B00008FF0U""","""Second Album""","""unknown""","""A3GTI1Z7YE49ZS""","""Tnahpellee ""Brendan""""","""0/0""",4.0,"""1307923200""","""Classic 60's stuff!""","""The Four Tops second album is formulaic; you know, vibraphone laced motown pop, with a strident bea…"
"""B0007N2G2U""","""Plavniki""","""unknown""","""A3H63KQJJIX264""","""10catz""","""0/0""",5.0,"""1137974400""","""Another great one by Dolphin.""","""I am somewhat repeating my review of another of Dophin's CD's ""Zvezda"":Dolphin, also know as Delfin…"
"""B0000007EV""","""Romeo & Juliet / Cinderella""","""unknown""","""A2LM3PDDT7EZKC""","""Tosh Ogawo""","""4/4""",5.0,"""1052352000""","""MUSIC IS FIRST, AND SO IS THE PIANO. WONDERFUL !!!""","""Mr. Chiu offered this listener arguably the most gratifying recorded music experience. He approache…"
"""B00000EL3M""","""Fire of Freedom""","""unknown""","""A3TD3W8KTCKELN""","""""devilzeye""""","""3/3""",5.0,"""945820800""","""Brilliant Lyrics Matched with Superb Melodies!A Sure Winner!""","""I never tire of playing this CD: it is my favorite , from my favorite band: Black 47! Its rare to f…"
"""B0000667PK""","""Brazilian Love Affair 3""","""unknown""","""A3CN9CCJUNIPKT""","""DJ Joe Sixpack""","""0/0""",4.0,"""1041984000""","""Surprisingly nice for a mellow, acid jazz set...""","""Lounge-y Brazilian electronica and soft funk-fusion. I was actually surprised by how much I enjoyed…"


In [14]:
# Check the unique titles
# df_music_cleaned["title"].value_counts(sort=True).slice(20, 40)
titles: list[dict[str, int]] = (
    df_music_cleaned["title"].value_counts(sort=True).to_dicts()
)
titles = sorted(titles, key=lambda x: x["title"], reverse=True)
titles[980:1000]

[{'title': "Zoom - Best of the 70's [VHS] (2000)", 'count': 1},
 {'title': 'Zoom', 'count': 6},
 {'title': 'Zoolook', 'count': 2},
 {'title': 'Zoologico Tropical - Fita Olivares y su grupo La Pura Sabrosura',
  'count': 1},
 {'title': 'Zoolander', 'count': 1},
 {'title': "Zookeeper's Boy [Vinyl]", 'count': 1},
 {'title': "Zookeeper's Boy", 'count': 1},
 {'title': 'Zoocoustic', 'count': 1},
 {'title': 'Zoo of Tranquility', 'count': 1},
 {'title': 'Zoo Story', 'count': 1},
 {'title': 'Zoo Rave II', 'count': 1},
 {'title': 'Zoo Rave 2', 'count': 1},
 {'title': 'Zoo Rave', 'count': 1},
 {'title': 'Zoo Label: Uncaged', 'count': 1},
 {'title': 'Zoo Hypothesis', 'count': 1},
 {'title': 'Zoo Be Zoo Be Zoo: The Remixes [Vinyl]', 'count': 1},
 {'title': 'Zoo Bar Collection, Vol. 4: Spider in My Stew', 'count': 1},
 {'title': 'Zoo Bar Collection 3', 'count': 1},
 {'title': 'Zoo Bar Collection 2', 'count': 1},
 {'title': 'Zoo Bar Collection 1', 'count': 1}]

In [15]:
text: str = "zoo rave"
df_music_cleaned.filter(pl.col("title").str.to_lowercase().str.contains(text))

productId,title,price,userId,profileName,helpfulness,score,time,summary,text
str,str,str,str,str,str,f32,str,str,str
"""B00000E9GN""","""Zoo Rave""","""unknown""","""A1SFF3DSLJVIEW""","""Ross H. Goodwin ""Digitally Natured Audio""""","""5/5""",5.0,"""1136678400""","""Nostalgic Frequencies....""","""I originally bought this as a cassette in 93. Back then I loved every moment of the cd, in retrospe…"
"""B00000E9GW""","""Zoo Rave 2""","""2.99""","""A3DF6EAZ020LUI""","""E. Oxenberg ""Just another stay-at-home comput...""","""1/1""",3.0,"""1121904000""","""Basic bloops and beeps, but a lot of fun""","""It is full of simple looped techno beats but I always find one really interesting track that stands…"
"""B00000099A""","""Zoo Rave II""","""unknown""","""A1DTAWBG9UTVWR""","""""mn2nmixr""""","""2/2""",5.0,"""1015200000""","""Perfecto""","""First of all, this album is from 1993, so late 90's is impossible. A great album packed with a vari…"


### Apply Levenshtein Distance

- Levenshtein Distance is a string metric used to measure the difference between two sequences.
- It calculates the `minimum number of single-character edits` (`insertions`, `deletions`, or `substitutions`) required to change one word into another.
- It provides a quantitative measure of how dissimilar two strings are, which can be very useful in recommendation systems, especially for handling `typos` or `slight variations` in text input.
- It's an effective way to find close matches even when there are small differences between strings.

In [16]:
def levenshtein_distance(s1: str, s2: str) -> int:
    """
    Calculate the Levenshtein distance between two strings.

    Parameters
    ----------
    s1 : str
        The first string.
    s2 : str
        The second string.

    Returns
    -------
    int
        The Levenshtein distance between s1 and s2.

    Example
    -------
    >>> levenshtein_distance("kitten", "sitting")
    3
    """
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row: list[int] = list(range(len(s2) + 1))
    for i, c1 in enumerate(s1):
        current_row: list[int] = [i + 1]
        for j, c2 in enumerate(s2):
            insertions: int = previous_row[j + 1] + 1
            deletions: int = current_row[j] + 1
            substitutions: int = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]


def compute_string_similarity(s1: str, s2: str) -> float:
    """
    Compute the similarity between two strings based on Levenshtein distance.

    Parameters
    ----------
    s1 : str
        The first string.
    s2 : str
        The second string.

    Returns
    -------
    float
        A similarity score between 0 and 1, where 1 indicates identical strings.
    """
    distance: int = levenshtein_distance(s1, s2)
    max_length: int = max(len(s1), len(s2))
    similarity: float = 1 - (distance / max_length)
    return round(similarity, 4)

In [17]:
s1, s2 = "friend", "fiend"
distance: int = levenshtein_distance(s1=s1, s2=s2)
print(f"{distance = }")

similarity: float = compute_string_similarity(s1=s1, s2=s2)
print(f"{similarity = }")

distance = 1
similarity = 0.8333


In [38]:
temp_df: pl.DataFrame = pl.DataFrame(
    data={
        "title": [
            "spider man",
            "Spider-Man",
            "Spiderman",
            "Anikulapo",
            "Aniku la po",
        ],
        "ratings": [5, 5, 4, 3, 3],
    }
)
temp_df

title,ratings
str,i64
"""spider man""",5
"""Spider-Man""",5
"""Spiderman""",4
"""Anikulapo""",3
"""Aniku la po""",3


In [39]:
temp_df = temp_df.with_columns(title=pl.col("title").str.to_lowercase())
uniq_titles: list[str] = temp_df["title"].unique().to_list()
schema: dict[str, Any] = {"title_1": str, "title_2": str, "similarity": pl.Float32}
similarity_df: pl.DataFrame = pl.DataFrame(schema=schema)
similarity_df

title_1,title_2,similarity
str,str,f32


In [40]:
# Claculate similarity
# for idx, title_ in enumerate(uniq_titles):
#     for n_title_ in lem()

result: list[dict[str, Any]] = []
threshold: float = 0.8

for idx, title_ in enumerate(uniq_titles):
    for n_title_ in uniq_titles[idx + 1 :]:
        score: float = compute_string_similarity(title_, n_title_)
        if score >= threshold:
            result.append({"title_1": title_, "title_2": n_title_, "similarity": score})
d: pl.DataFrame = pl.DataFrame(result, schema=schema)
similarity_df = pl.concat([similarity_df, d]).sort(by="title_1", descending=False)
similarity_df

title_1,title_2,similarity
str,str,f32
"""aniku la po""","""anikulapo""",0.8182
"""spider man""","""spider-man""",0.9
"""spider man""","""spiderman""",0.9
"""spider-man""","""spiderman""",0.9


In [55]:
def compute_similarity_matrix(
    unique_titles: list[str], threshold: float = 0.8
) -> pl.DataFrame:
    """
    Compute similarity matrix for unique titles.

    Parameters
    ----------
    unique_titles : list[str]
        List of unique titles to compare.
    threshold : float, optional
        Similarity threshold for including pairs, by default 0.8.

    Returns
    -------
    pl.DataFrame
        DataFrame containing similarity scores for title pairs above the threshold.
    """
    schema: dict[str, Any] = {"title_1": str, "title_2": str, "similarity": pl.Float32}
    result: list[dict[str, Any]] = []

    for idx, title_ in enumerate(unique_titles):
        for n_title_ in unique_titles[idx + 1 :]:
            score: float = compute_string_similarity(title_, n_title_)
            if score >= threshold:
                result.append(
                    {"title_1": title_, "title_2": n_title_, "similarity": score}
                )

    d: pl.DataFrame = pl.DataFrame(result, schema=schema)
    similarity_df: pl.DataFrame = pl.DataFrame(data=d, schema=schema).sort(
        by="title_1", descending=False
    )
    return similarity_df

In [51]:
def create_title_mapping(similarity_list: list[dict[str, str]]) -> dict[str, str]:
    """
    Create a mapping of variant titles to their canonical forms.

    Parameters
    ----------
    similarity_list : list[dict[str, str]]
        A list of dictionaries, where each dictionary contains
        'title_1' and 'title_2' keys with string values.

    Returns
    -------
    dict[str, str]
        A dictionary mapping variant titles to their canonical forms.

    Notes
    -----
    The canonical form is chosen as the shorter of the two titles.
    """
    mapping: dict[str, str] = {}

    for item in similarity_list:
        title_1: str = item["title_1"]
        title_2: str = item["title_2"]

        # Choose the shorter title as the canonical form
        if len(title_1) <= len(title_2):
            canonical: str = title_1
            variant: str = title_2
        else:
            canonical: str = title_2
            variant: str = title_1

        mapping[variant] = canonical

    return mapping


def replace_similar(title: str, mapping: dict[str, str]) -> str:
    return mapping.get(title, title)

In [56]:
similarity_df: pl.DataFrame = compute_similarity_matrix(
    unique_titles=uniq_titles, threshold=0.75
)
similarity_df

title_1,title_2,similarity
str,str,f32
"""aniku la po""","""anikulapo""",0.8182
"""spider man""","""spider-man""",0.9
"""spider man""","""spiderman""",0.9
"""spider-man""","""spiderman""",0.9


In [57]:
mapping: dict[str, str] = create_title_mapping(similarity_list=similarity_df.to_dicts())
mapping

{'aniku la po': 'anikulapo',
 'spider-man': 'spiderman',
 'spider man': 'spiderman'}

In [58]:
# Apply the replacement to the DataFrame
temp_df = temp_df.with_columns(
    title=pl.col("title").map_elements(
        lambda x: replace_similar(x, mapping), return_dtype=pl.Utf8
    )
)
temp_df

title,ratings
str,i64
"""spiderman""",5
"""spiderman""",5
"""spiderman""",4
"""anikulapo""",3
"""anikulapo""",3


#### Apply To the Data

In [53]:
df_music_cleaned.head(3)

productId,title,price,userId,profileName,helpfulness,score,time,summary,text
str,str,str,str,str,str,f32,str,str,str
"""B00008FF0U""","""Second Album""","""unknown""","""A3GTI1Z7YE49ZS""","""Tnahpellee ""Brendan""""","""0/0""",4.0,"""1307923200""","""Classic 60's stuff!""","""The Four Tops second album is formulaic; you know, vibraphone laced motown pop, with a strident bea…"
"""B0007N2G2U""","""Plavniki""","""unknown""","""A3H63KQJJIX264""","""10catz""","""0/0""",5.0,"""1137974400""","""Another great one by Dolphin.""","""I am somewhat repeating my review of another of Dophin's CD's ""Zvezda"":Dolphin, also know as Delfin…"
"""B0000007EV""","""Romeo & Juliet / Cinderella""","""unknown""","""A2LM3PDDT7EZKC""","""Tosh Ogawo""","""4/4""",5.0,"""1052352000""","""MUSIC IS FIRST, AND SO IS THE PIANO. WONDERFUL !!!""","""Mr. Chiu offered this listener arguably the most gratifying recorded music experience. He approache…"


In [59]:
df_music_cleaned = df_music_cleaned.with_columns(
    title=pl.col("title").str.to_lowercase()
)
uniq_titles: list[str] = df_music_cleaned["title"].unique().to_list()
similarity_df: pl.DataFrame = compute_similarity_matrix(
    unique_titles=uniq_titles, threshold=0.75
)
mapping: dict[str, str] = create_title_mapping(similarity_list=similarity_df.to_dicts())
# Apply the replacement to the DataFrame
df_music_cleaned = df_music_cleaned.with_columns(
    title=pl.col("title").map_elements(
        lambda x: replace_similar(x, mapping), return_dtype=pl.Utf8
    )
)
df_music_cleaned.head()


In [None]:
text: str = "zoo rave"
df_music_cleaned.filter(pl.col("title").str.contains(text))