# Recommendation Systems

- **`Content Based Recommendation`**
  - Recommends items based on their characteristics and a user's preferences for those characteristics.
  - For example, a movie recommender might suggest films with similar genres or actors to ones a user has liked in the past.

- **`Collaborative Filtering`**
  - recommends items based on the preferences of similar users.
  - It doesn't require knowledge of the items themselves, just information about user interactions.
  - For example, a music streaming service might recommend songs that other users with similar tastes have enjoyed.

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,mlxtend,omegaconf --conda

Python implementation: CPython
Python version       : 3.11.8
IPython version      : 8.22.2

numpy    : 1.26.4
pandas   : 2.2.1
polars   : 0.20.18
mlxtend  : 0.23.1
omegaconf: 2.3.0

conda environment: torch_p11



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")


# auto reload imports# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
from tqdm import tqdm
from typing import Generator


def process_text(text: list[str]) -> list[str]:
    """
    Process a list of text lines to extract product IDs and categories.

    Parameters
    ----------
    text : list[str]
        A list of strings representing lines of text to process.

    Returns
    -------
    list[str]
        A list of strings, each containing a product ID and its associated category.
    """
    result: list[str] = []
    # Starts with a capital letter and then lowercase letters
    pattern: re.Pattern[str] = re.compile(r"^[A-Z][a-z]+")
    prod_id: str = ""

    for line in tqdm(text, desc="Processing Text", unit="line", ncols=100):
        line: str = line.strip()
        if len(line.split(",")) == 1 and not pattern.match(line):
            prod_id = line
        elif pattern.match(line):
            result.append(f"{prod_id}: {line}")

    return result


def parse_keys_values(filename: str) -> Generator[dict[str, str], None, None]:
    """
    Parse key-value pairs from a file.

    Parameters
    ----------
    filename : str
        The path to the file to be parsed.

    Yields
    ------
    dict[str, str]
        A dictionary containing key-value pairs parsed from the file.

    """
    entry: dict[str, str] = {}
    f = open(filename, "rb")

    # Iterate over all lines in the file.
    for l in f:
        l = l.strip()
        # The key/value pairs are separated by a colon.
        colonPos: int = l.find(b":")
        if colonPos == -1:
            yield entry
            entry = {}
            continue
        key: str = l[:colonPos].decode("latin-1")
        value: str = l[colonPos + 2 :].decode("latin-1")
        entry[key] = value
    yield entry


def read_reviews(path: str, num: int = -1) -> pl.DataFrame:
    """
    Read reviews from a file and return them as a Polars DataFrame.

    Parameters
    ----------
    path : str
        The path to the file containing the reviews.
    num : int, optional
        The number of reviews to read. If -1, read all reviews. Default is -1.

    Returns
    -------
    pl.DataFrame
        A Polars DataFrame containing the parsed reviews.

    """
    i: int = 0
    df: dict[int, dict[str, str]] = {}
    for d in parse_keys_values(path):
        df[i] = d
        i += 1
        if i == num:
            break
    result: pl.DataFrame = pl.DataFrame([x for x in df.values()])
    return result

In [4]:
fp: str = "../../data/prod_categories.parquet"

prod_categories: pl.DataFrame = pl.read_parquet(fp)
print(f"{prod_categories.shape = }")
prod_categories.head(3)

prod_categories.shape = (2437878, 2)


product_id,product_category
str,str
"""B00005AL88""","""Cookware, Cookware Sets, Kitchen & Dining, Home & Kitchen"""
"""B000002ERS""","""Music, World Music Music, Latin Music"""
"""B000PR126O""","""Jewelry, Charms"""


In [5]:
# Set verbosity level
pl.Config.set_fmt_str_lengths(100)

polars.config.Config

In [6]:
fp: str = "../../data/Music_small.parquet"
df_music: pl.DataFrame = pl.read_parquet(fp)
print(f"{df_music.shape = }")
df_music.head(2)

df_music.shape = (6396351, 10)


productId,title,price,userId,profileName,helpfulness,score,time,summary,text
str,str,str,str,str,str,f32,str,str,str
"""B00002066I""","""ah""","""15.99""","""unknown""","""unknown""","""3/4""",5.0,"""939772800""","""Inspiring""","""I hope a lot of people hear this cd. We need more strong and positive vibes like this. Great vocals…"
"""B00002066I""","""ah""","""15.99""","""A2KLYVAS0MIBMQ""","""Stephen McClaning""","""0/0""",5.0,"""1332288000""","""Great CD""","""My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS a…"


In [7]:
def levenshtein_distance(s1: str, s2: str) -> int:
    """
    Calculate the Levenshtein distance between two strings.

    Parameters
    ----------
    s1 : str
        The first string.
    s2 : str
        The second string.

    Returns
    -------
    int
        The Levenshtein distance between s1 and s2.

    Example
    -------
    >>> levenshtein_distance("kitten", "sitting")
    3
    """
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row: list[int] = list(range(len(s2) + 1))
    for i, c1 in enumerate(s1):
        current_row: list[int] = [i + 1]
        for j, c2 in enumerate(s2):
            insertions: int = previous_row[j + 1] + 1
            deletions: int = current_row[j] + 1
            substitutions: int = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]


def compute_string_similarity(s1: str, s2: str) -> float:
    """
    Compute the similarity between two strings based on Levenshtein distance.

    Parameters
    ----------
    s1 : str
        The first string.
    s2 : str
        The second string.

    Returns
    -------
    float
        A similarity score between 0 and 1, where 1 indicates identical strings.
    """
    distance: int = levenshtein_distance(s1, s2)
    max_length: int = max(len(s1), len(s2))
    similarity: float = 1 - (distance / max_length)
    return round(similarity, 4)

In [None]:
s1, s2 = "friend", "fiend"
distance: int = levenshtein_distance(s1=s1, s2=s2)
print(distance)

similarity: float = compute_string_similarity(s1=s1, s2=s2)
print(similarity)

### Data Cleaning

In [8]:
df_music.head()

productId,title,price,userId,profileName,helpfulness,score,time,summary,text
str,str,str,str,str,str,f32,str,str,str
"""B00002066I""","""ah""","""15.99""","""unknown""","""unknown""","""3/4""",5.0,"""939772800""","""Inspiring""","""I hope a lot of people hear this cd. We need more strong and positive vibes like this. Great vocals…"
"""B00002066I""","""ah""","""15.99""","""A2KLYVAS0MIBMQ""","""Stephen McClaning""","""0/0""",5.0,"""1332288000""","""Great CD""","""My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS a…"
"""B000058A81""","""Chrono Cross: Original Soundtrack""","""unknown""","""A18C9SNLZWVBIE""","""A reader""","""1/1""",5.0,"""1096934400""","""First album I've bought since Napster""","""We've come a long way since the days of Ninetendo synthesized music! I say without exaggeration tha…"
"""B000058A81""","""Chrono Cross: Original Soundtrack""","""unknown""","""A38QSOKE2DD8JD""","""Christopher Walden""","""1/1""",5.0,"""1088121600""","""Pleasant to the ear, musical masterpiece""","""Final fantasy fans may be at first skeptical of Chrono Cross's composition and production under unk…"
"""B000058A81""","""Chrono Cross: Original Soundtrack""","""unknown""","""AKZLIIH3AP4RU""","""IcemanJ""","""1/1""",5.0,"""1075939200""","""Much more than a game Soundtrack.""","""This has got to be one of the best video game soundtracks ever. I've actually never really played t…"


In [11]:
# Check for unique product and userId entries
p_id: str = "B000058A81"
user_id: str = "A18C9SNLZWVBIE"
df_music.filter((pl.col("productId").eq(id) & pl.col("userId").eq(user_id)))

productId,title,price,userId,profileName,helpfulness,score,time,summary,text
str,str,str,str,str,str,f32,str,str,str
"""B000058A81""","""Chrono Cross: Original Soundtrack""","""unknown""","""A18C9SNLZWVBIE""","""A reader""","""1/1""",5.0,"""1096934400""","""First album I've bought since Napster""","""We've come a long way since the days of Ninetendo synthesized music! I say without exaggeration tha…"


In [12]:
# We want a:
# unique user-product pair.
# unique user-title pair.
# unique product-title pair.

print(f"[Before data cleaning]: {df_music.shape}")

# Remove duplicates
df_music_cleaned: pl.DataFrame = df_music.unique(subset=["userId", "productId"])
df_music_cleaned = df_music_cleaned.unique(subset=["userId", "title"])
df_music_cleaned = df_music_cleaned.unique(subset=["productId", "title"])

print(f"[After data cleaning]: {df_music_cleaned.shape}")

[Before data cleaning]: (6396351, 10)
[After data cleaning]: (522432, 10)


In [13]:
df_music_cleaned.head()

productId,title,price,userId,profileName,helpfulness,score,time,summary,text
str,str,str,str,str,str,f32,str,str,str
"""B000N6WMV6""","""Kiss Me""","""unknown""","""A1XP7P3X9MRE5H""","""Morten Vindberg""","""0/0""",5.0,"""1223424000""","""A Pop-Classic""","""""Kiss Me"" has by now more or less become a pop-classic. Featured in more than one film-score the so…"
"""B0000C6IGU""","""Sympathy for the Devil""","""unknown""","""A34KLP6T24KKAZ""","""Jerry Dunham""","""0/0""",5.0,"""1347321600""","""Oh! Yeah! Please let me introduce myself!""","""Stones reign, Fat Boys reign. Absolutely mischievious. I grinned ear to ear through the whole thing…"
"""B00008RH2S""","""Play to Kill""","""15.73""","""unknown""","""unknown""","""0/0""",5.0,"""1065744000""","""Great rock from the Northwest""","""The northwest has had some amazing bands come out of it in the last few years or so, and the Jet Ci…"
"""B00000EMNK""","""Who Do You Love""","""unknown""","""ANEDLXQSCJT3O""","""JC""","""0/0""",5.0,"""1208736000""","""Rare 1987 Studio Concept Album""","""1) Who Do You Love?2) Dare to love3) All the World Should Know (Duet w/ Kathy Troccoli)4) It Doesn'…"
"""B00005NV29""","""Junto a Ti""","""unknown""","""AG5ND5SXXL0AV""","""Blue Mendolence ""Blue""""","""0/0""",5.0,"""1358640000""","""A rich, musically complex voice""","""I first heard Alessandro Safina in his duet with Sarah Brightman on her DVD ""Symphony"", where she p…"


In [17]:
# Check the unique userIDs
df_music_cleaned["userId"].value_counts(sort=True).head()

userId,count
str,u32
"""unknown""",49291
"""A9Q28YTLYREO7""",1316
"""A2WQY1B8ZS7QRZ""",1229
"""A2AIMXT9PLAM12""",1196
"""A3CN9CCJUNIPKT""",854


In [18]:
# Drop the unwanted userIDs
rem_str: str = "unknown"
print(f"[Before data cleaning]: {df_music_cleaned.shape}")
df_music_cleaned = df_music_cleaned.filter(pl.col("userId").ne(rem_str))
print(f"[After data cleaning]: {df_music_cleaned.shape}")

[Before data cleaning]: (522432, 10)
[After data cleaning]: (473140, 10)
