In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,mlxtend,omegaconf --conda

Python implementation: CPython
Python version       : 3.11.8
IPython version      : 8.22.2

numpy    : 1.26.4
pandas   : 2.2.1
polars   : 0.20.18
mlxtend  : 0.23.1
omegaconf: 2.3.0

conda environment: torch_p11



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")


# auto reload imports# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(500)

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import norm
from typing import Literal

In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Dataset, Reader


# Sample data
user_item_interactions = pd.DataFrame(
    {
        "user_id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3],
        "item_id": [1, 2, 4, 5, 1, 2, 3, 5, 2, 3, 4],
        "rating": [5, 4, 3, 5, 4, 2, 5, 4, 3, 5, 5],
    }
)

item_attributes = pd.DataFrame(
    {
        "item_id": [1, 2, 3, 4, 5],
        "description": [
            "Action movie",
            "Comedy movie",
            "Drama movie",
            "Romance movie",
            "Sci-Fi | Action movie",
        ],
    }
)

# user_item_interactions

In [5]:
import numpy as np
import pandas as pd


def generate_mock_data(num_users=10, num_items=30):
    """Generates mock user-item interaction and item attribute data.

    Args:
        num_users: Number of users to generate.
        num_items: Number of items to generate.

    Returns:
        A tuple of two pandas DataFrames: user-item interactions and item attributes.
    """

    # Calculate the total number of interactions
    num_interactions = num_users * num_items // 2

    # Generate user-item interactions
    np.random.seed(42)  # For reproducibility
    users = np.random.randint(1, num_users + 1, size=num_interactions)
    items = np.random.randint(1, num_items + 1, size=num_interactions)
    ratings = np.random.randint(1, 6, size=num_interactions)

    user_item_interactions = pd.DataFrame(
        {"user_id": users, "item_id": items, "rating": ratings}
    )

    # Generate item attributes
    item_ids = np.arange(1, num_items + 1)

    # Expanded genres
    genres = ["Action", "Comedy", "Drama", "Romance", "Sci-Fi"]
    combined_genres = ["Action-Comedy", "Action-Sci-Fi", "Comedy-Drama", "Sci-Fi-Drama"]

    # Calculate how many items can be evenly distributed across all genres
    num_basic_genres = len(genres)
    num_combined_genres = len(combined_genres)
    total_genres = num_basic_genres + num_combined_genres

    # Ensure the description list matches the number of items
    descriptions = []

    # First add a fixed number of each genre
    for i in range(num_items):
        if i < num_basic_genres * (num_items // total_genres):
            genre = genres[i % num_basic_genres]
        else:
            genre = combined_genres[i % num_combined_genres]
        descriptions.append(f"{genre} movie")

    # Adjust to the exact number of items, if there's a remainder
    while len(descriptions) < num_items:
        descriptions.append(f"{np.random.choice(genres + combined_genres)} movie")

    # Shuffle descriptions to randomize genre assignment
    np.random.shuffle(descriptions)

    item_attributes = pd.DataFrame({"item_id": item_ids, "description": descriptions})

    return user_item_interactions, item_attributes


# Example usage
user_item_interactions, item_attributes = generate_mock_data(10, 30)
print(f"{user_item_interactions.shape = }")
user_item_interactions.head()

user_item_interactions.shape = (150, 3)


Unnamed: 0,user_id,item_id,rating
0,7,3,2
1,4,7,2
2,8,6,2
3,5,8,3
4,7,27,5


In [6]:
user_item_interactions["user_id"].nunique()

10

In [7]:
print(f"{item_attributes.shape=}")
item_attributes.head(10)

item_attributes.shape=(30, 2)


Unnamed: 0,item_id,description
0,1,Action movie
1,2,Comedy-Drama movie
2,3,Action-Comedy movie
3,4,Action movie
4,5,Action-Comedy movie
5,6,Sci-Fi movie
6,7,Romance movie
7,8,Action-Sci-Fi movie
8,9,Drama movie
9,10,Sci-Fi-Drama movie


In [8]:
# Content-Based Filtering
tfidf: TfidfVectorizer = TfidfVectorizer(stop_words="english")
item_tfidf_matrix: csr_matrix = tfidf.fit_transform(item_attributes["description"])

In [9]:
# Collaborative Filtering
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(
    user_item_interactions[["user_id", "item_id", "rating"]], reader
)
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1493c3ed0>

In [10]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Trainset, AlgoBase


def content_based_recommendations(
    user_profile: str, item_tfidf_matrix: np.ndarray, top_n: int = 5
) -> np.ndarray:
    """
    Generate content-based recommendations for a user.

    Parameters
    ----------
    user_profile : str
        The user's profile text.
    item_tfidf_matrix : np.ndarray, shape (n_items, n_features)
        TF-IDF matrix of all items.
    top_n : int, optional
        Number of recommendations to return (default is 5).

    Returns
    -------
    np.ndarray
        Indices of top N recommended items.
    """
    user_profile_vector = tfidf.transform([user_profile])
    cosine_sim = cosine_similarity(user_profile_vector, item_tfidf_matrix)
    # print(f"{cosine_sim = }")
    return cosine_sim.argsort()[0][::-1][:top_n]


def collaborative_recommendations(
    user_id: int, algo: AlgoBase, top_n: int = 5
) -> list[int]:
    """
    Generate collaborative filtering recommendations for a user.

    Parameters
    ----------
    user_id : int
        The ID of the user.
    algo : AlgoBase
        The collaborative filtering algorithm.
    top_n : int, optional
        Number of recommendations to return (default is 5).

    Returns
    -------
    list[int]
        List of top N recommended item IDs.
    """
    user_rated_items: set[int] = {
        item_id for item_id, _ in trainset.ur[trainset.to_inner_uid(user_id)]
    }
    all_items: set[int] = set(trainset.all_items())
    items_to_predict: set[int] = all_items - user_rated_items

    predictions: list[float] = [
        algo.predict(user_id, trainset.to_raw_iid(item)).est
        for item in items_to_predict
    ]
    item_predictions: list[tuple[int, float]] = list(zip(items_to_predict, predictions))

    top_predictions: list[tuple[int, float]] = sorted(
        item_predictions, key=lambda x: x[1], reverse=True
    )[:top_n]
    return [trainset.to_raw_iid(item[0]) for item in top_predictions]


def hybrid_recommendations(
    user_id: int,
    user_profile: str,
    item_tfidf_matrix: np.ndarray,
    algo: AlgoBase,
    w1: float = 0.5,
    w2: float = 0.5,
    top_n: int = 5,
) -> list[int]:
    """
    Generate hybrid recommendations combining content-based and collaborative filtering.

    Parameters
    ----------
    user_id : int
        The ID of the user.
    user_profile : str
        The user's profile text.
    item_tfidf_matrix : np.ndarray, shape (n_items, n_features)
        TF-IDF matrix of all items.
    algo : AlgoBase
        The collaborative filtering algorithm.
    w1 : float, optional
        Weight for content-based recommendations (default is 0.5).
    w2 : float, optional
        Weight for collaborative recommendations (default is 0.5).
    top_n : int, optional
        Number of recommendations to return (default is 5).

    Returns
    -------
    list[int]
        List of top N recommended item IDs.
    """
    user_profile_vector = tfidf.transform([user_profile])
    content_recs: np.ndarray = content_based_recommendations(
        user_profile, item_tfidf_matrix, top_n
    )
    collaborative_recs: list[int] = collaborative_recommendations(user_id, algo, top_n)

    combined_scores: dict[int, float] = {}
    for item in content_recs:
        combined_scores[item] = (
            combined_scores.get(item, 0)
            + w1 * cosine_similarity(user_profile_vector, item_tfidf_matrix[item])[0][0]
        )
    for item in collaborative_recs:
        combined_scores[item] = (
            combined_scores.get(item, 0) + w2 * algo.predict(user_id, item).est
        )

    return sorted(combined_scores, key=combined_scores.get, reverse=True)[:top_n]

In [11]:
# Example usage
items_dict: dict[str, Any] = item_attributes.to_dict(orient="records")
user_profile: str = "Action movie"

idx: list[int] = content_based_recommendations(
    user_profile, item_tfidf_matrix, top_n=10
).tolist()
sorted_items: list[dict[str, Any]] = [items_dict[i] for i in idx]
sorted_items

[{'item_id': 1, 'description': 'Action movie'},
 {'item_id': 4, 'description': 'Action movie'},
 {'item_id': 21, 'description': 'Action movie'},
 {'item_id': 24, 'description': 'Action-Comedy movie'},
 {'item_id': 3, 'description': 'Action-Comedy movie'},
 {'item_id': 5, 'description': 'Action-Comedy movie'},
 {'item_id': 15, 'description': 'Action-Comedy movie'},
 {'item_id': 27, 'description': 'Action-Sci-Fi movie'},
 {'item_id': 20, 'description': 'Action-Sci-Fi movie'},
 {'item_id': 8, 'description': 'Action-Sci-Fi movie'}]

In [12]:
df: pd.DataFrame = pd.merge(user_item_interactions, item_attributes, on="item_id")

In [13]:
# Example usage
items_dict: dict[str, Any] = item_attributes.to_dict(orient="records")
user_profile = "Action movie"
user_id: int = 3

idx: list[int] = collaborative_recommendations(user_id, algo=algo, top_n=5)
print(idx)
sorted_items: list[dict[str, Any]] = [items_dict[i - 1] for i in idx]
sorted_items

[23, 3, 10, 18, 4]


[{'item_id': 23, 'description': 'Comedy movie'},
 {'item_id': 3, 'description': 'Action-Comedy movie'},
 {'item_id': 10, 'description': 'Sci-Fi-Drama movie'},
 {'item_id': 18, 'description': 'Sci-Fi-Drama movie'},
 {'item_id': 4, 'description': 'Action movie'}]

In [14]:
df.loc[df["item_id"].isin([23, 3, 10, 18, 4])].drop_duplicates(subset=["item_id"])

Unnamed: 0,user_id,item_id,rating,description
0,7,3,2,Action-Comedy movie
10,4,10,5,Sci-Fi-Drama movie
40,9,23,4,Comedy movie
52,4,4,1,Action movie
91,1,18,5,Sci-Fi-Drama movie


In [15]:
df.loc[df["user_id"] == 3]["description"].value_counts()

description
Action-Sci-Fi movie    6
Sci-Fi movie           2
Comedy-Drama movie     2
Action movie           2
Action-Comedy movie    1
Romance movie          1
Drama movie            1
Name: count, dtype: int64

In [16]:
# Example usage
user_profile: str = "Action movie"
user_id: int = 3
recommendations = hybrid_recommendations(user_id, user_profile, item_tfidf_matrix, algo)
sorted_items: list[dict[str, Any]] = [items_dict[i - 1] for i in recommendations]
pprint(sorted_items)

[{'description': 'Action-Comedy movie', 'item_id': 3},
 {'description': 'Comedy movie', 'item_id': 23},
 {'description': 'Sci-Fi-Drama movie', 'item_id': 10},
 {'description': 'Sci-Fi-Drama movie', 'item_id': 18},
 {'description': 'Action movie', 'item_id': 4}]
