In [2]:
# Use this cell when you modify the recsys package and need to reload it
import importlib
import sys

# Reload the recsys modules
modules_to_reload = []
for module_name in sys.modules.keys():
    if module_name.startswith('recsys'):
        modules_to_reload.append(module_name)

if modules_to_reload:
    # Reload in reverse order to handle dependencies
    for module_name in reversed(sorted(modules_to_reload)):
        importlib.reload(sys.modules[module_name])
    print(f"Reloaded modules: {modules_to_reload}")
else:
    print("No recsys modules found to reload")

No recsys modules found to reload


# Collaborative Filtering using Cosine similarity


# Importing required packages
We start off by defining our root project entry point

In [3]:
import sys
from pathlib import Path
project_root = Path("..").resolve()
if str(project_root / "src") not in sys.path:
    sys.path.append(str(project_root / "src"))

This next code snippet unzips the .zip file in "data/raw" and saves it in "data/processed" folders respectively

In [4]:
from recsys.utils import unzip_file
zip_path = project_root / "data" / "raw" / "ml-latest.zip"
destination = project_root / "data" / "processed"

unzip_file(zip_path, destination, overwrite=True)

PosixPath('/Users/tonyli/Documents/Projects/craftyverse/craftyverse-recsys/data/processed')

In [5]:

from recsys.utils import read_csv
csv_path = project_root / "data" /"processed" / "ml-latest" / "ratings.csv"

ratings_df = read_csv(csv_path)
print(ratings_df.head())



   userId  movieId  rating   timestamp
0       1        1     4.0  1225734739
1       1      110     4.0  1225865086
2       1      158     4.0  1225733503
3       1      260     4.5  1225735204
4       1      356     5.0  1225735119


In [6]:
# Let's examine the data size first
print(f"Dataset shape: {ratings_df.shape}")
print(f"Number of unique users: {ratings_df['userId'].nunique()}")
print(f"Number of unique movies: {ratings_df['movieId'].nunique()}")
print(f"Potential matrix size: {ratings_df['userId'].nunique() * ratings_df['movieId'].nunique():,} cells")
print(f"Data sparsity: {len(ratings_df) / (ratings_df['userId'].nunique() * ratings_df['movieId'].nunique()) * 100:.2f}%")

Dataset shape: (33832162, 4)
Number of unique users: 330975
Number of unique movies: 83239
Potential matrix size: 27,550,028,025 cells
Data sparsity: 0.12%


In [7]:
# Create a smaller subset for testing - top N users and movies by rating count
top_n = 22000  # Use the first 12000 users and movies as training subset

# Get top users by number of ratings
top_users = ratings_df['userId'].value_counts().head(top_n).index
# Get top movies by number of ratings  
top_movies = ratings_df['movieId'].value_counts().head(top_n).index

# Filter the dataset to only include top users and movies
ratings_subset = ratings_df[
    (ratings_df['userId'].isin(top_users)) & 
    (ratings_df['movieId'].isin(top_movies))
].copy()

print(f"Original dataset: {ratings_df.shape[0]:,} ratings")
print(f"Subset dataset: {ratings_subset.shape[0]:,} ratings")
print(f"Subset users: {ratings_subset['userId'].nunique()}")
print(f"Subset movies: {ratings_subset['movieId'].nunique()}")
print(f"Subset matrix size: {ratings_subset['userId'].nunique() * ratings_subset['movieId'].nunique():,} cells")

Original dataset: 33,832,162 ratings
Subset dataset: 15,600,006 ratings
Subset users: 22000
Subset movies: 22000
Subset matrix size: 484,000,000 cells


## Create user-item ratings matrix
Pivot ratings so rows are movieId and columns are userId.

In [8]:
# Create user-item ratings matrix using the subset
ratings_matrix = ratings_subset.pivot_table(
    index='userId',
    columns='movieId',
    values='rating',
    fill_value=0
)
ratings_matrix = ratings_matrix.sort_index().sort_index(axis=1)
print(f"Ratings matrix shape: {ratings_matrix.shape}")
print(f"Memory usage: {ratings_matrix.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\nFirst 5x5 sample:")
print(ratings_matrix.iloc[:5, :5])

Ratings matrix shape: (22000, 22000)
Memory usage: 3692.79 MB

First 5x5 sample:
movieId    1    2    3    4    5
userId                          
22       0.0  0.0  0.0  0.0  0.0
24       4.5  0.0  0.0  0.0  0.0
30       0.0  0.0  0.0  0.0  0.0
44       3.5  0.0  0.0  0.0  0.0
53       3.5  0.0  0.0  0.0  0.0


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

user_similarity = cosine_similarity(ratings_matrix)
userIds = ratings_matrix.index.tolist()

user_similarity_df = pd.DataFrame(user_similarity,
                                  index=userIds,
                                  columns=userIds)

print(user_similarity_df)

In [23]:
from recsys.recommendation import CF_cosine_recommender
recommended_items = CF_cosine_recommender(149, user_similarity_df, ratings_matrix)
print(recommended_items)

[2571, 296, 318, 260, 1198]
