# MovieLens Recommendation System

Build a collaborative filtering recommendation system for movies.

**Dataset:** [https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset](https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset)  
**Columns:** `userId`, `movieId`, `rating`  
**Type:** Collaborative Filtering

> **TODO:** Download the dataset, place it in `../../data/raw/`, then update `DATA_PATH` below.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
sns.set_theme(style='whitegrid')

## 1. Load Data

In [None]:
DATA_PATH = "../../data/raw/ratings.csv"
USER_COL = "userId"
ITEM_COL = "movieId"
RATING_COL = "rating"

df = pd.read_csv(DATA_PATH)
# Use a sample for faster experimentation
df = df.sample(min(100_000, len(df)), random_state=42).reset_index(drop=True)
print(f'Shape: {df.shape}')
print(f'Users: {df[USER_COL].nunique()}, Items: {df[ITEM_COL].nunique()}')
df.head()

## 2. EDA

In [None]:
# Rating distribution
df[RATING_COL].value_counts().sort_index().plot(kind='bar', figsize=(8, 4))
plt.title('Rating Distribution'); plt.xlabel('Rating')
plt.tight_layout(); plt.show()

# Ratings per user / item
ratings_per_user = df.groupby(USER_COL).size()
ratings_per_item = df.groupby(ITEM_COL).size()
print(f'Ratings per user: mean={ratings_per_user.mean():.1f}, median={ratings_per_user.median():.1f}')
print(f'Ratings per item: mean={ratings_per_item.mean():.1f}, median={ratings_per_item.median():.1f}')

## 3. Train / Test Split

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print(f'Train: {len(train_df)}, Test: {len(test_df)}')

## 4. Popularity Baseline

In [None]:
# Recommend top-K most popular items to everyone
K = 10
popular_items = (
    train_df.groupby(ITEM_COL)[RATING_COL].mean()
    .sort_values(ascending=False)
    .head(K)
    .index.tolist()
)
print(f'Top-{K} popular items: {popular_items[:5]}...')

## 5. User-Item Matrix

In [None]:
# Build matrix (may be large — filter active users/items for experiments)
min_ratings_user = 5
min_ratings_item = 5
active_users = ratings_per_user[ratings_per_user >= min_ratings_user].index
popular_items_all = ratings_per_item[ratings_per_item >= min_ratings_item].index
filtered = train_df[
    train_df[USER_COL].isin(active_users) & train_df[ITEM_COL].isin(popular_items_all)
]

user_item_matrix = filtered.pivot_table(
    index=USER_COL, columns=ITEM_COL, values=RATING_COL, fill_value=0
)
sparsity = 1 - (filtered.shape[0] / (user_item_matrix.shape[0] * user_item_matrix.shape[1]))
print(f'Matrix shape: {user_item_matrix.shape}')
print(f'Sparsity: {sparsity:.1%}')

## 6. User-Based Collaborative Filtering

In [None]:
user_sim = cosine_similarity(user_item_matrix)
user_sim_df = pd.DataFrame(
    user_sim, index=user_item_matrix.index, columns=user_item_matrix.index
)

def recommend_user_based(user_id, n=10):
    if user_id not in user_sim_df.index:
        return popular_items[:n]  # cold-start fallback
    similar_users = user_sim_df[user_id].sort_values(ascending=False)[1:21].index
    seen = set(user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index)
    scores = user_item_matrix.loc[similar_users].mean().drop(index=list(seen), errors='ignore')
    return scores.nlargest(n).index.tolist()

sample_user = user_item_matrix.index[0]
recs = recommend_user_based(sample_user)
print(f'Recommendations for user {sample_user}: {recs}')

## 7. Item-Based Collaborative Filtering

In [None]:
item_sim = cosine_similarity(user_item_matrix.T)
item_sim_df = pd.DataFrame(
    item_sim, index=user_item_matrix.columns, columns=user_item_matrix.columns
)

def recommend_item_based(user_id, n=10):
    if user_id not in user_item_matrix.index:
        return popular_items[:n]
    rated = user_item_matrix.loc[user_id]
    rated_items = rated[rated > 0].index
    scores = item_sim_df[rated_items].mean(axis=1)
    scores = scores.drop(index=rated_items, errors='ignore')
    return scores.nlargest(n).index.tolist()

recs_item = recommend_item_based(sample_user)
print(f'Item-based recs for user {sample_user}: {recs_item}')

## 8. Offline Evaluation (Precision@K)

In [None]:
def precision_at_k(recommend_fn, test_df, k=10, n_users=200):
    test_users = test_df[USER_COL].unique()[:n_users]
    precisions = []
    for uid in test_users:
        relevant = set(test_df[test_df[USER_COL] == uid][ITEM_COL])
        if not relevant:
            continue
        recs = set(recommend_fn(uid, n=k))
        precisions.append(len(recs & relevant) / k)
    return np.mean(precisions) if precisions else 0.0

p_user = precision_at_k(recommend_user_based, test_df)
p_item = precision_at_k(recommend_item_based, test_df)
print(f'User-based Precision@10: {p_user:.4f}')
print(f'Item-based Precision@10: {p_item:.4f}')

## 9. Conclusion

| Method | Precision@10 |
|---|---|
| Popularity Baseline | — |
| User-Based CF | *(fill)* |
| Item-Based CF | *(fill)* |

**Observations:**
- 

**Next steps:**
- Try matrix factorization (SVD via `surprise` library)
- Add content-based features (genres, tags)
- Address cold-start with hybrid approach