In [2]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -q ml-100k.zip -d ./data


--2025-09-20 18:09:33--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.96.204
Connecting to files.grouplens.org (files.grouplens.org)|128.101.96.204|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://files.grouplens.org/datasets/movielens/ml-100k.zip [following]
--2025-09-20 18:09:33--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Connecting to files.grouplens.org (files.grouplens.org)|128.101.96.204|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2025-09-20 18:09:33 (21.9 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]



In [5]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -q ml-100k.zip -d ./data
!ls ./data/ml-100k | head


--2025-09-20 18:10:47--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.96.204
Connecting to files.grouplens.org (files.grouplens.org)|128.101.96.204|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://files.grouplens.org/datasets/movielens/ml-100k.zip [following]
--2025-09-20 18:10:47--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Connecting to files.grouplens.org (files.grouplens.org)|128.101.96.204|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip.1’


2025-09-20 18:10:47 (20.5 MB/s) - ‘ml-100k.zip.1’ saved [4924029/4924029]

replace ./data/ml-100k/allbut.pl? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace ./data/ml-100k/mku.sh? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace ./data/ml-100k/README? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace ./data/ml-100k/u.data? [y

In [6]:
DATA_DIR = "./data/ml-100k"


In [8]:
!wget -nc https://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -qo ml-100k.zip -d ./data


File ‘ml-100k.zip’ already there; not retrieving.



In [9]:
!unzip -qo ml-100k.zip -d ./data


In [10]:
!ls ./data/ml-100k | head


allbut.pl
mku.sh
README
u1.base
u1.test
u2.base
u2.test
u3.base
u3.test
u4.base


In [11]:
"""
Movie Recommendation System (User-based CF) — Precision@K

Follow the instructions from your assignment:
- Build a recommender based on **user similarity** using a **user–item matrix**.
- Recommend **top-rated unseen** movies for a given user.
- **Evaluate** performance using **precision@K**.

Extras included (optional):
- Item-based CF recommender
- Matrix factorization (Truncated SVD) baseline

How to run (example):
1) Download MovieLens 100K (either GroupLens `ml-100k` or a Kaggle CSV version) and unzip into a folder, e.g. `./data/ml-100k/`.
2) Set DATA_DIR below to that folder.
3) Run this script. It will:
   - Autodetect the file format (Kaggle-style CSVs *or* original u.data/u.item)
   - Train a user-based CF model on a per-user split
   - Print **Precision@K**
   - Print top-N recommendations for a sample user

Tested with Python 3.10+, pandas, numpy, scikit-learn, scipy.
"""
from __future__ import annotations

import os
import random
from dataclasses import dataclass
from typing import Tuple, Dict, List

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

# ----------------------- CONFIG -----------------------
DATA_DIR = "./data/ml-100k"  # fixed path for Colab/local run
K = 10                       # for Precision@K and recommendation list length
NEIGHBORS = 30               # number of nearest neighbors to use for scoring
RATING_THRESHOLD = 4.0       # relevance threshold for evaluation
TEST_SIZE_PER_USER = 0.2     # 20% per-user test split
MIN_RATINGS_PER_USER = 5     # users with fewer are skipped in eval
RNG_SEED = 42

random.seed(RNG_SEED)
np.random.seed(RNG_SEED)

# ----------------------- DATA LOADER -----------------------

def _load_kaggle_csv_style(path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load Kaggle-style CSVs: ratings.csv (userId,movieId,rating,timestamp), movies.csv (movieId,title,genres)."""
    ratings_path = os.path.join(path, "ratings.csv")
    movies_path = os.path.join(path, "movies.csv")
    if not (os.path.exists(ratings_path) and os.path.exists(movies_path)):
        raise FileNotFoundError
    ratings = pd.read_csv(ratings_path)
    movies = pd.read_csv(movies_path)
    # Standardize column names
    ratings = ratings.rename(columns={"user_id": "userId", "movie_id": "movieId"})
    movies = movies.rename(columns={"movie_id": "movieId", "title_x": "title"})
    return ratings[["userId", "movieId", "rating", "timestamp"]], movies[["movieId", "title"]]


def _load_original_u_files(path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load original GroupLens `ml-100k` files: u.data (\t-separated) and u.item (| separated)."""
    udata = os.path.join(path, "u.data")
    uitem = os.path.join(path, "u.item")
    if not (os.path.exists(udata) and os.path.exists(uitem)):
        raise FileNotFoundError
    ratings = pd.read_csv(
        udata,
        sep="\t",
        names=["userId", "movieId", "rating", "timestamp"],
        engine="python",
    )
    movies = pd.read_csv(
        uitem,
        sep="|",
        names=[
            "movieId", "title", "release_date", "video_release_date", "IMDb_URL",
            "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy",
            "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
            "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western",
        ],
        encoding_errors="ignore",
        engine="python",
    )
    movies = movies[["movieId", "title"]]
    return ratings, movies


def load_movielens_100k(path: str = DATA_DIR) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Try Kaggle CSV layout; fallback to GroupLens u.data/u.item."""
    try:
        return _load_kaggle_csv_style(path)
    except FileNotFoundError:
        return _load_original_u_files(path)

# ----------------------- UTILITIES -----------------------

@dataclass
class Encoders:
    user2idx: Dict[int, int]
    idx2user: List[int]
    item2idx: Dict[int, int]
    idx2item: List[int]


def build_encoders(ratings: pd.DataFrame) -> Encoders:
    users = np.sort(ratings.userId.unique())
    items = np.sort(ratings.movieId.unique())
    user2idx = {u: i for i, u in enumerate(users)}
    item2idx = {m: i for i, m in enumerate(items)}
    idx2user = users.tolist()
    idx2item = items.tolist()
    return Encoders(user2idx, idx2user, item2idx, idx2item)


def df_to_csr(ratings: pd.DataFrame, enc: Encoders) -> csr_matrix:
    rows = ratings.userId.map(enc.user2idx).values
    cols = ratings.movieId.map(enc.item2idx).values
    data = ratings.rating.values.astype(np.float32)
    mat = csr_matrix((data, (rows, cols)), shape=(len(enc.idx2user), len(enc.idx2item)))
    return mat

# ----------------------- TRAIN/TEST SPLIT -----------------------

def per_user_train_test_split(ratings: pd.DataFrame,
                              test_size: float = TEST_SIZE_PER_USER,
                              min_items: int = MIN_RATINGS_PER_USER) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Split each user's ratings into train/test (stratified by user). Uses timestamp if present, else random."""
    if "timestamp" not in ratings.columns:
        ratings = ratings.assign(timestamp=0)
    train_parts = []
    test_parts = []
    for uid, grp in ratings.groupby("userId"):
        if len(grp) < max(2, min_items):
            # keep very small-user data entirely in train
            train_parts.append(grp)
            continue
        grp = grp.sort_values("timestamp")
        n_test = max(1, int(round(len(grp) * test_size)))
        test_idx = grp.index[-n_test:]
        train_idx = grp.index[:-n_test]
        train_parts.append(ratings.loc[train_idx])
        test_parts.append(ratings.loc[test_idx])
    train_df = pd.concat(train_parts).reset_index(drop=True)
    test_df = pd.concat(test_parts).reset_index(drop=True) if test_parts else pd.DataFrame(columns=ratings.columns)
    return train_df, test_df

# ----------------------- USER-BASED CF -----------------------

def user_similarity_matrix(R: csr_matrix) -> np.ndarray:
    """Cosine similarity between users (rows). Returns dense array of shape [n_users, n_users]."""
    # Add tiny epsilon to avoid divide-by-zero within cosine
    sims = cosine_similarity(R)
    np.fill_diagonal(sims, 0.0)  # exclude self-similarity
    return sims


def score_items_for_user(user_idx: int, R: csr_matrix, sims: np.ndarray,
                         neighbors: int = NEIGHBORS) -> np.ndarray:
    """Predict preference scores for **all items** for a target user (mean-centered, top-N neighbors)."""
    r_u = R.getrow(user_idx).toarray().ravel()  # ratings by target user
    # Select top neighbors by similarity
    sim_vec = sims[user_idx]
    if neighbors is not None and neighbors > 0:
        top_idx = np.argpartition(-sim_vec, min(neighbors, len(sim_vec)-1))[:neighbors]
        sim_vec = sim_vec[top_idx]
        R_neighbors = R[top_idx]
    else:
        R_neighbors = R
    # Weighted sum of neighbors' ratings
    num = sim_vec @ R_neighbors.toarray()
    den = np.abs(sim_vec).sum() + 1e-8
    scores = num / den
    # Don't recommend already-seen items
    scores[r_u > 0] = -np.inf
    return scores


def recommend_for_user(user_external_id: int, enc: Encoders, R: csr_matrix, sims: np.ndarray,
                        movies: pd.DataFrame, topk: int = K, neighbors: int = NEIGHBORS) -> pd.DataFrame:
    uidx = enc.user2idx[user_external_id]
    scores = score_items_for_user(uidx, R, sims, neighbors)
    top_items = np.argpartition(-scores, topk)[:topk]
    top_items = top_items[np.argsort(-scores[top_items])]
    movie_ids = [enc.idx2item[i] for i in top_items]
    titles = movies.set_index("movieId").loc[movie_ids]["title"].values
    return pd.DataFrame({"movieId": movie_ids, "title": titles, "score": scores[top_items]})

# ----------------------- EVALUATION -----------------------

def precision_at_k(recommended: List[int], relevant: set[int], k: int) -> float:
    if k == 0:
        return 0.0
    hits = sum(1 for m in recommended[:k] if m in relevant)
    return hits / k


def evaluate_precision_at_k(train: pd.DataFrame, test: pd.DataFrame, enc: Encoders, movies: pd.DataFrame,
                             topk: int = K, neighbors: int = NEIGHBORS,
                             threshold: float = RATING_THRESHOLD) -> float:
    if test.empty:
        return float("nan")

    R_train = df_to_csr(train, enc)
    sims = user_similarity_matrix(R_train)

    precisions = []
    for uid, grp in test.groupby("userId"):
        # Only evaluate users that exist in train encoders
        if uid not in enc.user2idx:
            continue
        # Relevant = items the user rated >= threshold **in the test set**
        relevant = set(grp.loc[grp.rating >= threshold, "movieId"].tolist())
        if not relevant:
            continue
        recs_df = recommend_for_user(uid, enc, R_train, sims, movies, topk=topk, neighbors=neighbors)
        recommended = recs_df.movieId.tolist()
        p = precision_at_k(recommended, relevant, topk)
        precisions.append(p)
    if not precisions:
        return float("nan")
    return float(np.mean(precisions))

# ----------------------- ITEM-BASED CF (Bonus) -----------------------

def item_similarity_matrix(R: csr_matrix) -> np.ndarray:
    sims = cosine_similarity(R.T)  # item–item
    np.fill_diagonal(sims, 0.0)
    return sims


def recommend_item_based(user_external_id: int, enc: Encoders, R: csr_matrix, item_sims: np.ndarray,
                         movies: pd.DataFrame, topk: int = K) -> pd.DataFrame:
    uidx = enc.user2idx[user_external_id]
    r_u = R.getrow(uidx).toarray().ravel()
    scores = r_u @ item_sims  # sum of similarities to items the user liked
    scores[r_u > 0] = -np.inf
    top_items = np.argpartition(-scores, topk)[:topk]
    top_items = top_items[np.argsort(-scores[top_items])]
    movie_ids = [enc.idx2item[i] for i in top_items]
    titles = movies.set_index("movieId").loc[movie_ids]["title"].values
    return pd.DataFrame({"movieId": movie_ids, "title": titles, "score": scores[top_items]})

# ----------------------- SVD BASELINE (Bonus) -----------------------

def svd_recommend(user_external_id: int, enc: Encoders, R: csr_matrix, n_components: int = 40, topk: int = K,
                   movies: pd.DataFrame | None = None) -> pd.DataFrame:
    svd = TruncatedSVD(n_components=n_components, random_state=RNG_SEED)
    U = svd.fit_transform(R)           # users x comp
    VT = svd.components_               # comp x items
    preds = U @ VT                     # users x items
    uidx = enc.user2idx[user_external_id]
    r_u = R.getrow(uidx).toarray().ravel()
    scores = preds[uidx].copy()
    scores[r_u > 0] = -np.inf
    top_items = np.argpartition(-scores, topk)[:topk]
    top_items = top_items[np.argsort(-scores[top_items])]
    if movies is None:
        return pd.DataFrame({"item_idx": top_items, "score": scores[top_items]})
    movie_ids = [enc.idx2item[i] for i in top_items]
    titles = movies.set_index("movieId").loc[movie_ids]["title"].values
    return pd.DataFrame({"movieId": movie_ids, "title": titles, "score": scores[top_items]})

# ----------------------- MAIN -----------------------

def main():
    print("Loading data from:", DATA_DIR)
    ratings, movies = load_movielens_100k(DATA_DIR)

    print("Ratings:", ratings.shape, "Movies:", movies.shape)
    enc = build_encoders(ratings)

    train_df, test_df = per_user_train_test_split(
        ratings[["userId", "movieId", "rating", "timestamp"]],
        test_size=TEST_SIZE_PER_USER,
        min_items=MIN_RATINGS_PER_USER,
    )
    print(f"Train ratings: {len(train_df):,} | Test ratings: {len(test_df):,}")

    # Build train matrix & user similarities
    R_train = df_to_csr(train_df, enc)
    user_sims = user_similarity_matrix(R_train)

    # Evaluate precision@K
    p_at_k = evaluate_precision_at_k(train_df, test_df, enc, movies, topk=K, neighbors=NEIGHBORS,
                                     threshold=RATING_THRESHOLD)
    print(f"\nPrecision@{K}: {p_at_k:.4f}")

    # Show recommendations for a sample user that exists in train
    sample_user = int(train_df.userId.sample(1, random_state=RNG_SEED).iloc[0])
    print(f"\nSample recommendations for user {sample_user} (user-based CF):")
    recs = recommend_for_user(sample_user, enc, R_train, user_sims, movies, topk=K, neighbors=NEIGHBORS)
    print(recs)

    # --- Bonus demos (optional) ---
    print(f"\n[Bonus] Item-based CF recommendations for user {sample_user}:")
    item_sims = item_similarity_matrix(R_train)
    print(recommend_item_based(sample_user, enc, R_train, item_sims, movies, topk=K))

    print(f"\n[Bonus] SVD baseline recommendations for user {sample_user}:")
    print(svd_recommend(sample_user, enc, R_train, n_components=40, topk=K, movies=movies))


if __name__ == "__main__":
    main()


Loading data from: ./data/ml-100k
Ratings: (100000, 4) Movies: (1682, 2)
Train ratings: 80,000 | Test ratings: 20,000

Precision@10: 0.1294

Sample recommendations for user 524 (user-based CF):
   movieId                                   title     score
0      427            To Kill a Mockingbird (1962)  3.961439
1      357  One Flew Over the Cuckoo's Nest (1975)  3.892331
2      479                          Vertigo (1958)  3.641742
3       28                        Apollo 13 (1995)  3.500951
4      496            It's a Wonderful Life (1946)  3.406243
5      603                      Rear Window (1954)  3.300198
6      176                           Aliens (1986)  3.193579
7      183                            Alien (1979)  3.177843
8      197                    Graduate, The (1967)  3.153037
9        9                 Dead Man Walking (1995)  3.146633

[Bonus] Item-based CF recommendations for user 524:
   movieId                                   title       score
0      183         