In [17]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from pathlib import Path

# Load ratings & movies (use your df from earlier for titles)
RAW_PATH = Path('../data/raw/')
ratings = pd.read_csv(RAW_PATH / 'ratings.csv')
movies = pd.read_csv(RAW_PATH / 'movies.csv')  # or your enriched df

print("Ratings shape:", ratings.shape)
display(ratings.head())

# Map to consecutive IDs (implicit requires 0-based indices)
user_codes = ratings['userId'].astype('category').cat.codes
item_codes = ratings['movieId'].astype('category').cat.codes

# Sparse user-item matrix (rows=users, columns=items, values=rating as confidence)
sparse_matrix = coo_matrix((ratings['rating'], (user_codes, item_codes)))

Ratings shape: (100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [18]:
# Initialize model
model = AlternatingLeastSquares(
    factors=100,           # latent dimensions (like n_factors in SVD)
    iterations=50,
    regularization=0.02,
    random_state=42,
    use_gpu=False,        # CPU is fine for MovieLens small
    calculate_training_loss=True
)

# Fit (implicit expects confidence matrix; ratings work as-is)
model.fit(sparse_matrix.astype('float32'))

# Save
import joblib
joblib.dump(model, '../models/collab_als.joblib')



  0%|          | 0/50 [00:00<?, ?it/s]

['../models/collab_als.joblib']

In [19]:
import pandas as pd
from scipy.sparse import csr_matrix   # ← add this import

# Convert once after creating sparse_matrix (do this right after coo_matrix)
sparse_matrix = csr_matrix(sparse_matrix)   # convert to CSR format

# Now the function
def get_top_n_implicit(user_id: int, n: int = 10, model=model):
    """
    Get top N recommended movies for a user using implicit ALS.
    """
    # Get internal user ID (0-based category code)
    user_mask = ratings['userId'] == user_id
    if not user_mask.any():
        raise ValueError(f"User ID {user_id} not found")
    
    user_internal = ratings[user_mask]['userId'].astype('category').cat.codes.iloc[0]

    # Get the user's row (now works with CSR)
    user_row = sparse_matrix[user_internal]

    # Recommend: returns internal item IDs + scores
    recommended_items, scores = model.recommend(
        userid=user_internal,
        user_items=user_row,
        N=n + 20,                     # extra to allow filtering seen items
        filter_already_liked_items= False
    )

    # Map internal item IDs back to original movieId
    # We need the category mapping from earlier
    item_codes = ratings['movieId'].astype('category').cat.categories
    item_codes_map = dict(enumerate(item_codes))

    recs = []
    for internal_id, score in zip(recommended_items, scores):
        movie_id = item_codes_map[internal_id]
        title = movies[movies['movieId'] == movie_id]['title'].values[0]
        recs.append({
            'movieId': movie_id,
            'title': title,
            'score': round(score, 4)
        })

    return pd.DataFrame(recs).head(n)


# Test it
print("Top 10 for user 1:")
display(get_top_n_implicit(1, n=10))

Top 10 for user 1:


Unnamed: 0,movieId,title,score
0,2617,"Mummy, The (1999)",1.261
1,2174,Beetlejuice (1988),1.1856
2,367,"Mask, The (1994)",1.1608
3,1222,Full Metal Jacket (1987),1.1587
4,2028,Saving Private Ryan (1998),1.1481
5,733,"Rock, The (1996)",1.1348
6,50,"Usual Suspects, The (1995)",1.1324
7,2571,"Matrix, The (1999)",1.1319
8,1196,Star Wars: Episode V - The Empire Strikes Back...,1.1281
9,2947,Goldfinger (1964),1.1242


In [20]:
def qualitative_check(user_id: int, n: int = 10):
    recs = get_top_n_implicit(user_id, n=n)
    actual_high_rated = ratings[(ratings['userId'] == user_id) & (ratings['rating'] >= 4.0)]
    
    hits = actual_high_rated['movieId'].isin(recs['movieId']).sum()
    total_high = len(actual_high_rated)
    
    print(f"User {user_id}:")
    print(f"  Recommended {n} movies → {hits} were highly rated by the user ({hits / n:.1%} hit rate)")
    print(f"  User has {total_high} movies rated ≥4.0 in total")
    
    # Show overlap
    print("\nRecommended movies that user rated highly:")
    overlap = recs[recs['movieId'].isin(actual_high_rated['movieId'])]
    display(overlap[['movieId', 'title', 'score']])

# Test on a few users
for uid in [1, 42, 100, 200]:
    qualitative_check(uid, n=10)
    print("-" * 50)

User 1:
  Recommended 10 movies → 9 were highly rated by the user (90.0% hit rate)
  User has 200 movies rated ≥4.0 in total

Recommended movies that user rated highly:


Unnamed: 0,movieId,title,score
1,2174,Beetlejuice (1988),1.1856
2,367,"Mask, The (1994)",1.1608
3,1222,Full Metal Jacket (1987),1.1587
4,2028,Saving Private Ryan (1998),1.1481
5,733,"Rock, The (1996)",1.1348
6,50,"Usual Suspects, The (1995)",1.1324
7,2571,"Matrix, The (1999)",1.1319
8,1196,Star Wars: Episode V - The Empire Strikes Back...,1.1281
9,2947,Goldfinger (1964),1.1242


--------------------------------------------------
User 42:
  Recommended 10 movies → 4 were highly rated by the user (40.0% hit rate)
  User has 259 movies rated ≥4.0 in total

Recommended movies that user rated highly:


Unnamed: 0,movieId,title,score
3,1222,Full Metal Jacket (1987),1.1587
4,2028,Saving Private Ryan (1998),1.1481
6,50,"Usual Suspects, The (1995)",1.1324
7,2571,"Matrix, The (1999)",1.1319


--------------------------------------------------
User 100:
  Recommended 10 movies → 1 were highly rated by the user (10.0% hit rate)
  User has 107 movies rated ≥4.0 in total

Recommended movies that user rated highly:


Unnamed: 0,movieId,title,score
4,2028,Saving Private Ryan (1998),1.1481


--------------------------------------------------
User 200:
  Recommended 10 movies → 4 were highly rated by the user (40.0% hit rate)
  User has 202 movies rated ≥4.0 in total

Recommended movies that user rated highly:


Unnamed: 0,movieId,title,score
0,2617,"Mummy, The (1999)",1.261
7,2571,"Matrix, The (1999)",1.1319
8,1196,Star Wars: Episode V - The Empire Strikes Back...,1.1281
9,2947,Goldfinger (1964),1.1242


--------------------------------------------------


In [21]:
def manual_map_precision_at_k(model, train_matrix, test_matrix, K=10, sample_users=50):
    test_users = np.unique(test_matrix.nonzero()[0])
    if len(test_users) > sample_users:
        test_users = np.random.choice(test_users, sample_users, replace=False)
    
    map_sum = 0
    precision_sum = 0
    num_users = 0
    
    for uid in test_users:
        # Get top-K recs
        user_row = train_matrix[uid]
        rec_items, _ = model.recommend(uid, user_row, N=K, filter_already_liked_items=True)
        
        # Get true positives in test
        true_pos = test_matrix[uid].nonzero()[1]  # item indices
        
        if len(true_pos) == 0:
            continue
        
        hits = 0
        rank_sum = 0
        for rank, item in enumerate(rec_items, 1):
            if item in true_pos:
                hits += 1
                rank_sum += 1 / rank  # for AP
        
        if hits > 0:
            ap = rank_sum / len(true_pos)
            map_sum += ap
            precision_sum += hits / K
            num_users += 1
    
    if num_users == 0:
        return 0.0, 0.0
    
    return map_sum / num_users, precision_sum / num_users

# Run
map_manual, prec_manual = manual_map_precision_at_k(model, train_matrix, test_matrix, K=10, sample_users=100)

print(f"Manual MAP@10 (sample 100 users): {map_manual:.4f}")
print(f"Manual Precision@10 (sample 100 users): {prec_manual:.4f}")

Manual MAP@10 (sample 100 users): 0.0000
Manual Precision@10 (sample 100 users): 0.0000


## Day 5 Evaluation Summary

- Qualitative: Hit rate on high-rated movies for sample users: XX%
- Quantitative: MAP@10 = {map_k:.4f}, NDCG@10 = {ndcg_k:.4f} on 100 test users
- Observations: The model prefers popular items, but personalizes well for active users.
- Next: Day 6 – blend with content-based scores from Day 4.