# Collaborative Filtering Recommendation System

This notebook implements and evaluates collaborative filtering algorithms for the recommendation system.

## Overview
- **User-to-User Collaborative Filtering**: Find similar users and recommend items they liked
- **Item-to-Item Collaborative Filtering**: Find similar items based on user interactions
- **Evaluation**: Compare performance using recall@k and mean rank metrics

## 1. Setup and Data Loading

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import gc
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OrdinalEncoder

# Set random seed for reproducibility
np.random.seed(42)

# Data types for memory efficiency
DT_FLOAT = np.float32
DT_INT = np.int32

In [None]:
# Load datasets
train_df = pd.read_parquet("../data/sample/sample_interactions.csv")
valid_df = pd.read_parquet("../data/sample/sample_interactions.csv")
test_df = pd.read_parquet("../data/sample/sample_interactions.csv")

print(f"Train: {train_df.shape}, Valid: {valid_df.shape}, Test: {test_df.shape}")

## 2. Data Preprocessing

In [None]:
# Encode user and item IDs
enc = OrdinalEncoder(dtype=DT_INT, handle_unknown='use_encoded_value', unknown_value=-1)

# Fit on training data and transform all datasets
train_df[["user_id", "click_article_id"]] = enc.fit_transform(
    train_df[["user_id", "click_article_id"]]
)
valid_df[["user_id", "click_article_id"]] = enc.transform(
    valid_df[["user_id", "click_article_id"]]
)
test_df[["user_id", "click_article_id"]] = enc.transform(
    test_df[["user_id", "click_article_id"]]
)

n_users = int(train_df.user_id.max()) + 1
n_items = int(train_df.click_article_id.max()) + 1

print(f"Number of users: {n_users}, Number of items: {n_items}")

In [None]:
# Create sparse interaction matrices
def make_interaction_matrix(df):
    """Create sparse user-item interaction matrix."""
    # Filter out unknown users/items (-1)
    mask = (df.user_id >= 0) & (df.click_article_id >= 0)
    df_clean = df[mask]
    print(f"Filtered out {len(df) - len(df_clean)} unknown interactions")
    
    return csr_matrix(
        (np.ones(len(df_clean), dtype=np.float32),
         (df_clean.user_id, df_clean.click_article_id)),
        shape=(n_users, n_items)
    )

train_mat = make_interaction_matrix(train_df)
valid_mat = make_interaction_matrix(valid_df)
test_mat = make_interaction_matrix(test_df)

print(f"Training matrix shape: {train_mat.shape}")
print(f"Training matrix density: {train_mat.nnz / (train_mat.shape[0] * train_mat.shape[1]):.6f}")

## 3. Helper Functions

In [None]:
def build_ground_truth(mat: csr_matrix):
    """Build ground truth sets for evaluation."""
    indptr, indices = mat.indptr, mat.indices
    return [set(indices[indptr[u]:indptr[u+1]]) for u in range(mat.shape[0])]

def evaluate_recommendations(ranks, ground_truth, k_list=(5, 10, 100)):
    """Evaluate recommendation performance."""
    n_users, _ = ranks.shape
    gt_items = np.fromiter(
        (next(iter(s)) if s else -1 for s in ground_truth),
        dtype=DT_INT, count=n_users
    )
    
    # Get ranks for ground truth items
    gt_ranks = ranks[np.arange(n_users), gt_items]
    
    results = {}
    for k in k_list:
        hits = (gt_ranks <= k).astype(DT_INT)
        results[f"recall@{k}"] = hits.mean(dtype=DT_FLOAT)
    
    results["mean_rank"] = gt_ranks.mean(dtype=DT_FLOAT)
    return results

# Build ground truth for validation
gt_valid = build_ground_truth(valid_mat)

## 4. User-to-User Collaborative Filtering

In [None]:
print("Computing user-to-user collaborative filtering...")

TOP_SIM = 50  # Keep top 50 similar users

# L2-normalize user rows for cosine similarity
train_mat_csr = train_mat.tocsr()
row_norms = np.sqrt(np.array(train_mat_csr.multiply(train_mat_csr).sum(axis=1)).flatten())
row_norms[row_norms == 0] = 1.0  # Avoid division by zero

# Normalize each row
train_mat_norm = train_mat_csr.copy().astype(DT_FLOAT)
train_mat_norm.data /= row_norms[train_mat_norm.nonzero()[0]]

# Compute similarity matrix
sim = train_mat_norm @ train_mat_norm.T
sim = sim.toarray().astype(DT_FLOAT)

# Keep top-(TOP_SIM+1) per row (including self)
top_idx = np.argpartition(-sim, min(TOP_SIM+1, sim.shape[1]-1), axis=1)[:, :TOP_SIM+1]
top_sim = np.take_along_axis(sim, top_idx, axis=1)

# Sort by similarity descending
order = np.argsort(-top_sim, axis=1)
u_neighbors = np.take_along_axis(top_idx, order, axis=1)

print(f"Similarity matrix computed: {sim.shape}")

# Clean up memory
del sim, top_idx, top_sim, train_mat_norm
gc.collect()

In [None]:
def build_user_score_matrix(batch=2000):
    """Build recommendation scores based on similar users."""
    scores = np.zeros((n_users, n_items), dtype=np.uint8)
    indptr, indices = train_mat_csr.indptr, train_mat_csr.indices

    for start in range(0, n_users, batch):
        end = min(start + batch, n_users)
        for u in range(start, end):
            # Get top 20 similar users (excluding self)
            peers = u_neighbors[u][1:21]
            if peers.size == 0:
                continue
            
            srow = scores[u]
            for p in peers:
                # Add votes for items liked by similar users
                srow[train_mat_csr[p].indices] += 1
            
            # Mask already seen items
            srow[indices[indptr[u]:indptr[u+1]]] = 0
    
    return scores

print("Building user-based recommendation scores...")
scores = build_user_score_matrix()

# Convert scores to ranks
ranks = scores.argsort(axis=1).astype(DT_INT)
ranks = ranks.argsort(axis=1) + 1  # 1-based ranks

print("User-to-user collaborative filtering completed.")

In [None]:
# Evaluate user-to-user CF
u2u_metrics = evaluate_recommendations(
    ranks, 
    [set([aid]) for aid in valid_df.click_article_id],
    k_list=(5, 10, 100)
)

print("User-to-User Collaborative Filtering Results:")
for metric, value in u2u_metrics.items():
    print(f"  {metric}: {value:.6f}")

# Clean up memory
del scores, ranks
gc.collect()

## 5. Item-to-Item Collaborative Filtering

In [None]:
print("Computing item-to-item collaborative filtering...")

TOP_SIM_ITEM = 50  # Keep top 50 similar items

# Transpose to get item x user matrix
item_mat_csr = train_mat_csr.T

# L2 normalize item rows
row_norms_i = np.sqrt(item_mat_csr.multiply(item_mat_csr).sum(1)).A1
row_norms_i[row_norms_i == 0] = 1.0

item_mat_norm = item_mat_csr.copy().astype(DT_FLOAT)
item_mat_norm.data /= row_norms_i[item_mat_norm.nonzero()[0]]

# Compute item similarity matrix
isim = (item_mat_norm @ item_mat_norm.T).toarray().astype(DT_FLOAT)

# Keep top similar items
top_idx_i = np.argpartition(
    -isim, min(TOP_SIM_ITEM+1, isim.shape[1]-1), axis=1
)[:, :TOP_SIM_ITEM+1]
top_sim_i = np.take_along_axis(isim, top_idx_i, axis=1)

order_i = np.argsort(-top_sim_i, axis=1)
item_neighbors = np.take_along_axis(top_idx_i, order_i, axis=1)

print(f"Item similarity matrix computed: {isim.shape}")

# Clean up memory
del isim, top_idx_i, top_sim_i, item_mat_norm
gc.collect()

In [None]:
def build_item_score_matrix(batch=2000):
    """Build recommendation scores based on similar items."""
    scores_i = np.zeros((n_users, n_items), dtype=np.uint8)
    indptr, idx = train_mat_csr.indptr, train_mat_csr.indices

    for s in range(0, n_users, batch):
        e = min(s + batch, n_users)
        for u in range(s, e):
            seen = idx[indptr[u]:indptr[u+1]]
            if seen.size == 0:
                continue
            
            row = scores_i[u]
            for itm in seen:
                # Get similar items (excluding self)
                peers = item_neighbors[itm][1:21]
                row[peers] += 1
            
            # Mask already seen items
            row[seen] = 0
    
    return scores_i

print("Building item-based recommendation scores...")
scores_i = build_item_score_matrix()

# Convert to ranks
order_i = scores_i.argsort(axis=1)
ranks_i = order_i.argsort(axis=1).astype(DT_INT) + 1

print("Item-to-item collaborative filtering completed.")

# Clean up memory
del scores_i, order_i
gc.collect()

In [None]:
# Evaluate item-to-item CF
i2i_metrics = evaluate_recommendations(
    ranks_i,
    [set([aid]) for aid in valid_df.click_article_id],
    k_list=(5, 10, 100)
)

print("Item-to-Item Collaborative Filtering Results:")
for metric, value in i2i_metrics.items():
    print(f"  {metric}: {value:.6f}")

## 6. Results Comparison

In [None]:
# Compare results
import pandas as pd

comparison_df = pd.DataFrame({
    'User-to-User CF': u2u_metrics,
    'Item-to-Item CF': i2i_metrics
})

print("\nCollaborative Filtering Comparison:")
print(comparison_df.round(6))

# Determine better performing method
if i2i_metrics['recall@10'] > u2u_metrics['recall@10']:
    print("\nItem-to-Item CF performs better on recall@10")
else:
    print("\nUser-to-User CF performs better on recall@10")

## 7. Conclusions

This notebook demonstrates:

1. **User-to-User CF**: Finds users with similar interaction patterns and recommends items they liked
2. **Item-to-Item CF**: Identifies items that are frequently interacted with together
3. **Performance Comparison**: Evaluates both approaches using standard recommendation metrics

### Key Insights:
- Item-to-Item CF typically performs better for implicit feedback datasets
- Both methods suffer from sparsity issues in large datasets
- These methods form the foundation for more advanced ensemble approaches

### Next Steps:
- Experiment with different similarity metrics
- Implement matrix factorization approaches (ALS)
- Combine CF with content-based features