# Item-to-Item Collaborative Filtering

This notebook implements a clear, step-by-step approach to item-to-item collaborative filtering for recommendation systems. We'll focus on:

1. Understanding the algorithm fundamentals
2. Proper handling of training and validation data
3. Building an item similarity matrix
4. Generating and evaluating recommendations

In [2]:
"""
Item-to-Item Collaborative Filtering Implementation using Pandas
---------------------------------------------------------------

This script implements item-to-item collaborative filtering without requiring Spark.
It's designed to work with the same data format as the notebook but uses pure Pandas.
"""

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import os

In [21]:
# Configuration
TRAIN_FP = "../datasets/train_clicks.parquet"
VAL_FP = "../datasets/valid_clicks.parquet"
NEIGHBORS_FP = "../datasets/item_neighbors_pandas.parquet"
TOP_K_NEIGH = 50  # number of neighbors to keep per item

print("1. Loading data...")
# Load training data
if os.path.exists(TRAIN_FP):
    interactions = pd.read_parquet(TRAIN_FP)
    interactions = interactions.rename(columns={"click_article_id": "item_id"})
    interactions = interactions[["user_id", "item_id"]].drop_duplicates()
    
    # Print dataset statistics
    num_users = interactions['user_id'].nunique()
    num_items = interactions['item_id'].nunique()
    num_interactions = len(interactions)
    
    print(f"Dataset statistics:")
    print(f"  Users: {num_users}")
    print(f"  Items: {num_items}")
    print(f"  Interactions: {num_interactions}")
    print(f"  Density: {num_interactions / (num_users * num_items):.6f}")
    
    # Load validation data
    if os.path.exists(VAL_FP):
        val = pd.read_parquet(VAL_FP)
        val = val.rename(columns={"click_article_id": "true_item"})
        val = val[["user_id", "true_item"]].drop_duplicates()
        
        val_users = val['user_id'].nunique()
        val_items = val['true_item'].nunique()
        val_interactions = len(val)
        
        print(f"\nValidation users: {val_users}")
        print(f"Validation Items: {val_items}")
        print(f"Validation Interactions: {val_interactions}")
        print(f"Validation Density: {val_interactions / (val_users * val_items):.6f}")
    else:
        print(f"Warning: Validation file {VAL_FP} not found.")
        val = pd.DataFrame(columns=["user_id", "true_item"])
else:
    print(f"Error: Training file {TRAIN_FP} not found.")
    exit(1)


1. Loading data...
Dataset statistics:
  Users: 65536
  Items: 33320
  Interactions: 2738802
  Density: 0.001254

Validation users: 65536
Validation Items: 4653
Validation Interactions: 65536
Validation Density: 0.000215


In [22]:
interactions.shape

(2738802, 2)

In [23]:
interactions.head()

Unnamed: 0,user_id,item_id
0,0,26469
1,0,3330
2,0,29902
3,0,59758
4,0,8796


In [24]:
interactions.tail()

Unnamed: 0,user_id,item_id
2857104,65535,3436
2857105,65535,63489
2857106,65535,64075
2857107,65535,55297
2857108,65535,59929


In [25]:
interactions.describe()

Unnamed: 0,user_id,item_id
count,2738802.0,2738802.0
mean,30456.81,30019.8
std,19157.42,17370.93
min,0.0,1.0
25%,13486.0,15347.0
50%,28843.0,29060.0
75%,47396.0,39820.0
max,65535.0,65535.0


In [26]:
interactions.nunique()

user_id    65536
item_id    33320
dtype: int64

In [27]:
val.head()

Unnamed: 0,user_id,true_item
0,0,26859
1,1,36162
2,2,30760
3,3,3436
4,4,30800


# 2 Computing item popularity 

In [28]:
print("\n2. Computing item popularity...")
# Compute item counts (popularity)
item_counts = interactions.groupby('item_id').size().reset_index(name='n_i')
# print(f"Item counts statistics:")
# print(item_counts.describe())


2. Computing item popularity...


In [29]:
print("\n2. Computing item popularity...")
# Compute item counts (popularity)
item_counts = interactions.groupby('item_id').size().reset_index(name='n_i')


2. Computing item popularity...


In [30]:
print(f"Item counts statistics:")
item_counts.describe()

Item counts statistics:


Unnamed: 0,item_id,n_i
count,33320.0,33320.0
mean,31645.998109,82.196939
std,18518.740181,663.029782
min,1.0,1.0
25%,15419.75,1.0
50%,31256.5,2.0
75%,46567.5,12.0
max,65535.0,31925.0


In [33]:
item_counts.head()

Unnamed: 0,item_id,n_i
0,1,1
1,3,1
2,8,1
3,9,1
4,11,1


In [34]:
item_counts.tail()

Unnamed: 0,item_id,n_i
33315,65520,1
33316,65523,1
33317,65527,1
33318,65534,14
33319,65535,30


# 3 Creating user-item matrix

In [35]:
print("\n3. Creating user-item matrix...")
# Create a sparse user-item matrix
user_ids = interactions['user_id'].unique()
item_ids = interactions['item_id'].unique()


3. Creating user-item matrix...


In [11]:
user_ids.shape

(65536,)

In [18]:
user_ids[-5:]

array([65531, 65532, 65533, 65534, 65535], dtype=uint16)

In [13]:
item_ids.shape

(33320,)

In [14]:
item_ids[:5]

array([26469,  3330, 29902, 59758,  8796], dtype=uint16)

In [36]:
# Create mappings for user and item indices
user_to_idx = {user: i for i, user in enumerate(user_ids)}
item_to_idx = {item: i for i, item in enumerate(item_ids)}
idx_to_item = {i: item for item, i in item_to_idx.items()}

In [37]:
# Create a sparse matrix
rows = [user_to_idx[user] for user in interactions['user_id']]
cols = [item_to_idx[item] for item in interactions['item_id']]
data = np.ones(len(interactions))

user_item_matrix = csr_matrix((data, (rows, cols)), shape=(len(user_ids), len(item_ids)))

In [38]:
user_item_matrix.shape

(65536, 33320)

# 4 Computing item similarity matrix

In [41]:
print("\n4. Computing item similarity matrix...")
# Compute cosine similarity between items
# Note: This can be memory-intensive for large datasets
print("   Computing cosine similarity (this may take a while)...")
item_similarity = cosine_similarity(user_item_matrix.T, dense_output=False)



4. Computing item similarity matrix...
   Computing cosine similarity (this may take a while)...


In [43]:
item_similarity.shape

(33320, 33320)

# 5 Extracting top neighbors for each item

In [45]:
print("\n5. Extracting top neighbors for each item...")
# Extract top-K neighbors for each item
from tqdm import tqdm
neighbors_list = []

for i in tqdm(range(len(item_ids))):
    item = idx_to_item[i]
    # Get similarity scores for this item
    sim_scores = item_similarity[i].toarray().flatten()
    
    # Get indices of top neighbors (excluding self)
    sim_scores[i] = 0  # Exclude self-similarity
    top_indices = np.argsort(sim_scores)[-TOP_K_NEIGH:][::-1]
    top_scores = sim_scores[top_indices]
    
    # Filter out zero similarities
    nonzero_mask = top_scores > 0
    top_indices = top_indices[nonzero_mask]
    top_scores = top_scores[nonzero_mask]
    
    # Add to neighbors list
    for idx, score in zip(top_indices, top_scores):
        neighbors_list.append({
            'item_id': item,
            'neighbor_id': idx_to_item[idx],
            'sim': score
        })

# Convert to DataFrame
item_neighbors = pd.DataFrame(neighbors_list)

print(f"\nSample of item neighbors:")
print(item_neighbors.head(10))

# Count how many items have neighbors
items_with_neighbors = item_neighbors['item_id'].nunique()
print(f"Items with at least one neighbor: {items_with_neighbors} out of {num_items}")



5. Extracting top neighbors for each item...


100%|███████████████| 33320/33320 [00:10<00:00, 3300.82it/s]



Sample of item neighbors:
   item_id  neighbor_id       sim
0    26469        27117  0.173968
1    26469        31297  0.091930
2    26469        26060  0.073596
3    26469        10514  0.068871
4    26469         4861  0.067791
5    26469        29902  0.065823
6    26469        27107  0.065420
7    26469        27010  0.058642
8    26469        38622  0.058205
9    26469        38090  0.058058
Items with at least one neighbor: 33320 out of 33320


# 6 Saving item neighbors

In [46]:
print(f"\n6. Saving item neighbors to {NEIGHBORS_FP}...")
item_neighbors.to_parquet(NEIGHBORS_FP, index=False)



6. Saving item neighbors to ../datasets/item_neighbors_pandas.parquet...


# 7. Generating recommendations

In [48]:
print("\n7. Generating recommendations...")
# Generate recommendations for users in validation set
if len(val) > 0:
    # Get user history from training data
    user_history = interactions.copy()
    
    # Generate recommendations
    print("  Computing recommendations...")
    
    # This is a simplified approach - for each user:
    # 1. Get their history
    # 2. Find similar items to those in their history
    # 3. Aggregate similarity scores
    # 4. Rank and recommend
    
    all_recs = []
    
    for user_id in val['user_id'].unique():
        # Get user's history
        user_items = interactions[interactions['user_id'] == user_id]['item_id'].tolist()
        
        if not user_items:
            continue
            
        # Get similar items to those in history
        user_recs = item_neighbors[item_neighbors['item_id'].isin(user_items)].copy()
        
        # Remove items the user has already interacted with
        user_recs = user_recs[~user_recs['neighbor_id'].isin(user_items)]
        
        if user_recs.empty:
            continue
            
        # Aggregate similarity scores for each recommended item
        user_recs = user_recs.groupby('neighbor_id')['sim'].sum().reset_index()
        user_recs['user_id'] = user_id
        
        # Rank recommendations
        user_recs = user_recs.sort_values('sim', ascending=False)
        user_recs['rank'] = range(1, len(user_recs) + 1)
        
        all_recs.append(user_recs)
    
    if all_recs:
        recs = pd.concat(all_recs, ignore_index=True)
        recs = recs.rename(columns={'neighbor_id': 'rec_item_id'})
        
        print("\n8. Evaluating recommendations...")
        # Join recommendations with validation data
        joined = pd.merge(
            recs, 
            val, 
            on='user_id', 
            how='left'
        )
        
        # Check if recommendations match validation items
        joined['hit'] = joined['rec_item_id'] == joined['true_item']
        
        # Calculate Recall@K
        total_val_users = val['user_id'].nunique()
        print(f"Total validation users: {total_val_users}")
        
        for k in (5, 10, 20, 50):
            # Filter to top-K recommendations
            top_k_recs = joined[joined['rank'] <= k]
            
            # Count users with hits
            users_with_hits = top_k_recs[top_k_recs['hit']]['user_id'].nunique()
            
            recall = users_with_hits / total_val_users
            print(f"Recall@{k}: {recall:.4f} ({users_with_hits} users had their validation item in top {k} recommendations)")
    else:
        print("No recommendations could be generated.")
else:
    print("Skipping recommendation evaluation (no validation data).")

print("\nItem-to-item collaborative filtering completed!")



7. Generating recommendations...
  Computing recommendations...


KeyboardInterrupt: 

## Data Loading and Preparation

We'll load the training and validation data, keeping only the essential columns for collaborative filtering.

In [None]:
# ─── 3) Load train clicks DataFrame ─────────────────────────────────────────────
clicks = spark.read.parquet(TRAIN_FP)

# Keep only the fields we need for CF and deduplicate
interactions = (
    clicks
    .select("user_id", F.col("click_article_id").alias("item_id"))
    .distinct()  # one interaction per user‐item
)

# Show a sample of the interactions
print("Sample of interactions:")
interactions.show(5)

# Count unique users and items
num_users = interactions.select("user_id").distinct().count()
num_items = interactions.select("item_id").distinct().count()
num_interactions = interactions.count()

print(f"Dataset statistics:")
print(f"  Users: {num_users}")
print(f"  Items: {num_items}")
print(f"  Interactions: {num_interactions}")
print(f"  Density: {num_interactions / (num_users * num_items):.6f}")

## Load Validation Data

We'll load the validation data to use for evaluation later.

In [None]:
# Load validation data
val_clicks = spark.read.parquet(VAL_FP)
val = (
    val_clicks
    .select("user_id", F.col("click_article_id").alias("true_item"))
    .distinct()
)

print("Sample of validation data:")
val.show(5)

# Count validation statistics
val_users = val.select("user_id").distinct().count()
print(f"Validation users: {val_users}")

## Building Item Similarity Matrix

The core of item-to-item collaborative filtering is building a similarity matrix between items based on user interaction patterns.

In [None]:
# ─── 4) Compute total clicks per item for normalization ─────────────────────────
item_counts = (
    interactions
    .groupBy("item_id")
    .agg(F.count("*").alias("n_i"))
)

print("Sample of item counts:")
item_counts.show(5)

# Distribution of item popularity
item_counts.describe().show()

In [None]:
# ─── 5) Generate co‐click counts via self‐join on user_id ────────────────────────
print("Computing co-occurrence matrix (this may take a while)...")
pairs = (
    interactions.alias("a")
    .join(interactions.alias("b"), on="user_id")
    .where(F.col("a.item_id") < F.col("b.item_id"))  # Avoid duplicates and self-pairs
    .groupBy("a.item_id", "b.item_id")
    .agg(F.count("*").alias("co_count"))
)

# Show a sample of the co-occurrence matrix
print("Sample of co-occurrence matrix:")
pairs.show(5)

In [None]:
# ─── 6) Join item counts to compute cosine similarity ───────────────────────────
print("Computing similarity scores...")
pairs = (
    pairs
    # join on 'a' counts
    .join(
        item_counts.withColumnRenamed("item_id", "i").withColumnRenamed("n_i", "n_i"),
        pairs["a.item_id"] == F.col("i")
    )
    .drop("i")
    # join on 'b' counts
    .join(
        item_counts.withColumnRenamed("item_id", "j").withColumnRenamed("n_i", "n_j"),
        pairs["b.item_id"] == F.col("j")
    )
    .drop("j")
    # cosine similarity = co_count / sqrt(n_i * n_j)
    .withColumn("sim", F.col("co_count") / F.sqrt(F.col("n_i") * F.col("n_j")))
)

# Show a sample with similarity scores
print("Sample with similarity scores:")
pairs.show(5)

In [None]:
# ─── 7) For each item, keep top‐TOP_K_NEIGH neighbors ────────────────────────────
print(f"Selecting top {TOP_K_NEIGH} neighbors per item...")
window_spec = Window.partitionBy("a.item_id").orderBy(F.col("sim").desc())

item_neighbors = (
    pairs
    .withColumn("rank", F.row_number().over(window_spec))
    .filter(F.col("rank") <= TOP_K_NEIGH)
    .select(
        F.col("a.item_id").alias("item_id"),
        F.col("b.item_id").alias("neighbor_id"),
        "sim"
    )
)

# Show a sample of the neighbor table
print("Sample of item neighbors:")
item_neighbors.show(10)

# Count how many items have neighbors
items_with_neighbors = item_neighbors.select("item_id").distinct().count()
print(f"Items with at least one neighbor: {items_with_neighbors} out of {num_items}")

In [None]:
# ─── 8) Persist the neighbor table ──────────────────────────────────────────────
print(f"Saving item neighbors to {NEIGHBORS_FP}")
item_neighbors.write.mode("overwrite").parquet(NEIGHBORS_FP)

## Generating Recommendations

Now we'll use the item similarity matrix to generate recommendations for users in the validation set.

In [None]:
# Get user history from training data
user_history = interactions

# Generate recommendations by joining user history with item neighbors
print("Generating recommendations...")
recs = (
    user_history
    .join(item_neighbors, user_history.item_id == item_neighbors.item_id)
    .select(
        user_history.user_id,
        item_neighbors.neighbor_id.alias("rec_item_id"),
        item_neighbors.sim
    )
    # Remove items the user has already interacted with
    .join(
        user_history.select(
            "user_id", 
            F.col("item_id").alias("rec_item_id")
        ),
        on=["user_id", "rec_item_id"],
        how="left_anti"
    )
    # Aggregate to get the best recommendations per user
    .groupBy("user_id", "rec_item_id")
    .agg(F.sum("sim").alias("score"))
    # Rank recommendations
    .withColumn("rank", F.row_number().over(
        Window.partitionBy("user_id").orderBy(F.col("score").desc())
    ))
)

# Show sample recommendations
print("Sample recommendations:")
recs.filter(F.col("rank") <= 5).show(10)

## Evaluating Recommendations

Finally, we'll evaluate our recommendations against the validation set.

In [None]:
# Join recommendations with validation data
print("Evaluating recommendations...")
joined = (
    recs
    .join(
        val,
        (recs.user_id == val.user_id) & (recs.rec_item_id == val.true_item),
        "left"
    )
)

# Calculate Recall@K
total_val_users = val.select("user_id").distinct().count()
print(f"Total validation users: {total_val_users}")

for K in (5, 10, 20, 50):
    hits = joined.filter(
        (F.col("rank") <= K) & (F.col("true_item").isNotNull())
    ).select("user_id").distinct().count()
    
    recall = hits / total_val_users
    print(f"Recall@{K}: {recall:.4f} ({hits} users had their validation item in top {K} recommendations)")

In [None]:
# ─── 9) Stop Spark ──────────────────────────────────────────────────────────────
spark.stop()
print("Spark session stopped.")