# 06 — Recommendation Engine
Build a hybrid recommendation system: collaborative filtering + content-based + LLM explanations.

## Step 1: Load and Prepare Data

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse

# Load raw data
orders = pd.read_csv('../data/raw/olist_orders_dataset.csv')
items = pd.read_csv('../data/raw/olist_order_items_dataset.csv')
products = pd.read_csv('../data/raw/olist_products_dataset.csv')
customers = pd.read_csv('../data/raw/olist_customers_dataset.csv')
categories = pd.read_csv('../data/raw/product_category_name_translation.csv')
customer_features = pd.read_csv('../data/processed/customer_features.csv')

# Build user-product interaction table
interactions = (
    items[['order_id', 'product_id', 'price']]
    .merge(orders[['order_id', 'customer_id']], on='order_id')
    .merge(customers[['customer_id', 'customer_unique_id']], on='customer_id')
)

print(f'Interactions: {len(interactions):,}')
print(f'Unique users: {interactions["customer_unique_id"].nunique():,}')
print(f'Unique products: {interactions["product_id"].nunique():,}')

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Interactions: 112,650
Unique users: 95,420
Unique products: 32,951


## Step 2: Collaborative Filtering
Build a user-item matrix and use the `implicit` library to find patterns.
We use **ALS (Alternating Least Squares)** — it learns latent factors for each user and product.
Think of it as: each user gets a hidden 'taste vector' and each product gets a hidden 'feature vector'.
Good recommendations happen when these vectors are similar.

In [2]:
from implicit.als import AlternatingLeastSquares

# Create mappings: user/product IDs to integer indices
user_ids = interactions['customer_unique_id'].unique()
product_ids = interactions['product_id'].unique()

user_to_idx = {uid: i for i, uid in enumerate(user_ids)}
product_to_idx = {pid: i for i, pid in enumerate(product_ids)}
idx_to_product = {i: pid for pid, i in product_to_idx.items()}
idx_to_user = {i: uid for uid, i in user_to_idx.items()}

# Build sparse user-item matrix (rows=users, cols=products, values=1 for purchased)
rows = interactions['customer_unique_id'].map(user_to_idx).values
cols = interactions['product_id'].map(product_to_idx).values
values = np.ones(len(interactions))  # binary: purchased or not

user_item = sparse.csr_matrix((values, (rows, cols)), shape=(len(user_ids), len(product_ids)))
print(f'User-item matrix: {user_item.shape}')
print(f'Sparsity: {1 - user_item.nnz / (user_item.shape[0] * user_item.shape[1]):.6f}')

User-item matrix: (95420, 32951)
Sparsity: 0.999968


In [3]:
# Train ALS model
als_model = AlternatingLeastSquares(
    factors=50,       # dimensionality of latent vectors
    regularization=0.1,
    iterations=20,
    random_state=42
)

# implicit expects item-user matrix (transposed)
als_model.fit(user_item)
print('ALS model trained.')

  check_blas_config()


  0%|          | 0/20 [00:00<?, ?it/s]

ALS model trained.


In [4]:
# Test: get recommendations for a sample user
sample_user_idx = 0
sample_user_id = idx_to_user[sample_user_idx]

# Get top 5 recommendations
recommended_idx, scores = als_model.recommend(
    sample_user_idx, user_item[sample_user_idx], N=5, filter_already_liked_items=True
)

print(f'Recommendations for user: {sample_user_id[:20]}...')
for idx, score in zip(recommended_idx, scores):
    pid = idx_to_product[idx]
    print(f'  Product: {pid[:20]}...  Score: {score:.3f}')

Recommendations for user: 871766c5855e863f6ecc...
  Product: 54d9ac713e253fa1fae9...  Score: 0.000
  Product: 99a4788cb24856965c36...  Score: 0.000
  Product: 601a360bd2a916ecef0e...  Score: 0.000
  Product: d285360f29ac7fd97640...  Score: 0.000
  Product: 7ce94ab189134e2d3c05...  Score: 0.000


## Step 3: Content-Based Filtering
For users with few purchases (most of them), recommend products similar to what they bought.
We use **cosine similarity** on product features: category, price range, and physical attributes.

In [5]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Prepare product features
prod_features = products.merge(categories, on='product_category_name', how='left')

# Select numeric features + encode category
le = LabelEncoder()
prod_features['category_encoded'] = le.fit_transform(
    prod_features['product_category_name_english'].fillna('unknown')
)

content_cols = ['category_encoded', 'product_weight_g', 'product_length_cm',
                'product_height_cm', 'product_width_cm', 'product_photos_qty']

# Fill nulls and scale
prod_content = prod_features[content_cols].fillna(0)
scaler = StandardScaler()
prod_content_scaled = scaler.fit_transform(prod_content)

# Map product_id to index in this feature matrix
prod_id_to_content_idx = {pid: i for i, pid in enumerate(prod_features['product_id'].values)}

print(f'Product content matrix: {prod_content_scaled.shape}')

Product content matrix: (32951, 6)


In [6]:
def get_content_recommendations(product_id, top_n=5):
    """Given a product, find the most similar products by content features."""
    if product_id not in prod_id_to_content_idx:
        return []
    
    idx = prod_id_to_content_idx[product_id]
    # Compute similarity between this product and all others
    sim_scores = cosine_similarity(
        prod_content_scaled[idx:idx+1], prod_content_scaled
    )[0]
    
    # Get top N (excluding itself)
    top_indices = sim_scores.argsort()[::-1][1:top_n+1]
    
    results = []
    for i in top_indices:
        results.append({
            'product_id': prod_features.iloc[i]['product_id'],
            'category': prod_features.iloc[i]['product_category_name_english'],
            'similarity': sim_scores[i]
        })
    return results

# Test: find products similar to the first product
test_pid = interactions['product_id'].iloc[0]
test_cat = prod_features[prod_features['product_id'] == test_pid]['product_category_name_english'].values[0]
print(f'Products similar to {test_pid[:20]}... (category: {test_cat}):')
for rec in get_content_recommendations(test_pid):
    print(f"  {rec['product_id'][:20]}...  Category: {rec['category']}  Sim: {rec['similarity']:.3f}")

Products similar to 4244733e06e7ecb4970a... (category: cool_stuff):
  54e5939fcd9ae70ad0f5...  Category: cool_stuff  Sim: 0.998
  d65b6607952f9e0d705b...  Category: cool_stuff  Sim: 0.997
  a835dd882c7fb9015ed5...  Category: cool_stuff  Sim: 0.994
  6456fdf43df7d711b98d...  Category: computers_accessories  Sim: 0.993
  8b75356a402f017bf41f...  Category: auto  Sim: 0.989


## Step 4: Hybrid Recommendations
Combine collaborative filtering and content-based into a single recommendation function.
Strategy: use collaborative filtering scores when available, boost with content similarity.

In [7]:
def get_hybrid_recommendations(customer_unique_id, top_n=5):
    """Get hybrid recommendations for a customer."""
    recommendations = {}
    
    # 1. Collaborative filtering (if user exists in the matrix)
    if customer_unique_id in user_to_idx:
        user_idx = user_to_idx[customer_unique_id]
        rec_idx, rec_scores = als_model.recommend(
            user_idx, user_item[user_idx], N=top_n * 2, filter_already_liked_items=True
        )
        for idx, score in zip(rec_idx, rec_scores):
            pid = idx_to_product[idx]
            recommendations[pid] = {'cf_score': float(score), 'content_score': 0.0}
    
    # 2. Content-based: find products similar to what they already bought
    user_products = interactions[interactions['customer_unique_id'] == customer_unique_id]['product_id'].unique()
    for bought_pid in user_products:
        for rec in get_content_recommendations(bought_pid, top_n=top_n):
            pid = rec['product_id']
            if pid not in user_products:  # don't recommend what they already bought
                if pid not in recommendations:
                    recommendations[pid] = {'cf_score': 0.0, 'content_score': 0.0}
                recommendations[pid]['content_score'] = max(
                    recommendations[pid]['content_score'], rec['similarity']
                )
    
    # 3. Combine scores (weighted: 60% CF, 40% content)
    for pid in recommendations:
        recommendations[pid]['hybrid_score'] = (
            0.6 * recommendations[pid]['cf_score'] +
            0.4 * recommendations[pid]['content_score']
        )
    
    # Sort by hybrid score and return top N
    sorted_recs = sorted(recommendations.items(), key=lambda x: x[1]['hybrid_score'], reverse=True)[:top_n]
    
    result = []
    for pid, scores in sorted_recs:
        cat = prod_features[prod_features['product_id'] == pid]['product_category_name_english'].values
        cat = cat[0] if len(cat) > 0 else 'unknown'
        result.append({
            'product_id': pid,
            'category': cat,
            'hybrid_score': scores['hybrid_score'],
            'cf_score': scores['cf_score'],
            'content_score': scores['content_score']
        })
    return result

# Test on a few customers
test_customers = interactions['customer_unique_id'].value_counts().head(3).index.tolist()
for cid in test_customers:
    recs = get_hybrid_recommendations(cid, top_n=3)
    seg = customer_features[customer_features['customer_unique_id'] == cid]['segment_name'].values
    seg = seg[0] if len(seg) > 0 else 'unknown'
    print(f'\nCustomer: {cid[:20]}...  Segment: {seg}')
    for r in recs:
        print(f"  {r['category']:30s}  hybrid={r['hybrid_score']:.3f}  (cf={r['cf_score']:.3f}, content={r['content_score']:.3f})")


Customer: c8460e4251689ba20504...  Segment: High-Value Buyers
  furniture_decor                 hybrid=0.568  (cf=0.946, content=0.000)
  furniture_decor                 hybrid=0.461  (cf=0.768, content=0.000)
  sports_leisure                  hybrid=0.397  (cf=0.000, content=0.992)

Customer: 4546caea018ad8c69296...  Segment: Repeat Loyalists
  health_beauty                   hybrid=0.399  (cf=0.000, content=0.997)
  health_beauty                   hybrid=0.399  (cf=0.000, content=0.997)
  garden_tools                    hybrid=0.398  (cf=0.000, content=0.996)

Customer: c402f431464c72e27330...  Segment: High-Value Buyers
  consoles_games                  hybrid=0.399  (cf=0.000, content=0.997)
  consoles_games                  hybrid=0.399  (cf=0.000, content=0.996)
  christmas_supplies              hybrid=0.398  (cf=0.000, content=0.995)


## Step 5: Evaluation
For users with 2+ orders, hold out the last order and see if we recommend any of those products.
Metric: **Hit Rate @K** — what % of users had at least one held-out product in their top-K recommendations.

In [8]:
# Find users with 2+ orders
user_order_counts = (
    interactions.merge(orders[['order_id', 'order_purchase_timestamp']], on='order_id')
    .groupby('customer_unique_id')['order_id'].nunique()
)
repeat_users = user_order_counts[user_order_counts >= 2].index.tolist()
print(f'Users with 2+ orders: {len(repeat_users):,}')

# For each repeat user: hold out their last purchased product, check if we recommend it
hits = 0
total = 0
K = 10

user_interactions = interactions.merge(orders[['order_id', 'order_purchase_timestamp']], on='order_id')

for uid in repeat_users[:500]:  # sample 500 for speed
    user_data = user_interactions[user_interactions['customer_unique_id'] == uid].sort_values('order_purchase_timestamp')
    last_products = user_data.iloc[-1:]['product_id'].values
    
    recs = get_hybrid_recommendations(uid, top_n=K)
    rec_pids = [r['product_id'] for r in recs]
    
    if any(p in rec_pids for p in last_products):
        hits += 1
    total += 1

print(f'Hit Rate @{K}: {hits/total:.3f} ({hits}/{total})')
print(f'(Baseline random would be ~{K/len(product_ids):.5f})')

Users with 2+ orders: 2,913
Hit Rate @10: 0.000 (0/500)
(Baseline random would be ~0.00030)


## Step 6: LLM-Powered Recommendation Explanations
Use an LLM API to generate natural language explanations for recommendations.
This shows you can integrate LLM APIs into a production ML pipeline.

We use OpenAI here. The API key is loaded from a `.env` file (never hardcode keys in notebooks!).

In [10]:
from openai import OpenAI
from dotenv import load_dotenv
import os

# Load API key from .env file
load_dotenv('../.env')
client = OpenAI()  # reads OPENAI_API_KEY from environment

def generate_explanation(customer_segment, purchased_categories, recommended_product, rec_category):
    """Generate a natural language explanation for a recommendation."""
    prompt = f"""You are a product recommendation system. Generate a brief, friendly 1-2 sentence explanation
for why this product is being recommended.

Customer segment: {customer_segment}
Products they previously bought (categories): {', '.join(purchased_categories[:5])}
Recommended product category: {rec_category}

Write the explanation as if speaking directly to the customer. Be specific about why this fits them."""
    
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        max_tokens=100,
        messages=[{'role': 'user', 'content': prompt}]
    )
    return response.choices[0].message.content

print('LLM client ready.')

LLM client ready.


In [11]:
# Generate explanations for a sample customer's recommendations
sample_cid = test_customers[0]
sample_seg = customer_features[customer_features['customer_unique_id'] == sample_cid]['segment_name'].values[0]
sample_bought = interactions[interactions['customer_unique_id'] == sample_cid]['product_id'].unique()
bought_cats = []
for pid in sample_bought:
    cat = prod_features[prod_features['product_id'] == pid]['product_category_name_english'].values
    if len(cat) > 0:
        bought_cats.append(cat[0])

recs = get_hybrid_recommendations(sample_cid, top_n=3)

print(f'Customer: {sample_cid[:20]}...  Segment: {sample_seg}')
print(f'Previously bought: {bought_cats}')
print()

for r in recs:
    explanation = generate_explanation(sample_seg, bought_cats, r['product_id'], r['category'])
    print(f"Recommended: {r['category']}")
    print(f"  Why: {explanation}")
    print()

Customer: c8460e4251689ba20504...  Segment: High-Value Buyers
Previously bought: ['telephony']

Recommended: furniture_decor
  Why: We know that as a high-value buyer, you appreciate quality and style in every aspect of your life. That's why we're recommending our premium furniture and decor, tailored to create a harmonious and inspiring workspace that complements your sophisticated taste while enhancing your telephony experience.

Recommended: furniture_decor
  Why: As a valued customer who appreciates quality and sophistication in your telephony choices, we think you'll love our curated selection of furniture and decor. These pieces not only enhance your workspace but also reflect your premium taste, creating an inspiring and stylish environment for your important calls and meetings.

Recommended: sports_leisure
  Why: We know you appreciate quality and value, and with your interest in telephony, we think you'd enjoy our premium sports and leisure products that offer the perfect blen

## Step 7: Save Recommendation Function
Save the models and mappings so the API can use them later.

In [12]:
import pickle
import os

os.makedirs('../models', exist_ok=True)

# Save everything the API will need
rec_data = {
    'als_model': als_model,
    'user_item': user_item,
    'user_to_idx': user_to_idx,
    'idx_to_product': idx_to_product,
    'idx_to_user': idx_to_user,
    'product_to_idx': product_to_idx,
    'prod_content_scaled': prod_content_scaled,
    'prod_id_to_content_idx': prod_id_to_content_idx,
    'prod_features': prod_features[['product_id', 'product_category_name_english']],
    'interactions': interactions[['customer_unique_id', 'product_id']],
}

with open('../models/recommendation_models.pkl', 'wb') as f:
    pickle.dump(rec_data, f)

print(f'Models saved to models/recommendation_models.pkl')
print(f'File size: {os.path.getsize("../models/recommendation_models.pkl") / 1e6:.1f} MB')

Models saved to models/recommendation_models.pkl
File size: 47.2 MB


## Step 8: A/B Test Design (Written Exercise)
**YOUR EXERCISE:** Write a brief A/B test design document answering these questions:

1. **Hypothesis:** What do you expect the recommendation engine to improve? (e.g., click-through rate, repeat purchases, revenue per user)
2. **Control vs Treatment:** What does each group see? (e.g., control = random/popular products, treatment = personalized recommendations)
3. **Randomization:** How do you split users? (random assignment, stratified by segment?)
4. **Primary metric:** What single metric determines success?
5. **Sample size:** How many users and how long to run the test?
6. **Guardrail metrics:** What should NOT get worse? (e.g., customer satisfaction, return rate)

### A/B Test Design

**YOUR ANSWERS HERE:**

1. **Hypothesis:** ...
2. **Control vs Treatment:** ...
3. **Randomization:** ...
4. **Primary metric:** ...
5. **Sample size:** ...
6. **Guardrail metrics:** ...