# 0. Load Library

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import os
import warnings
import sys

# Machine Learning Libraries
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
import sklearn

# Visualization Libraries
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# FPM
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth
#CF SVD
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules
from collections import Counter
from itertools import combinations

# Configure settings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None
pd.options.mode.copy_on_write = True
sklearn.set_config(transform_output="pandas")

# Configure matplotlib
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
sns.set()

# Suppress additional warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

print("All libraries imported and configured successfully!")


All libraries imported and configured successfully!


# 1. Data Loading and Preparation

This section handles loading the dataset files and preparing them for analysis.

In [2]:
# Load all CSV files from the dataset directory
files = [file for file in os.listdir('./dataset_looker')]
collection = []
for file in files:
    collection.append(file.rsplit('.', 1)[0])

# Create dictionary to store all dataframes
df = {}
for name in collection:
    df_single = pd.read_csv("./dataset_looker/" + name + ".csv")
    df[name] = df_single
    print(f"Loaded {name}.csv with {len(df_single)} rows")

print(f"\nTotal datasets loaded: {len(df)}")

Loaded distribution_centers.csv with 10 rows
Loaded orders.csv with 125226 rows
Loaded order_items.csv with 181759 rows
Loaded products.csv with 29120 rows
Loaded users.csv with 100000 rows

Total datasets loaded: 5


In [3]:
# --- Create a function to convert data for dataset ---
def convert_dates_robustly(date_series):
    """
    Converts a pandas Series of date strings to datetime objects,
    handling multiple formats (with and without microseconds).

    Args:
        date_series (pd.Series): The column containing date strings.

    Returns:
        pd.Series: The converted column with datetime objects.
    """
    # Define the two different formats we've seen in the data
    format_with_micros = '%Y-%m-%d %H:%M:%S.%f%z'
    format_without_micros = '%Y-%m-%d %H:%M:%S%z'

    # Pass 1: Try converting everything using automatic detection, which is fast.
    converted_series = pd.to_datetime(date_series, errors='coerce', utc=True)
    
    # Pass 2: For any rows that failed, try the specific formats.
    failed_mask = converted_series.isna()
    if failed_mask.any():
        # On the failed rows ONLY, try the format with microseconds
        pass2_micros = pd.to_datetime(date_series[failed_mask], format=format_with_micros, errors='coerce')
        converted_series.update(pass2_micros)

        # Update the failed mask and try the format without microseconds
        failed_mask = converted_series.isna()
        if failed_mask.any():
            pass3_no_micros = pd.to_datetime(date_series[failed_mask], format=format_without_micros, errors='coerce')
            converted_series.update(pass3_no_micros)

    return converted_series

## 1.1 Data Merging and Feature Engineering

Merge multiple datasets and create relevant features for customer analysis.

In [4]:
# Add prefixes to column names for clarity
order_items = df['order_items'].add_prefix('order_items_')
products = df['products'].add_prefix('products_')
users = df['users'].add_prefix('users_')

# Merge order_items with products on product_id
merged = pd.merge(
    order_items,
    products,
    left_on='order_items_product_id',
    right_on='products_id',
    how='left'
)

# Merge the result with users on user_id
combine_df = pd.merge(
    merged,
    users,
    left_on='order_items_user_id',
    right_on='users_id',
    how='left'
)

print(f"Merged dataset created with {len(combine_df)} rows and {len(combine_df.columns)} columns")

# Convert date columns to datetime objects
date_cols = ['order_items_created_at', 'order_items_shipped_at',
             'order_items_delivered_at', 'order_items_returned_at', 'users_created_at']

for col in date_cols:
    if col in combine_df.columns:
        print(f"Converting '{col}' to datetime...")
        combine_df[col] = convert_dates_robustly(combine_df[col])
        # Handle any NaT values that still exist after conversion
        if combine_df[col].isnull().any():
            print(f"Filling {combine_df[col].isnull().sum()} missing/invalid dates in '{col}' with median.")
            median_date = combine_df[col].median()
            combine_df[col].fillna(median_date, inplace=True)
    else:
        print(f"Warning: Date column '{col}' not found in the dataframe.")

# Clean up unused columns
columns_to_drop = [
    'order_items_id', 'order_items_user_id', 'order_items_inventory_item_id',
    'order_items_product_id', 'order_items_shipped_at', 'order_items_delivered_at',
    'order_items_returned_at', 'users_email', 'users_street_address',
    'users_first_name', 'users_last_name', 'users_postal_code',
    'products_sku', 'products_retail_price',
    'products_distribution_center_id'
]

combine_df = combine_df.drop(columns=columns_to_drop)
print(f"Removed {len(columns_to_drop)} unnecessary columns")

Merged dataset created with 181759 rows and 35 columns
Converting 'order_items_created_at' to datetime...
Converting 'order_items_shipped_at' to datetime...
Filling 63478 missing/invalid dates in 'order_items_shipped_at' with median.
Converting 'order_items_delivered_at' to datetime...
Filling 117918 missing/invalid dates in 'order_items_delivered_at' with median.
Converting 'order_items_returned_at' to datetime...
Filling 163527 missing/invalid dates in 'order_items_returned_at' with median.
Converting 'users_created_at' to datetime...
Removed 15 unnecessary columns


In [5]:
# Feature Engineering: Create profit column
combine_df['profit'] = combine_df['order_items_sale_price'] - combine_df['products_cost']

# Analyze order status distribution
print("Order Status Distribution:")
print(combine_df['order_items_status'].value_counts())
print("\n" + "="*50 + "\n")

# Analyze customer countries
print("Customer Country Distribution:")
print(combine_df['users_country'].value_counts())



Order Status Distribution:
order_items_status
Shipped       54440
Complete      45609
Processing    36388
Cancelled     27090
Returned      18232
Name: count, dtype: int64


Customer Country Distribution:
users_country
China             62708
United States     40571
Brasil            26428
South Korea        9600
France             8593
United Kingdom     8281
Germany            7724
Spain              7200
Japan              4303
Australia          3783
Belgium            2066
Poland              473
Colombia             22
España                4
Austria               2
Deutschland           1
Name: count, dtype: int64


### Filter Data & Aggregate Data

In [6]:
combine_df = combine_df[~combine_df['order_items_status'].isin(['Cancelled', 'Returned'])]
agg_df = combine_df.groupby('order_items_order_id').agg({
    'order_items_status': 'first',
    'order_items_created_at': 'first',
    'order_items_sale_price': 'sum',
    'products_cost': 'sum',
    'products_category': lambda x: sorted((x)),
    'products_brand': lambda x: sorted((x.dropna())),
    'products_department': lambda x: sorted((x)),
    'products_name': lambda x: sorted((x.dropna())),
    'products_id': lambda x: sorted((x.dropna())),
    'users_id': 'first',
    'users_age': 'first',
    'users_gender': 'first',
    'users_state': 'first',
    'users_city': 'first',
    'users_country': 'first',
    'users_latitude': 'first',
    'users_longitude': 'first',
    'users_traffic_source': 'first',
    'users_created_at': 'first',
    'profit': 'sum'
}).reset_index()
agg_df = agg_df[agg_df['users_country'].isin(['China', 'United States'])]
agg_df

Unnamed: 0,order_items_order_id,order_items_status,order_items_created_at,order_items_sale_price,products_cost,products_category,products_brand,products_department,products_name,products_id,...,users_age,users_gender,users_state,users_city,users_country,users_latitude,users_longitude,users_traffic_source,users_created_at,profit
2,3,Processing,2023-04-23 09:04:57+00:00,59.990002,30.174971,[Pants],[Dockers],[Men],[Dockers Men's Limited Offer D2 Stretch Khaki ...,[22308],...,16,M,Florida,Hallandale Beach,United States,25.985238,-80.146620,Organic,2023-03-09 09:13:00+00:00,29.815031
3,4,Processing,2023-08-08 06:13:20+00:00,108.750000,47.306250,[Sleep & Lounge],[Tommy Bahama],[Men],[Tommy Bahama Terry Loop Robe],[26696],...,16,M,Florida,Hallandale Beach,United States,25.985238,-80.146620,Organic,2023-03-09 09:13:00+00:00,61.443750
4,5,Shipped,2023-03-10 07:14:45+00:00,109.989998,49.825469,[Active],[SmartWool],[Men],[Smartwool Men's Midweight Bottom],[18177],...,16,M,Florida,Hallandale Beach,United States,25.985238,-80.146620,Organic,2023-03-09 09:13:00+00:00,60.164529
5,6,Complete,2023-04-03 07:50:36+00:00,123.470001,69.546891,"[Jeans, Swim]","[Marc Ecko Cut & Sew, Volcom]","[Men, Men]",[Marc Ecko Cut & Sew Men's Baked Alaska Bootcu...,"[21364, 28050]",...,16,M,Florida,Hallandale Beach,United States,25.985238,-80.146620,Organic,2023-03-09 09:13:00+00:00,53.923110
6,7,Complete,2022-10-18 06:17:54+00:00,148.000000,61.568000,[Sweaters],[Lilly Pulitzer],[Women],[Lilly Pulitzer Women's Charter Sweater],[1035],...,12,F,Texas,Mission Bend,United States,29.693098,-95.648935,Search,2019-09-14 09:31:00+00:00,86.432000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94081,125220,Processing,2023-03-14 23:56:15+00:00,62.870001,30.750200,"[Active, Socks]","[Gildan, Pearl iZUMi]","[Men, Men]",[Gildan 7.75 oz Sweatpant (18200) Available in...,"[18161, 25132]",...,51,M,Guangdong,Nanping,China,23.514312,116.411514,Search,2019-12-30 02:00:00+00:00,32.119801
94082,125221,Processing,2022-12-16 06:35:21+00:00,40.000000,20.760000,[Sweaters],[Original Penguin],[Men],[Original Penguin Men's Kris Sweater],[19164],...,51,M,North Carolina,Candler,United States,35.512934,-82.716829,Search,2022-10-31 07:27:00+00:00,19.240000
94083,125222,Processing,2023-01-28 06:25:04+00:00,55.000000,28.930000,[Tops & Tees],[Woolrich],[Men],[Woolrich Men's Elite Discreet Carry Short Sle...,[16938],...,51,M,North Carolina,Candler,United States,35.512934,-82.716829,Search,2022-10-31 07:27:00+00:00,26.070000
94084,125223,Processing,2023-07-22 05:16:01+00:00,21.990000,10.027440,[Pants],[Allegra K],[Men],[Allegra K Men Belt Loop Pleats Slant Pockets ...,[21923],...,51,M,North Carolina,Candler,United States,35.512934,-82.716829,Search,2022-10-31 07:27:00+00:00,11.962560


In [7]:
#Prepare datetime features
from datetime import datetime,date
agg_df['week'] = agg_df['order_items_created_at'].dt.isocalendar().week
agg_df['hour'] = agg_df['order_items_created_at'].dt.hour
agg_df['is_weekend'] = agg_df['order_items_created_at'].dt.dayofweek >= 5

# Add 'seasonal' and 'non-seasonal' flags based on month
agg_df['month'] = agg_df['order_items_created_at'].dt.month
# Example: Assume months 6-8 (Jun-Aug) are 'seasonal' (summer), rest are 'non-seasonal'
agg_df['is_seasonal'] = agg_df['month'].isin([11,12,1])


## 1.2 Spliting Data

In [8]:
cutoff = agg_df.order_items_created_at.quantile(0.80)
train = agg_df[agg_df.order_items_created_at <  cutoff]
test  = agg_df[agg_df.order_items_created_at >= cutoff]

## 1.3 Data for advance model

In [9]:
# === 3. Build user-item interaction matrix ===
# Map users and items to integer indices
user_ids = train['users_id'].unique()
# Flatten all product ids, convert to string, and get unique values
cat_ids = pd.unique([pid for sublist in train['products_category'] for pid in (sublist if isinstance(sublist, list) else [sublist])])
user_map = {u: i for i, u in enumerate(user_ids)}
cat_map = {i: j for j, i in enumerate(cat_ids)}
inv_cat_map = {j: i for i, j in cat_map.items()}

# Prepare sparse interaction: 1 if user bought item (train only)
rows = train.explode('products_category')['users_id'].map(user_map)
cols = train.explode('products_category')['products_category'].map(cat_map)
data = np.ones(len(rows), dtype=int)

n_users = len(user_map)
n_cat = len(cat_map)
interactions = csr_matrix((data, (rows, cols)), shape=(n_users, n_cat))



## 1.4 Build Popularity Table and helper table

In [10]:
# Build popularity tables on TRAIN only
# Global popularity
item_pop = (
    train.explode('products_id')
         .groupby('products_id')
         .size()
         .reset_index(name='count')
         .sort_values('count', ascending=False)
)

item_to_cat = (
    combine_df[['products_id','products_category']]
    .drop_duplicates()
    .set_index('products_id')['products_category']
    .to_dict()
)

cat_pop = (
    combine_df[combine_df.order_items_created_at <  cutoff]
         .groupby(['products_category','products_id'])
         .size()
         .reset_index(name='count')
         .sort_values(['products_category','count'], ascending=[True,False])
)


cat_only = (
    combine_df[combine_df.order_items_created_at <  cutoff]
         .groupby(['products_category'])
         .size()
         .reset_index(name='count')
         .sort_values(['count'], ascending=False)
)

# Explode categories per order
train_month_long = train.explode('products_category').rename(
    columns={'products_category': 'category'}
)

# Count occurrences per (month, category)
month_cat_counts = (
    train_month_long
    .groupby(['month', 'category'])
    .size()
    .reset_index(name='count')
)

# Build a dict: month -> list of categories sorted by descending count
from collections import defaultdict

month_to_top_categories = defaultdict(list)
for month, group in month_cat_counts.groupby('month'):
    sorted_cats = group.sort_values('count', ascending=False)['category'].tolist()
    month_to_top_categories[month] = sorted_cats
    
    
catalog = set(
    agg_df
    .explode('products_category')['products_category']
    .unique()
)

# 2. Prepare Evaluation Metric


In [11]:
#Evaluation helpers
def precision_at_k(recs, actual, K):
    """Fraction of top-K recommendations that are relevant."""
    if K == 0:
        return 0.0
    return len(set(recs[:K]) & set(actual)) / K

def recall_at_k(recs, actual, K):
    """Fraction of actual relevant items covered in the top-K recommendations."""
    if not actual:
        return 0.0
    return len(set(recs[:K]) & set(actual)) / len(set(actual))

def f1_at_k(recs, actual, K):
    """Harmonic mean of precision@K and recall@K."""
    p = precision_at_k(recs, actual, K)
    r = recall_at_k(recs, actual, K)
    if p + r == 0:
        return 0.0
    return 2 * (p * r) / (p + r)

def hit_rate_at_k(recs, actual, K):
    """1 if any of the top-K recommendations is relevant, else 0."""
    return int(bool(set(recs[:K]) & set(actual)))

def coverage(recs_all, catalog):
    a =[]
    for recs in recs_all:
        for rec in recs:
            a.append(rec)
    recommended = set(a)
    if not catalog:
        return 0.0
    return max(len(recommended) / len(set(catalog)),1)

# 3. Modelling

## 3.1 Baseline


In [13]:
K = 5

#Get the most popular category
# Baseline recommenders
def recommend_baseline(u, K=K):
    return cat_only['products_category'].iloc[:K].tolist()

# === Recommendation function using month-specific baseline ===
def recommend_baseline_by_month(u, K=K):
    # Finds the user's test order month and returns that month's top-K
    user_test = test[test['users_id']==u]
    if user_test.empty:
        return cat_only[:K]
    month = user_test['order_items_created_at'].dt.month.iloc[0]
    top = month_to_top_categories.get(month, cat_only)
    return top[:K]

# 3. Evaluate on test
# Prepare lists to collect recommendations
global_recs = []
month_recs = []

# Collect per-user metrics
results = []
for _, row in test.iterrows():
    u = row['users_id']
    actual = row['products_category']  # list of true categories
    
    # Generate and store recommendations
    rec_global = recommend_baseline(u, K)
    rec_month  = recommend_baseline_by_month(u, K)
    
    global_recs.append(rec_global)
    month_recs.append(rec_month)
    
    # Evaluate for each model
    for name, recs in [
        ('Global Baseline', rec_global),
        ('Month Baseline',  rec_month)
    ]:
        results.append({
            'model':     name,
            'precision': precision_at_k(recs, actual, K),
            'recall':    recall_at_k(recs, actual, K),
            'f1':        f1_at_k(recs, actual, K),
            'hit_rate':  hit_rate_at_k(recs, actual, K)
        })

# Build DataFrame and compute average metrics
eval_df = pd.DataFrame(results)
summary = eval_df.groupby('model').mean()[['precision','recall','f1','hit_rate']]

# Compute global coverage separately
coverage_global = coverage(global_recs, catalog)
coverage_month  = coverage(month_recs,  catalog)

# Add coverage to the summary
summary['coverage'] = [coverage_global, coverage_month]

print(summary)

                 precision    recall        f1  hit_rate  coverage
model                                                             
Global Baseline   0.095926  0.342988  0.144180  0.432238         1
Month Baseline    0.094483  0.335722  0.141787  0.422684         1


## 3.2 FPM Model

In [14]:
# # # One-hot encode
te = TransactionEncoder()
onehot = te.fit(train['products_category']).transform(train['products_category'], sparse=False)
fpm_df = pd.DataFrame(onehot, columns=te.columns_)



# Mine frequent itemsets & association rules
freq_itemsets = fpgrowth(fpm_df, min_support=0.001, use_colnames=True)
rules = association_rules(freq_itemsets, metric="confidence", min_threshold=0.001)

def recommend_fpm_categories(user_cats, K=5):
    """
    Recommend up to K categories based on association rules.
    - If user_cats is empty, fall back to the top-K global rules by confidence.
    - Otherwise, take the highest-confidence applicable rules.
    Never repeats the same category twice.
    
    Returns:
      List of (category, confidence) tuples, length <= K.
    """
    user_cats = set(user_cats or [])
    seen = set()
    recs = []

    # Helper to add consequents from a rule, up to K unique categories
    def add_from_rule(rule):
        nonlocal recs, seen
        for cat in rule['consequents']:
            if cat not in seen:
                seen.add(cat)
                recs.append((cat, rule['confidence']))
                if len(recs) >= K:
                    return True   # done
        return False

    if not user_cats:
        # Global fallback: top-K rules by confidence
        for _, rule in rules.sort_values('confidence', ascending=False).iterrows():
            if add_from_rule(rule):
                break
        return recs

    # Otherwise, find and sort only the applicable rules by confidence
    applicable = [
        rule for _, rule in rules.iterrows()
        if rule['antecedents'].issubset(user_cats)
    ]
    applicable.sort(key=lambda r: r['confidence'], reverse=True)

    for rule in applicable:
        if add_from_rule(rule):
            break

    return [cat for cat, _ in recs]



In [15]:
# === 1. Configuration ===
K = 5

# === 3. Per-user evaluation ===
results = []
all_recs = []

for _, row in test.iterrows():
    actual = row['products_category']   # list of true categories
    recs = recommend_fpm_categories(actual, K)
    all_recs.append(recs)
    
    results.append({
        'precision': precision_at_k(recs, actual, K),
        'recall':    recall_at_k(recs, actual, K),
        'f1':        f1_at_k(recs, actual, K),
        'hit_rate':  hit_rate_at_k(recs, actual, K)
    })

# === 4. Summarize metrics ===
eval_df = pd.DataFrame(results)
summary_fpm = eval_df.mean().to_frame(name='FPM').T

# === 5. Coverage calculation ===
cov = coverage(all_recs, catalog)
summary_fpm['coverage'] = cov

# === 6. Output ===
print("FPM Evaluation Summary:")
print(summary_fpm)

FPM Evaluation Summary:
     precision    recall        f1  hit_rate  coverage
FPM   0.040423  0.082248  0.053356  0.156036         1


## 3.3 CF model

In [16]:
# === User‐Based CF ===
def recommend_cf_user(user_id, K=5):
    if user_id not in user_map:
        return []
    idx = user_map[user_id]
    log_interactions = interactions.copy().astype(float)
    log_interactions.data = np.log1p(log_interactions.data)
    knn = NearestNeighbors(n_neighbors=10, metric='cosine').fit(log_interactions)
    _, neighs = knn.kneighbors(interactions[idx], n_neighbors=K+1)
    similar = [i for i in neighs.flatten() if i != idx][:K]
    agg = np.array(interactions[similar].sum(axis=0)).flatten()
    top_idx = agg.argsort()[::-1][:K]
    return [inv_cat_map[i] for i in top_idx if agg[i] > 0]


# === Per-user evaluation ===
results = []
all_recs = []

for _, row in test.iterrows():
    u = row['users_id']
    actual = row['products_category']  # list of true categories
    recs = recommend_cf_user(u, K=K)
    all_recs.append(recs)
    
    results.append({
        'precision': precision_at_k(recs, actual, K),
        'recall':    recall_at_k(recs, actual, K),
        'f1':        f1_at_k(recs, actual, K),
        'hit_rate':  hit_rate_at_k(recs, actual, K)
    })

# === 4. Summarize metrics ===
eval_df = pd.DataFrame(results)
summary_cf = eval_df.mean().to_frame(name='CF').T

# === 5. Coverage calculation ===
cov = coverage(all_recs, catalog)
summary_cf['coverage'] = cov

# === 6. Output ===
print("CF Evaluation Summary:")
print(summary_cf)


CF Evaluation Summary:
    precision    recall        f1  hit_rate  coverage
CF   0.010059  0.035146  0.015023  0.047954       1.0


## 3.4 SVD

In [17]:
K = 5

# === 4b. SVD + KNN on categories ===
k = 50
svd = TruncatedSVD(n_components=k, random_state=42,n_iter=10)
cat_factors = svd.fit_transform(interactions.T)  # shape: [n_cats × k]

knn_cat = NearestNeighbors(n_neighbors=10, metric='cosine').fit(cat_factors)

def recommend_svd_categories(user_id, K=5):
    """Project user into SVD latent space and find nearest category embeddings."""
    if user_id not in user_map:
        return []
    uidx = user_map[user_id]
    # get the user's log‐scaled vector as a NumPy array
    log_interactions = interactions.copy().astype(float)
    log_interactions.data = np.log1p(log_interactions.data)
    user_vec = log_interactions[uidx].toarray().flatten()   # shape [n_cats]
    # project into latent space
    latent = user_vec @ cat_factors                          # shape [50]
    latent = np.asarray(latent)                              # ensure ndarray
    dists, neighs = knn_cat.kneighbors(latent.reshape(1,-1), n_neighbors=10)
    recs = [inv_cat_map[i] for i in neighs.flatten() if i != uidx]
    return recs[:K]

# === 5. Evaluate on the test set ===
# === Per-user evaluation ===
results = []
all_recs = []

for _, row in test.iterrows():
    u = row['users_id']
    actual = row['products_category']  # list of true categories
    recs = recommend_svd_categories(u, K=K)
    all_recs.append(recs)
    
    results.append({
        'precision': precision_at_k(recs, actual, K),
        'recall':    recall_at_k(recs, actual, K),
        'f1':        f1_at_k(recs, actual, K),
        'hit_rate':  hit_rate_at_k(recs, actual, K)
    })

# === 4. Summarize metrics ===
eval_df = pd.DataFrame(results)
summary_cf = eval_df.mean().to_frame(name='SVD').T

# === 5. Coverage calculation ===
cov = coverage(all_recs, catalog)
summary_cf['coverage'] = cov

# === 6. Output ===
print("SVD Evaluation Summary:")
print(summary_cf)


SVD Evaluation Summary:
     precision    recall        f1  hit_rate  coverage
SVD   0.026637  0.094291  0.039973  0.118479       1.0


## 3.5 Blend SVD + FPM

In [18]:
def recommend_hybrid(user_id, K=5):
    """
    Hybrid SVD‐CF + FPM recommender with user‐specific blending weights:
      - existing users: alpha_CF = 0.6
      - new users:      alpha_CF = 0.2
    """
    # 1. Fetch this user’s past orders
    user_orders = train[train['users_id'] == user_id]
    if user_orders.empty:
        alpha_cf = 0.2
    else:
        alpha_cf = 0.6

    # 2. Seed categories from last order (if any)
    if user_orders.empty:
        seed_cats = []   # will lead to an empty FPM set
    else:
        last = user_orders.sort_values('order_items_created_at').iloc[-1]
        seed_cats = last['products_category']

    # 3. CF candidates (via SVD+KNN)
    svd_recs = recommend_svd_categories(user_id, K)
    cf_scores = {c: 1.0 for c in svd_recs}

    # 4. FPM candidates (via association rules)
    fpm_recs = recommend_fpm_categories(seed_cats, K)
    fpm_scores = {c: 1.0 for c in fpm_recs}

    # 5. Blend with user‐specific weight
    all_cats = set(cf_scores) | set(fpm_scores)
    blended = {
        c: alpha_cf * cf_scores.get(c, 0.0)
           + (1 - alpha_cf) * fpm_scores.get(c, 0.0)
        for c in all_cats
    }

    # 6. Return top‐K by blended score
    return sorted(blended, key=blended.get, reverse=True)[:K]

# === Hybrid model evaluation ===
results_hybrid = []
hyb_recs_all = []

for _, row in test.iterrows():
    u = row['users_id']
    actual = row['products_category']  # list of true categories
    recs = recommend_hybrid(u, K=K)
    hyb_recs_all.append(recs)
    
    results_hybrid.append({
        'precision': precision_at_k(recs, actual, K),
        'recall':    recall_at_k(recs, actual, K),
        'f1':        f1_at_k(recs, actual, K),
        'hit_rate':  hit_rate_at_k(recs, actual, K)
    })

# === Summarize metrics ===
eval_hybrid_df = pd.DataFrame(results_hybrid)
summary_hybrid = eval_hybrid_df.mean().to_frame(name='Hybrid').T

# === Coverage calculation ===
# Assume `catalog` is the set of all categories from the training set
cov_hybrid = coverage(hyb_recs_all, catalog)
summary_hybrid['coverage'] = cov_hybrid

# === Output ===
print("Hybrid Evaluation Summary:")
print(summary_hybrid)


Hybrid Evaluation Summary:
        precision    recall        f1  hit_rate  coverage
Hybrid   0.026637  0.094291  0.039973  0.118479  1.192308
