In [None]:

import numpy as np
import pandas as pd
import csv

In [None]:
data = pd.read_csv("faire-ml-rank-small.csv")

In [None]:
data.shape

(20000, 50)

In [None]:
data.columns.tolist()

['Unnamed: 0.1',
 'Unnamed: 0',
 'product.product_brand_page_click_to_cart_rate',
 'product.product_brand_page_impression_to_click_rate',
 'product.product_click_to_cart_rate_4w',
 'product.product_impression_to_click_rate_4w',
 'product.product_is_high_sell_through',
 'product.product_num_cart_adds_4w',
 'product.product_num_clicks_4w',
 'product.product_num_impressions_4w',
 'product.product_num_search_excess_cart_adds_4w',
 'product.product_num_search_excess_clicks_4w',
 'product.product_num_search_impressions_4w',
 'retailerbrand.retailer_brand_days_since_last_cart_item_added',
 'retailerbrand.retailer_brand_days_since_last_favorite',
 'retailerbrand.retailer_brand_days_since_last_order',
 'retailerbrand.retailer_brand_days_since_last_visit',
 'retailerbrand.retailer_brand_empirical_visit_rate',
 'retailerbrand.retailer_brand_filter_first_order_minimum_dot_product',
 'retailerbrand.retailer_brand_filter_maker_values_cosine_similarity',
 'retailerbrand.retailer_brand_filter_maker_va

In [None]:
# Get a summary of the DataFrame including data types, non-null values, and memory usage
#data.info()


--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 50 columns):
 #   Column                                                                      Non-Null Count  Dtype  
---  ------                                                                      --------------  -----  
 0   Unnamed: 0.1                                                                20000 non-null  int64  
 1   Unnamed: 0                                                                  20000 non-null  int64  
 2   product.product_brand_page_click_to_cart_rate                               19813 non-null  float64
 3   product.product_brand_page_impression_to_click_rate                         19815 non-null  float64
 4   product.product_click_to_cart_rate_4w                                       301 non-null    float64
 5   product.product_impression_to_click_rate_4w                                 4555 non-null   float64
 6   product.product_is_high_sell

In [None]:
# check the unique number of request_id

data['request_id_anon'].nunique()

1447

In [None]:
print("--- Target Variable Distribution ('has_product_click') ---")
click_counts = data['has_product_click'].value_counts()
print(click_counts)
print(f"Click rate: {click_counts[1] / len(data) * 100:.2f}%")

--- Target Variable Distribution ('has_product_click') ---
has_product_click
0    19161
1      839
Name: count, dtype: int64
Click rate: 4.20%


In [None]:
### list all the categorical features， notnull count and unique values

print("--- Object Type Categorical Features ---")
object_cols = data.select_dtypes(include='object')


for col in object_cols.columns:
    # data[col].count() 返回非空值的数量
    not_null_count = data[col].count()
    unique_count = data[col].nunique()

    print(f"'{col}':{not_null_count:<14} | {unique_count:<13} unique values")

--- Object Type Categorical Features ---
'title':20000          | 18091         unique values
'description':19677          | 15450         unique values
'created_at_a':20000          | 1447          unique values
'query_text':20000          | 937           unique values
'filter_string':1719           | 47            unique values


In [None]:
# one hot encoding for "filter_string"

# 生成新的列 'fillter_string_id'
# factorize() 返回一个 tuple：第一个元素是整数编码的数组，第二个元素是唯一的类别列表
data['fillter_string_id'] = data['filter_string'].factorize()[0]

print("已成功创建 'fillter_string_id' 列。")
print(f"新列包含 {data['filter_string'].nunique()} 个唯一的整数 ID。")


data[['filter_string','fillter_string_id']].sample(10)

# target encoding for "query_text"

已成功创建 'fillter_string_id' 列。
新列包含 47 个唯一的整数 ID。


Unnamed: 0,filter_string,fillter_string_id
9303,,-1
14346,,-1
700,"[""wholesale_price:under_10_dollars""]",4
7160,,-1
838,,-1
4862,,-1
19084,,-1
11474,,-1
8622,,-1
19306,,-1


In [None]:
print("--- Missing Values (Percentage & Type) ---")

# 1. Calculate the missing info DataFrame (加入 'Dtype')
missing_info = pd.DataFrame({
    'Missing Count': data.isnull().sum(),
    'Missing Percentage': (data.isnull().sum() / len(data)) * 100,
    'Dtype': data.dtypes  # 获取每一列的数据类型
})

# 2. Filter, sort by Missing Count
missing_percentage_only = missing_info[missing_info['Missing Count'] > 0]
missing_percentage_only = missing_percentage_only.sort_values(by='Missing Count', ascending=False)

# 3. Print both 'Missing Percentage' and 'Dtype'
print(missing_percentage_only[['Missing Percentage', 'Dtype']])

--- Missing Values (Percentage & Type) ---
                                                    Missing Percentage  \
product.product_num_clicks_4w                                   99.895   
product.product_num_cart_adds_4w                                98.495   
product.product_click_to_cart_rate_4w                           98.495   
retailerbrand.retailer_brand_num_brand_orders_1w                94.175   
retailerbrand.retailer_brand_days_since_last_order              94.175   
retailerbrand.retailer_brand_num_brand_orders_12w               94.175   
retailerbrand.retailer_brand_num_brand_orders_4w                94.175   
retailerbrand.retailer_brand_empirical_visit_rate               93.340   
retailerbrand.retailer_brand_days_since_last_fa...              92.575   
filter_string                                                   91.405   
retailerbrand.retailer_brand_num_brand_impressi...              91.095   
retailerbrand.retailer_brand_num_products_added...              88.76

In [None]:
# process with numerical features first

In [None]:
# 1. Identify numerical columns with missing values
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
num_cols_with_missing = [col for col in numerical_cols if data[col].isnull().sum() > 0]

# 2. Set a threshold for "High Missingness" (e.g., 75%)
high_missing_threshold = 0.75

print("--- Processing Numerical Missing Values ---")

for col in num_cols_with_missing:
    # Calculate missing rate
    missing_rate = data[col].isnull().mean()

    if missing_rate > high_missing_threshold:
        # === Strategy: Binary Indicator + Fill with 0 ===
        # Useful for features like 'num_clicks' where missing often means '0 events'

        # Create a new binary feature (1 if we have data, 0 if missing)
        data[f'{col}_is_known'] = data[col].notnull().astype(int)

        # Fill the original missing values with 0
        data[col] = data[col].fillna(0)

        print(f"High Missing ({missing_rate:.1%}): {col} \n   -> Created '{col}_is_known' & filled NaNs with 0")

    else:
        # === Strategy: Median Imputation ===
        # Useful for features like 'impressions' where data is generally available

        median_val = data[col].median()
        data[col] = data[col].fillna(median_val)

        print(f"Low Missing  ({missing_rate:.1%}): {col} \n   -> Filled NaNs with median ({median_val})")

print("\nDone! Check data.info() to verify no numerical nulls remain.")

--- Processing Numerical Missing Values ---
Low Missing  (0.9%): product.product_brand_page_click_to_cart_rate 
   -> Filled NaNs with median (0.3371)
Low Missing  (0.9%): product.product_brand_page_impression_to_click_rate 
   -> Filled NaNs with median (0.0142)
High Missing (98.5%): product.product_click_to_cart_rate_4w 
   -> Created 'product.product_click_to_cart_rate_4w_is_known' & filled NaNs with 0
High Missing (77.2%): product.product_impression_to_click_rate_4w 
   -> Created 'product.product_impression_to_click_rate_4w_is_known' & filled NaNs with 0
Low Missing  (0.4%): product.product_is_high_sell_through 
   -> Filled NaNs with median (0.0)
High Missing (98.5%): product.product_num_cart_adds_4w 
   -> Created 'product.product_num_cart_adds_4w_is_known' & filled NaNs with 0
High Missing (99.9%): product.product_num_clicks_4w 
   -> Created 'product.product_num_clicks_4w_is_known' & filled NaNs with 0
Low Missing  (0.5%): product.product_num_impressions_4w 
   -> Filled NaNs 

In [None]:
data.shape

(20000, 72)

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report

In [None]:
import pandas as pd
import numpy as np

# === 1. Build Training Data (Feature Selection) - 修正版 ===

# --- A. 定义目标变量 (y) ---
# 必须先定义 y，因为 Target Encoding 需要用到它
y = data['has_product_click']

# --- B. 准备数值特征 ---
# 筛选所有数值类型的列 (int64, float64)
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()

# 剔除 Label 和 ID 列
id_cols_to_drop = ['has_product_click', 'request_id_anon', 'retailer_token_anon','fillter_string_id']
numerical_features = [col for col in numeric_cols if col not in id_cols_to_drop]

X_numeric = data[numerical_features]

# --- B. 编码类别特征 ---
# 对 filter_string 进行 One-Hot Encoding (OHE)
filter_dummies = pd.get_dummies(data['fillter_string_id'], prefix='filter_string')



# --- E. 合并特征集 ---
# 使用 pd.concat 将数值特征、OHE 特征和 TE 特征按列合并
X = pd.concat([X_numeric, filter_dummies], axis=1)

print(f"最终特征集 X 的维度: {X.shape[1]} features")
print(f"X 中包含 {len(numerical_features)} 个数值特征, {len(filter_dummies.columns)} 个 OHE 特征")

最终特征集 X 的维度: 114 features
X 中包含 66 个数值特征, 48 个 OHE 特征


In [None]:
# === 2. Split Data (Train / Validation) ===

# 使用 stratify=y 非常重要，因为点击数据是不平衡的 (Imbalanced)，这能保证训练集和验证集里的点击率一致
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain shape: {X_train.shape}, Val shape: {X_val.shape}")



# ### Optional
# # === 2. Non-Leaky Target Encoding (TE) for "query_text" ===

# # 1. 计算全局平均点击率 (仅使用训练集 y_train)
# global_mean = y_train.mean()

# # 2. 计算 TE Map (仅使用训练集 X_train 和 y_train)
# # 将 X_train 和 y_train 合并，计算每个 query_text 的平均点击率
# te_mapping = pd.Series(y_train.values, index=X_train['query_text']).groupby(level=0).mean()


# # 3. 转换数据集 (Transform)

# # A. 转换训练集
# # 使用训练集的映射，填充缺失值 (fillna) 为全局平均值
# X_train['query_text_te'] = X_train['query_text'].map(te_mapping).fillna(global_mean)

# # B. 转换验证集
# # 必须使用**训练集**计算出的 te_mapping 应用于验证集
# X_val['query_text_te'] = X_val['query_text'].map(te_mapping).fillna(global_mean)

# print("--- Target Encoding Complete (Non-Leaky) ---")


Train shape: (16000, 114), Val shape: (4000, 114)


In [None]:
#X_train.columns.tolist()

['Unnamed: 0.1',
 'Unnamed: 0',
 'product.product_brand_page_click_to_cart_rate',
 'product.product_brand_page_impression_to_click_rate',
 'product.product_click_to_cart_rate_4w',
 'product.product_impression_to_click_rate_4w',
 'product.product_is_high_sell_through',
 'product.product_num_cart_adds_4w',
 'product.product_num_clicks_4w',
 'product.product_num_impressions_4w',
 'product.product_num_search_excess_cart_adds_4w',
 'product.product_num_search_excess_clicks_4w',
 'product.product_num_search_impressions_4w',
 'retailerbrand.retailer_brand_days_since_last_cart_item_added',
 'retailerbrand.retailer_brand_days_since_last_favorite',
 'retailerbrand.retailer_brand_days_since_last_order',
 'retailerbrand.retailer_brand_days_since_last_visit',
 'retailerbrand.retailer_brand_empirical_visit_rate',
 'retailerbrand.retailer_brand_filter_first_order_minimum_dot_product',
 'retailerbrand.retailer_brand_filter_maker_values_cosine_similarity',
 'retailerbrand.retailer_brand_filter_maker_va

In [None]:
# === 3. Train the Model (XGBoost) ===

# 建立模型
# scale_pos_weight: 如果数据非常不平衡（比如点击率<5%），可以设置这个参数来增加正样本的权重
# 或者直接用默认参数跑 Baseline

import xgboost as xgb

# --- 1. 准备 DMatrix ---
# 原生 XGBoost 训练需要 DMatrix 格式的数据
# 假设 X_train, y_train, X_val, y_val 已经定义
dtrain = xgb.DMatrix(X_train, y_train)
dval = xgb.DMatrix(X_val, y_val)

# --- 2. 定义参数 (使用 Baseline 参数) ---
params = {
    'objective': 'binary:logistic',  # 目标函数：预测点击概率
    'eval_metric': 'logloss',        # 评估指标
    'eta': 0.1,                      # 学习率 (对应 learning_rate)
    'max_depth': 5,                  # 树的深度
    'seed': 42                       # 随机种子 (对应 random_state)
    # 不平衡数据可以在这里设置 scale_pos_weight
}

# --- 3. 训练模型 (使用 xgb.train) ---
print("\nTraining model with Native API...")

# 使用 watchlist 监控训练和验证集
watchlist = [(dtrain, 'train'), (dval, 'eval')]

model= xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=100,           # 迭代次数 (对应 n_estimators)
    evals=watchlist,               # 传入监控集
    verbose_eval=10                # 每 10 轮打印一次结果
)


Training model with Native API...
[0]	train-logloss:0.16860	eval-logloss:0.17021
[10]	train-logloss:0.14725	eval-logloss:0.15587
[20]	train-logloss:0.13742	eval-logloss:0.15366
[30]	train-logloss:0.13085	eval-logloss:0.15284
[40]	train-logloss:0.12639	eval-logloss:0.15282
[50]	train-logloss:0.12192	eval-logloss:0.15311
[60]	train-logloss:0.11932	eval-logloss:0.15341
[70]	train-logloss:0.11542	eval-logloss:0.15391
[80]	train-logloss:0.11301	eval-logloss:0.15395
[90]	train-logloss:0.10947	eval-logloss:0.15430
[99]	train-logloss:0.10707	eval-logloss:0.15476


In [None]:


import xgboost as xgb

# 1. 准备 DMatrix
dtrain = xgb.DMatrix(X_train, y_train)
dval = xgb.DMatrix(X_val, y_val)

# 2. 定义参数 (使用你修正后的分类参数)
# reg_alpha: L1, reg_lambda: L2
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.05,
    'max_depth': 4,
    'reg_alpha': 0.05,
    'reg_lambda': 5,
    'tree_method': 'hist',
    'seed': 42
}

# 3. 训练 (原生 API 的 Early Stopping)
watchlist = [(dtrain, 'train'), (dval, 'eval')]

model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=500,
    evals=watchlist,
    early_stopping_rounds=10,   # 原生 API 直接支持
    verbose_eval=10
)

[0]	train-logloss:0.17228	eval-logloss:0.17224
[10]	train-logloss:0.16137	eval-logloss:0.16158
[20]	train-logloss:0.15619	eval-logloss:0.15716
[30]	train-logloss:0.15265	eval-logloss:0.15492
[40]	train-logloss:0.14983	eval-logloss:0.15395
[50]	train-logloss:0.14743	eval-logloss:0.15348
[60]	train-logloss:0.14543	eval-logloss:0.15320
[67]	train-logloss:0.14411	eval-logloss:0.15316


In [None]:
from sklearn.metrics import roc_auc_score
import xgboost as xgb


# 假设 model (原生 XGBoost 模型), X_train, y_train, X_val, y_val 都已在前面的代码块中定义和准备好

# --- 1. 准备 DMatrix 格式数据用于预测 ---
# 训练集 DMatrix
dtrain_eval = xgb.DMatrix(X_train)
# 验证集 DMatrix
dval_eval = xgb.DMatrix(X_val)


# --- 2. 获取预测分数 ---
# 原生 API 中，model.predict() 直接返回概率分数
y_pred_proba_val = model.predict(dval_eval)
y_pred_proba_train = model.predict(dtrain_eval)

# --- 3. 计算 ROC AUC ---
auc_val = roc_auc_score(y_val, y_pred_proba_val)
auc_train = roc_auc_score(y_train, y_pred_proba_train)

print(f"\n--- Classification Model Results (Native API) ---")
print(f"Training ROC AUC: {auc_train:.4f}")
print(f"Validation ROC AUC: {auc_val:.4f}")

# 简易过拟合检查
if auc_train > auc_val * 1.1:
    print("\n⚠️ 警示：Training AUC 远高于 Validation AUC，可能存在过拟合。")



--- Classification Model Results (Native API) ---
Training ROC AUC: 0.8297
Validation ROC AUC: 0.7774


In [None]:
# 看看特征重要性 (Feature Importance) - 加分项
# 使用 model.get_score() 方法获取特征重要性
# importance_type='gain' (增益) 是衡量特征贡献最常用的指标
importance = model.get_score(importance_type='gain')

# 将结果转换为 Pandas Series 进行排序和打印
# 注意：使用 X_train.columns 来确保特征名称匹配正确
feature_importances = pd.Series(importance).sort_values(ascending=False)

print("\n--- Top 5 Important Features (by Gain) ---")
print(feature_importances.head(5))


--- Top 5 Important Features (by Gain) ---
page_size                                                       32.676353
position                                                        17.865368
product.product_num_search_excess_clicks_4w                     13.143281
retailerbrand.retailer_brand_num_brand_visits_12w                9.162340
retailerbrand.retailer_brand_days_since_last_cart_item_added     8.541301
dtype: float64


In [None]:
data[['page_size','position']].sample(10)

Unnamed: 0,page_size,position
14981,50,10.0
12754,48,94.0
16771,48,841.0
15142,48,44.0
11132,48,5.0
11081,24,98.0
13982,48,37.0
1867,50,28.0
2094,48,88.0
13444,24,113.0


In [None]:
object_cols = data.select_dtypes(include=['object']).columns.tolist()
print(object_cols)

['title', 'description', 'created_at_a', 'query_text', 'filter_string']


In [None]:
data[object_cols].sample(10)

Unnamed: 0,title,description,created_at_a,query_text,filter_string
12264,Floral Washable Face Masks for Kids,Masks are in high demand at the moment and wit...,2020-05-31 20:45:12.601,kids masks,
683,Bracelet vegan leather blacks with shell,Ethically made high quality vegan Leather doub...,2020-05-31 02:24:46.998,jewelry,"[""maker_minimum:less_or_equal_to_200_dollars""]"
16148,Hand sanitizer w/90% alcohol,Plant based hand sanitizer with 90% alcohol & ...,2020-05-31 06:00:18.042,hand sanitizer with alcohol,
14425,Mesquite Bracelet,Waxed canvas.\nSmall carabiner clasp.,2020-05-31 01:05:44.575,caribiner,
9377,Medium Sparkalicious - Turquoise Duffle,Item Dimensions - 13.5 x 9 x 9\n• Easy grip co...,2020-05-31 00:10:26.119,kids bags,
2997,Classic Seersucker Jon Jon - Blue Seersucker,Boys Jon Jon\n\n- Double buttons on the straps...,2020-05-31 02:02:14.451,seersucker,
971,Shell Pave Inlay Huggie Earrings,One touch gold filled huggie earrings\n15mm ho...,2020-05-31 23:51:57.963,blue mesa,"[""category:jewelry|earrings""]"
19136,Get Well Soon Dachshund (H) Card,,2020-05-31 18:17:16.791,get well,
6974,"Coyote and La Luna Southwestern Decor, Tassels","Coyote and La Luna Pillow, Decorative Pillow, ...",2020-05-31 09:35:38.646,southwestern,
14189,Candied Walnut - Maple Glaze,"Light English walnuts, monksweet sweetener (er...",2020-05-31 22:24:07.203,keto foods,


In [None]:
### Code to Implement These 2 Features

In [None]:
# === Feature 1: Query-Title Match (Relevance) ===

#data['title_has_query'] = data.apply(lambda x: 1 if x['query_text'].lower() in x['title'].lower() else 0, axis=1)

#
def check_query_in_title(row):
    """
    检查搜索词 (query_text) 是否包含在商品标题 (title) 中，不区分大小写。
    返回 1 (包含) 或 0 (不包含)。
    """
    query = row['query_text'].lower()
    title = row['title'].lower()
    if query in title:
        return 1
    else:
        return 0

# 应用函数到 DataFrame 的每一行 (axis=1)
data['title_has_query'] = data.apply(check_query_in_title, axis=1)

In [None]:


# === Feature 2: Days Since Creation (Recency) ===
# FIX: 添加 format='mixed' 来同时处理带毫秒和不带毫秒的时间格式
data['created_at_dt'] = pd.to_datetime(data['created_at_a'], format='mixed')

# Define a reference date (using the max date in the dataset)
current_date = data['created_at_dt'].max()

# Calculate the difference in days
data['days_since_created'] = (current_date - data['created_at_dt']).dt.days

print("--- New Features Created ---")
print(data[['title', 'query_text', 'title_has_query', 'days_since_created']].head())

--- New Features Created ---
                                               title          query_text  \
0                              Leopard Visor Sun Hat             sun hat   
1                                 Beach Ball (Blank)               beach   
2                            Selene’s Love Necklace   gold love necklace   
3  Black Plastic Letter Set for Changeable Felt L...        letter board   
4                             Love Cross Dakota Sign         porch signs   

   title_has_query  days_since_created  
0                1                   0  
1                1                   0  
2                0                   0  
3                1                   0  
4                0                   0  


##List of 10 Additional Features (For the Exam Report)
### For the "List up to 10 more features" part of your exam, here is a strong list that covers different aspects of a recommendation system (Content, Context, User, Product).



* Product Price: (Crucial!) High vs. Low price point relative to the user's spending habits.

* Historical CTR (Smoothed): clicks / impressions with Bayesian smoothing to handle products with few impressions.

* Query-Description Similarity: Using TF-IDF or simple word overlap count between the search query and the product description (richer than just Title).

* Brand Popularity Rank: Where does this brand rank globally on Faire? (e.g., Top 1% Brand).

* Retailer Average Order Value (AOV): Does this retailer usually buy cheap items or luxury items?

* Seasonality Score: Is this product relevant for the current season? (e.g., "Scarf" in Winter vs. Summer).

* Geographic Distance: Distance between the Retailer and the Brand (proxy for shipping cost/time).

* Image Quality/Style: Embedding features from the product main image (e.g., "Boho", "Minimalist").

* Category Match: Does the product category match the dominant category of the search query?

* Reorder Rate: How often do retailers who buy this product come back to buy it again? (Signal of product quality).


### Improvment in 1 week


1. Short Term (1 Week): Feature Engineering & Robustness
Goal: Squeeze the maximum performance out of the current Gradient Boosting framework.

Advanced Text Processing: Currently, we only use basic text matching. I would implement TF-IDF or pre-trained SentenceBERT embeddings on query_text, title, and description to capture semantic similarity (e.g., matching "sofa" to "couch") rather than just exact keyword matching.

Hyperparameter Optimization: Move beyond manual tuning to automated Bayesian Optimization (using Optuna or Hyperopt) to find the optimal max_depth, learning_rate, and subsample parameters.

Error Analysis: Manually inspect the "False Positives" (high rank, no click) and "False Negatives" (low rank, click). Are we failing on new products (Cold Start)? Are we failing on specific categories? This qualitative feedback loop is critical.

Target Encoding: Implement smoothed Target Encoding for high-cardinality categorical features like retailer_token or brand_id to capture historical performance without overfitting.

### Medium Term (1 Month): Learning to Rank (LTR) & Advanced Evaluation
Goal: Shift from "Classifying Clicks" to "Ranking Lists".

Switch to Learning-to-Rank Objective: The current model predicts "Will this be clicked?" (Classification). I would switch the objective function in XGBoost/LightGBM to rank:pairwise (LambdaMART) or rank:ndcg. This optimizes the relative order of items within a request_id rather than their absolute probability, which is the true goal of a search engine.

New Evaluation Metrics: AUC is good for classification, but for ranking, I would implement NDCG@10 (Normalized Discounted Cumulative Gain) and MRR (Mean Reciprocal Rank). These metrics heavily penalize putting a relevant item at position 10 instead of position 1, which matches the user experience.

Position Bias Modeling: Train a separate model to estimate the "propensity to click" based solely on position. During inference, we can debias our predictions to rank items based on relevance, not just screen real estate.

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
# 我们假设 X_train 和 X_val 已经是包含所有特征和 'request_id_anon' 的 DataFrame


print("--- 重新准备 L2R 数据 ---")

# 1. 筛选数值特征，并确保保留 L2R 分组所需的 'request_id_anon'
numeric_df = data.select_dtypes(include=[np.number])

# 目标标签
y = data['has_product_click']

# 应该被移除的列 (Target 和不需要的分组列)
# 注意：我们特意保留了 'request_id_anon'
drop_cols = ['has_product_click', 'retailer_token_anon']

# X 现在包含所有的数值特征，以及最重要的 'request_id_anon'
X = numeric_df.drop(columns=[c for c in drop_cols if c in numeric_df.columns], errors='ignore')

# 2. 重新划分数据
# X_train, X_val 现在都包含 'request_id_anon' 列
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


--- 重新准备 L2R 数据 ---


In [None]:

# --- L2R 步骤 1: 准备分组信息 (与之前相同) ---
# 确保 X_train 中仍然包含 'request_id_anon'
group_train = X_train.groupby('request_id_anon').size().tolist()

# 移除分组列
X_train_rank = X_train.drop(columns=['request_id_anon'])
X_val_rank = X_val.drop(columns=['request_id_anon'])

# --- L2R 步骤 2: 转换数据为 DMatrix 格式 ---
# L2R 训练必须使用 DMatrix，它支持传入分组信息
dtrain = xgb.DMatrix(
    data=X_train_rank,
    label=y_train,
    group=group_train
)

# --- L2R 步骤 3: 定义参数并训练模型 (使用原生 API) ---
params = {
    'objective': 'rank:ndcg', # 核心改变：优化排序指标
    'eval_metric': 'ndcg@5',  # 评估指标
    'eta': 0.1,               # learning_rate (eta)
    'max_depth': 5,
    'tree_method': 'hist'
}

# 训练模型
model_rank = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=100  # 相当于 n_estimators
)

print("L2R Model Training Complete using Native XGBoost API.")

# --- L2R 步骤 4: 准备验证数据用于预测 ---
dval = xgb.DMatrix(data=X_val_rank)

L2R Model Training Complete using Native XGBoost API.


In [None]:
from sklearn.metrics import ndcg_score

def calculate_average_ndcg(X_with_qid, y_true, y_score, k=5):
    """
    计算分组数据的平均 NDCG@K

    Args:
        X_with_qid (pd.DataFrame): 包含 'request_id_anon' 的特征集 (e.g., X_val)
        y_true (pd.Series): 真实标签 (e.g., y_val)
        y_score (np.array): 模型预测的分数 (e.g., model.predict_proba(X_val_rank)[:, 1])
        k (int): NDCG 的截止位置
    """
    # 1. 将数据、标签、分数组合成一个 DataFrame
    data_eval = X_with_qid.copy()
    data_eval['y_true'] = y_true.values
    data_eval['y_score'] = y_score

    ndcg_scores = []

    # 2. 按 request_id (query ID) 进行分组循环
    for qid, group in data_eval.groupby('request_id_anon'):

        # 排序：根据模型分数降序排列，确保 y_true 和 y_score 是对应的
        group = group.sort_values(by='y_score', ascending=False)
        y_true_qid = group['y_true'].values
        y_score_qid = group['y_score'].values

        # 确保列表有足够的项目来计算 NDCG@K
        if len(y_true_qid) >= k:
            # ndcg_score 要求输入是列表 of 列表
            ndcg = ndcg_score([y_true_qid], [y_score_qid], k=k)
            ndcg_scores.append(ndcg)

    # 3. 返回平均值
    return np.mean(ndcg_scores)

# --- 验证 NDCG@5 ---
# 1. 获取验证集的分数 (使用 X_val_rank)

y_score_train = model_rank.predict(dtrain)
y_score_val = model_rank.predict(dval)

# 2. 计算平均 NDCG

# 计算训练集 NDCG
avg_ndcg_train = calculate_average_ndcg(
    X_train,           # 传入包含 'request_id_anon' 的 X_train
    y_train,
    y_score_train,
    k=5
)

# 核心修正：第一个参数必须是 X_val（包含分组列 request_id_anon）
avg_ndcg_val = calculate_average_ndcg(
    X_val,           # <-- 传入包含 'request_id_anon' 的 X_val
    y_val,
    y_score_val,
    k=5
)


print("\n--- NDCG@5 结果 ---")
print(f"Training NDCG@{5}: {avg_ndcg_train:.4f}")
print(f"Validation NDCG@{5}: {avg_ndcg_val:.4f}")


--- NDCG@5 结果 ---
Training NDCG@5: 0.3003
Validation NDCG@5: 0.1069


### 3. Long Term (3 Months): Deep Learning & Multi-Objective

OptimizationGoal: Build a state-of-the-art, personalized recommendation engine.Two-Tower Neural Network (Deep Learning): Move to a Deep Learning architecture (e.g., TensorFlow Recommenders). Use a Two-Tower model where one tower learns a "User Embedding" (Retailer history, demographics) and the other learns an "Item Embedding" (Image, Text, Metadata). The dot product of these embeddings gives the relevance score. This captures complex, non-linear interactions that Trees miss.Graph Neural Networks (GNN): Faire is a B2B marketplace with a rich graph of Retailers, Brands, and Products. Implementing a GNN (like PinSage or GraphSAGE) would allow us to propagate information through the graph. If a retailer likes Brand A, and Brand A is similar to Brand B, the GNN can recommend Brand B effectively.Multi-Objective Optimization (MOO): "Clicks" are a proxy, but "GMV" (Sales) is the business goal. I would train a multi-head model that predicts:
1. p(Click)
2. p(AddToCart | Click)
3. p(Purchase | AddToCart)

The final ranking score would be a weighted combination: $Score = \alpha \cdot p(Click) + \beta \cdot p(Conversion) \cdot Price$. This ensures we don't just recommend "click-bait" but actual high-value products.