12.20 2:25 by zyf 
网站测试分数0.6790183

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [None]:
# 数据读取 
train_df = pd.read_csv('../data/data_format1/train_format1.csv')   # user_id, merchant_id, label
test_df = pd.read_csv('../data/data_format1/test_format1.csv')     # user_id, merchant_id, prob(空)
user_info = pd.read_csv('../data/data_format1/user_info_format1.csv')  # user_id, age_range, gender
user_log = pd.read_csv('../data/data_format1/user_log_format1.csv')    # user_id, item_id, cat_id, merchant_id, brand_id, time_stamp, action_type

In [None]:
# 合并用户画像
train_df = train_df.merge(user_info, on='user_id', how='left')
test_df = test_df.merge(user_info, on='user_id', how='left')

# 性别和年龄填充与编码
train_df['gender'] = train_df['gender'].replace({2: -1, np.nan: -1})
test_df['gender'] = test_df['gender'].replace({2: -1, np.nan: -1})
train_df['age_range'] = train_df['age_range'].fillna(0)
test_df['age_range'] = test_df['age_range'].fillna(0)

In [None]:
def get_time_period(x):
    # 根据日期分段：假设1101到1111为一个活动周期
    if x < 1101:
        return 'early'
    elif x <= 1111:
        return 'campaign'
    else:
        return 'after'

user_log['time_period'] = user_log['time_stamp'].apply(get_time_period)

In [None]:
# 对 (user_id, merchant_id) 按照 time_period 和 action_type 分组统计
group = user_log.groupby(['user_id', 'merchant_id'])

agg_feat = group['action_type'].agg(
    action_count='count',
    click_count=lambda x: (x == 0).sum(),
    cart_count=lambda x: (x == 1).sum(),
    purchase_count=lambda x: (x == 2).sum(),
    fav_count=lambda x: (x == 3).sum()
).reset_index()

item_count = group['item_id'].nunique().reset_index(name='unique_item_count')
cat_count = group['cat_id'].nunique().reset_index(name='unique_cat_count')
brand_count = group['brand_id'].nunique().reset_index(name='unique_brand_count')

feat = agg_feat.merge(item_count, on=['user_id','merchant_id'], how='left')
feat = feat.merge(cat_count, on=['user_id','merchant_id'], how='left')
feat = feat.merge(brand_count, on=['user_id','merchant_id'], how='left')


In [None]:
# 时间段特征统计
time_group = user_log.groupby(['user_id','merchant_id','time_period'])['action_type'].count().unstack(fill_value=0)
time_group.columns = [f'{col}_actions' for col in time_group.columns]  # early_actions, campaign_actions, after_actions
time_group = time_group.reset_index()

feat = feat.merge(time_group, on=['user_id','merchant_id'], how='left')

feat.fillna(0, inplace=True)

# 比例特征
# 行为比例特征：购买占比、点击占比，以及时间段动作占比
feat['purchase_ratio'] = feat['purchase_count'] / (feat['action_count'] + 1e-9)
feat['click_ratio'] = feat['click_count'] / (feat['action_count'] + 1e-9)
feat['cat_diversity'] = feat['unique_cat_count'] / (feat['unique_item_count'] + 1e-9)

total_actions = feat['action_count'] + 1e-9
feat['early_ratio'] = feat['early_actions'] / total_actions
feat['campaign_ratio'] = feat['campaign_actions'] / total_actions
feat['after_ratio'] = feat['after_actions'] / total_actions

# Outlier Handling
# 对计数类特征进行分位数截尾
count_cols = [
    'action_count','click_count','cart_count','purchase_count','fav_count',
    'unique_item_count','unique_cat_count','unique_brand_count',
    'early_actions','campaign_actions','after_actions'
]

# 找到99分位阈值
for c in count_cols:
    cap_value = feat[c].quantile(0.99) 
    feat[c] = np.where(feat[c] > cap_value, cap_value, feat[c])



In [None]:
# 合并特征到train和test
train_df = train_df.merge(feat, on=['user_id','merchant_id'], how='left')
test_df = test_df.merge(feat, on=['user_id','merchant_id'], how='left')

train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

excluded_cols = ['user_id','merchant_id','label','prob']
feature_cols = [c for c in train_df.columns if c not in excluded_cols]

X = train_df[feature_cols].values
y = train_df['label'].values
X_test = test_df[feature_cols].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2024)

# 使用LightGBM进行训练
lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 64,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'seed': 2024
}

callbacks = [
    lgb.early_stopping(stopping_rounds=50),
    lgb.log_evaluation(period=50)
]

print("Training LightGBM with time-based features and outlier handling...")
gbm = lgb.train(
    params=params,
    train_set=lgb_train,
    num_boost_round=2000, 
    valid_sets=[lgb_val],
    callbacks=callbacks
)

val_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
val_auc = roc_auc_score(y_val, val_pred)
print(f"Validation AUC: {val_auc:.4f}")

test_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
test_df['prob'] = test_pred
test_df['prob'] = test_df['prob'].round(4)
result = test_df[['user_id','merchant_id','prob']]
result.to_csv('prediction_time_outlier.csv', index=False, float_format='%.4f')
print("prediction_time_outlier.csv 已生成")