In [1]:
# pandas读写csv文件
import pandas as pd
import numpy as np

from datetime import datetime


def log_print(*arg, log=True):
    if log:
        print(datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"),
              *arg,
              flush=True)


path = '/Users/mingjue/downloads/'

df_train_1 = pd.read_csv(path + 'data_format1/train_format1.csv', dtype={'user_id': 'str', 'merchant_id': 'str', 'label': 'float'})
df_test_1 = pd.read_csv(path + 'data_format1/test_format1.csv', dtype={'user_id': 'str', 'merchant_id': 'str', 'label': 'float'})
df_user_info = pd.read_csv(path + 'data_format1/user_info_format1.csv', dtype='str')
df_user_log = pd.read_csv(path + 'data_format1/user_log_format1.csv', dtype='str')
df_train_2 = pd.read_csv(path + 'data_format2/train_format2.csv', dtype='str')
df_test_2 = pd.read_csv(path + 'data_format2/test_format2.csv', dtype='str')
df_infer = pd.read_csv(path + 'sample_submission.csv', dtype='str')

# df_user_info
# age_range 和 gender 有缺失值
# age_range 的取值为0-8，0表示缺失值（age_range一般对应年龄大小顺序，根据分布判断0表示未知）
# gender 的取值为0-2，2表示缺失值(根据官网描述，2和NULL表示未知)
# 将df_user_info的 age_range 列  字段转换为float， 取值0都替换为空值,
df_user_info['age_range'] = df_user_info['age_range'].astype(float)
df_user_info['age_range'] = df_user_info['age_range'].replace(0, np.nan)
# 将df_user_info的 gender 列 中 取值为2替换为空值
df_user_info['gender'] = df_user_info['gender'].replace('2', np.nan)

# df_user_log
df_user_log.rename(columns={'seller_id': 'merchant_id'}, inplace=True)
# time_stamp 的取值为0511到1112共186天，表示mmdd，没有年份，请转换为标准日期格式
df_user_log['time_stamp'] = pd.to_datetime('2025' + df_user_log['time_stamp'], format='%Y%m%d')
df_user_log['recency'] = (pd.to_datetime('2025-11-12') - df_user_log['time_stamp']).dt.days + 1

# 计算用户rfm, 数据没有monetary，所以不能计算rfm，只能计算rf
# recency：距离最后一次消费的天数
# frequency：消费的次数



In [2]:
df_user_log['recency'].describe()

count    5.492533e+07
mean     5.732774e+01
std      5.795207e+01
min      1.000000e+00
25%      4.000000e+00
50%      3.400000e+01
75%      1.060000e+02
max      1.860000e+02
Name: recency, dtype: float64

In [3]:
# 计算用户特征
def compute_rf(d, groupby_cols):
    if ['user_id'] == groupby_cols:
        d_rf =  d.groupby('user_id').agg(
            r=('recency', 'min'),
            f=('user_id', 'count'), 
            f_item=('item_id', 'nunique'),
            f_cat=('cat_id', 'nunique'), 
            f_merchant=('merchant_id', 'nunique'),
            f_brand=('brand_id', 'nunique'),
        ).reset_index()
    elif ['user_id', 'merchant_id'] == groupby_cols:
        d_rf =  d.groupby(['user_id', 'merchant_id']).agg(
            r=('recency', 'min'),
            f=('user_id', 'count'), 
            f_item=('item_id', 'nunique'),
            f_cat=('cat_id', 'nunique'), 
            f_brand=('brand_id', 'nunique'),    
        ).reset_index()
    elif ['merchant_id'] == groupby_cols:
        d_rf =  d.groupby('merchant_id').agg(
            r=('recency', 'min'),
            f=('user_id', 'count'), 
            f_user=('user_id', 'nunique'),
            f_item=('item_id', 'nunique'),
            f_cat=('cat_id', 'nunique'), 
            f_brand=('brand_id', 'nunique'),    
        ).reset_index()
    else:
        raise ValueError('groupby_cols must be ["user_id"], ["user_id", "merchant_id"] or ["merchant_id"]')
    return d_rf


# user_id 维度
df_user_rf = compute_rf(df_user_log, ['user_id'])
log_print('df_user_rf all shape:', df_user_rf.shape)
for action_type in ['0', '1', '2', '3']:
    df_user_rf_t = compute_rf(df_user_log.query(f'("{action_type}"==action_type)'), ['user_id'])
    df_user_rf = df_user_rf.merge(df_user_rf_t, on='user_id', how='left', suffixes=('', f'_t{action_type}'))
    log_print(f'df_user_rf {action_type} shape:', df_user_rf.shape)
for recency in [7, 14, 30, 60, 90]:
    df_user_rf_r = compute_rf(df_user_log.query(f'(recency<={recency})'), ['user_id'])
    df_user_rf = df_user_rf.merge(df_user_rf_r, on='user_id', how='left', suffixes=('', f'_r{recency}'))
    log_print(f'df_user_rf {recency} shape:', df_user_rf.shape)
for recency in [7, 14, 30, 60, 90]:
    for action_type in ['0', '1', '2', '3']:
        df_user_rf_r_t = compute_rf(df_user_log.query(f'(recency<={recency}) & ("{action_type}"==action_type)'), ['user_id'])
        df_user_rf = df_user_rf.merge(df_user_rf_r_t, on='user_id', how='left', suffixes=('', f'_r{recency}_t{action_type}'))
        log_print(f'df_user_rf {action_type} {recency} shape:', df_user_rf.shape)
        

# user_id, merchant_id 维度
df_user_merchant_rf = compute_rf(df_user_log, ['user_id', 'merchant_id'])
log_print(f'df_user_merchant_rf all shape:', df_user_merchant_rf.shape)
for action_type in ['0', '1', '2', '3']:
    df_user_merchant_rf_t = compute_rf(df_user_log.query(f'("{action_type}"==action_type)'), ['user_id', 'merchant_id'])
    df_user_merchant_rf = df_user_merchant_rf.merge(df_user_merchant_rf_t, on=['user_id', 'merchant_id'], how='left', suffixes=('', f'_t{action_type}'))
    log_print(f'df_user_merchant_rf {action_type} shape:', df_user_merchant_rf.shape)
for recency in [7, 14, 30, 60, 90]:
    df_user_merchant_rf_r = compute_rf(df_user_log.query(f'(recency<={recency})'), ['user_id', 'merchant_id'])
    df_user_merchant_rf = df_user_merchant_rf.merge(df_user_merchant_rf_r, on=['user_id', 'merchant_id'], how='left', suffixes=('', f'_r{recency}'))
    log_print(f'df_user_merchant_rf {recency} shape:', df_user_merchant_rf.shape)
for recency in [7, 14, 30, 60, 90]:
    for action_type in ['0', '1', '2', '3']:
        df_user_merchant_rf_r_t = compute_rf(df_user_log.query(f'(recency<={recency}) & ("{action_type}"==action_type)'), ['user_id', 'merchant_id'])
        df_user_merchant_rf = df_user_merchant_rf.merge(df_user_merchant_rf_r_t, on=['user_id', 'merchant_id'], how='left', suffixes=('', f'_r{recency}_t{action_type}'))
        log_print(f'df_user_merchant_rf {action_type} {recency} shape:', df_user_merchant_rf.shape)


# merchant_id 维度
df_merchant_rf = compute_rf(df_user_log, ['merchant_id'])
log_print('df_merchant_rf all shape:', df_merchant_rf.shape)
for action_type in ['0', '1', '2', '3']:
    df_merchant_rf_t = compute_rf(df_user_log.query(f'("{action_type}"==action_type)'), ['merchant_id'])
    df_merchant_rf = df_merchant_rf.merge(df_merchant_rf_t, on='merchant_id', how='left', suffixes=('', f'_t{action_type}'))
    log_print(f'df_merchant_rf {action_type} shape:', df_merchant_rf.shape)
for recency in [7, 14, 30, 60, 90]:
    df_merchant_rf_r = compute_rf(df_user_log.query(f'(recency<={recency})'), ['merchant_id'])
    df_merchant_rf = df_merchant_rf.merge(df_merchant_rf_r, on='merchant_id', how='left', suffixes=('', f'_r{recency}')) 
    log_print(f'df_merchant_rf {recency} shape:', df_merchant_rf.shape)
for recency in [7, 14, 30, 60, 90]:
    for action_type in ['0', '1', '2', '3']:
        df_merchant_rf_r_t = compute_rf(df_user_log.query(f'(recency<={recency}) & ("{action_type}"==action_type)'), ['merchant_id'])
        df_merchant_rf = df_merchant_rf.merge(df_merchant_rf_r_t, on='merchant_id', how='left', suffixes=('', f'_r{recency}_t{action_type}'))
        log_print(f'df_merchant_rf {action_type} {recency} shape:', df_merchant_rf.shape)


2025-10-16 14:44:35.852940 df_user_rf all shape: (424170, 7)
2025-10-16 14:47:13.795232 df_user_rf 0 shape: (424170, 13)
2025-10-16 14:47:15.426043 df_user_rf 1 shape: (424170, 19)
2025-10-16 14:47:24.375547 df_user_rf 2 shape: (424170, 25)
2025-10-16 14:47:32.106942 df_user_rf 3 shape: (424170, 31)
2025-10-16 14:48:16.717488 df_user_rf 7 shape: (424170, 37)
2025-10-16 14:49:13.212967 df_user_rf 14 shape: (424170, 43)
2025-10-16 14:50:30.195110 df_user_rf 30 shape: (424170, 49)
2025-10-16 14:52:07.385135 df_user_rf 60 shape: (424170, 55)
2025-10-16 14:54:10.593735 df_user_rf 90 shape: (424170, 61)
2025-10-16 14:54:48.936048 df_user_rf 0 7 shape: (424170, 67)
2025-10-16 14:54:50.770726 df_user_rf 1 7 shape: (424170, 73)
2025-10-16 14:54:54.740068 df_user_rf 2 7 shape: (424170, 79)
2025-10-16 14:54:57.211324 df_user_rf 3 7 shape: (424170, 85)
2025-10-16 14:55:47.086622 df_user_rf 0 14 shape: (424170, 91)
2025-10-16 14:55:49.058997 df_user_rf 1 14 shape: (424170, 97)
2025-10-16 14:55:53.0

In [7]:
df_user_rf.to_parquet(path +'df_user_rf.parquet')
df_user_merchant_rf.to_parquet(path +'df_user_merchant_rf.parquet')
df_merchant_rf.to_parquet(path +'df_merchant_rf.parquet')

In [11]:
df_merchant_rf.columns.to_list()

['merchant_id',
 'r',
 'f',
 'f_user',
 'f_item',
 'f_cat',
 'f_brand',
 'r_t0',
 'f_t0',
 'f_user_t0',
 'f_item_t0',
 'f_cat_t0',
 'f_brand_t0',
 'r_t1',
 'f_t1',
 'f_user_t1',
 'f_item_t1',
 'f_cat_t1',
 'f_brand_t1',
 'r_t2',
 'f_t2',
 'f_user_t2',
 'f_item_t2',
 'f_cat_t2',
 'f_brand_t2',
 'r_t3',
 'f_t3',
 'f_user_t3',
 'f_item_t3',
 'f_cat_t3',
 'f_brand_t3',
 'r_r7',
 'f_r7',
 'f_user_r7',
 'f_item_r7',
 'f_cat_r7',
 'f_brand_r7',
 'r_r14',
 'f_r14',
 'f_user_r14',
 'f_item_r14',
 'f_cat_r14',
 'f_brand_r14',
 'r_r30',
 'f_r30',
 'f_user_r30',
 'f_item_r30',
 'f_cat_r30',
 'f_brand_r30',
 'r_r60',
 'f_r60',
 'f_user_r60',
 'f_item_r60',
 'f_cat_r60',
 'f_brand_r60',
 'r_r90',
 'f_r90',
 'f_user_r90',
 'f_item_r90',
 'f_cat_r90',
 'f_brand_r90',
 'r_r7_t0',
 'f_r7_t0',
 'f_user_r7_t0',
 'f_item_r7_t0',
 'f_cat_r7_t0',
 'f_brand_r7_t0',
 'r_r7_t1',
 'f_r7_t1',
 'f_user_r7_t1',
 'f_item_r7_t1',
 'f_cat_r7_t1',
 'f_brand_r7_t1',
 'r_r7_t2',
 'f_r7_t2',
 'f_user_r7_t2',
 'f_item_r7_t

In [5]:
# df_infer = pd.read_csv(path + 'sample_submission.csv')
# # 简单的预测，随机大于0小于1的小数，比如0.5、0.8，均值0.2
# df_infer['prob'] = np.random.rand(len(df_infer))
# # 保存结果到prediction.csv文件中
# df_infer.to_csv(path + 'prediction.csv', index=False)

In [15]:
# df_user_rf = pd.read_parquet(path +'df_user_rf.parquet')
# df_user_merchant_rf = pd.read_parquet(path +'df_user_merchant_rf.parquet')
# df_merchant_rf = pd.read_parquet(path +'df_merchant_rf.parquet')

df_train = df_train_1.merge(
    df_user_info, on='user_id', how='left',
).merge(
    df_user_rf, on='user_id', how='left'
).merge(
    df_user_merchant_rf, on=['user_id', 'merchant_id'], how='left', suffixes=('', '_um')
).merge(
    df_merchant_rf, on='merchant_id', how='left', suffixes=('', '_m')
)
df_train['gender'] = df_train['gender'].astype(str)

df_test = df_test_1.merge(
    df_user_info, on='user_id', how='left',
).merge(
    df_user_rf, on='user_id', how='left', 
).merge(
    df_user_merchant_rf, on=['user_id', 'merchant_id'], how='left', suffixes=('', '_um')
).merge(
    df_merchant_rf, on='merchant_id', how='left', suffixes=('', '_m')
)
df_test['gender'] = df_test['gender'].astype(str)
# 将所有的f开头字段缺失补零 
cols_f = [col for col in df_train.columns if col.startswith('f')]
df_train[cols_f].fillna(0, inplace=True)
df_test[cols_f].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train[cols_f].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[cols_f].fillna(0, inplace=True)


In [16]:
for r in ['', '_r7', '_r14', '_r30', '_r60', '_r90']:
    for t in ['_t0', '_t1', '_t2', '_t3']:
        for col in ['f', 'f_item', 'f_cat', 'f_merchant', 'f_brand']:
            df_train[f'p{col}{r}{t}'] = df_train[f'{col}{r}{t}'] / (df_train[f'{col}{r}'] + 1e-5)
            df_test[f'p{col}{r}{t}'] = df_test[f'{col}{r}{t}'] / (df_test[f'{col}{r}'] + 1e-5)
        for col in ['f', 'f_item', 'f_cat', 'f_brand']:
            df_train[f'p{col}{r}{t}_um'] = df_train[f'{col}{r}{t}_um'] / (df_train[f'{col}{r}_um'] + 1e-5)
            df_test[f'p{col}{r}{t}_um'] = df_test[f'{col}{r}{t}_um'] / (df_test[f'{col}{r}_um'] + 1e-5)
        for col in ['f', 'f_item', 'f_cat', 'f_brand']:
            df_train[f'p{col}{r}{t}_m'] = df_train[f'{col}{r}{t}_m'] / (df_train[f'{col}{r}_m'] + 1e-5)
            df_test[f'p{col}{r}{t}_m'] = df_test[f'{col}{r}{t}_m'] / (df_test[f'{col}{r}_m'] + 1e-5)
df_train.to_parquet(path +'df_train.parquet')
df_test.to_parquet(path +'df_test.parquet')

  df_train[f'p{col}{r}{t}'] = df_train[f'{col}{r}{t}'] / (df_train[f'{col}{r}'] + 1e-5)
  df_test[f'p{col}{r}{t}'] = df_test[f'{col}{r}{t}'] / (df_test[f'{col}{r}'] + 1e-5)
  df_train[f'p{col}{r}{t}'] = df_train[f'{col}{r}{t}'] / (df_train[f'{col}{r}'] + 1e-5)
  df_test[f'p{col}{r}{t}'] = df_test[f'{col}{r}{t}'] / (df_test[f'{col}{r}'] + 1e-5)
  df_train[f'p{col}{r}{t}_um'] = df_train[f'{col}{r}{t}_um'] / (df_train[f'{col}{r}_um'] + 1e-5)
  df_test[f'p{col}{r}{t}_um'] = df_test[f'{col}{r}{t}_um'] / (df_test[f'{col}{r}_um'] + 1e-5)
  df_train[f'p{col}{r}{t}_um'] = df_train[f'{col}{r}{t}_um'] / (df_train[f'{col}{r}_um'] + 1e-5)
  df_test[f'p{col}{r}{t}_um'] = df_test[f'{col}{r}{t}_um'] / (df_test[f'{col}{r}_um'] + 1e-5)
  df_train[f'p{col}{r}{t}_um'] = df_train[f'{col}{r}{t}_um'] / (df_train[f'{col}{r}_um'] + 1e-5)
  df_test[f'p{col}{r}{t}_um'] = df_test[f'{col}{r}{t}_um'] / (df_test[f'{col}{r}_um'] + 1e-5)
  df_train[f'p{col}{r}{t}_um'] = df_train[f'{col}{r}{t}_um'] / (df_train[f'{col}{

In [17]:
df_train.columns

Index(['user_id', 'merchant_id', 'label', 'age_range', 'gender', 'r', 'f',
       'f_item', 'f_cat', 'f_merchant',
       ...
       'pf_merchant_r90_t3', 'pf_brand_r90_t3', 'pf_r90_t3_um',
       'pf_item_r90_t3_um', 'pf_cat_r90_t3_um', 'pf_brand_r90_t3_um',
       'pf_r90_t3_m', 'pf_item_r90_t3_m', 'pf_cat_r90_t3_m',
       'pf_brand_r90_t3_m'],
      dtype='object', length=827)

In [None]:
# df_train catboost 二分类 训练推理
# 固定随机种子，保证结果可复现
np.random.seed(42)
# 训练代码 
from catboost import CatBoostClassifier, Pool  
from sklearn.model_selection import train_test_split

# 'user_id', 'merchant_id', 'label', 'age_range', 'gender', 
features = [col for col in df_train.columns if col not in ['user_id', 'merchant_id', 'label']]
target = 'label'
X = df_train[features]
y = df_train[target].astype(float)
X_test = df_test[features]        
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
train_pool = Pool(X_train, y_train, cat_features=['gender'])
val_pool = Pool(X_val, y_val, cat_features=['gender'])  
model = CatBoostClassifier(iterations=10000, learning_rate=0.5, depth=2, eval_metric='AUC', random_seed=42, logging_level='Verbose', use_best_model=True)
model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=1000, verbose=100)
y_pred = model.predict_proba(X_test)[:, 1]
df_test['prob'] = y_pred
df_test[['user_id', 'merchant_id', 'prob']].to_csv(path + 'prediction.csv', index=False)  
print(path + 'prediction.csv saved!')


0:	test: 0.5528450	best: 0.5528450 (0)	total: 312ms	remaining: 52m 1s
100:	test: 0.6715798	best: 0.6715798 (100)	total: 17.8s	remaining: 29m 4s
200:	test: 0.6821439	best: 0.6821439 (200)	total: 33.5s	remaining: 27m 15s
300:	test: 0.6874260	best: 0.6874260 (300)	total: 48.5s	remaining: 26m 3s
400:	test: 0.6900139	best: 0.6900139 (400)	total: 1m 4s	remaining: 25m 34s
500:	test: 0.6908130	best: 0.6909056 (491)	total: 1m 19s	remaining: 25m 5s
600:	test: 0.6915640	best: 0.6916394 (591)	total: 1m 33s	remaining: 24m 26s
700:	test: 0.6918758	best: 0.6918758 (700)	total: 1m 48s	remaining: 23m 54s
800:	test: 0.6919133	best: 0.6919976 (716)	total: 2m 4s	remaining: 23m 50s
900:	test: 0.6923620	best: 0.6924584 (880)	total: 2m 20s	remaining: 23m 41s
1000:	test: 0.6922991	best: 0.6926582 (935)	total: 2m 35s	remaining: 23m 15s
1100:	test: 0.6919947	best: 0.6926582 (935)	total: 2m 50s	remaining: 22m 55s
1200:	test: 0.6917013	best: 0.6926582 (935)	total: 3m 5s	remaining: 22m 38s
1300:	test: 0.6921308	be