In [None]:
# MAE: 528.190
# XGBoostRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

train = pd.read_csv('../data/used_car_train_20200313.csv', sep=' ')
test = pd.read_csv('../data/used_car_testB_20200421.csv', sep=' ')

# 合并训练数据和测试数据集
all_data = pd.concat([train, test], ignore_index=True)

# 对 price 做对数变换
all_data['price'] = np.log1p(all_data['price'])

In [None]:
# 处理异常值，如功率大于 600 的值
all_data['power'] = all_data['power'].apply(lambda x: 600 if x > 600 else x)

# 处理日期相关信息
all_data['reg_year'] = all_data['regDate'].apply(lambda x: int(str(x)[:4]))
all_data['reg_month'] = all_data['regDate'].apply(lambda x: int(str(x)[4:6]))
all_data['reg_day'] = all_data['regDate'].apply(lambda x: int(str(x)[6:]))
all_data['creat_year'] = all_data['creatDate'].apply(lambda x: int(str(x)[:4]))
all_data['creat_month'] = all_data['creatDate'].apply(lambda x: int(str(x)[4:6]))
all_data['creat_day'] = all_data['creatDate'].apply(lambda x: int(str(x)[6:]))

# 标记汽车没有经过维修
all_data['notRepairedDamage'] = all_data['notRepairedDamage'].apply(lambda x: 0 if x == '-' else 1)

# 对可分类的连续特征进行分桶，如将功率（power）分成10个分桶，并提取新特征
all_data['power_bucket'] = pd.cut(all_data['power'], 10, labels=False)
new_cols = ['power_bucket', 'v_0', 'v_3', 'v_8', 'v_12']
for col1 in new_cols:
    for col2 in new_cols:
        if col1 != col2:
            all_data['{}_{}_sum'.format(col1, col2)] = all_data[col1] + all_data[col2]
            all_data['{}_{}_diff'.format(col1, col2)] = all_data[col1] - all_data[col2]

# 处理缺失值
all_data['fuelType'] = all_data['fuelType'].fillna(0)
all_data['gearbox'] = all_data['gearbox'].fillna(0)
all_data['bodyType'] = all_data['bodyType'].fillna(0)
all_data['model'] = all_data['model'].fillna(0)

# 分离特征和标签
train_data = all_data[~all_data['price'].isnull()]
test_data = all_data[all_data['price'].isnull()]
X_train = train_data.drop(['SaleID', 'name', 'regDate', 'creatDate', 'price'], axis=1)
X_test = test_data.drop(['SaleID', 'name', 'regDate', 'creatDate', 'price'], axis=1)
y_train = train_data['price']

In [None]:
# 定义模型参数
xgb_model = XGBRegressor(
    max_depth=10,
    learning_rate=0.05,
    n_estimators=1000,
    gamma=0.005,
    subsample=0.9,
    colsample_bytree=0.7,
    objective='reg:squarederror',
    n_jobs=-1,
    random_state=2021,
    eval_metric='mae'
)

# 交叉验证以及训练模型
skf = KFold(n_splits=5, shuffle=True, random_state=2021)
oof = np.zeros(len(X_train))
test_predict = np.zeros(len(X_test))
for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
    print("Training on Fold {}".format(i+1))
    tr_x, tr_y = X_train.iloc[train_index], y_train.iloc[train_index]
    vl_x, vl_y = X_train.iloc[val_index], y_train.iloc[val_index]
    xgb_model.fit(
        tr_x, tr_y,
        eval_set=[(vl_x, vl_y)],
        early_stopping_rounds=100,
        verbose=200
    )

    oof[val_index] = xgb_model.predict(vl_x)
    test_predict += xgb_model.predict(X_test) / skf.n_splits

mae = mean_absolute_error(np.expm1(y_train), np.expm1(oof))
print("MAE: {:.3f}".format(mae))

# 保存预测结果
submission = pd.DataFrame({'SaleID': test_data['SaleID'], 'price': np.expm1(test_predict)})
submission.to_csv('xgb_submission.csv', index=False)