In [1]:
import numpy as np # linear algebra
import pandas as pd
from datetime import datetime
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import sklearn.linear_model as linear_model

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
train.drop(['ID', 'Record Status '], axis=1, inplace=True)  # 删‘ID’列
test.drop(['ID', 'Record Status '], axis=1, inplace=True)

In [4]:
train.drop(['Secondary Address'], axis=1, inplace=True)  # 删‘Secondary Address’列
test.drop(['Secondary Address'], axis=1, inplace=True)
train.drop(['Locality'], axis=1, inplace=True)  # 删‘Locality’列
test.drop(['Locality'], axis=1, inplace=True)

In [5]:
train['Postcode'].fillna(train['Postcode'].mode()[0], inplace=True)
test['Postcode'].fillna(test['Postcode'].mode()[0], inplace=True)

train['Street'].fillna(train['Street'].mode()[0], inplace=True)
test['Street'].fillna(test['Street'].mode()[0], inplace=True)

train['District'].fillna(train['District'].mode()[0], inplace=True)
test['District'].fillna(test['District'].mode()[0], inplace=True)

In [6]:
train['Price'] = np.log1p(train['Price'])

In [7]:
train_counts = train.shape[0]
test_counts = test.shape[0]

train_Price = train['Price'][:train_counts]
train_features = train.drop(['Price'], axis=1)
test_features = test
features = pd.concat([train_features, test_features]).reset_index(drop=True)

In [8]:
features['Date '] = pd.to_datetime(features['Date '])

# 提取月、日新特征
features['Month'] = features['Date '].dt.month
features['Day'] = features['Date '].dt.day

In [9]:
# 提取'Month'特征，转换为季节信息
seasons = [(1, 'Winter'), (4, 'Spring'), (7, 'Summer'), (10, 'Fall')]
bins = [1, 4, 7, 10, 13]
labels = [name for _, name in seasons]
features['Season'] = pd.cut(features['Date '].dt.month, bins=bins, labels=labels, right=False)

# 提取'Day'特征，转换为星期几信息
features['Weekday'] = features['Date '].dt.weekday
features['Weekend'] = features['Weekday'].isin([5, 6]).astype(int)

In [10]:
# 删除原始的'Date '列
features.drop(['Date '], axis=1, inplace=True)

In [11]:
from mean_encoder import MeanEncoder

Large_Num_Features = ['Postcode', 'Street', 'First address', 'Town/City', 'District', 'County']

mean_encoder = MeanEncoder(categorical_features=Large_Num_Features)
train = mean_encoder.fit_transform(train, train['Price'])
features = mean_encoder.transform(features)

features.drop(Large_Num_Features, axis=1, inplace=True)

            Price           Date   Postcode Property Type Old/New  \
0       13.415034  2019/1/11 0:00   CM3 4BS             D       N   
1       12.206078   2019/1/4 0:00   SS3 9RJ             T       N   
2       13.208543  2019/1/18 0:00   CM3 4UR             D       N   
3       12.506181  2019/1/11 0:00   SS0 9TY             T       N   
4       12.814481  2019/1/16 0:00  SS11 7BB             S       N   
...           ...             ...       ...           ...     ...   
691861  13.304687  2019/1/15 0:00  SW15 2TR             O       N   
691862  14.603968   2019/1/3 0:00  EC2A 4HB             O       N   
691863  13.473022   2019/2/1 0:00   SW9 9JB             T       N   
691864  14.058458  2019/1/25 0:00    E8 4AU             O       N   
691865  14.077876  2019/2/22 0:00  SW12 0BH             T       N   

       Property Rights       First address               Street  \
0                    F    FIR TREE COTTAGE           NORTH HILL   
1                    F               

(553493,) (138373,)
Town/City
ABBOTS LANGLEY     1.979260e-32
ABERAERON          2.543666e-13
ABERDARE          1.746409e-177
ABERDOVEY          1.026188e-10
ABERGAVENNY       3.961430e-107
                      ...      
YATELEY            6.813557e-46
YELVERTON          7.667648e-53
YEOVIL            6.308650e-287
YORK               0.000000e+00
YSTRAD MEURIG      1.670142e-05
Name: beta, Length: 1143, dtype: float64
(553493,) (138373,)
Town/City
ABBOTS LANGLEY     7.281290e-33
ABERAERON          4.658886e-15
ABERDARE          3.507755e-176
ABERDOVEY          2.543666e-13
ABERGAVENNY       1.798486e-111
                      ...      
YATELEY            5.166421e-55
YELVERTON          2.138866e-62
YEOVIL             0.000000e+00
YORK               0.000000e+00
YSTRAD MEURIG      8.315280e-07
Name: beta, Length: 1143, dtype: float64
(553493,) (138373,)
Town/City
ABBOTS LANGLEY     9.602680e-24
ABERAERON          6.305117e-16
ABERDARE          2.659777e-185
ABERDOVEY          1.026188e

In [12]:
from sklearn.preprocessing import LabelEncoder

# 创建LabelEncoder对象
le = LabelEncoder()
features['Property Type'] = le.fit_transform(features['Property Type'])
features['Old/New'] = le.fit_transform(features['Old/New'])
features['Property Rights'] = le.fit_transform(features['Property Rights'])

In [13]:
features = pd.get_dummies(features, columns=['Category Type'])
features['Category Type_A'] = features['Category Type_A'].astype('int8')
features['Category Type_B'] = features['Category Type_B'].astype('int8')
features = pd.get_dummies(features, columns=['Season'])
features['Season_Spring'] = features['Season_Spring'].astype('int8')
features['Season_Summer'] = features['Season_Summer'].astype('int8')
features['Season_Fall'] = features['Season_Fall'].astype('int8')
features['Season_Winter'] = features['Season_Winter'].astype('int8')

In [14]:
train = features[:train_counts]
test = features[train_counts:]

In [15]:
X_train = train
y_train = train_Price

In [16]:
from sklearn.model_selection import cross_val_predict
#采用十折交叉验证
n_folds = 10

def rmse_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=20)
    preds = cross_val_predict(model, X_train, y_train, cv=kf)
    rmse = np.sqrt(mean_squared_error(y_train, preds))
    return (rmse)

In [18]:
#LightGBM
lgbr_params = {
    'learning_rate': 0.01,
    'n_estimators': 10000,
    'max_depth': 8,
    'num_leaves': 256,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 0.001,
    'min_child_samples': 20,
    'reg_alpha': 0.01,
    'reg_lambda': 0.01,
    'histogram_pool_size': 2048,
    'random_state': 42,
    'n_jobs': -1
}
lgbr = LGBMRegressor(**lgbr_params)

#XGBoost
xgbr_params = {'learning_rate': 0.01,
               'n_estimators': 600,
               'tree_method': 'gpu_hist',
               'max_depth': 4,
               'subsample': 0.8,
               'colsample_bytree': 0.8,
               'min_child_weight': 5,
               'seed': 42,
               'gamma': 0.1,
               'reg_alpha': 0.1,
               'reg_lambda': 0.1}
xgbr = XGBRegressor(**xgbr_params)

# histGBR
from sklearn.ensemble import HistGradientBoostingRegressor
gbr_params = {
    'loss': 'squared_error',
    'max_iter': 600,
    'learning_rate': 0.01,
    'max_depth': 5,
    'max_leaf_nodes': 500,
    'random_state': 42
}
gbr = HistGradientBoostingRegressor(**gbr_params)

# 随机森林
rf_params = {
    'n_estimators': 100,
    'max_depth': 10,
    'max_features': 'sqrt',
    'min_samples_split': 5,
    'min_samples_leaf': 5,
    'random_state': 42,
    'warm_start': True,
    'oob_score': True
}
rf = RandomForestRegressor(**rf_params)

In [19]:
stack_model = StackingCVRegressor(regressors=(gbr, lgbr, xgbr, rf),
                                  meta_regressor=rf, use_features_in_secondary=True)

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

for train_index, val_index in kf.split(X_train):
    stack_model_trained = stack_model.fit(X_train.iloc[train_index], y_train.iloc[train_index])
    y_pred = stack_model_trained.predict(X_train.iloc[val_index])
    score = mean_squared_error(y_train.iloc[val_index], y_pred, squared=False)
    print("RMSE: {:.4f}".format(score))

RMSE: 0.2780
RMSE: 0.2782
RMSE: 0.2787
RMSE: 0.2774
RMSE: 0.2793
RMSE: 0.2811
RMSE: 0.2816
RMSE: 0.2808
RMSE: 0.2789
