In [129]:
import pandas as pd
import numpy as np
train_df = pd.read_csv('./酒店住宿价格预测挑战赛公开数据/train.csv')
test_df = pd.read_csv('./酒店住宿价格预测挑战赛公开数据/test.csv')
train_df['target'] = np.log1p(train_df['target'])

| 特征字段                       | 字段描述                      |
| ------------------------------ | ----------------------------- |
| id                             | 样本标识id                    |
| host_id                        | 酒店id                        |
| neighbourhood_group            | 街区分组                      |
| neighbourhood                  | 街区                          |
| room_type                      | 房间类型                      |
| minimum_nights                 | 最低夜晚数                    |
| number_of_reviews              | 评论数量                      |
| last_review                    | 最新评论时间                  |
| reviews_per_month              | 每月评论数量                  |
| calculated_host_listings_count | 酒店的订单数量                |
| availability                   | 未来365天内房间可以预订的天数 |
| region_1_id                    | 酒店区域ID1                   |
| region_2_id                    | 酒店区域ID2                   |
| region_3_id                    | 酒店区域ID3                   |
| target                         | 酒店住宿价格（已脱敏处理）    |

In [108]:
train_df.iloc[:2].T

Unnamed: 0,0,1
id,0,1.0
host_id,7609,35608.0
neighbourhood_group,0,2.0
neighbourhood,20,183.0
room_type,1,0.0
minimum_nights,4,1.0
number_of_reviews,4,0.0
last_review,2019-03-24,
reviews_per_month,0.09,
calculated_host_listings_count,1,1.0


In [109]:
train_df['region_1_id'].value_counts().sort_index()

0     1254
1     1577
2      119
3     1334
4     1497
5     1022
6      324
7      192
8      749
9      593
10     968
11     301
12     907
13    1287
14    1599
15     123
16     265
17    1156
18     265
19    2339
20    1719
21    1202
22     324
23     631
24      44
25    2311
26    1095
27     385
28     897
29    1809
30    1364
31     348
Name: region_1_id, dtype: int64

In [110]:
test_df['region_2_id'].value_counts().sort_index()

0       55
1      160
2       50
3      115
4      167
      ... 
123     93
124      7
125    120
126     68
127     95
Name: region_2_id, Length: 128, dtype: int64

In [111]:
train_df.corr().target

id                               -0.004852
host_id                          -0.009706
neighbourhood_group              -0.020181
neighbourhood                    -0.157885
room_type                         0.468608
minimum_nights                    0.033968
number_of_reviews                -0.046053
reviews_per_month                -0.045008
calculated_host_listings_count    0.140425
availability                      0.100097
region_1_id                       0.122547
region_2_id                      -0.030460
region_3_id                      -0.019899
target                            1.000000
Name: target, dtype: float64

In [130]:
from sklearn.model_selection import cross_val_predict, cross_validate
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

In [131]:
train_df['last_review_isnull'] = train_df['last_review'].isnull()
train_df['last_review_year'] = pd.to_datetime(train_df['last_review']).dt.year

train_df['neighbourhood_group_mean'] = train_df['neighbourhood_group'].map(train_df.groupby(['neighbourhood_group'])['target'].mean())
train_df['neighbourhood_group_counts'] = train_df['neighbourhood_group'].map(train_df['neighbourhood_group'].value_counts())

train_df['room_type_mean'] = train_df['room_type'].map(train_df.groupby(['room_type'])['target'].mean())
train_df['room_type_counts'] = train_df['room_type'].map(train_df['room_type'].value_counts())

train_df['region_1_id_mean'] = train_df['region_1_id'].map(train_df.groupby(['region_1_id'])['target'].mean())
train_df['region_1_counts'] = train_df['region_1_id'].map(train_df['region_1_id'].value_counts())

train_df['region_2_id_mean'] = train_df['region_2_id'].map(train_df.groupby(['region_2_id'])['target'].mean())
train_df['region_2_counts'] = train_df['region_2_id'].map(train_df['region_2_id'].value_counts())

train_df['region_3_id_mean'] = train_df['region_3_id'].map(train_df.groupby(['region_3_id'])['target'].mean())
train_df['region_3_counts'] = train_df['region_3_id'].map(train_df['region_3_id'].value_counts())

train_df['availability_month'] = train_df['availability'] // 30
train_df['availability_week'] = train_df['availability'] // 7

train_df['reviews_per_month_count'] = train_df['reviews_per_month'] * train_df['calculated_host_listings_count'] 
train_df['room_type_calculated_host_listings_count'] = train_df['room_type'].map(train_df.groupby(['room_type'])['calculated_host_listings_count'].sum())

In [138]:
test_df['last_review_isnull'] = test_df['last_review'].isnull()
test_df['last_review_year'] = pd.to_datetime(test_df['last_review']).dt.year

test_df['neighbourhood_group_mean'] = test_df['neighbourhood_group'].map(train_df.groupby(['neighbourhood_group'])['target'].mean())
test_df['neighbourhood_group_counts'] = test_df['neighbourhood_group'].map(train_df['neighbourhood_group'].value_counts())

test_df['room_type_mean'] = test_df['room_type'].map(train_df.groupby(['room_type'])['target'].mean())
test_df['room_type_counts'] = test_df['room_type'].map(train_df['room_type'].value_counts())

test_df['region_1_id_mean'] = test_df['region_1_id'].map(train_df.groupby(['region_1_id'])['target'].mean())
test_df['region_1_counts'] = test_df['region_1_id'].map(train_df['region_1_id'].value_counts())

test_df['region_2_id_mean'] = test_df['region_2_id'].map(train_df.groupby(['region_2_id'])['target'].mean())
test_df['region_2_counts'] = test_df['region_2_id'].map(train_df['region_2_id'].value_counts())

test_df['region_3_id_mean'] = test_df['region_3_id'].map(train_df.groupby(['region_3_id'])['target'].mean())
test_df['region_3_counts'] = test_df['region_3_id'].map(train_df['region_3_id'].value_counts())

test_df['availability_month'] = test_df['availability'] // 30
test_df['availability_week'] = test_df['availability'] // 7

test_df['reviews_per_month_count'] = test_df['reviews_per_month'] * test_df['calculated_host_listings_count'] 
test_df['room_type_calculated_host_listings_count'] = test_df['room_type'].map(train_df.groupby(['room_type'])['calculated_host_listings_count'].sum())

In [135]:
val_pred = cross_val_predict(
    # LGBMRegressor(verbose=0, force_row_wise=True),
    CatBoostRegressor(verbose=0,n_estimators=1000),
    # XGBRegressor(),
    train_df.drop(['id', 'target', 'last_review'], axis=1),
    train_df['target']
)

mean_absolute_error(np.exp(val_pred)-1, np.exp(train_df['target'])-1)

110.4459987795426

In [136]:
cat_val = cross_validate(
    CatBoostRegressor(verbose=0,n_estimators=1000),
    train_df.drop(['id', 'target', 'last_review'], axis=1),
    train_df['target'],
    return_estimator=True
)

lgb_val = cross_validate(
    LGBMRegressor(verbose=0, force_row_wise=True),
    train_df.drop(['id', 'target', 'last_review'], axis=1),
    train_df['target'],
    return_estimator=True
)

xgb_val = cross_validate(
    XGBRegressor(),
    train_df.drop(['id', 'target', 'last_review'], axis=1),
    train_df['target'],
    return_estimator=True
)


In [139]:
pred = np.zeros(len(test_df))
# for clf in cat_val['estimator'] + lgb_val['estimator'] + xgb_val['estimator']:
for clf in cat_val['estimator']:
    pred += clf.predict(test_df.drop(['id', 'last_review'], axis=1))

In [140]:
pred /= 5

In [141]:
pred = np.exp(pred) - 1

In [142]:
pd.DataFrame({'id': range(30000, 40000), 'target': pred}).to_csv('a.csv', index=None)