In [1]:
'''
数据分析
'''
import pandas as pd


#读取训练和测试数据。
train_data = pd.read_csv('../Datasets/ames/train.csv')
test_data = pd.read_csv('../Datasets/ames/test.csv')

In [2]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [3]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [4]:
'''
数据预处理
'''
y_train = train_data['SalePrice']

def data_preprocess(df):
    for column in df.columns:
        if df[column].isna().sum() <= df[column].size * 0.2:
            if df[column].dtype == 'object':
                df = df.fillna({column: df[column].value_counts().idxmax()})
            elif df[column].dtype == 'int64':
                df = df.fillna({column: df[column].median()})
            elif df[column].dtype == 'float64':
                df = df.fillna({column: df[column].mean()})
        else:
            df = df.drop([column], axis=1)
    return df

train_data = data_preprocess(train_data)
test_data = data_preprocess(test_data)

In [5]:
X_train = train_data.drop(['Id', 'SalePrice'], axis=1)
X_test = test_data.drop(['Id'], axis=1)

In [6]:
cate_columns = []
num_columns = []

#找出数值型与类别型特征。
for column in X_train.columns:
    if X_train[column].dtype == 'object':
        cate_columns.append(column)
    elif X_train[column].dtype == 'int64' or X_train[column].dtype == 'float64':
        num_columns.append(column)

In [7]:
#选出数值型特征。
num_X_train = X_train[num_columns].values
num_X_test = X_test[num_columns].values

In [8]:
from sklearn.preprocessing import OneHotEncoder


ohe = OneHotEncoder()

#对类别型特征进行编码。
cate_X_train = ohe.fit_transform(X_train[cate_columns]).todense()
cate_X_test = ohe.transform(X_test[cate_columns]).todense()

In [9]:
import numpy as np


#将数值特征与类别特征的独热编码进行拼接。
X_train = np.concatenate([num_X_train, cate_X_train], axis=1)
X_test = np.concatenate([num_X_test, cate_X_test], axis=1)

In [10]:
'''
采用梯度提升树回归器，并且交叉验证、超参数寻优。
'''
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators':[50, 100, 200, 500, 1000]}

gbr = GradientBoostingRegressor()

reg = GridSearchCV(gbr, parameters, n_jobs=4, scoring='neg_root_mean_squared_error')

reg.fit(X_train, y_train)

print('最优超参数设定为：%s' %reg.best_params_)

print('交叉验证得到的最佳RMSE为：%f' %-reg.best_score_)

最优超参数设定为：{'n_estimators': 1000}
交叉验证得到的最佳RMSE为：25422.454407


In [11]:
'''
使用最优的模型，依据测试数据的特征进行数值回归。
'''
y_predict = reg.predict(X_test)

submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': y_predict})

submission.to_csv('../Kaggle_submissions/ames_submission.csv', index=False)