# 概述

[House Prices - Advanced Regression Techniques](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview)

這份房價資料集是由 Dean De Cock 編制，79 個特徵（幾乎）描述愛荷華州艾姆斯(Ames, Iowa)住宅資訊。<br>
我們的目標是**根據房屋特徵來預測房屋的最終價格**。

# 載入套件

In [1]:
import numpy as np
import pandas as pd
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.svm import SVR

import warnings
warnings.filterwarnings('ignore')

# 載入資料集

In [2]:
# 讀取訓練資料集和測試資料集
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

print('訓練資料集和測試資料集成功!\n')
print(f'訓練資料集: {train_data.shape[0]} rows, {train_data.shape[1]} columns')
print(f'測試資料集: {test_data.shape[0]} rows, {test_data.shape[1]} columns')

訓練資料集和測試資料集成功!

訓練資料集: 1460 rows, 81 columns
測試資料集: 1459 rows, 80 columns


In [3]:
all_data = pd.concat([train_data, test_data]).reset_index(drop=True)

print(f'合併資料集: {all_data.shape[0]} rows, {all_data.shape[1]} columns')
print(f"SalePrice NaN 數量(測試資料集數量): {all_data['SalePrice'].isnull().sum()}")

合併資料集: 2919 rows, 81 columns
SalePrice NaN 數量(測試資料集數量): 1459


# 資料處理

## 處理缺失值

In [4]:
# LotFrontage（街道長度）在不同的街區（Neighborhood）可能有不同的房屋特徵，故根據各自街區計算街道長度的中位數填補
all_data['LotFrontage'].fillna(all_data.groupby('Neighborhood')['LotFrontage'].transform('median'), inplace=True)

# Alley（巷道類型）由於大多數房屋沒有巷道，可以填補為"None"
all_data['Alley'].fillna('None', inplace=True)

# MasVnrType（磚石飾面類型）可以填補為"None"，表示沒有磚石飾面
all_data['MasVnrType'].fillna('None', inplace=True)
# MasVnrArea（磚石飾面面積）可以填補為0，因為當MasVnrType是"None"時，MasVnrArea 應該為0
all_data['MasVnrArea'].fillna(0, inplace=True)

# BsmtQual、BsmtCond、BsmtExposure、BsmtFinType1、BsmtFinType2
# 地下室相關的特徵可能表示沒有地下室，可以填補為 "None"
bsmt_features = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
all_data[bsmt_features] = all_data[bsmt_features].fillna('None')

# Electrical（電氣系統）只有一個缺失值，建議填補為最常見的值"SBrkr"
all_data['Electrical'].fillna(all_data['Electrical'].mode()[0], inplace=True)

# FireplaceQu（壁爐質量評分）可以填補為 "None"，表示沒有壁爐。
all_data['FireplaceQu'].fillna('None', inplace=True)

# GarageType、GarageYrBlt、GarageFinish、GarageQual、GarageCond
# 車庫相關的特徵，可能表示沒有車庫，因此可以填補為 "None" 或 0
all_data['GarageType'].fillna('None', inplace=True)
all_data['GarageYrBlt'].fillna(0, inplace=True)
all_data['GarageFinish'].fillna('None', inplace=True)
all_data['GarageQual'].fillna('None', inplace=True)
all_data['GarageCond'].fillna('None', inplace=True)

# PoolQC（泳池質量評分）由於很少有房屋有泳池，因此可以填補為"None"
all_data['PoolQC'].fillna('None', inplace=True)

# Fence（圍欄質量）可以填補為"None"，表示沒有圍欄
all_data['Fence'].fillna('None', inplace=True)

# MiscFeature（其他特殊設施）可以填補為"None"，表示沒有特殊設施
all_data['MiscFeature'].fillna('None', inplace=True)

In [5]:
# MSZoning（區域分類）
all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0], inplace=True)
# Utilities（公共設施）
all_data['Utilities'].fillna(all_data['Utilities'].mode()[0], inplace=True)
# Exterior1st、Exterior2nd（外部覆蓋材料）
all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0], inplace=True)
all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0], inplace=True)
# KitchenQual（廚房質量）
all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0], inplace=True)
# Functional（功能性）
all_data['Functional'].fillna(all_data['Functional'].mode()[0], inplace=True)
# SaleType（銷售類型）
all_data['SaleType'].fillna(all_data['SaleType'].mode()[0], inplace=True)

# 地下室相關的特徵可能表示沒有地下室，可以填補為 "None"
all_data['BsmtFinSF1'].fillna(all_data['BsmtFinSF1'].median(), inplace=True)
all_data['BsmtFinSF2'].fillna(all_data['BsmtFinSF2'].median(), inplace=True)
all_data['BsmtUnfSF'].fillna(all_data['BsmtUnfSF'].median(), inplace=True)
all_data['TotalBsmtSF'].fillna(all_data['TotalBsmtSF'].median(), inplace=True)
all_data['BsmtFullBath'].fillna(all_data['BsmtFullBath'].median(), inplace=True)
all_data['BsmtHalfBath'].fillna(all_data['BsmtHalfBath'].median(), inplace=True)
# GarageCars、GarageArea（車庫車位數與面積）
all_data['GarageCars'].fillna(all_data['GarageCars'].median(), inplace=True)
all_data['GarageArea'].fillna(all_data['GarageArea'].median(), inplace=True)

In [6]:
# 檢查缺失值數量
missing_values = all_data.isnull().sum()
missing_features = missing_values[missing_values > 0]

print(f'有 {len(missing_features)} 個特徵有缺失值')
# 列出所有缺失值的特徵
missing_features

有 1 個特徵有缺失值


SalePrice    1459
dtype: int64

## Label Encoding

In [7]:
# 質量等級轉換成連續數值
condition_dict = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}

for column in [
        'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
        'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'
]:

    all_data[column] = all_data[column].replace(condition_dict)

# 土地形狀等級
lot_shape_dict = {'IR3': 0, 'IR2': 1, 'IR1': 2, 'Reg': 3}
all_data['LotShape'] = all_data['LotShape'].replace(lot_shape_dict)

# 磚石飾面等級
all_data['MasVnrType'] = all_data['MasVnrType'].replace({
    'None': 0,
    'BrkCmn': 0,
    'BrkFace': 1,
    'Stone': 2
})

# 車庫內部裝修等級
all_data['GarageFinish'] = all_data['GarageFinish'].replace({'None': 0,
                                                             'Unf': 1,
                                                             'RFn': 2,
                                                             'Fin': 3})
# 地下室窗戶等級
all_data['BsmtExposure'] = all_data['BsmtExposure'].replace({'None': 0,
                                                             'No': 1,
                                                             'Mn': 2,
                                                             'Av': 3,
                                                             'Gd': 4})
# 地下室完工類型等級
for col in ['BsmtFinType1', 'BsmtFinType2']:
    all_data[col] = all_data[col].replace({'None': 0,
                                           'Unf': 1,
                                           'LwQ': 2,
                                           'Rec': 3,
                                           'BLQ': 4,
                                           'ALQ': 5,
                                           'GLQ': 6})

# 功能性等級
all_data['Functional'] = all_data['Functional'].replace({'Sal': 0,
                                                         'Sev': 1,
                                                         'Maj2': 2,
                                                         'Maj1': 3,
                                                         'Mod': 4,
                                                         'Min2': 5,
                                                         'Min1': 6,'Typ': 7})

# 地勢坡度
all_data['LandSlope'] = all_data['LandSlope'].replace({'Sev': 0,
                                                       'Mod': 1,
                                                       'Gtl': 2})

# 車道鋪設等級
all_data['PavedDrive'] = all_data['PavedDrive'].replace({'N': 0,
                                                         'P': 1,
                                                         'Y': 2})
# 中央空調
all_data['CentralAir'] = all_data['CentralAir'].map({'N': 0, 'Y': 1})
# 街道鋪設類型等級
all_data['Street'] = all_data['Street'].map({'Grvl': 0, 'Pave': 1})

## 處理異常值

In [8]:
def handle_outliers(df, columns, factor=1.5):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR
        # 限制異常值到合理範圍內
        df[col] = df[col].clip(lower_bound, upper_bound)
    return df

# 處理異常值
numeric_columns = all_data.select_dtypes(include=[np.number]).columns
all_data = handle_outliers(all_data, numeric_columns)

## 特徵工程

In [9]:
# 計算總浴室數量
all_data['TotalBath'] = all_data['FullBath'] + all_data['BsmtFullBath'] + 0.5 * all_data['HalfBath'] + 0.5 * all_data['BsmtHalfBath']

# 計算不同街區的房價分級
def label_neighborhoods(row):
    if row in ['StoneBr', 'NridgHt', 'NoRidge']:
        return 3
    if row in [
            'Timber', 'Somerst', 'Veenker', 'Crawfor', 'CollgCr', 'ClearCr',
            'Blmngtn', 'NWAmes', 'Gilbert', 'SawyerW'
    ]:
        return 2
    elif row in ['MeadowV', 'IDOTRR', 'BrDale']:
        return 0
    else:
        return 1
all_data['Neigh_Rich'] = all_data['Neighborhood'].apply(lambda row: label_neighborhoods(row))

# 計算總面積
all_data['TotalSqFeet'] = all_data['GrLivArea'] + all_data['TotalBsmtSF']

## OneHot Encoding

In [10]:
all_data_before = all_data.copy()

categorical = all_data.select_dtypes(include=['object']).columns
all_data = pd.get_dummies(all_data, columns=categorical, drop_first=True)

print('OneHot encoding completed!\n')
print('Shape before: ', all_data_before.shape)
print('Shape after: ', all_data.shape)

OneHot encoding completed!

Shape before:  (2919, 84)
Shape after:  (2919, 208)


## 資料拆分

合併資料集拆分回原訓練集和原測試集

根據SalePrice是否為空來拆分數據：
- 訓練集(train_df)包含有已知SalePrice的房屋樣本。
- 測試集(test_df)包含SalePrice缺失的樣本，用已完成訓練模型來預測。

In [11]:
idx = all_data['SalePrice'].notnull().sum()

# Train Set
train_df = all_data[:idx]
X_train_data = train_df.drop(columns=['SalePrice'])
y_train_data = np.log1p(train_df['SalePrice'])

# Test Set
test_df = all_data[idx:]
X_test_data = test_df.drop('SalePrice', axis=1)

## Standardization

In [12]:
# 資料標準化
scaler = StandardScaler()
X_train_data = scaler.fit_transform(X_train_data)
X_test_data = scaler.transform(X_test_data)

## 資料拆分
訓練集拆分訓練集和測試集以進行模型訓練

In [13]:
# 將訓練集拆分為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X_train_data, y_train_data, test_size=0.3, random_state=42)

# 輸出拆分後的數據集大小
print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')
print('='*33)
print(f'X_test: {X_test.shape}')
print(f'y_test: {y_test.shape}')

X_train: (1022, 207)
y_train: (1022,)
X_test: (438, 207)
y_test: (438,)


# 訓練模型

In [14]:
cv_tuned = {}

def Model_Performance(model_name, model):

    print('\n',model_name)
    print('='*33)
    print('Val RMSE: ', str(np.round(np.sqrt(-model.best_score_), 3)))
    print('Best Parameters: ', str(model.best_params_))

    arg_min = np.argmin(model.cv_results_['rank_test_score'])
    scores_list = []
    for i in [
            'split0_test_score', 'split1_test_score', 'split2_test_score',
            'split3_test_score', 'split4_test_score'
    ]:
        scores_list.append(np.sqrt(-model.cv_results_[i][arg_min]))

    cv_tuned[model_name] = scores_list

In [15]:
svr = SVR()

param_grid = {
    'kernel': ['rbf'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto'],
    'coef0': [0, 1, 10],
    'C': [0.001, 0.1, 1, 10],
    'epsilon': [0.001, 0.01, 0.1, 1]
}

svr_grid = GridSearchCV(svr,
                        param_grid=param_grid,
                        cv=5,
                        scoring='neg_mean_squared_error',
                        verbose=True,
                        n_jobs=-1)

best_grid_svr = svr_grid.fit(X_train, y_train)

Model_Performance('SVR', best_grid_svr)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits

 SVR
Val RMSE:  0.168
Best Parameters:  {'C': 1, 'coef0': 0, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}


# 評估模型

均方根誤差(Root Mean Squared Error, RMSE)是一個常用的評估回歸模型準確度的指標。

$$
RMSE = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2}
$$

In [16]:
best_estimators = [
    best_grid_svr.best_estimator_,
]

names = ['SVR']

cv_tuned_rmse = [np.sqrt(-best_grid_svr.best_score_)] 

test_scores = []
for i, est in enumerate(best_estimators):
    est.fit(X_train, y_train)
    y_pred = est.predict(X_test)

    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    test_scores.append(rmse_test)

pd.DataFrame({
    'Algorithm': names,
    'Train CV RMSE': cv_tuned_rmse,
    'Test RMSE': test_scores
})

Unnamed: 0,Algorithm,Train CV RMSE,Test RMSE
0,SVR,0.167537,0.171406


In [17]:
test_scores = []
for i, est in enumerate(best_estimators):
    est.fit(X_train, y_train)
    y_pred = est.predict(X_test)
    
    # 計算RMSE，將對數變換後的預測值轉換回原始房價，然後計算誤差
    rmse_test = np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))
    test_scores.append(np.round(rmse_test, 1))

pd.DataFrame({'Algorithm': names, 'Test RMSE': test_scores})

Unnamed: 0,Algorithm,Test RMSE
0,SVR,27016.6


# 產生提交檔

![kaggle-submission-SVR-20240818](./images/kaggle-submission-SVR-20240818.png)

In [18]:
for i in range(len(best_estimators)):
    est = best_estimators[i]
    
    # 使用完整的訓練集進行模型訓練
    est.fit(X_train_data, y_train_data)

    # 進行最終測試集上的預測
    y_pred = est.predict(X_test_data)
    
    # SalePrice對數反轉
    y_pred_trans = np.expm1(y_pred)
    
    # 創建提交數據
    submission = pd.DataFrame({
        'Id': test_data['Id'],
        'SalePrice': y_pred_trans
    })
    
    model_name = names[i]
    date = datetime.datetime.today().strftime('%Y-%m-%d')
    submission_path = f'./data/{model_name}-submission-{model_name}-{date}.csv'
    submission.to_csv(submission_path, index=False)
    print(f"Submission saved successfully as: {submission_path}")

Submission saved successfully as: ./data/SVR-submission-SVR-2024-08-19.csv
