 # [教學目標]
 - 用房價預測資料, 觀察填補缺值以及 標準化 / 最小最大化 對數值的影響

 # [範例重點]
 - 知道如何查詢各欄位空缺值數量 (In[2], Out[2])
 - 觀察替換不同補缺方式, 對於特徵的影響 (In[5]~In[7], Out[5]~Out[7])
 - 觀察替換不同特徵縮放方式, 對於特徵的影響 (In[8]~In[9], Out[8]~Out[9])

In [326]:
# 載入套件
import pandas as pd
import numpy as np
import copy
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

# 讀取訓練與測試資料
data_path = 'data/'
df_train = pd.read_csv(data_path + 'house_train.csv.gz')
df_test = pd.read_csv(data_path + 'house_test.csv.gz')

#資料平滑處理 -- log1p( ) 和 exmp1( )
#1. 資料預處理時首先可以對偏度比較大的資料用og1p函式進行轉化，
#   使其更加服從高斯分佈，此步處理可能會使我們後續的分類結果得到一個好的結果。
#2. 平滑問題很容易處理掉，導致模型的結果達不到一定的標準，
#   log1p( )能夠避免復值得問題 — 復值指一個自變數對應多個因變數
#log1p( ) 的使用就像是一個數據壓縮到了一個區間，與資料的標準類似。
#其逆運算就是expm1的函式
#由於使用的log1p（）對資料進行了壓縮，最後需要將預測出的平滑資料進行一個還原，
#而還原過程就是log1p的逆運算expm1.
#log1p = log（x+1）,當x較大時直接計算，當x較小時用泰勒展開式計算

# 重組資料成為訓練 / 預測用格式
train_Y = np.log1p(df_train['SalePrice'])
ids = df_test['Id']
df_train = df_train.drop(['Id', 'SalePrice'] , axis=1)
df_test = df_test.drop(['Id'] , axis=1)
df = pd.concat([df_train,df_test])
df.head()


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [327]:
# 檢查欄位缺值數量 (去掉.head()可以顯示全部)
df.isnull().sum().sort_values(ascending=False).head()



PoolQC         2909
MiscFeature    2814
Alley          2721
Fence          2348
FireplaceQu    1420
dtype: int64

In [328]:

#zip([iterable, ...])
#  它接受一系列可叠代的對象作為參數，將對象中對應的元素打包成一個個tuple（元組），
#  然後返回由這些tuples組成的list（列表）。
#  若傳入參數的長度不等，則返回list的長度和參數中長度最短的對象相同；
#  與dict() 連用，可完成list組合成字典；

#只取 int64, float64 兩種數值型欄位, 存於 num_features 中
num_features = []
i = 0
for dtype, feature in zip(df.dtypes, df.columns): 
    print(i)    
    print(dtype)
    print(feature)
    if dtype == 'float64' or dtype == 'int64':
        num_features.append(feature)
    i = i + 1

print(f'{len(num_features)} Numeric Features : {num_features}\n')



0
int64
MSSubClass
1
object
MSZoning
2
float64
LotFrontage
3
int64
LotArea
4
object
Street
5
object
Alley
6
object
LotShape
7
object
LandContour
8
object
Utilities
9
object
LotConfig
10
object
LandSlope
11
object
Neighborhood
12
object
Condition1
13
object
Condition2
14
object
BldgType
15
object
HouseStyle
16
int64
OverallQual
17
int64
OverallCond
18
int64
YearBuilt
19
int64
YearRemodAdd
20
object
RoofStyle
21
object
RoofMatl
22
object
Exterior1st
23
object
Exterior2nd
24
object
MasVnrType
25
float64
MasVnrArea
26
object
ExterQual
27
object
ExterCond
28
object
Foundation
29
object
BsmtQual
30
object
BsmtCond
31
object
BsmtExposure
32
object
BsmtFinType1
33
float64
BsmtFinSF1
34
object
BsmtFinType2
35
float64
BsmtFinSF2
36
float64
BsmtUnfSF
37
float64
TotalBsmtSF
38
object
Heating
39
object
HeatingQC
40
object
CentralAir
41
object
Electrical
42
int64
1stFlrSF
43
int64
2ndFlrSF
44
int64
LowQualFinSF
45
int64
GrLivArea
46
float64
BsmtFullBath
47
float64
BsmtHalfBath
48
int64
FullBath
49
i

In [329]:
# 削減文字型欄位, 只剩數值型欄位
df = df[num_features]
train_num = train_Y.shape[0]
df.head()



Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,...,548.0,0,61,0,0,0,0,0,2,2008
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,...,460.0,298,0,0,0,0,0,0,5,2007
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,...,608.0,0,42,0,0,0,0,0,9,2008
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,...,642.0,0,35,272,0,0,0,0,2,2006
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,836.0,192,84,0,0,0,0,0,12,2008


In [330]:
# 空值補 -1, 做線性迴歸
df_m1 = df.fillna(-1)
train_X = df_m1[:train_num]
estimator = LinearRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()



0.8466400643386465

In [331]:
# 空值補 0
df_0 = df.fillna(0)
train_X = df_0[:train_num]
estimator = LinearRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()



0.8466118155868767

In [332]:
# 空值補平均值
df_mn = df.fillna(df.mean())
train_X = df_mn[:train_num]
estimator = LinearRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()



0.8442642432201264

In [333]:
# 空值補 -1, 搭配最大最小化
df = df.fillna(-1)
df_temp = MinMaxScaler().fit_transform(df)
train_X = df_temp[:train_num]
estimator = LinearRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()



0.8458634640853118

In [334]:
# 搭配標準化
df_temp = StandardScaler().fit_transform(df)
train_X = df_temp[:train_num]
estimator = LinearRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()


0.8478460221697844

 # 作業1
 * 試著在補空值區塊, 替換並執行兩種以上填補的缺值, 看看何者比較好?

 # 作業2
 * 使用不同的標準化方式 ( 原值 / 最小最大化 / 標準化 )，搭配羅吉斯迴歸模型，何者效果最好?