In [1]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.externals import joblib   # 함수는 dump 시켜도 안됨
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

In [4]:
from my_transformer import change_to_str, divide_columns, feature_selection, simple_imputer, one_hot_encoding, concat, rf_imputer, fill_columns
from my_transformer import rmsle_scorer, neg_rmsle_scorer, rmsle

In [5]:
def preparation(data) :
    astype_str = change_to_str('MSSubClass')
    data = astype_str.fit_transform(data)
    
    num_columns, cat_columns = divide_columns(data)
    
    num_null_0 = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                  'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']
    num_null_mean = 'LotFrontage'
    num_null_rf = 'GarageYrBlt'
    replace_null = ['Alley', 'Fence', 'FireplaceQu', 'MiscFeature', 'PoolQC',
                   'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'GarageFinish', 'GarageCond']
    
    for column in replace_null :
        data.loc[data[column].isnull(), column] = 'NA'
    
    pipeline_cat = make_pipeline(
        feature_selection(cat_columns),
        simple_imputer('most_frequent'),
        one_hot_encoding(cat_columns)
    )
    
    X_cat = pipeline_cat.fit_transform(data)
    X_num = data[num_columns]
    X = concat(X_num, X_cat)
    
    isnull_sum = X.isnull().sum()
    not_null = list(isnull_sum[isnull_sum == 0].index)
    null_columns = list(isnull_sum[isnull_sum > 0])
    
    for column in num_columns :
        if column == num_null_rf :
            X = rf_imputer(X, column, not_null)

        elif column == num_null_mean :
            X = fill_columns(X, column, 'mean')

        else :
            X = fill_columns(X, column, 'constant', 0)
        
    isnull_sum = X.isnull().sum()
    print(isnull_sum[isnull_sum > 0].sort_values(ascending=False))
            
    return X

In [6]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [7]:
data = train.drop(['SalePrice', 'Id'], axis=1)
y = train['SalePrice']

In [8]:
X = preparation(data)

Series([], dtype: int64)


In [9]:
X.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
39,65.0,6040,4,5,1955,1955,0.0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
48,33.0,4456,4,5,1920,2008,0.0,0,0,736,...,0,1,0,0,0,0,0,0,0,1
78,72.0,10778,4,5,1968,1968,0.0,0,0,1768,...,0,0,0,1,0,0,0,0,1,0
88,105.0,8470,3,2,1915,1982,0.0,0,0,1013,...,0,0,0,0,1,0,0,0,0,0
89,60.0,8070,4,5,1994,1995,0.0,588,0,402,...,0,0,0,1,0,0,0,0,1,0


#### 피처 데이터 세트의 데이터 분포도 확인하기
- 타겟 데이터 세트의 데이터 분포도의 왜곡은 확인함
- 피처 데이터 세트의 경우도 지나치게 왜곡된 피처가 존재할 경우 회귀 예측 성능을 저하시킬 수 있음
- 모든 숫자형 피처의 데이터 분포도를 확인해 분포도가 얼마나 왜곡됐는지 알아보겠음
- scipy.stats 모듈의 skew()함수 이용
- 컬럼의 데이터 세트의 왜곡된 정도를 쉽게 추출 가능
- 일반적으로 skew()함수의 리턴값이 1 이상인 경우, 왜곡 정도가 높다고 판단하지만, 상황에 따라 편차가 있을 수 있음
- 여기에서는 1 이상의 값을 반환하는 피처만 추출해 왜곡 정도를 완화하기 위해 로그 변환을 적용
- 주의할 점 : skew() 적용 시 one-hot encoding된 피처는 제외해야 함

In [10]:
from scipy.stats import skew

num_columns, cat_columns = divide_columns(data)
skew_features = X[num_columns].apply(lambda x : skew(x))
# skew 정도가 1 이상인 컬럼들만 추출. 
skew_features_top = skew_features[skew_features > 1]
print(skew_features_top.sort_values(ascending=False))

MiscVal          24.451640
PoolArea         14.813135
LotArea          12.195142
3SsnPorch        10.293752
LowQualFinSF      9.002080
KitchenAbvGr      4.483784
BsmtFinSF2        4.250888
ScreenPorch       4.117977
BsmtHalfBath      4.099186
EnclosedPorch     3.086696
MasVnrArea        2.674865
LotFrontage       2.382499
OpenPorchSF       2.361912
BsmtFinSF1        1.683771
WoodDeckSF        1.539792
TotalBsmtSF       1.522688
1stFlrSF          1.375342
GrLivArea         1.365156
dtype: float64


In [11]:
X[skew_features_top.index] = np.log1p(X[skew_features_top.index])

In [12]:
skew_features = X[num_columns].apply(lambda x : skew(x))
# skew 정도가 1 이상인 컬럼들만 추출. 
skew_features_top = skew_features[skew_features > 1]
print(skew_features_top.sort_values(ascending=False))

PoolArea         14.348342
3SsnPorch         7.727026
LowQualFinSF      7.452650
MiscVal           5.165390
BsmtHalfBath      3.929022
KitchenAbvGr      3.865437
ScreenPorch       3.147171
BsmtFinSF2        2.521100
EnclosedPorch     2.110104
dtype: float64


- 여전히 값이 높은 피처는 존재

In [13]:
knn = KNeighborsRegressor(n_neighbors=5)
linear = LinearRegression()
sgd = SGDRegressor(max_iter=None, eta0=0.01, penalty='l2', random_state=30)
ridge = Ridge(alpha=1, random_state=30)
lasso = Lasso(alpha=1, random_state=30)
elastic = ElasticNet(alpha=1, l1_ratio=0.5, random_state=30)
dt = DecisionTreeRegressor(max_depth=None, random_state=30)
rf = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=30, n_jobs=-1)
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=30)
svm = SVR(C=1, kernel='rbf', gamma='auto_deprecated')
mlp = MLPRegressor(hidden_layer_sizes=[100], solver='adam', activation='relu', alpha=0.0001, max_iter=200, random_state=30)
xgb = XGBRegressor(random_state=30)

In [14]:
models = {
    'knn' : knn,
    'linear' : linear,
    'sgd' : sgd,
    'ridge' : ridge,
    'lasso' : lasso,
    'elastic' : elastic,
    'dt' : dt,
    'rf' : rf,
    'gb' : gb,
    'xgb' : xgb,
    'svm' : svm,
    'mlp' : mlp
}

In [15]:
scores = dict()

for key, model in models.items() : 
    score = cross_val_score(model, X, y, cv=5, scoring=rmsle_scorer).mean()   
    scores[key] = round(score, 3)
    
scores



{'knn': 0.449,
 'linear': nan,
 'sgd': nan,
 'ridge': 0.455,
 'lasso': nan,
 'elastic': 0.411,
 'dt': 0.584,
 'rf': 0.432,
 'gb': 0.425,
 'xgb': 0.424,
 'svm': 0.4,
 'mlp': 0.411}

#### target 값도 log 변환

In [16]:
y = np.log1p(y)

In [17]:
scores = dict()

for key, model in models.items() : 
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error').mean()   
    scores[key] = round(np.sqrt(-score), 3)
    
scores



{'knn': 0.438,
 'linear': 0.492,
 'sgd': 1.1086813594975178e+16,
 'ridge': 0.44,
 'lasso': 0.4,
 'elastic': 0.4,
 'dt': 0.591,
 'rf': 0.415,
 'gb': 0.42,
 'xgb': 0.415,
 'svm': 0.402,
 'mlp': 1.431}

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [19]:
for key, model in models.items() : 
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    score_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    score_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    print('{}, train_score : {}, test_score : {}' . format(key, round(score_train, 3), round(score_test, 3)))

knn, train_score : 0.357, test_score : 0.461
linear, train_score : 0.351, test_score : 0.496
sgd, train_score : 8197151497102261.0, test_score : 8082188156673609.0
ridge, train_score : 0.354, test_score : 0.47
lasso, train_score : 0.396, test_score : 0.413
elastic, train_score : 0.396, test_score : 0.413
dt, train_score : 0.0, test_score : 0.602
rf, train_score : 0.158, test_score : 0.425
gb, train_score : 0.323, test_score : 0.429
xgb, train_score : 0.334, test_score : 0.428
svm, train_score : 0.108, test_score : 0.416
mlp, train_score : 1.052, test_score : 1.14
