In [1]:
import numpy as np
from scipy import stats
from scipy.stats import skew
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.externals import joblib   # 함수는 dump 시켜도 안됨
from sklearn.feature_selection import SelectPercentile, f_classif, f_regression, SelectFromModel, RFE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

In [4]:
from my_transformer import change_to_str, divide_columns, feature_selection, simple_imputer, one_hot_encoding, concat, rf_imputer, fill_columns
from my_transformer import rmsle_scorer, neg_rmsle_scorer, rmsle

In [5]:
def preparation(data) :
    
    data.drop(['SalePrice', 'Id'], axis=1, inplace=True)
    
    astype_str = change_to_str('MSSubClass')
    data = astype_str.fit_transform(data)
          
    data.drop(['Alley', 'Fence', 'FireplaceQu', 'MiscFeature', 'PoolQC'], axis=1, inplace=True)
    
    num_columns, cat_columns = divide_columns(data)
    
    pipeline_cat = make_pipeline(
        feature_selection(cat_columns),
#         simple_imputer('most_frequent'),
        one_hot_encoding(cat_columns)
    )
    
    X_cat = pipeline_cat.fit_transform(data)
    X_num = data[num_columns]
    X = concat(X_num, X_cat)
    
    isnull_sum = X.isnull().sum()
    not_null = list(isnull_sum[isnull_sum == 0].index)
    null_columns = list(isnull_sum[isnull_sum > 0])
    
    for column in num_columns :
        X = fill_columns(X, column, 'mean')
        
    isnull_sum = X.isnull().sum()
    print(isnull_sum[isnull_sum > 0].sort_values(ascending=False))
    
    skew_features = X[num_columns].apply(lambda x : skew(x))
    skew_features_top = skew_features[skew_features > 1]
    X[skew_features_top.index] = np.log1p(X[skew_features_top.index])
            
    return X

In [6]:
def data_set() :
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    
    data = pd.concat([train, test], axis=0)
    
    X = preparation(data)
    X_train = X.iloc[:1460, :]
    X_test = X.iloc[1460:, :]
    
    y = train['SalePrice']
    y_train = np.log1p(y)
    
    return X_train, X_test, y_train

In [7]:
def remove_outlier(column) :
    index_outlier = X_train[abs(X_train[column] - X_train[column].mean()) > (3 * X_train[column].std())].index
    print(len(index_outlier))
    X_train.drop(index=index_outlier, axis=0, inplace=True)
    y_train.drop(index=index_outlier, axis=0, inplace=True)
    print(X_train.shape)

In [8]:
X_train, X_test, y_train = data_set()

Series([], dtype: int64)


In [9]:
X_train['SalePrice'] = y_train

In [10]:
corr_matrix = X_train.corr()
corr_matrix['SalePrice'].sort_values(ascending=False)

SalePrice               1.000000
OverallQual             0.817185
GrLivArea               0.730254
GarageCars              0.680625
GarageArea              0.650888
1stFlrSF                0.608955
FullBath                0.594771
YearBuilt               0.586570
YearRemodAdd            0.565608
TotRmsAbvGrd            0.534422
Foundation_PConc        0.530840
ExterQual_Gd            0.509675
GarageYrBlt             0.501656
Fireplaces              0.489450
HeatingQC_Ex            0.466706
BsmtQual_Ex             0.463991
OpenPorchSF             0.459948
BsmtFinType1_GLQ        0.440764
MasVnrArea              0.420319
GarageFinish_Fin        0.418881
GarageType_Attchd       0.415787
KitchenQual_Ex          0.413937
KitchenQual_Gd          0.406601
MSSubClass_60           0.401501
LotArea                 0.399923
TotalBsmtSF             0.372838
GarageCond_TA           0.365783
ExterQual_Ex            0.356134
Neighborhood_NridgHt    0.351803
CentralAir_Y            0.351599
          

In [11]:
X_train.drop('SalePrice', axis=1, inplace=True)

In [12]:
linear = LinearRegression()
ridge = Ridge(alpha=10, random_state=30)
lasso = Lasso(alpha=0.001, random_state=30)
elastic = ElasticNet(alpha=0.001, l1_ratio=0.5, random_state=30)
rf = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=30, n_jobs=-1)
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=30)
xgb = XGBRegressor(random_state=30)

In [13]:
models = {
    'linear' : linear,
    'ridge' : ridge,
    'lasso' : lasso,
    'elastic' : elastic,
    'rf' : rf,
    'gb' : gb,
    'xgb' : xgb,
}

In [14]:
scores = dict()

for key, model in models.items() : 
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()   
    scores[key] = round(np.sqrt(-score), 3)
    
scores



{'linear': 0.144,
 'ridge': 0.127,
 'lasso': 0.125,
 'elastic': 0.124,
 'rf': 0.143,
 'gb': 0.127,
 'xgb': 0.13}

### Outlier 데이터 처리
- 특히 회귀계수가 높은 피처, 즉 예측에 많은 영향을 미치는 중요 피처의 이상치 데이터 처리가 중요

In [15]:
index_outlier = y_train[abs(y_train - y_train.mean()) > (3 * y_train.std())].index

In [16]:
len(index_outlier)

12

In [17]:
X_train.drop(index=index_outlier, axis=0, inplace=True)
y_train.drop(index=index_outlier, axis=0, inplace=True)
X_train.shape

(1448, 285)

In [18]:
scores = dict()

for key, model in models.items() : 
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()   
    scores[key] = round(np.sqrt(-score), 3)
    
scores



{'linear': 0.139,
 'ridge': 0.122,
 'lasso': 0.121,
 'elastic': 0.12,
 'rf': 0.136,
 'gb': 0.121,
 'xgb': 0.123}

In [19]:
X_train, X_test, y_train = data_set()

Series([], dtype: int64)


In [20]:
remove_outlier('GrLivArea')

8
(1452, 285)


In [21]:
scores = dict()

for key, model in models.items() : 
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()   
    scores[key] = round(np.sqrt(-score), 3)
    
scores



{'linear': 0.127,
 'ridge': 0.113,
 'lasso': 0.112,
 'elastic': 0.11,
 'rf': 0.137,
 'gb': 0.121,
 'xgb': 0.124}

In [22]:
X_train, X_test, y_train = data_set()

Series([], dtype: int64)


In [23]:
remove_outlier('OverallQual')

2
(1458, 285)


In [24]:
scores = dict()

for key, model in models.items() : 
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()   
    scores[key] = round(np.sqrt(-score), 3)
    
scores



{'linear': 0.144,
 'ridge': 0.127,
 'lasso': 0.125,
 'elastic': 0.124,
 'rf': 0.142,
 'gb': 0.127,
 'xgb': 0.129}

In [25]:
X_train, X_test, y_train = data_set()

Series([], dtype: int64)


In [26]:
remove_outlier('GarageCars')

0
(1460, 285)


In [27]:
X_train, X_test, y_train = data_set()

Series([], dtype: int64)


In [28]:
remove_outlier('1stFlrSF')

6
(1454, 285)


In [29]:
scores = dict()

for key, model in models.items() : 
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()   
    scores[key] = round(np.sqrt(-score), 3)
    
scores



{'linear': 0.128,
 'ridge': 0.113,
 'lasso': 0.112,
 'elastic': 0.11,
 'rf': 0.137,
 'gb': 0.122,
 'xgb': 0.124}

In [30]:
remove_outlier('GrLivArea')

5
(1449, 285)


In [31]:
scores = dict()

for key, model in models.items() : 
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()   
    scores[key] = round(np.sqrt(-score), 3)
    
scores



{'linear': 0.128,
 'ridge': 0.113,
 'lasso': 0.112,
 'elastic': 0.11,
 'rf': 0.137,
 'gb': 0.122,
 'xgb': 0.124}