In [1]:
import numpy as np
from scipy import stats
from scipy.stats import skew
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.externals import joblib   # 함수는 dump 시켜도 안됨
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

In [4]:
from my_transformer import change_to_str, divide_columns, feature_selection, simple_imputer, one_hot_encoding, concat, rf_imputer, fill_columns
from my_transformer import rmsle_scorer, neg_rmsle_scorer, rmsle

In [5]:
def preparation(data) :
    
    data.drop(['SalePrice', 'Id'], axis=1, inplace=True)
#     data.drop(['Id'], axis=1, inplace=True)
    
    astype_str = change_to_str('MSSubClass')
    data = astype_str.fit_transform(data)
    
    num_columns, cat_columns = divide_columns(data)
    
    num_null_0 = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                  'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']
    num_null_mean = 'LotFrontage'
    num_null_rf = 'GarageYrBlt'
    replace_null = ['Alley', 'Fence', 'FireplaceQu', 'MiscFeature', 'PoolQC',
                   'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'GarageFinish', 'GarageCond']
    
    for column in replace_null :
        data.loc[data[column].isnull(), column] = 'NA'
    
    pipeline_cat = make_pipeline(
        feature_selection(cat_columns),
        simple_imputer('most_frequent'),
        one_hot_encoding(cat_columns)
    )
    
    X_cat = pipeline_cat.fit_transform(data)
    X_num = data[num_columns]
    X = concat(X_num, X_cat)
    
    isnull_sum = X.isnull().sum()
    not_null = list(isnull_sum[isnull_sum == 0].index)
    null_columns = list(isnull_sum[isnull_sum > 0])
    
    for column in num_columns :
        if column == num_null_rf :
            X = rf_imputer(X, column, not_null)

        elif column == num_null_mean :
            X = fill_columns(X, column, 'mean')

        else :
            X = fill_columns(X, column, 'constant', 0)
        
    isnull_sum = X.isnull().sum()
    print(isnull_sum[isnull_sum > 0].sort_values(ascending=False))
    
    skew_features = X[num_columns].apply(lambda x : skew(x))
    skew_features_top = skew_features[skew_features > 1]
    X[skew_features_top.index] = np.log1p(X[skew_features_top.index])
            
    return X

In [6]:
def data_set() :
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    
    data = pd.concat([train, test], axis=0)
    
    X = preparation(data)
    X_test = X.iloc[1460:, :]
    X = X.iloc[:1460, :]
    
    y = train['SalePrice']
    y = np.log1p(y)
    
    return X, X_test, y

In [7]:
X, X_test, y = data_set()

Series([], dtype: int64)


In [8]:
corr_matrix = X.corr()

In [9]:
drop_columns = {}
n = len(X.columns)

for i in range(n):
    
    key = 0
    value = []
    
    for j in range(1, n - i):
        k = j + i        
        
        if abs(corr_matrix.iloc[i, k]) >= 0.7 :
            key = corr_matrix.columns[i]
            value.append(corr_matrix.columns[k])
            
    if key :
        drop_columns[key] = value
        
drop_columns

{'2ndFlrSF': ['HouseStyle_2Story'],
 'BsmtFinSF1': ['BsmtFinType1_Unf'],
 'BsmtFinSF2': ['BsmtFinType2_Unf'],
 'Fireplaces': ['FireplaceQu_NA'],
 'GarageArea': ['GarageCars'],
 'GarageYrBlt': ['YearBuilt'],
 'GrLivArea': ['TotRmsAbvGrd'],
 'MasVnrArea': ['MasVnrType_None'],
 'MiscVal': ['MiscFeature_NA', 'MiscFeature_Shed'],
 'PoolArea': ['PoolQC_NA'],
 'TotalBsmtSF': ['BsmtCond_NA',
  'BsmtExposure_NA',
  'BsmtFinType1_NA',
  'BsmtFinType2_NA',
  'BsmtQual_NA'],
 'BldgType_2fmCon': ['MSSubClass_190'],
 'BldgType_Duplex': ['MSSubClass_90'],
 'BsmtCond_NA': ['BsmtExposure_NA',
  'BsmtFinType1_NA',
  'BsmtFinType2_NA',
  'BsmtQual_NA',
  'Foundation_Slab'],
 'BsmtExposure_NA': ['BsmtFinType1_NA', 'BsmtFinType2_NA', 'BsmtQual_NA'],
 'BsmtFinType1_NA': ['BsmtFinType2_NA', 'BsmtQual_NA', 'Foundation_Slab'],
 'BsmtFinType2_NA': ['BsmtQual_NA'],
 'BsmtQual_NA': ['Foundation_Slab'],
 'CentralAir_N': ['CentralAir_Y'],
 'Electrical_FuseA': ['Electrical_SBrkr'],
 'ExterCond_Gd': ['ExterCond_TA'],

In [10]:
drop_columns = []
n = len(X.columns)

for i in range(n):
    
    key = 0
    value = []
    
    for j in range(1, n - i):
        k = j + i        
        
        if abs(corr_matrix.iloc[i, k]) >= 0.7 :
            drop_columns.append(corr_matrix.columns[k])
            
drop_columns

['HouseStyle_1Story',
 'HouseStyle_2Story',
 'BsmtFinType1_Unf',
 'BsmtFinType2_Unf',
 'FireplaceQu_NA',
 'GarageCars',
 'YearBuilt',
 'TotRmsAbvGrd',
 'MasVnrType_BrkFace',
 'MasVnrType_None',
 'MiscFeature_NA',
 'MiscFeature_Shed',
 'PoolQC_NA',
 'BsmtCond_NA',
 'BsmtExposure_NA',
 'BsmtFinType1_NA',
 'BsmtFinType2_NA',
 'BsmtQual_NA',
 'Foundation_Slab',
 'Alley_NA',
 'MSSubClass_190',
 'MSSubClass_90',
 'MSSubClass_120',
 'BsmtExposure_NA',
 'BsmtFinType1_NA',
 'BsmtFinType2_NA',
 'BsmtQual_NA',
 'Foundation_Slab',
 'BsmtFinType1_NA',
 'BsmtFinType2_NA',
 'BsmtQual_NA',
 'Foundation_Slab',
 'BsmtFinType2_NA',
 'BsmtQual_NA',
 'Foundation_Slab',
 'BsmtQual_NA',
 'Foundation_Slab',
 'BsmtQual_TA',
 'Foundation_Slab',
 'CentralAir_Y',
 'MiscFeature_Gar2',
 'RoofStyle_Shed',
 'Electrical_SBrkr',
 'ExterCond_TA',
 'ExterQual_TA',
 'Exterior2nd_AsbShng',
 'Exterior2nd_AsphShn',
 'Exterior2nd_CBlock',
 'Exterior2nd_CmentBd',
 'Exterior2nd_HdBoard',
 'Exterior2nd_MetalSd',
 'Exterior2nd_Pl

In [11]:
X.drop(drop_columns, axis=1, inplace=True)
X_test.drop(drop_columns, axis=1, inplace=True)

In [12]:
print(X.shape)
print(X_test.shape)

(1460, 246)
(1459, 246)


In [13]:
knn = KNeighborsRegressor(n_neighbors=5)
linear = LinearRegression()
sgd = SGDRegressor(max_iter=None, eta0=0.01, penalty='l2', random_state=30)
ridge = Ridge(alpha=1, random_state=30)
lasso = Lasso(alpha=1, random_state=30)
elastic = ElasticNet(alpha=1, l1_ratio=0.5, random_state=30)
dt = DecisionTreeRegressor(max_depth=None, random_state=30)
rf = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=30, n_jobs=-1)
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=30)
svm = SVR(C=1, kernel='rbf', gamma='auto_deprecated')
mlp = MLPRegressor(hidden_layer_sizes=[100], solver='adam', activation='relu', alpha=0.0001, max_iter=200, random_state=30)
xgb = XGBRegressor(random_state=30)

In [14]:
models = {
    'knn' : knn,
    'linear' : linear,
    'sgd' : sgd,
    'ridge' : ridge,
    'lasso' : lasso,
    'elastic' : elastic,
    'dt' : dt,
    'rf' : rf,
    'gb' : gb,
    'xgb' : xgb,
    'svm' : svm,
    'mlp' : mlp
}

In [15]:
scores = dict()

for key, model in models.items() : 
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error').mean()   
    scores[key] = round(np.sqrt(-score), 3)
    
scores



{'knn': 0.433,
 'linear': 1.943,
 'sgd': 1.1017807257895782e+16,
 'ridge': 0.438,
 'lasso': 0.4,
 'elastic': 0.401,
 'dt': 0.591,
 'rf': 0.408,
 'gb': 0.412,
 'xgb': 0.408,
 'svm': 0.4,
 'mlp': 2.556}