In [1]:
import numpy as np
from scipy import stats
from scipy.stats import skew
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.externals import joblib   # 함수는 dump 시켜도 안됨
from sklearn.feature_selection import SelectPercentile, f_classif, f_regression, SelectFromModel, RFE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

In [4]:
from my_transformer import change_to_str, divide_columns, feature_selection, simple_imputer, one_hot_encoding, concat, rf_imputer, fill_columns
from my_transformer import rmsle_scorer, neg_rmsle_scorer, rmsle

In [5]:
def preparation(data) :
    
    data.drop(['SalePrice', 'Id'], axis=1, inplace=True)
    
    astype_str = change_to_str('MSSubClass')
    data = astype_str.fit_transform(data)
          
    data.drop(['Alley', 'Fence', 'FireplaceQu', 'MiscFeature', 'PoolQC'], axis=1, inplace=True)
    
    num_columns, cat_columns = divide_columns(data)
    
    pipeline_cat = make_pipeline(
        feature_selection(cat_columns),
#         simple_imputer('most_frequent'),
        one_hot_encoding(cat_columns)
    )
    
    X_cat = pipeline_cat.fit_transform(data)
    X_num = data[num_columns]
    X = concat(X_num, X_cat)
    
    isnull_sum = X.isnull().sum()
    not_null = list(isnull_sum[isnull_sum == 0].index)
    null_columns = list(isnull_sum[isnull_sum > 0])
    
    for column in num_columns :
        X = fill_columns(X, column, 'mean')
        
    isnull_sum = X.isnull().sum()
    print(isnull_sum[isnull_sum > 0].sort_values(ascending=False))
    
    skew_features = X[num_columns].apply(lambda x : skew(x))
    skew_features_top = skew_features[skew_features > 1]
    X[skew_features_top.index] = np.log1p(X[skew_features_top.index])
            
    return X

In [6]:
def data_set() :
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    
    data = pd.concat([train, test], axis=0)
    
    X = preparation(data)
    X_train = X.iloc[:1460, :]
    X_test = X.iloc[1460:, :]
    
    y = train['SalePrice']
    y_train = np.log1p(y)
    
    return X_train, X_test, y_train

In [7]:
X_train, X_test, y_train = data_set()

Series([], dtype: int64)


In [8]:
param_grid = [
    {'alpha': [0.001, 0.01, 1, 10, 100, 1000]},
  ]

ridge = Ridge(random_state=30)
search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)
search.fit(X_train, y_train)
search.best_params_

{'alpha': 10}

In [9]:
result = search.cv_results_
for mean_score, params in zip(result["mean_test_score"], result["params"]):
    print('{} : {:.4f}' .format(params, abs(mean_score)))

{'alpha': 0.001} : 0.0208
{'alpha': 0.01} : 0.0206
{'alpha': 1} : 0.0174
{'alpha': 10} : 0.0162
{'alpha': 100} : 0.0179
{'alpha': 1000} : 0.0252


In [10]:
feature_importances = abs(search.best_estimator_.coef_)
sorted(zip(feature_importances, X_train.columns), reverse=True)[:20]

[(0.15926765052314665, 'GrLivArea'),
 (0.14843082303300137, '1stFlrSF'),
 (0.1457789488066732, 'MSZoning_C (all)'),
 (0.10636770805661314, 'RoofMatl_ClyTile'),
 (0.08921033435812374, 'Neighborhood_StoneBr'),
 (0.0830521471462738, 'Condition2_PosN'),
 (0.07768736107021743, 'Neighborhood_Edwards'),
 (0.07694847889968091, 'Neighborhood_Crawfor'),
 (0.07190808145394349, 'LotArea'),
 (0.07094553232541652, 'Neighborhood_NridgHt'),
 (0.06460594395796118, 'Functional_Typ'),
 (0.06300936336485549, 'Exterior1st_BrkFace'),
 (0.06275721771031284, 'Neighborhood_NoRidge'),
 (0.06013644442957515, 'RoofMatl_WdShngl'),
 (0.05891109549697487, 'Functional_Maj2'),
 (0.05822401881909911, 'MSZoning_FV'),
 (0.050782146535058525, 'OverallQual'),
 (0.04995566229835544, 'Neighborhood_Mitchel'),
 (0.04781893234749694, 'Condition2_Norm'),
 (0.04762807501927813, 'GarageCars')]

In [11]:
param_grid = [
    {'alpha': [0.0001, 0.001, 0.01, 1, 10, 100]},
  ]

lasso = Lasso(random_state=30)
search = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)
search.fit(X_train, y_train)
search.best_params_

{'alpha': 0.001}

In [12]:
result = search.cv_results_
for mean_score, params in zip(result["mean_test_score"], result["params"]):
    print('{} : {:.4f}' .format(params, abs(mean_score)))

{'alpha': 0.0001} : 0.0168
{'alpha': 0.001} : 0.0157
{'alpha': 0.01} : 0.0231
{'alpha': 1} : 0.0633
{'alpha': 10} : 0.0846
{'alpha': 100} : 0.1596


In [13]:
feature_importances = abs(search.best_estimator_.coef_)
sorted(zip(feature_importances, X_train.columns), reverse=True)[:20]

[(0.19984484378949127, 'MSZoning_C (all)'),
 (0.18649185773506927, 'GrLivArea'),
 (0.17811908172535973, '1stFlrSF'),
 (0.0891937624569626, 'Neighborhood_Crawfor'),
 (0.07261423602619697, 'Neighborhood_StoneBr'),
 (0.07097185177441688, 'LotArea'),
 (0.0623106889767113, 'Neighborhood_NridgHt'),
 (0.06229906781275155, 'Exterior1st_BrkFace'),
 (0.05981248847572987, 'OverallQual'),
 (0.05906352284546794, 'SaleCondition_Abnorml'),
 (0.05234041414124047, 'Functional_Typ'),
 (0.05062404122613029, 'Condition1_Norm'),
 (0.05044110355444801, 'Neighborhood_Edwards'),
 (0.04809322669252832, 'GarageCars'),
 (0.046585863530736335, 'KitchenQual_Ex'),
 (0.044959087413498515, 'BsmtQual_Ex'),
 (0.042939089396113746, 'OverallCond'),
 (0.04042296153936279, 'Neighborhood_NoRidge'),
 (0.04021724411463401, 'Neighborhood_Somerst'),
 (0.03625700059599559, 'BsmtExposure_Gd')]

In [14]:
param_grid = [
    {'alpha': [0.0001, 0.001, 0.01, 1, 10, 100],
    'l1_ratio': [0.2, 0.35, 0.5, 0.65, 0.8]}
  ]

elastic = ElasticNet(alpha=1, l1_ratio=0.5, random_state=30)
search = GridSearchCV(elastic, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)
search.fit(X_train, y_train)
search.best_params_

{'alpha': 0.001, 'l1_ratio': 0.5}

In [15]:
result = search.cv_results_
for mean_score, params in zip(result["mean_test_score"], result["params"]):
    print('{} : {:.4f}' .format(params, abs(mean_score)))

{'alpha': 0.0001, 'l1_ratio': 0.2} : 0.0184
{'alpha': 0.0001, 'l1_ratio': 0.35} : 0.0179
{'alpha': 0.0001, 'l1_ratio': 0.5} : 0.0176
{'alpha': 0.0001, 'l1_ratio': 0.65} : 0.0173
{'alpha': 0.0001, 'l1_ratio': 0.8} : 0.0170
{'alpha': 0.001, 'l1_ratio': 0.2} : 0.0156
{'alpha': 0.001, 'l1_ratio': 0.35} : 0.0153
{'alpha': 0.001, 'l1_ratio': 0.5} : 0.0153
{'alpha': 0.001, 'l1_ratio': 0.65} : 0.0153
{'alpha': 0.001, 'l1_ratio': 0.8} : 0.0154
{'alpha': 0.01, 'l1_ratio': 0.2} : 0.0180
{'alpha': 0.01, 'l1_ratio': 0.35} : 0.0196
{'alpha': 0.01, 'l1_ratio': 0.5} : 0.0208
{'alpha': 0.01, 'l1_ratio': 0.65} : 0.0216
{'alpha': 0.01, 'l1_ratio': 0.8} : 0.0222
{'alpha': 1, 'l1_ratio': 0.2} : 0.0575
{'alpha': 1, 'l1_ratio': 0.35} : 0.0609
{'alpha': 1, 'l1_ratio': 0.5} : 0.0613
{'alpha': 1, 'l1_ratio': 0.65} : 0.0618
{'alpha': 1, 'l1_ratio': 0.8} : 0.0623
{'alpha': 10, 'l1_ratio': 0.2} : 0.0706
{'alpha': 10, 'l1_ratio': 0.35} : 0.0812
{'alpha': 10, 'l1_ratio': 0.5} : 0.0826
{'alpha': 10, 'l1_ratio': 0.65}

In [16]:
feature_importances = abs(search.best_estimator_.coef_)
sorted(zip(feature_importances, X_train.columns), reverse=True)[:20]

[(0.3714736953274218, 'RoofMatl_ClyTile'),
 (0.2676057426421319, 'MSZoning_C (all)'),
 (0.2002554372412639, 'GrLivArea'),
 (0.17503214991634314, '1stFlrSF'),
 (0.1437958272159716, 'Condition2_PosN'),
 (0.10659761703292929, 'Neighborhood_StoneBr'),
 (0.09761669508819149, 'Neighborhood_Crawfor'),
 (0.07870323515313092, 'Neighborhood_NridgHt'),
 (0.07354797728616067, 'Exterior1st_BrkFace'),
 (0.06856841939527598, 'LotArea'),
 (0.06699638482104832, 'Neighborhood_NoRidge'),
 (0.06501758696333337, 'Functional_Typ'),
 (0.05926827812547618, 'KitchenQual_Ex'),
 (0.056575871342854946, 'Neighborhood_Edwards'),
 (0.05336336518525095, 'OverallQual'),
 (0.051191181523374846, 'SaleCondition_Abnorml'),
 (0.05075067600372821, 'Condition1_Norm'),
 (0.04953701965285945, 'Neighborhood_Somerst'),
 (0.04740308695961816, 'BsmtQual_Ex'),
 (0.0443989882666394, 'GarageCars')]

In [17]:
param_grid = [
    {'max_depth': [10, 15, 20]},
  ]

rf= RandomForestRegressor(n_estimators=100, random_state=30)
search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)
search.fit(X_train, y_train)
search.best_params_

{'max_depth': 15}

In [18]:
result = search.cv_results_
for mean_score, params in zip(result["mean_test_score"], result["params"]):
    print('{} : {:.4f}' .format(params, abs(mean_score)))

{'max_depth': 10} : 0.0208
{'max_depth': 15} : 0.0205
{'max_depth': 20} : 0.0206


In [19]:
feature_importances = search.best_estimator_.feature_importances_
sorted(zip(feature_importances, X_train.columns), reverse=True)[:20]

[(0.551749742028365, 'OverallQual'),
 (0.11016392359084276, 'GrLivArea'),
 (0.04698381577659507, 'TotalBsmtSF'),
 (0.04257130592758489, 'GarageCars'),
 (0.024141001353117395, 'GarageArea'),
 (0.021609338122830102, 'BsmtFinSF1'),
 (0.02102948614149855, '1stFlrSF'),
 (0.015704198128319396, 'YearBuilt'),
 (0.01417882061339648, 'LotArea'),
 (0.011352032200789196, 'OverallCond'),
 (0.008268112742924206, 'YearRemodAdd'),
 (0.00664548043318209, '2ndFlrSF'),
 (0.0060577450206641984, 'LotFrontage'),
 (0.006007097988422408, 'Fireplaces'),
 (0.0058516614246438, 'CentralAir_N'),
 (0.00486998925424434, 'BsmtUnfSF'),
 (0.0047530804258337205, 'CentralAir_Y'),
 (0.003927898878914061, 'MSZoning_RM'),
 (0.0036303522440060236, 'GarageYrBlt'),
 (0.0035844344985736217, 'OpenPorchSF')]

In [20]:
param_grid = [
    {'learning_rate': [0.001, 0.01, 0.1, 1, 10]},
  ]

gb = GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=30)
search = GridSearchCV(gb, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)
search.fit(X_train, y_train)
search.best_params_

{'learning_rate': 0.1}

In [21]:
result = search.cv_results_
for mean_score, params in zip(result["mean_test_score"], result["params"]):
    print('{} : {:.4f}' .format(params, abs(mean_score)))

{'learning_rate': 0.001} : 0.1396
{'learning_rate': 0.01} : 0.0540
{'learning_rate': 0.1} : 0.0160
{'learning_rate': 1} : 0.0368
{'learning_rate': 10} : 8147641805968464756791155152136435449338150090755760719076423854811255106304928577882630898395853605476089854502278862522105323797412952715236947622985725293010389033655356986122635006967808.0000


In [22]:
feature_importances = search.best_estimator_.feature_importances_
sorted(zip(feature_importances, X_train.columns), reverse=True)[:20]

[(0.460535386048635, 'OverallQual'),
 (0.1698967593530751, 'GrLivArea'),
 (0.050807739243889045, 'TotalBsmtSF'),
 (0.046856291404455436, 'GarageCars'),
 (0.036203720802304766, 'YearBuilt'),
 (0.02803850828091807, 'BsmtFinSF1'),
 (0.027076258549046404, 'YearRemodAdd'),
 (0.01865109072996971, 'GarageArea'),
 (0.016330325516379404, 'Fireplaces'),
 (0.01582227881659197, 'OverallCond'),
 (0.015603504458916248, 'LotArea'),
 (0.013712718804095258, '1stFlrSF'),
 (0.01047801905879691, 'CentralAir_N'),
 (0.006347924194558466, 'ExterQual_TA'),
 (0.005490754251189732, 'BsmtQual_Ex'),
 (0.00529570534530531, 'MSZoning_C (all)'),
 (0.005203835524598234, 'FullBath'),
 (0.004444437403811901, '2ndFlrSF'),
 (0.004401733992914414, 'MSZoning_RM'),
 (0.003977546049029851, 'MSZoning_RL')]

In [23]:
param_grid = [
    {'learning_rate': [0.001, 0.01, 0.1, 1]},
  ]

xgb = XGBRegressor(random_state=30)
search = GridSearchCV(xgb, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)
search.fit(X_train, y_train)
search.best_params_



{'learning_rate': 0.1}

In [24]:
result = search.cv_results_
for mean_score, params in zip(result["mean_test_score"], result["params"]):
    print('{} : {:.4f}' .format(params, abs(mean_score)))

{'learning_rate': 0.001} : 108.8978
{'learning_rate': 0.01} : 17.9421
{'learning_rate': 0.1} : 0.0169
{'learning_rate': 1} : 0.0317


In [25]:
feature_importances = search.best_estimator_.feature_importances_
sorted(zip(feature_importances, X_train.columns), reverse=True)[:20]

[(0.12694754, 'GarageCars'),
 (0.064219296, 'OverallQual'),
 (0.059413463, 'GarageType_Attchd'),
 (0.056744818, 'Fireplaces'),
 (0.05671513, 'TotalBsmtSF'),
 (0.05515654, 'GarageCond_TA'),
 (0.053765666, 'YearRemodAdd'),
 (0.037046615, 'GrLivArea'),
 (0.035829023, 'CentralAir_N'),
 (0.032158524, 'GarageQual_TA'),
 (0.029776314, 'MSZoning_RL'),
 (0.02784636, 'BsmtQual_Ex'),
 (0.024210434, 'YearBuilt'),
 (0.015903784, 'MSZoning_RM'),
 (0.015557663, 'KitchenQual_Ex'),
 (0.013620804, 'KitchenQual_TA'),
 (0.012927861, 'BsmtFinType1_GLQ'),
 (0.011069388, 'LotArea'),
 (0.01078211, 'BsmtExposure_Gd'),
 (0.009873064, 'BsmtFinSF1')]