In [1]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

In [4]:
class change_to_str(BaseEstimator, TransformerMixin):
    def __init__(self, column_name=None):
        self.column_name = column_name
        
    def fit(self, df_X, y=None):
        return self
    
    def transform(self, df_X):
        df_X[self.column_name] = df_X[self.column_name].astype('str')
        return df_X

In [5]:
class simple_imputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='mean'):
        self.strategy = strategy
        
    def fit(self, df_X, y=None):
        return self
    
    def transform(self, df_X):
        imputer = SimpleImputer(strategy=self.strategy)
        np_X = imputer.fit_transform(df_X)
        df_X = pd.DataFrame(np_X, columns=df_X.columns, index=df_X.index)
        
        return df_X

In [6]:
def fill_columns(df, columns, strategy='constant', value=0) :
    
    if strategy == 'constant' :
        df[columns].fillna(value, inplace=True)
        
    elif strategy == 'mean' :
        value = df[columns].mean()
        df[columns].fillna(value, inplace=True)
        
    elif strategy == 'median' :
        value = df[columns].median()
        df[columns].fillna(value, inplace=True)
        
    return df

In [7]:
def rf_imputer(df, column_imp, columns_rf) :
    
    df_impute = df[df[column_imp].isnull()]
    df_rf = df[df[column_imp].notnull()]
    
#     df_rf[column_imp] = df_rf[column_imp].astype('str')
    
    rf_imp = RandomForestClassifier()
    rf_imp.fit(df_rf[columns_rf], df_rf[column_imp])
    impute_values = rf_imp.predict(df_impute[columns_rf])
    df_impute[column_imp] = impute_values
    
    df_new = pd.concat([df_impute, df_rf], axis=0)
    
#     df_new[column_imp] = df_new[column_imp].astype('float')
    
    return df_new

In [8]:
def divide_columns(df_X) :
    columns = df_X.columns

    num_columns = []
    cat_columns = []

    for column in columns :
        if df_X[column].dtypes == 'object' :
            cat_columns.append(column)

        else :
            num_columns.append(column)

    return num_columns, cat_columns

In [9]:
class feature_selection(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        
    def fit(self, df_X, y=None):
        return self
    
    def transform(self, df_X):
        return df_X[self.columns]

In [10]:
class one_hot_encoding(BaseEstimator, TransformerMixin):
    def __init__(self, column_name=None, prefix=None):
        self.column_name = column_name
        self.prefix = prefix
        
    def fit(self, df_X, y=None):
        return self
    
    def transform(self, df_X):
        onehotencoding = pd.get_dummies(df_X[self.column_name], prefix = self.prefix)
        df_X.drop(self.column_name, axis=1, inplace=True)
        return pd.concat([df_X, onehotencoding], axis=1)

In [11]:
class standard_scaler(BaseEstimator, TransformerMixin):      
    def fit(self, df_X, y=None):
        return self
    
    def transform(self, df_X):
        scaler = StandardScaler()
        scaler.fit(df_X)
        X = scaler.transform(df_X)
        df_X = pd.DataFrame(X, columns=df_X.columns, index=df_X.index)
        return df_X

In [12]:
def concat(df_A, df_B) : 
    return pd.concat([df_A, df_B], axis=1)

In [13]:
def concat_list(df_list) : 
    return pd.concat(df_list, axis=1)

In [14]:
from sklearn.metrics import make_scorer

def rmsle(predicted_values, actual_values):
    # 넘파이로 배열 형태로 바꿔준다.
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)
    
    # 예측값과 실제 값에 1을 더하고 로그를 씌워준다.
    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)
    
    # 위에서 계산한 예측값에서 실제값을 빼주고 제곱을 해준다.
    difference = log_predict - log_actual
    # difference = (log_predict - log_actual) ** 2
    difference = np.square(difference)
    
    # 평균을 낸다.
    mean_difference = difference.mean()
    
    # 다시 루트를 씌운다.
    score = np.sqrt(mean_difference)
    
    return score

rmsle_scorer = make_scorer(rmsle)

In [15]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [16]:
data = train.drop(['SalePrice', 'Id'], axis=1)
y = train['SalePrice']
data.shape

(1460, 79)

In [17]:
astype_str = change_to_str('MSSubClass')
data = astype_str.fit_transform(data)

In [18]:
num_columns, cat_columns = divide_columns(data)
print('num_columns : {}, cat_columns : {}' .format(len(num_columns), len(cat_columns)))

num_columns : 35, cat_columns : 44


In [19]:
num_null_0 = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
              'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']
num_null_mean = 'LotFrontage'
num_null_rf = 'GarageYrBlt'
replace_null = ['Alley', 'Fence', 'FireplaceQu', 'MiscFeature', 'PoolQC']

In [20]:
for column in replace_null :
    data.loc[data[column].isnull(), column] = 'NA'

In [21]:
pipeline_cat = make_pipeline(
    feature_selection(cat_columns),
    simple_imputer('most_frequent'),
    one_hot_encoding(cat_columns)
)

In [22]:
X_cat = pipeline_cat.fit_transform(data)
X_cat.shape

(1460, 272)

In [23]:
X_num = data[num_columns]
X_num.shape

(1460, 35)

In [24]:
X = concat(X_num, X_cat)

In [25]:
isnull_sum = X.isnull().sum()
not_null = list(isnull_sum[isnull_sum == 0].index)
null_columns = list(isnull_sum[isnull_sum > 0])
isnull_sum[isnull_sum > 0].sort_values(ascending=False)

LotFrontage    259
GarageYrBlt     81
MasVnrArea       8
dtype: int64

In [26]:
for column in num_columns :
    if column == num_null_rf :
        X = rf_imputer(X, column, not_null)
        
    elif column == num_null_mean :
        X = fill_columns(X, column, 'mean')
        
    else :
        X = fill_columns(X, column, 'constant', 0)

In [27]:
isnull_sum = X.isnull().sum()
isnull_sum[isnull_sum > 0].sort_values(ascending=False)

Series([], dtype: int64)

In [28]:
def preparation(data) :
    astype_str = change_to_str('MSSubClass')
    data = astype_str.fit_transform(data)
    
    num_columns, cat_columns = divide_columns(data)
    
    num_null_0 = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                  'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']
    num_null_mean = 'LotFrontage'
    num_null_rf = 'GarageYrBlt'
    replace_null = ['Alley', 'Fence', 'FireplaceQu', 'MiscFeature', 'PoolQC']
    
    for column in replace_null :
        data.loc[data[column].isnull(), column] = 'NA'
    
    pipeline_cat = make_pipeline(
        feature_selection(cat_columns),
        simple_imputer('most_frequent'),
        one_hot_encoding(cat_columns)
    )
    
    X_cat = pipeline_cat.fit_transform(data)
    X_num = data[num_columns]
    X = concat(X_num, X_cat)
    
    isnull_sum = X.isnull().sum()
    not_null = list(isnull_sum[isnull_sum == 0].index)
    null_columns = list(isnull_sum[isnull_sum > 0])
    
    for column in num_columns :
        if column == num_null_rf :
            X = rf_imputer(X, column, not_null)

        elif column == num_null_mean :
            X = fill_columns(X, column, 'mean')

        else :
            X = fill_columns(X, column, 'constant', 0)
        
    isnull_sum = X.isnull().sum()
    print(isnull_sum[isnull_sum > 0].sort_values(ascending=False))
            
    return X

In [29]:
X = preparation(data)

Series([], dtype: int64)


In [30]:
X.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
39,65.0,6040,4,5,1955,1955,0.0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
48,33.0,4456,4,5,1920,2008,0.0,0,0,736,...,0,1,0,0,0,0,0,0,0,1
78,72.0,10778,4,5,1968,1968,0.0,0,0,1768,...,0,0,0,1,0,0,0,0,1,0
88,105.0,8470,3,2,1915,1982,0.0,0,0,1013,...,0,0,0,0,1,0,0,0,0,0
89,60.0,8070,4,5,1994,1995,0.0,588,0,402,...,0,0,0,1,0,0,0,0,1,0


In [31]:
knn = KNeighborsRegressor(n_neighbors=5)
linear = LinearRegression()
sgd = SGDRegressor(max_iter=None, eta0=0.01, penalty='l2', random_state=30)
ridge = Ridge(alpha=1, random_state=30)
lasso = Lasso(alpha=1, random_state=30)
elastic = ElasticNet(alpha=1, l1_ratio=0.5, random_state=30)
dt = DecisionTreeRegressor(max_depth=None, random_state=30)
rf = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=30, n_jobs=-1)
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=30)
svm = SVR(C=1, kernel='rbf', gamma='auto_deprecated')
mlp = MLPRegressor(hidden_layer_sizes=[100], solver='adam', activation='relu', alpha=0.0001, max_iter=200, random_state=30)
xgb = XGBRegressor(random_state=30)

In [32]:
models = {
    'knn' : knn,
    'linear' : linear,
    'sgd' : sgd,
    'ridge' : ridge,
    'lasso' : lasso,
    'elastic' : elastic,
    'dt' : dt,
    'rf' : rf,
    'gb' : gb,
    'xgb' : xgb,
    'svm' : svm,
    'mlp' : mlp
}

In [33]:
scores = dict()

for key, model in models.items() : 
    score = cross_val_score(model, X, y, cv=5, scoring=rmsle_scorer).mean()   
    scores[key] = round(score, 3)



In [34]:
scores

{'knn': 0.444,
 'linear': nan,
 'sgd': nan,
 'ridge': 0.459,
 'lasso': nan,
 'elastic': 0.413,
 'dt': 0.595,
 'rf': 0.43,
 'gb': 0.426,
 'xgb': 0.425,
 'svm': 0.4,
 'mlp': 0.413}

In [35]:
y = np.log1p(y)

In [37]:
scores = dict()

for key, model in models.items() : 
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error').mean()   
    scores[key] = round(np.sqrt(-score), 3)
    
scores



{'knn': 0.433,
 'linear': 0.463,
 'sgd': 8.979190772338786e+16,
 'ridge': 0.443,
 'lasso': 0.401,
 'elastic': 0.401,
 'dt': 0.583,
 'rf': 0.416,
 'gb': 0.423,
 'xgb': 0.416,
 'svm': 0.4,
 'mlp': 7.91}