In [1]:
import numpy as np
from scipy import stats
from scipy.stats import skew
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.externals import joblib   # 함수는 dump 시켜도 안됨
from sklearn.feature_selection import SelectPercentile, f_classif, f_regression, SelectFromModel, RFE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

In [4]:
from my_transformer import change_to_str, divide_columns, feature_selection, simple_imputer, one_hot_encoding, concat, rf_imputer, fill_columns
from my_transformer import rmsle_scorer, neg_rmsle_scorer, rmsle

In [5]:
def preparation(data) :
    
    data.drop(['SalePrice', 'Id'], axis=1, inplace=True)
    
    astype_str = change_to_str('MSSubClass')
    data = astype_str.fit_transform(data)
    
    isnull_sum = data.isnull().sum()
    data = data[isnull_sum[isnull_sum == 0].index]
    
    num_columns, cat_columns = divide_columns(data)
    
    pipeline_cat = make_pipeline(
        feature_selection(cat_columns),
        simple_imputer('most_frequent'),
        one_hot_encoding(cat_columns)
    )
    
    X_cat = pipeline_cat.fit_transform(data)
    X_num = data[num_columns]
    X = concat(X_num, X_cat)
        
    isnull_sum = X.isnull().sum()
    print(isnull_sum[isnull_sum > 0].sort_values(ascending=False))
    
    skew_features = X[num_columns].apply(lambda x : skew(x))
    skew_features_top = skew_features[skew_features > 1]
    X[skew_features_top.index] = np.log1p(X[skew_features_top.index])
            
    return X

In [6]:
def data_set() :
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    
    data = pd.concat([train, test], axis=0)
    
    X = preparation(data)
    X_train = X.iloc[:1460, :]
    X_test = X.iloc[1460:, :]
    
    y = train['SalePrice']
    y_train = np.log1p(y)
    
    return X_train, X_test, y_train

In [7]:
def feature_auto_selection(X_train, y_train) :
    select_models = [SelectPercentile(score_func=f_regression, percentile=10),
            SelectFromModel(rf, threshold='0.5*mean'),
            RFE(lasso, n_features_to_select=30)]
    
    scores_list = []
    
    for model in select_models :
        select = model
        select.fit(X_train, y_train)
        X_train = select.transform(X_train)
        print(X_train.shape)
        
        scores = dict()

        for key, model in models.items() : 
            score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()   
            scores[key] = round(np.sqrt(-score), 3)
        
        scores_list.append(scores)
        
    scores_df = {model.__class__.__name__ : pd.Series(scores) for model, scores in zip(select_models, scores_list)}
    scores_df = pd.DataFrame(scores_df)
    
    return scores_df

In [9]:
X_train, X_test, y_train = data_set()

Series([], dtype: int64)


In [10]:
X_train.shape

(1460, 164)

In [11]:
knn = KNeighborsRegressor(n_neighbors=5)
linear = LinearRegression()
sgd = SGDRegressor(max_iter=None, eta0=0.01, penalty='l2', random_state=30)
ridge = Ridge(alpha=1, random_state=30)
lasso = Lasso(alpha=1, random_state=30)
elastic = ElasticNet(alpha=1, l1_ratio=0.5, random_state=30)
dt = DecisionTreeRegressor(max_depth=None, random_state=30)
rf = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=30, n_jobs=-1)
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=30)
svm = SVR(C=1, kernel='rbf', gamma='auto_deprecated')
mlp = MLPRegressor(hidden_layer_sizes=[100], solver='adam', activation='relu', alpha=0.0001, max_iter=200, random_state=30)
xgb = XGBRegressor(random_state=30)

In [12]:
models = {
    'knn' : knn,
    'linear' : linear,
    'sgd' : sgd,
    'ridge' : ridge,
    'lasso' : lasso,
    'elastic' : elastic,
    'dt' : dt,
    'rf' : rf,
    'gb' : gb,
    'xgb' : xgb,
    'svm' : svm,
    'mlp' : mlp
}

In [13]:
scores = dict()

for key, model in models.items() : 
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()   
    scores[key] = round(np.sqrt(-score), 3)
    
scores



{'knn': 0.243,
 'linear': 0.145,
 'sgd': 6917347452608911.0,
 'ridge': 0.14,
 'lasso': 0.29,
 'elastic': 0.287,
 'dt': 0.208,
 'rf': 0.151,
 'gb': 0.137,
 'xgb': 0.14,
 'svm': 0.246,
 'mlp': 0.98}

In [14]:
population_dict = {
    'California' : 38,
    'Texas' : 26,
    'New York' : 19,
    'Florida' : 19,
    'Illinois' : 12
}
population = pd.Series(population_dict)
print(population)

California    38
Texas         26
New York      19
Florida       19
Illinois      12
dtype: int64


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=30)

In [16]:
for key, model in models.items() : 
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    score_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    score_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    print('{}, train_score : {}, test_score : {}' . format(key, round(score_train, 3), round(score_test, 3)))

knn, train_score : 0.192, test_score : 0.255
linear, train_score : 0.111, test_score : 0.141
sgd, train_score : 9895785475320566.0, test_score : 9833027633622028.0
ridge, train_score : 0.113, test_score : 0.141
lasso, train_score : 0.285, test_score : 0.305
elastic, train_score : 0.282, test_score : 0.302
dt, train_score : 0.001, test_score : 0.236
rf, train_score : 0.056, test_score : 0.167
gb, train_score : 0.09, test_score : 0.161
xgb, train_score : 0.098, test_score : 0.153
svm, train_score : 0.102, test_score : 0.278
mlp, train_score : 0.849, test_score : 0.907


In [17]:
X_train, X_test, y_train = data_set()

Series([], dtype: int64)


In [18]:
scores_df = feature_auto_selection(X_train, y_train)

(1460, 17)
(1460, 6)
(1460, 6)


In [19]:
scores_df

Unnamed: 0,SelectPercentile,SelectFromModel,RFE
knn,0.226,0.233,0.233
linear,0.157,0.161,0.161
sgd,4783475000000000.0,4604651000000000.0,4604651000000000.0
ridge,0.157,0.161,0.161
lasso,0.31,0.31,0.31
elastic,0.307,0.307,0.307
dt,0.222,0.216,0.216
rf,0.159,0.162,0.162
gb,0.152,0.152,0.152
xgb,0.154,0.155,0.155
