# Resonable Model (resonable.csv)
> Public Score: 25787.83648

In [1]:
import sys, os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# column 보기 30개로 설정
pd.set_option('display.max_columns', 40)

In [2]:
# 데이터 경로 설정
file_path = os.path.join("c:\\Users\\prohe\\OneDrive\\바탕 화면\\알고리즘PBL2\\", 'data')
train_origin = pd.read_csv(file_path+'\dataset.csv')
test_origin = pd.read_csv(file_path+'\problem.csv')

In [3]:
train_origin.shape, test_origin.shape

((1340, 24), (130, 23))

In [4]:
# 전처리를 위한 원본 데이터 복사
train = train_origin.copy()
test = test_origin.copy()

In [5]:
# 메모리를 효율적으로 사용하기 위한 downcast 함수 정의
def downcast(df, verbose=True):     # verbose 옵션 추가: (True)인 경우 몇 퍼센트 압축됐는지 출력
    start_mem = df.memory_usage().sum() / 1024**2   # 초기 메모리 사용량
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print(f'{(100*(start_mem - end_mem) / start_mem):.1f}% 압축됨')

    return df

In [6]:
downcast(train)
downcast(test)

67.2% 압축됨
68.7% 압축됨


Unnamed: 0,Id,LotArea,Street,LotConfig,OverallQual,OverallCond,YearBuilt,YearRemodAdd,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageType,GarageYrBlt,GarageCars,GarageArea,YrSold
0,1341,11200,Pave,Inside,5,5,1965,1965,1040,1040,0,1040,1,0,3,1,5,0,Detchd,1965,1,384,2008
1,1342,7200,Pave,Corner,5,7,1951,2000,900,900,0,900,1,0,3,1,5,0,Detchd,2005,2,576,2010
2,1343,16905,Pave,Inside,5,6,1959,1959,1350,1328,0,1328,1,1,2,1,5,2,Attchd,1959,1,308,2007
3,1344,9180,Pave,CulDSac,5,7,1983,1983,840,884,0,884,1,0,2,1,5,0,Attchd,1983,2,504,2007
4,1345,7200,Pave,Inside,5,7,1920,1996,530,581,530,1111,1,0,3,1,6,0,Detchd,1935,1,288,2007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,1466,11478,Pave,Inside,8,5,2007,2008,1704,1704,0,1704,2,0,3,1,7,1,Attchd,2008,3,772,2010
126,1467,16321,Pave,CulDSac,5,6,1957,1997,1484,1600,0,1600,1,0,2,1,6,2,Attchd,1957,1,319,2006
127,1468,6324,Pave,Inside,4,6,1927,1950,520,520,0,520,1,0,1,1,4,0,Detchd,1920,1,240,2008
128,1469,8500,Pave,Inside,4,4,1920,1950,649,649,668,1317,1,0,3,1,6,0,Detchd,1920,1,250,2008


# 전처리

In [7]:
# 결측치는 확인해보면 주차장이 모두 없는 경우 이므로 0으로 대체
train['GarageType'].fillna('Nan', inplace=True)
train['GarageYrBlt'].fillna(0, inplace=True)

train.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0)

In [8]:
# 몇 가지 이상치 처리
train = train[train['LotArea'] < 100000].reset_index(drop=True)
train = train[train['GrLivArea'] < 4000].reset_index(drop=True)

In [9]:
# Target과 feature 분리
X_train = train.drop(['SalePrice'], axis=1)
y_train = train['SalePrice']

In [10]:
# 공통 전처리를 위한 데이터 병합
all_data = pd.concat([X_train, test], axis=0).reset_index(drop=True)

all_data.shape

(1464, 23)

In [11]:
# 파생 변수 생성
all_data['Total_Bath'] = all_data['FullBath'] + (0.5 * all_data['HalfBath'])
all_data['YrBuiltOrRemod'] = 2010 - all_data['YearRemodAdd']
all_data['TotalSF']=all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

all_data['Has2ndfloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['Hasgarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['Hasbsmt'] = all_data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['Hasfireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [12]:
all_data

Unnamed: 0,Id,LotArea,Street,LotConfig,OverallQual,OverallCond,YearBuilt,YearRemodAdd,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageType,GarageYrBlt,GarageCars,GarageArea,YrSold,Total_Bath,YrBuiltOrRemod,TotalSF,Has2ndfloor,Hasgarage,Hasbsmt,Hasfireplace
0,1,8450,Pave,Inside,7,5,2003,2003,856,856,854,1710,2,1,3,1,8,0,Attchd,2003.0,2,548,2008,2.5,7,2566,1,1,1,0
1,2,9600,Pave,FR2,6,8,1976,1976,1262,1262,0,1262,2,0,3,1,6,1,Attchd,1976.0,2,460,2007,2.0,34,2524,0,1,1,1
2,3,11250,Pave,Inside,7,5,2001,2002,920,920,866,1786,2,1,3,1,6,1,Attchd,2001.0,2,608,2008,2.5,8,2706,1,1,1,1
3,4,9550,Pave,Corner,7,5,1915,1970,756,961,756,1717,1,0,3,1,7,1,Detchd,1998.0,3,642,2006,1.0,40,2473,1,1,1,1
4,5,14260,Pave,FR2,8,5,2000,2000,1145,1145,1053,2198,2,1,4,1,9,1,Attchd,2000.0,3,836,2008,2.5,10,3343,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,1466,11478,Pave,Inside,8,5,2007,2008,1704,1704,0,1704,2,0,3,1,7,1,Attchd,2008.0,3,772,2010,2.0,2,3408,0,1,1,1
1460,1467,16321,Pave,CulDSac,5,6,1957,1997,1484,1600,0,1600,1,0,2,1,6,2,Attchd,1957.0,1,319,2006,1.0,13,3084,0,1,1,1
1461,1468,6324,Pave,Inside,4,6,1927,1950,520,520,0,520,1,0,1,1,4,0,Detchd,1920.0,1,240,2008,1.0,60,1040,0,1,1,0
1462,1469,8500,Pave,Inside,4,4,1920,1950,649,649,668,1317,1,0,3,1,6,0,Detchd,1920.0,1,250,2008,1.0,60,1966,1,1,1,0


In [13]:
# 불필요한 변수 제거
no_use_features = ['Id', 'YrSold', 'Street', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
                   'FullBath', 'HalfBath', 'GarageArea', 'GrLivArea', 'KitchenAbvGr']

all_data.drop(no_use_features, axis=1, inplace=True)

In [14]:
all_data

Unnamed: 0,LotArea,LotConfig,OverallQual,OverallCond,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,GarageType,GarageYrBlt,GarageCars,Total_Bath,YrBuiltOrRemod,TotalSF,Has2ndfloor,Hasgarage,Hasbsmt,Hasfireplace
0,8450,Inside,7,5,3,8,0,Attchd,2003.0,2,2.5,7,2566,1,1,1,0
1,9600,FR2,6,8,3,6,1,Attchd,1976.0,2,2.0,34,2524,0,1,1,1
2,11250,Inside,7,5,3,6,1,Attchd,2001.0,2,2.5,8,2706,1,1,1,1
3,9550,Corner,7,5,3,7,1,Detchd,1998.0,3,1.0,40,2473,1,1,1,1
4,14260,FR2,8,5,4,9,1,Attchd,2000.0,3,2.5,10,3343,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,11478,Inside,8,5,3,7,1,Attchd,2008.0,3,2.0,2,3408,0,1,1,1
1460,16321,CulDSac,5,6,2,6,2,Attchd,1957.0,1,1.0,13,3084,0,1,1,1
1461,6324,Inside,4,6,1,4,0,Detchd,1920.0,1,1.0,60,1040,0,1,1,0
1462,8500,Inside,4,4,3,6,0,Detchd,1920.0,1,1.0,60,1966,1,1,1,0


In [15]:
# 범주형 변수들 OHE
final_data = pd.get_dummies(all_data).reset_index(drop=True)
final_data.shape

(1464, 27)

In [17]:
# 다시 train, test 데이터 분리
X_train = final_data.iloc[:len(y_train), :]
X_test = final_data.iloc[len(y_train):, :]
X_train.shape, y_train.shape, X_test.shape

((1334, 27), (1334,), (130, 27))

In [18]:
# 몇 가지 feature의 왜도 확인
cols = ['LotArea', 'TotalSF']
X_train[cols].skew()

LotArea    4.187933
TotalSF    0.671507
dtype: float64

In [19]:
# 왜도가 높은 feature에 log1p 적용
from scipy.stats import skew

X_train.loc[:, 'LotArea'] = np.log1p(X_train['LotArea'])
X_test.loc[:, 'LotArea'] = np.log1p(X_test['LotArea'])

In [20]:
X_train[cols].skew()

LotArea   -0.711377
TotalSF    0.671507
dtype: float64

In [22]:
# target data의 왜도 확인
y_train.skew()

1.6133418376671402

In [23]:
# log 변환 적용
y_train = np.log1p(y_train)
y_train.skew()

0.10701619999746427

# 모델링

In [24]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

In [25]:
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

In [26]:
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler


ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=10000000, alphas=alphas2, random_state=42, cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=10000000, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))                                
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003,))

In [27]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)  

In [28]:
from xgboost import XGBRegressor

xgboost = XGBRegressor(learning_rate=0.01,n_estimators=3460, max_depth=3, min_child_weight=0,
                       gamma=0, subsample=0.7, colsample_bytree=0.7, nthread=-1,
                       scale_pos_weight=1, seed=27, reg_alpha=0.00006)

In [29]:
from lightgbm import LGBMRegressor

lightgbm = LGBMRegressor(objective='regression', num_leaves=4, learning_rate=0.01, n_estimators=5000,
                         max_bin=200, bagging_fraction=0.75, bagging_freq=5, bagging_seed=7,
                         feature_fraction=0.2, feature_fraction_seed=7, verbose=-1,)

In [30]:
from mlxtend.regressor import StackingCVRegressor

stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, gbr, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

In [31]:
score1 = cv_rmse(ridge)
print("Ridge: {:.4f} ({:.4f})\n".format(score1.mean(), score1.std()), )

score2 = cv_rmse(lasso)
print("LASSO: {:.4f} ({:.4f})\n".format(score2.mean(), score2.std()), )

score3 = cv_rmse(elasticnet)
print("Elastic net: {:.4f} ({:.4f})\n".format(score3.mean(), score3.std()), )

score4 = cv_rmse(svr)
print("SVR: {:.4f} ({:.4f})\n".format(score4.mean(), score4.std()), )

score5 = cv_rmse(gbr)
print("gbr: {:.4f} ({:.4f})\n".format(score5.mean(), score5.std()), )

score6 = cv_rmse(xgboost)
print("xgboost: {:.4f} ({:.4f})\n".format(score6.mean(), score6.std()), )

score7 = cv_rmse(lightgbm)
print("lightgbm: {:.4f} ({:.4f})\n".format(score7.mean(), score7.std()), )

Ridge: 0.1402 (0.0123)

LASSO: 0.1384 (0.0131)

Elastic net: 0.1388 (0.0129)

SVR: 0.1388 (0.0129)

gbr: 0.1331 (0.0130)

xgboost: 0.1315 (0.0111)

lightgbm: 0.1325 (0.0115)



In [214]:
model_performances = pd.DataFrame({
    'Model': ['Ridge', 'Lasso', 'ElasticNet', 'SVR', 'GradientBoosting', 'XGBoost', 'LightGBM'],
    'Score': [score1.mean(), score2.mean(), score3.mean(), score4.mean(), score5.mean(), score6.mean(), score7.mean()],
    'Std': [score1.std(), score2.std(), score3.std(), score4.std(), score5.std(), score6.std(), score7.std()]})

model_performances

Unnamed: 0,Model,Score,Std
0,Ridge,0.146652,0.003962
1,Lasso,0.146034,0.003977
2,ElasticNet,0.14638,0.003958
3,SVR,0.143069,0.005951
4,GradientBoosting,0.04408,0.005097
5,XGBoost,0.072236,0.003563
6,LightGBM,0.100767,0.003736


In [32]:
print('START Fit')

print('stack_gen')
stack_gen_model = stack_gen.fit(np.array(X_train), np.array(y_train))

print('elasticnet')
elastic_model_full_data = elasticnet.fit(X_train, y_train)

print('Lasso')
lasso_model_full_data = lasso.fit(X_train, y_train)

print('Ridge')
ridge_model_full_data = ridge.fit(X_train, y_train)

print('Svr')
svr_model_full_data = svr.fit(X_train, y_train)

print('GradientBoosting')
gbr_model_full_data = gbr.fit(X_train, y_train)

print('xgboost')
xgb_model_full_data = xgboost.fit(X_train, y_train)

print('lightgbm')
lgb_model_full_data = lightgbm.fit(X_train, y_train)

START Fit
stack_gen
elasticnet
Lasso
Ridge
Svr
GradientBoosting
xgboost
lightgbm


In [33]:
print('Ridge RMSLE score on train data:')
print(rmsle(np.expm1(y_train), np.expm1(ridge_model_full_data.predict(X_train))))

print('Lasso RMSLE score on train data:')
print(rmsle(np.expm1(y_train), np.expm1(lasso_model_full_data.predict(X_train))))

print('ElasticNet RMSLE score on train data:')
print(rmsle(np.expm1(y_train), np.expm1(elastic_model_full_data.predict(X_train))))

print('SVR RMSLE score on train data:')
print(rmsle(np.expm1(y_train), np.expm1(svr_model_full_data.predict(X_train))))

print('GradientBoosting RMSLE score on train data:')
print(rmsle(np.expm1(y_train), np.expm1(gbr_model_full_data.predict(X_train))))

print('Xgboost RMSLE score on train data:')
print(rmsle(np.expm1(y_train), np.expm1(xgb_model_full_data.predict(X_train))))

print('lightgbm RMSLE score on train data:')
print(rmsle(np.expm1(y_train), np.expm1(lgb_model_full_data.predict(X_train))))

print('stack_gen RMSLE score on train data:')
print(rmsle(np.expm1(y_train), np.expm1(stack_gen_model.predict(np.array(X_train)))))

Ridge RMSLE score on train data:
25493.19746675185
Lasso RMSLE score on train data:
25243.861965555152
ElasticNet RMSLE score on train data:
25254.44228563902
SVR RMSLE score on train data:
25514.05979166108
GradientBoosting RMSLE score on train data:
10520.42405359543
Xgboost RMSLE score on train data:
14054.634697501904
lightgbm RMSLE score on train data:
19806.11194304361
stack_gen RMSLE score on train data:
14338.491531689933


In [54]:
def blend_models_predict(X):
    return ((0.5 * elastic_model_full_data.predict(X)) + \
            (0.1 * gbr_model_full_data.predict(X)) + \
            (0.1 * xgb_model_full_data.predict(X)) + \
            (0.1 * lgb_model_full_data.predict(X)) + \
            (0.2 * stack_gen_model.predict(np.array(X))))

In [55]:
print('RMSLE score on train data:')
print(rmsle(y_train, blend_models_predict(X_train)))
print('RMSLE score on train data:')
print(rmsle(np.expm1(y_train), np.expm1(blend_models_predict(X_train))))

RMSLE score on train data:
0.10335614142223833
RMSLE score on train data:
18666.913631833075


In [56]:
print('Predict submission')
submission_origin = pd.read_csv(file_path+'\submission.csv')
submission = submission_origin.copy()
submission.iloc[:,1] = (np.expm1(blend_models_predict(X_test)))

submission.to_csv('resonable.csv', index=False)

Predict submission
