<a href="https://colab.research.google.com/github/dddonghwa/dacon-struggle/blob/main/housing_0127_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import os
import os.path as osp
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

In [None]:
data_dir = './drive/MyDrive/Colab Notebooks/DACrew/housing/dataset'

train = pd.read_csv(osp.join(data_dir, 'train.csv'))
test = pd.read_csv(osp.join(data_dir, 'test.csv'))

train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)
print( train.shape, test.shape)

train.head()

(1350, 14) (1350, 13)


Unnamed: 0,Overall Qual,Gr Liv Area,Exter Qual,Garage Cars,Garage Area,Kitchen Qual,Total Bsmt SF,1st Flr SF,Bsmt Qual,Full Bath,Year Built,Year Remod/Add,Garage Yr Blt,target
0,10,2392,Ex,3,968,Ex,2392,2392,Ex,2,2003,2003,2003,386250
1,7,1352,Gd,2,466,Gd,1352,1352,Ex,2,2006,2007,2006,194000
2,5,900,TA,1,288,TA,864,900,TA,1,1967,1967,1967,123000
3,5,1174,TA,2,576,Gd,680,680,TA,1,1900,2006,2000,135000
4,7,1958,Gd,3,936,Gd,1026,1026,Gd,2,2005,2005,2005,250000


In [None]:
# 중복값 제거
print("제거 전 :", train.shape)
train = train.drop_duplicates()
print("제거 후 :", train.shape)

제거 전 : (1350, 14)
제거 후 : (1349, 14)


In [None]:
# Garage Yr Blt 이상치 수정 2207 → 2007
# train[train['Garage Yr Blt']> 2050] # 254
train.loc[254, 'Garage Yr Blt'] = 2007

In [None]:
# 품질 관련 변수 → 숫자로 매핑
qual_cols = train.dtypes[train.dtypes == np.object].index
def label_encoder(df_, qual_cols):
  df = df_.copy()
  mapping={
      'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1
  }
  for col in qual_cols :
    df[col] = df[col].map(mapping)
  return df

train = label_encoder(train, qual_cols)
test = label_encoder(test, qual_cols)
train.head()

Unnamed: 0,Overall Qual,Gr Liv Area,Exter Qual,Garage Cars,Garage Area,Kitchen Qual,Total Bsmt SF,1st Flr SF,Bsmt Qual,Full Bath,Year Built,Year Remod/Add,Garage Yr Blt,target
0,10,2392,5,3,968,5,2392,2392,5,2,2003,2003,2003,386250
1,7,1352,4,2,466,4,1352,1352,5,2,2006,2007,2006,194000
2,5,900,3,1,288,3,864,900,3,1,1967,1967,1967,123000
3,5,1174,3,2,576,4,680,680,3,1,1900,2006,2000,135000
4,7,1958,4,3,936,4,1026,1026,4,2,2005,2005,2005,250000


In [None]:
# 변수 재배치
cols = ['Year Built','Year Remod/Add','Garage Yr Blt', # 년도
        'Overall Qual',  'Exter Qual','Kitchen Qual','Bsmt Qual', # 품질
        'Gr Liv Area','Total Bsmt SF', '1st Flr SF', 'Garage Area', # 면적
        'Garage Cars', 'Full Bath'] # 갯수
train = train[cols+['target']]
test = test[cols]

train.head()

Unnamed: 0,Year Built,Year Remod/Add,Garage Yr Blt,Overall Qual,Exter Qual,Kitchen Qual,Bsmt Qual,Gr Liv Area,Total Bsmt SF,1st Flr SF,Garage Area,Garage Cars,Full Bath,target
0,2003,2003,2003,10,5,5,5,2392,2392,2392,968,3,2,386250
1,2006,2007,2006,7,4,4,5,1352,1352,1352,466,2,2,194000
2,1967,1967,1967,5,3,3,3,900,864,900,288,1,1,123000
3,1900,2006,2000,5,3,4,3,1174,680,680,576,2,1,135000
4,2005,2005,2005,7,4,4,4,1958,1026,1026,936,3,2,250000


id : 데이터 고유 id  
#### 품질 피처
OverallQual : 전반적 재료와 마감 품질  
ExterQual : 외관 재료 품질  
KitchenQual : 부억 품질  
BsmtQual : 지하실 높이(품질) 

#### 년도 피처
YearBuilt : 완공 연도  
YearRemodAdd : 리모델링 연도  
GarageYrBlt : 차고 완공 연도  

#### 면적 피처
TotalBsmtSF : 지하실 면적   

1stFlrSF : 1층 면적   
GrLivArea : 지상층 생활 면적  
FullBath : 지상층 화장실 개수  

GarageCars: 차고 자리 개수  
GarageArea: 차고 면적   

target : 집값(달러 단위)  

## 피처 엔지니어링 변수 아이디어

면적 관련 피처 중요

- 리모델링 연도 차 `Year Gap Remod`  = 리모델링 연도 - 완공 연도
- 차고 자리당 면적 `Car Area`= 차고 면적/차고 자리 개수
- 2층 면적 `2nd flr SF`= 지상층 생활 면적 - 1층 면적
- 2층 여부 `2nd flr`= 1(지상층 생활 면적 - 1층 면적 > 0), 0(지상층 생활 면적 - 1층 면적 < 0)
- 전체 면적 `Total SF` = 지상층 생활 면적 + 지하실 면적 + 차고 면적
- 품질 합 `Sum Qual` = (전반적 + 부억 + 재료 + 지하실) 품질 
- 차고 밖/안 `Garage In/Out` = 1(지상층 생활 면적 != 1층 면적), 0(지상층 생활 면적 == 1층 면적) 

In [None]:
area_cols = ["Gr Liv Area", "Garage Area", "Total Bsmt SF", "1st Flr SF"]
year_cols = ["Year Built", "Year Remod/Add", "Garage Yr Blt"]
qual_cols = ["Exter Qual", "Kitchen Qual", "Overall Qual"]
cnt_cols = ['Full Bath', 'Garage Cars', 'Bsmt Qual']

In [None]:
def feature_eng(data_):
  data = data_.copy()
  data['Year Gap Remod'] = data['Year Remod/Add'] - data['Year Built']
  data['Car Area'] = data['Garage Area']/data['Garage Cars']
  data['2nd flr SF'] = data['Gr Liv Area'] - data['1st Flr SF']
  data['2nd flr'] = data['2nd flr SF'].apply(lambda x : 1 if x > 0 else 0)
  data['Total SF'] = data[['Gr Liv Area',"Garage Area", "Total Bsmt SF"]].sum(axis=1)
  data['Sum Qual'] = data[["Exter Qual", "Kitchen Qual", "Overall Qual"]].sum(axis=1)
  data['Garage InOut'] = data.apply(lambda x : 1 if x['Gr Liv Area'] != x['1st Flr SF'] else 0, axis=1)
  return data

train = feature_eng(train)
test = feature_eng(test)

In [None]:
# ! pip install catboost
# ! pip install ngboost

In [None]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from catboost import CatBoostRegressor, Pool
from ngboost import NGBRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold

In [None]:
# 평가 기준 정의

def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [None]:
nmae_score = make_scorer(NMAE, greater_is_better=False)
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

In [None]:
X = train.drop(['target'], axis = 1)
y = np.log1p(train.target)

target = test[X.columns]


In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

lr_pred = np.zeros(target.shape[0])
lr_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    lr = LinearRegression(normalize=True)
    lr.fit(tr_x, tr_y)
    
    val_pred = np.expm1(lr.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    lr_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = target, label = None)
    fold_pred = lr.predict(target) / 10
    lr_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(lr_val)} & std = {np.std(lr_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.0866672720329789

2 FOLD Training.....
2 FOLD NMAE = 0.10347670424802673

3 FOLD Training.....
3 FOLD NMAE = 0.09229735498844713

4 FOLD Training.....
4 FOLD NMAE = 0.11799934402235537

5 FOLD Training.....
5 FOLD NMAE = 0.08209432282120803

6 FOLD Training.....
6 FOLD NMAE = 0.10781844398088267

7 FOLD Training.....
7 FOLD NMAE = 0.09766946218424478

8 FOLD Training.....
8 FOLD NMAE = 0.08972063592538121

9 FOLD Training.....
9 FOLD NMAE = 0.10024113938374264

10 FOLD Training.....
10 FOLD NMAE = 0.0985674342870046

10FOLD Mean of NMAE = 0.09765521138742721 & std = 0.01006540790544728


In [None]:
rg_pred = np.zeros(target.shape[0])
rg_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    rg = Ridge()
    rg.fit(tr_x, tr_y)
    
    val_pred = np.expm1(rg.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    rg_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = target, label = None)
    fold_pred = rg.predict(target) / 10
    rg_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(rg_val)} & std = {np.std(rg_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.08581885877289748

2 FOLD Training.....
2 FOLD NMAE = 0.10351488017384594

3 FOLD Training.....
3 FOLD NMAE = 0.09230885944304251

4 FOLD Training.....
4 FOLD NMAE = 0.11810498652284357

5 FOLD Training.....
5 FOLD NMAE = 0.08203104688746903

6 FOLD Training.....
6 FOLD NMAE = 0.10861654257511381

7 FOLD Training.....
7 FOLD NMAE = 0.09759836761926945

8 FOLD Training.....
8 FOLD NMAE = 0.08978354982132336

9 FOLD Training.....
9 FOLD NMAE = 0.10026834042526891

10 FOLD Training.....
10 FOLD NMAE = 0.09852718029848899

10FOLD Mean of NMAE = 0.0976572612539563 & std = 0.010271475354535178


In [None]:
ls_pred = np.zeros(target.shape[0])
ls_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    ls = Lasso()
    ls.fit(tr_x, tr_y)
    
    val_pred = np.expm1(ls.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    ls_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = target, label = None)
    fold_pred = ls.predict(target) / 10
    ls_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(ls_val)} & std = {np.std(ls_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.10720786095196369

2 FOLD Training.....
2 FOLD NMAE = 0.12959806353079442

3 FOLD Training.....
3 FOLD NMAE = 0.11666290875261083

4 FOLD Training.....
4 FOLD NMAE = 0.14581946036923357

5 FOLD Training.....
5 FOLD NMAE = 0.11666524082039152

6 FOLD Training.....
6 FOLD NMAE = 0.11807772623432769

7 FOLD Training.....
7 FOLD NMAE = 0.11705468605632682

8 FOLD Training.....
8 FOLD NMAE = 0.09876735680786661

9 FOLD Training.....
9 FOLD NMAE = 0.11797371906825896

10 FOLD Training.....
10 FOLD NMAE = 0.1377503786996134

10FOLD Mean of NMAE = 0.12055774012913875 & std = 0.013129848212992463


In [None]:
el_pred = np.zeros(target.shape[0])
el_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    el = ElasticNet()
    el.fit(tr_x, tr_y)
    
    val_pred = np.expm1(el.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    el_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = target, label = None)
    fold_pred = el.predict(target) / 10
    el_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(el_val)} & std = {np.std(el_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.10309285537353392

2 FOLD Training.....
2 FOLD NMAE = 0.12555720075106525

3 FOLD Training.....
3 FOLD NMAE = 0.11322048468074489

4 FOLD Training.....
4 FOLD NMAE = 0.14388975255771236

5 FOLD Training.....
5 FOLD NMAE = 0.10826116862317958

6 FOLD Training.....
6 FOLD NMAE = 0.11504663125560351

7 FOLD Training.....
7 FOLD NMAE = 0.10863178101700444

8 FOLD Training.....
8 FOLD NMAE = 0.0960756914411123

9 FOLD Training.....
9 FOLD NMAE = 0.11661173009519116

10 FOLD Training.....
10 FOLD NMAE = 0.13061056853065028

10FOLD Mean of NMAE = 0.11609978643257976 & std = 0.013301537394206678


In [None]:
gbr_pred = np.zeros(target.shape[0])
gbr_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    gbr = GradientBoostingRegressor(random_state = 42, max_depth = 4, learning_rate = 0.05, n_estimators = 1000)
    gbr.fit(tr_x, tr_y)
    
    val_pred = np.expm1(gbr.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    gbr_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    fold_pred = gbr.predict(target) / 10
    gbr_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(gbr_val)} & std = {np.std(gbr_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.08521383081817246

2 FOLD Training.....
2 FOLD NMAE = 0.09800613110447504

3 FOLD Training.....
3 FOLD NMAE = 0.09714814000098175

4 FOLD Training.....
4 FOLD NMAE = 0.12206356107634417

5 FOLD Training.....
5 FOLD NMAE = 0.09987111486203049

6 FOLD Training.....
6 FOLD NMAE = 0.10006096469643243

7 FOLD Training.....
7 FOLD NMAE = 0.0954594273831974

8 FOLD Training.....
8 FOLD NMAE = 0.09480453873785635

9 FOLD Training.....
9 FOLD NMAE = 0.0927454588538402

10 FOLD Training.....
10 FOLD NMAE = 0.113177808744915

10FOLD Mean of NMAE = 0.09985509762782455 & std = 0.009956119395336189


In [None]:
rf_pred = np.zeros(target.shape[0])
rf_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    rf = RandomForestRegressor(random_state = 42, criterion = 'mae')
    rf.fit(tr_x, tr_y)
    
    val_pred = np.expm1(rf.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    rf_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    fold_pred = rf.predict(target) / 10
    rf_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(rf_val)} & std = {np.std(rf_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.08902677559447959

2 FOLD Training.....
2 FOLD NMAE = 0.09542060467137374

3 FOLD Training.....
3 FOLD NMAE = 0.09776047464684795

4 FOLD Training.....
4 FOLD NMAE = 0.11800981805882746

5 FOLD Training.....
5 FOLD NMAE = 0.09040174119290646

6 FOLD Training.....
6 FOLD NMAE = 0.10072422281336606

7 FOLD Training.....
7 FOLD NMAE = 0.08821811278271963

8 FOLD Training.....
8 FOLD NMAE = 0.08470976054885425

9 FOLD Training.....
9 FOLD NMAE = 0.09631626999642359

10 FOLD Training.....
10 FOLD NMAE = 0.10779432519781668

10FOLD Mean of NMAE = 0.09683821055036154 & std = 0.00954127424521896


In [None]:
ngb_pred = np.zeros(target.shape[0])
ngb_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    ngb = NGBRegressor(random_state = 42, n_estimators = 1000, verbose = 0, learning_rate = 0.03)
    ngb.fit(tr_x, tr_y, val_x, val_y, early_stopping_rounds = 300)
    
    val_pred = np.expm1(ngb.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    ngb_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = target, label = None)
    fold_pred = ngb.predict(target) / 10
    ngb_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(ngb_val)} & std = {np.std(ngb_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.08054212819588058

2 FOLD Training.....
2 FOLD NMAE = 0.09826314136636571

3 FOLD Training.....
3 FOLD NMAE = 0.090164977172421

4 FOLD Training.....
4 FOLD NMAE = 0.11157223707512959

5 FOLD Training.....
5 FOLD NMAE = 0.09158111031358876

6 FOLD Training.....
6 FOLD NMAE = 0.10199860160269314

7 FOLD Training.....
7 FOLD NMAE = 0.09624270846691245

8 FOLD Training.....
8 FOLD NMAE = 0.08693934033613829

9 FOLD Training.....
9 FOLD NMAE = 0.09367456404230176

10 FOLD Training.....
10 FOLD NMAE = 0.1113452072496042

10FOLD Mean of NMAE = 0.09623240158210355 & std = 0.00947954044057474


In [None]:
cb_pred = np.zeros(target.shape[0])
cb_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    tr_data = Pool(data = tr_x, label = tr_y)
    val_data = Pool(data = val_x, label = val_y)
    
    cb = CatBoostRegressor(depth = 4, random_state = 42, loss_function = 'MAE', n_estimators = 3000, learning_rate = 0.03, verbose = 0)
    cb.fit(tr_data, eval_set = val_data, early_stopping_rounds = 750, verbose = 1000)
    
    val_pred = np.expm1(cb.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    cb_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = target, label = None)
    fold_pred = cb.predict(target) / 10
    cb_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(cb_val)} & std = {np.std(cb_val)}')

1 FOLD Training.....
0:	learn: 0.2922952	test: 187886.6140975	best: 187886.6140975 (0)	total: 48.6ms	remaining: 2m 25s
Stopped by overfitting detector  (750 iterations wait)

bestTest = 187886.5514
bestIteration = 176

Shrink model to first 177 iterations.
1 FOLD NMAE = 0.08895705291300346

2 FOLD Training.....
0:	learn: 0.2944914	test: 183672.7367272	best: 183672.7367272 (0)	total: 1.65ms	remaining: 4.96s
1000:	learn: 0.0676004	test: 183672.7241542	best: 183672.7231861 (273)	total: 1.25s	remaining: 2.5s
Stopped by overfitting detector  (750 iterations wait)

bestTest = 183672.7232
bestIteration = 273

Shrink model to first 274 iterations.
2 FOLD NMAE = 0.0968972121316015

3 FOLD Training.....
0:	learn: 0.2873596	test: 190826.8657579	best: 190826.8657579 (0)	total: 1.41ms	remaining: 4.22s
1000:	learn: 0.0657954	test: 190826.7982569	best: 190826.7982569 (1000)	total: 1.28s	remaining: 2.55s
Stopped by overfitting detector  (750 iterations wait)

bestTest = 190826.798
bestIteration = 1162

In [None]:
val_list = [lr_val, rg_val, ls_val, el_val, gbr_val, rf_val, ngb_val, cb_val]
for val in val_list :
  print("{:.8f}".format(np.mean(val)))

0.09765521
0.09765726
0.12055774
0.11609979
0.09985510
0.09683821
0.09623240
0.10569176


In [None]:
sub = pd.read_csv(osp.join(data_dir, 'sample_submission.csv'))
# submission['target'] = np.expm1((ngb_pred + cb_pred + rf_pred + gbr_pred) / 4)
sub['target'] = np.expm1((ngb_pred + rf_pred + rg_pred + gbr_pred) / 4)
sub['target']

0       335337.654642
1       127904.152693
2       175336.562401
3       259910.851993
4       131738.743520
            ...      
1345    336026.203200
1346    122898.377798
1347     88709.831039
1348    186947.796714
1349    132869.241192
Name: target, Length: 1350, dtype: float64

In [None]:
sub_dir = './drive/MyDrive/Colab Notebooks/DACrew/housing/sub'
sub.to_csv(osp.join(sub_dir, 'baseline_0127.csv'), index=False)  # baseline_0127(1).csv