In [178]:
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

In [179]:
train_data_path = join('./data/', 'train.csv')
sub_data_path = join('./data/', 'test.csv')

## 1. 데이터 살펴보기
pandas의 read_csv 함수를 사용해 데이터를 읽어오고, 각 변수들이 나타내는 의미를 살펴보겠습니다.
1. ID : 집을 구분하는 번호
2. date : 집을 구매한 날짜
3. price : 타겟 변수인 집의 가격
4. bedrooms : 침실의 수
5. bathrooms : 침실당 화장실 개수
6. sqft_living : 주거 공간의 평방 피트
7. sqft_lot : 부지의 평방 피트
8. floors : 집의 층 수
9. waterfront : 집의 전방에 강이 흐르는지 유무 (a.k.a. 리버뷰)
10. view : 집이 얼마나 좋아 보이는지의 정도
11. condition : 집의 전반적인 상태
12. grade : King County grading 시스템 기준으로 매긴 집의 등급
13. sqft_above : 지하실을 제외한 평방 피트
14. sqft_basement : 지하실의 평방 피트
15. yr_built : 집을 지은 년도
16. yr_renovated : 집을 재건축한 년도
17. zipcode : 우편번호
18. lat : 위도
19. long : 경도
20. sqft_living15 : 2015년 기준 주거 공간의 평방 피트(집을 재건축했다면, 변화가 있을 수 있음)
21. sqft_lot15 : 2015년 기준 부지의 평방 피트(집을 재건축했다면, 변화가 있을 수 있음)

In [180]:
data = pd.read_csv(train_data_path)
sub = pd.read_csv(sub_data_path)
print('train data dim : {}'.format(data.shape))
print('sub data dim : {}'.format(sub.shape))

train data dim : (15035, 21)
sub data dim : (6468, 20)


In [181]:
y = data['price']

del data['price']

In [182]:
train_len = len(data)
data = pd.concat((data, sub), axis=0)

In [183]:
for c in data.columns:
    print('{} : {}'.format(c, len(data.loc[pd.isnull(data[c]), c].values)))

id : 0
date : 0
bedrooms : 0
bathrooms : 0
sqft_living : 0
sqft_lot : 0
floors : 0
waterfront : 0
view : 0
condition : 0
grade : 0
sqft_above : 0
sqft_basement : 0
yr_built : 0
yr_renovated : 0
zipcode : 0
lat : 0
long : 0
sqft_living15 : 0
sqft_lot15 : 0


In [184]:
sub_id = data['id'][train_len:]
del data['id']
data['date'] = data['date'].apply(lambda x : str(x[:6])).astype(int)

In [185]:
# columns_to_drop = data.loc[:, 'zipcode':'sqft_lot15'].columns
# data = data.drop(columns=columns_to_drop) 

In [186]:
data.head()

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,201410,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,201502,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
2,201502,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
3,201406,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
4,201501,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711


In [187]:
print(data.describe())

                date      bedrooms     bathrooms   sqft_living      sqft_lot  \
count   21503.000000  21503.000000  21503.000000  21503.000000  2.150300e+04   
mean   201438.639539      3.371762      2.116042   2081.441334  1.511715e+04   
std        44.281419      0.930026      0.770018    918.669624  4.147591e+04   
min    201405.000000      0.000000      0.000000    290.000000  5.200000e+02   
25%    201407.000000      3.000000      1.750000   1430.000000  5.040000e+03   
50%    201410.000000      3.000000      2.250000   1914.000000  7.616000e+03   
75%    201502.000000      4.000000      2.500000   2550.000000  1.068600e+04   
max    201505.000000     33.000000      8.000000  13540.000000  1.651359e+06   

             floors    waterfront          view     condition         grade  \
count  21503.000000  21503.000000  21503.000000  21503.000000  21503.000000   
mean       1.495140      0.007580      0.234618      3.409710      7.659164   
std        0.540183      0.086737      0.7

In [188]:
# bedrooms, bathrooms(비율이긴 함), floors(0.5 도 있음), waterfront, view, condition, grade 는 정수형임. 로그스케일 하지 말자
# sqft 붙은 모든 컬럼은 로그스케일. 
# yr_ 에서 이상치 파악해야댐

In [189]:
for col in data.columns:
    zero_count = (data[col] == 0).sum()  # 해당 열에서 0인 값의 개수 계산
    print(f"{col} : {zero_count}")


date : 0
bedrooms : 13
bathrooms : 10
sqft_living : 0
sqft_lot : 0
floors : 0
waterfront : 21340
view : 19385
condition : 0
grade : 0
sqft_above : 0
sqft_basement : 13056
yr_built : 0
yr_renovated : 20590
zipcode : 0
lat : 0
long : 0
sqft_living15 : 0
sqft_lot15 : 0


In [190]:
# yr_renovated 재건축을 안했으면 0으로 표기. 이거 해결해야댐

In [191]:
# 재건축 연도가 0인 경우, 첫 건축 연도로 채우기
data['yr_renovated'] = data.apply(
    lambda row: row['yr_built'] if row['yr_renovated'] == 0 else row['yr_renovated'], axis=1
)

In [192]:
skew_columns = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']

for c in skew_columns:
    data[c] = np.log1p(data[c].values)

In [193]:
data

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,201410,3,1.00,7.074117,8.639588,1.0,0,0,3,7,7.074117,0.000000,1955,1955.0,98178,47.5112,-122.257,1340,5650
1,201502,2,1.00,6.647688,9.210440,1.0,0,0,3,6,6.647688,0.000000,1933,1933.0,98028,47.7379,-122.233,2720,8062
2,201502,3,2.00,7.427144,8.997271,1.0,0,0,3,8,7.427144,0.000000,1987,1987.0,98074,47.6168,-122.045,1800,7503
3,201406,3,2.25,7.447751,8.827615,2.0,0,0,3,7,7.447751,0.000000,1995,1995.0,98003,47.3097,-122.327,2238,6819
4,201501,3,1.50,6.966967,9.181118,1.0,0,0,3,7,6.966967,0.000000,1963,1963.0,98198,47.4095,-122.315,1650,9711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6463,201406,3,1.75,7.313887,9.390075,1.0,0,0,3,6,7.313887,0.000000,2014,2014.0,98010,47.3095,-122.002,1320,11303
6464,201501,3,2.00,7.307202,7.027315,3.0,0,0,3,8,7.307202,0.000000,2014,2014.0,98144,47.5699,-122.288,1400,1230
6465,201502,3,2.50,7.178545,7.166266,2.0,0,0,3,8,7.074117,4.875197,2008,2008.0,98116,47.5773,-122.409,1330,1265
6466,201406,2,0.75,6.928538,7.208600,2.0,0,0,3,7,6.928538,0.000000,2009,2009.0,98144,47.5944,-122.299,1020,2007


In [194]:
sub = data.iloc[train_len:, :]
x = data.iloc[:train_len, :]

In [195]:
sub

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,201412,3,2.25,7.852050,8.887791,2.0,0,0,3,7,7.682943,5.993961,1951,1991.0,98125,47.7210,-122.319,1690,7639
1,201412,4,3.00,7.581210,8.517393,1.0,0,0,5,7,6.957497,6.814543,1965,1965.0,98136,47.5208,-122.393,1360,5000
2,201405,4,4.50,8.598036,11.532051,1.0,0,0,3,11,8.266421,7.333676,2001,2001.0,98053,47.6561,-122.005,4760,101930
3,201504,3,1.00,7.484930,8.918784,1.0,0,0,3,7,6.957497,6.594413,1960,1960.0,98146,47.5123,-122.337,1780,8113
4,201503,3,2.50,7.544861,8.788898,2.0,0,0,3,7,7.544861,0.000000,2003,2003.0,98038,47.3684,-122.031,2390,7570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6463,201406,3,1.75,7.313887,9.390075,1.0,0,0,3,6,7.313887,0.000000,2014,2014.0,98010,47.3095,-122.002,1320,11303
6464,201501,3,2.00,7.307202,7.027315,3.0,0,0,3,8,7.307202,0.000000,2014,2014.0,98144,47.5699,-122.288,1400,1230
6465,201502,3,2.50,7.178545,7.166266,2.0,0,0,3,8,7.074117,4.875197,2008,2008.0,98116,47.5773,-122.409,1330,1265
6466,201406,2,0.75,6.928538,7.208600,2.0,0,0,3,7,6.928538,0.000000,2009,2009.0,98144,47.5944,-122.299,1020,2007


In [196]:
y = np.log1p(y)

In [197]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# random_state는 모델초기화나 데이터셋 구성에 사용되는 랜덤 시드값입니다. 
#random_state=None    # 이게 초기값입니다. 아무것도 지정하지 않고 None을 넘겨주면 모델 내부에서 임의로 선택합니다.  
random_state=2020        # 하지만 우리는 이렇게 고정값을 세팅해 두겠습니다. 

gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = XGBRegressor(random_state=random_state)
lightgbm = LGBMRegressor(random_state=random_state)
rdforest = RandomForestRegressor(random_state=random_state)

models = [gboost, xgboost, lightgbm, rdforest]



In [198]:
def get_scores(models, train, y):
    df = {}
    for model in models:
        model_name = model.__class__.__name__
        X_train, X_test, y_train, y_test= train_test_split(train, y, test_size= 0.2)
        model.fit(X_train, y_train)
        y_pred= model.predict(X_test)
        df[model_name] = rmse(y_test, y_pred)
        score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)
    return score_df

In [199]:
def my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5):
    # GridSearchCV 모델로 초기화
    grid_model = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', \
                              cv=5, verbose=verbose, n_jobs=n_jobs)
    
    # 모델 fitting
    grid_model.fit(train, y)

    # 결과값 저장
    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']
    
    # 데이터 프레임 생성
    results = pd.DataFrame(params)
    results['score'] = score
    
    # RMSLE 값 계산 후 정렬
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results = results.sort_values('RMSLE')

    return results

In [200]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [1, 10],
}

model = LGBMRegressor(random_state=random_state)
my_GridSearch(model, x, y, param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
3,10,100,-0.027049,0.164466
2,10,50,-0.029348,0.171313
1,1,100,-0.055002,0.234525
0,1,50,-0.073398,0.27092


In [201]:

def save_submission(model, train, y, test, model_name, rmsle=None):
    model.fit(train, y)
    prediction = model.predict(test)
    prediction = np.expm1(prediction)
    data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'
    submission_path = join(data_dir, 'sample_submission.csv')
    submission = pd.read_csv(submission_path)
    submission['price'] = prediction
    submission_csv_path = '{}/submission_{}_RMSLE_{}.csv'.format(data_dir, model_name, rmsle)
    submission.to_csv(submission_csv_path, index=False)
    print('{} saved!'.format(submission_csv_path))

In [202]:
save_submission(model, x, y, sub, 'lgbm', rmsle='0.164466')

/aiffel/aiffel/kaggle_kakr_housing/data/submission_lgbm_RMSLE_0.164466.csv saved!
