## 1. 데이터 살펴보기
pandas의 read_csv 함수를 사용해 데이터를 읽어오고, 각 변수들이 나타내는 의미를 살펴보겠습니다.
1. ID : 집을 구분하는 번호
2. date : 집을 구매한 날짜
3. price : 타겟 변수인 집의 가격
4. bedrooms : 침실의 수
5. bathrooms : 침실당 화장실 개수
6. sqft_living : 주거 공간의 평방 피트
7. sqft_lot : 부지의 평방 피트
8. floors : 집의 층 수
9. waterfront : 집의 전방에 강이 흐르는지 유무 (a.k.a. 리버뷰)
10. view : 집이 얼마나 좋아 보이는지의 정도
11. condition : 집의 전반적인 상태
12. grade : King County grading 시스템 기준으로 매긴 집의 등급
13. sqft_above : 지하실을 제외한 평방 피트
14. sqft_basement : 지하실의 평방 피트
15. yr_built : 집을 지은 년도
16. yr_renovated : 집을 재건축한 년도
17. zipcode : 우편번호
18. lat : 위도
19. long : 경도
20. sqft_living15 : 2015년 기준 주거 공간의 평방 피트(집을 재건축했다면, 변화가 있을 수 있음)
21. sqft_lot15 : 2015년 기준 부지의 평방 피트(집을 재건축했다면, 변화가 있을 수 있음)

In [50]:
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

train_data_path = join('/kaggle/input/aiffel/data/', 'train.csv')
sub_data_path = join('/kaggle/input/aiffel/data/', 'test.csv')

data = pd.read_csv(train_data_path)
sub = pd.read_csv(sub_data_path)
print('train data dim : {}'.format(data.shape))
print('sub data dim : {}'.format(sub.shape))


y = data['price']

del data['price']

train_len = len(data)
data = pd.concat((data, sub), axis=0)

for c in data.columns:
    print('{} : {}'.format(c, len(data.loc[pd.isnull(data[c]), c].values)))

sub_id = data['id'][train_len:]
del data['id']
data['date'] = data['date'].apply(lambda x : str(x[:6])).astype(int)

train data dim : (15035, 21)
sub data dim : (6468, 20)
id : 0
date : 0
bedrooms : 0
bathrooms : 0
sqft_living : 0
sqft_lot : 0
floors : 0
waterfront : 0
view : 0
condition : 0
grade : 0
sqft_above : 0
sqft_basement : 0
yr_built : 0
yr_renovated : 0
zipcode : 0
lat : 0
long : 0
sqft_living15 : 0
sqft_lot15 : 0


In [51]:
print(data.head())
print(data.describe())
for col in data.columns:
    zero_count = (data[col] == 0).sum()  # 해당 열에서 0인 값의 개수 계산
    print(f"{col} : {zero_count}")

     date  bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  \
0  201410         3       1.00         1180      5650     1.0           0   
1  201502         2       1.00          770     10000     1.0           0   
2  201502         3       2.00         1680      8080     1.0           0   
3  201406         3       2.25         1715      6819     2.0           0   
4  201501         3       1.50         1060      9711     1.0           0   

   view  condition  grade  sqft_above  sqft_basement  yr_built  yr_renovated  \
0     0          3      7        1180              0      1955             0   
1     0          3      6         770              0      1933             0   
2     0          3      8        1680              0      1987             0   
3     0          3      7        1715              0      1995             0   
4     0          3      7        1060              0      1963             0   

   zipcode      lat     long  sqft_living15  sqft_lot15 

In [52]:
# 피쳐 제거
# data = data.drop(columns=['waterfront'])

# 재건축 연도가 0인 경우, 첫 건축 연도로 채우기
data['yr_renovated'] = data.apply(
    lambda row: row['yr_built'] if row['yr_renovated'] == 0 else row['yr_renovated'], axis=1
)

In [53]:
skew_columns = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']

for c in skew_columns:
    data[c] = np.log1p(data[c].values)
print(data)

        date  bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  \
0     201410         3       1.00     7.074117  8.639588     1.0           0   
1     201502         2       1.00     6.647688  9.210440     1.0           0   
2     201502         3       2.00     7.427144  8.997271     1.0           0   
3     201406         3       2.25     7.447751  8.827615     2.0           0   
4     201501         3       1.50     6.966967  9.181118     1.0           0   
...      ...       ...        ...          ...       ...     ...         ...   
6463  201406         3       1.75     7.313887  9.390075     1.0           0   
6464  201501         3       2.00     7.307202  7.027315     3.0           0   
6465  201502         3       2.50     7.178545  7.166266     2.0           0   
6466  201406         2       0.75     6.928538  7.208600     2.0           0   
6467  201501         3       2.50     7.378384  7.778630     2.0           0   

      view  condition  grade  sqft_abov

In [54]:
sub = data.iloc[train_len:, :]
x = data.iloc[:train_len, :]
y = np.log1p(y)

In [55]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# random_state는 모델초기화나 데이터셋 구성에 사용되는 랜덤 시드값입니다. 
#random_state=None    # 이게 초기값입니다. 아무것도 지정하지 않고 None을 넘겨주면 모델 내부에서 임의로 선택합니다.  
random_state=2020        # 하지만 우리는 이렇게 고정값을 세팅해 두겠습니다. 

gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = XGBRegressor(random_state=random_state)
lightgbm = LGBMRegressor(random_state=random_state)
rdforest = RandomForestRegressor(random_state=random_state)

models = [gboost, xgboost, lightgbm, rdforest]

def my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5):
    # GridSearchCV 모델로 초기화
    grid_model = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', \
                              cv=5, verbose=verbose, n_jobs=n_jobs)
    
    # 모델 fitting
    grid_model.fit(train, y)

    # 결과값 저장
    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']
    
    # 데이터 프레임 생성
    results = pd.DataFrame(params)
    results['score'] = score
    
    # RMSLE 값 계산 후 정렬
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results = results.sort_values('RMSLE')

    return results

In [56]:
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [5, 10],
    'learning_rate': [0.01, 0.05, 0.1],
}

model = LGBMRegressor(random_state=random_state)
results= my_GridSearch(model, x, y, param_grid, verbose=2, n_jobs=5)
results

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000838 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2401
[LightGBM] [Info] Number of data points in the train set: 15035, number of used features: 19
[LightGBM] [Info] Start training from score 13.048122


Unnamed: 0,learning_rate,max_depth,n_estimators,score,RMSLE
11,0.1,10,300,-0.026212,0.161901
7,0.05,10,300,-0.026432,0.162578
9,0.1,5,300,-0.026623,0.163165
10,0.1,10,100,-0.027049,0.164466
5,0.05,5,300,-0.027323,0.165296
8,0.1,5,100,-0.02863,0.169205
6,0.05,10,100,-0.029186,0.17084
4,0.05,5,100,-0.031367,0.177107
3,0.01,10,300,-0.034103,0.184671
1,0.01,5,300,-0.036606,0.191327


In [57]:
best_params = results.iloc[0].drop(['score', 'RMSLE']).to_dict()
# 정수형 파라미터를 변환
int_params = ['n_estimators', 'max_depth']
for param in int_params:
    if param in best_params:
        best_params[param] = int(best_params[param])
# 최적의 파라미터로 모델 초기화
best_model = LGBMRegressor(random_state=random_state, **best_params)
best_model.fit(x, y)
prediction = best_model.predict(sub)
prediction = np.expm1(prediction)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001097 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2401
[LightGBM] [Info] Number of data points in the train set: 15035, number of used features: 19
[LightGBM] [Info] Start training from score 13.048122


In [58]:
data_dir = '/kaggle/input/aiffel/data/'
submission_path = join(data_dir, 'sample_submission.csv')
submission = pd.read_csv(submission_path)
submission['price'] = prediction
submission_csv_path = 'submission_RMSLE_{}.csv'.format(round(results['RMSLE'].iloc[0], 5))
submission.to_csv(submission_csv_path, index=False)
print('{} saved!'.format(submission_csv_path))

submission_RMSLE_0.1619.csv saved!
