In [139]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
import os

In [140]:
train = pd.read_csv('./2019-2nd-ml-month-with-kakr/train.csv')
test = pd.read_csv('./2019-2nd-ml-month-with-kakr/test.csv')

In [141]:
train['date'] = train['date'].apply(lambda i: i[:6]).astype(int)
train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,0,201410,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,1,201502,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
2,2,201502,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
3,3,201406,257500.0,3,2.25,1715,6819,2.0,0,0,...,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
4,4,201501,291850.0,3,1.5,1060,9711,1.0,0,0,...,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711


In [142]:
y = train['price']
del train['price']
del train['id']
print(train.columns)

Index(['date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')


In [143]:
test['date'] = test['date'].apply(lambda i: i[:6]).astype(int)
del test['id']
print(test.columns)

Index(['date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')


In [144]:
test

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,201412,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
1,201412,4,3.00,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
2,201405,4,4.50,5420,101930,1.0,0,0,3,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930
3,201504,3,1.00,1780,7470,1.0,0,0,3,7,1050,730,1960,0,98146,47.5123,-122.337,1780,8113
4,201503,3,2.50,1890,6560,2.0,0,0,3,7,1890,0,2003,0,98038,47.3684,-122.031,2390,7570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6463,201406,3,1.75,1500,11968,1.0,0,0,3,6,1500,0,2014,0,98010,47.3095,-122.002,1320,11303
6464,201501,3,2.00,1490,1126,3.0,0,0,3,8,1490,0,2014,0,98144,47.5699,-122.288,1400,1230
6465,201502,3,2.50,1310,1294,2.0,0,0,3,8,1180,130,2008,0,98116,47.5773,-122.409,1330,1265
6466,201406,2,0.75,1020,1350,2.0,0,0,3,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007


In [145]:
y = np.log1p(y)
y

0        12.309987
1        12.100718
2        13.142168
3        12.458779
4        12.583999
           ...    
15030    13.322338
15031    13.822984
15032    12.793862
15033    12.899222
15034    12.691584
Name: price, Length: 15035, dtype: float64

In [146]:
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

In [147]:
random_state=2025
gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = XGBRegressor(random_state=random_state)
lightgbm = LGBMRegressor(random_state=random_state)
rdforest = RandomForestRegressor(random_state=random_state)

models = [gboost, xgboost, lightgbm, rdforest]

In [148]:
 
def get_scores(models, train, y):
    df = {}
    
    for model in models:
        model_name = model.__class__.__name__
        
        X_train, X_test, y_train, y_test = train_test_split(train, y, random_state=random_state, test_size=0.2)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        df[model_name] = rmse(y_test, y_pred)
        score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)
            
    return score_df

get_scores(models, train, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000552 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2303
[LightGBM] [Info] Number of data points in the train set: 12028, number of used features: 19
[LightGBM] [Info] Start training from score 13.047062


Unnamed: 0,RMSE
RandomForestRegressor,154829.06273
GradientBoostingRegressor,148196.219625
LGBMRegressor,140783.296114
XGBRegressor,137073.917972


In [149]:
from sklearn.model_selection import GridSearchCV

In [150]:
def my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5):
    # GridSearchCV 모델로 초기화
    grid_model = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', \
                              cv=5, verbose=verbose, n_jobs=n_jobs)
    
    # 모델 fitting
    grid_model.fit(train, y)

    # 결과값 저장
    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']
    
    # 데이터 프레임 생성
    results = pd.DataFrame(params)
    results['score'] = score
    
    # RMSLE 값 계산 후 정렬
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results = results.sort_values('RMSLE')

    return results

In [180]:
param_grid = {
    'boosting_type': ['gbdt'], 
    'learning_rate': [0.05],  
    'max_depth': [18, 20,25,30], 
    'n_estimators': [500,1000,1500], 
    'num_leaves': [20,40,],  
}

model = LGBMRegressor(random_state=random_state)
best_params_df = my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000992 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 15035, number of used features: 19
[LightGBM] [Info] Start training from score 13.048122


In [181]:
best_params_df

Unnamed: 0,boosting_type,learning_rate,max_depth,n_estimators,num_leaves,score,RMSLE
2,gbdt,0.05,18,1000,20,-0.025828,0.16071
8,gbdt,0.05,20,1000,20,-0.025828,0.16071
14,gbdt,0.05,25,1000,20,-0.025828,0.16071
20,gbdt,0.05,30,1000,20,-0.025828,0.16071
7,gbdt,0.05,20,500,40,-0.025955,0.161105
4,gbdt,0.05,18,1500,20,-0.025966,0.16114
16,gbdt,0.05,25,1500,20,-0.025966,0.16114
10,gbdt,0.05,20,1500,20,-0.025966,0.16114
22,gbdt,0.05,30,1500,20,-0.025966,0.16114
12,gbdt,0.05,25,500,20,-0.025989,0.161212


In [176]:
best_params = best_params_df.iloc[0].drop(['score', 'RMSLE']).to_dict()
best_params

{'boosting_type': 'gbdt',
 'learning_rate': 0.05,
 'max_depth': 20,
 'n_estimators': 500,
 'num_leaves': 40}

In [182]:
"""
아래의 과정을 수행하는 `save_submission(model, train, y, test, model_name, rmsle)` 함수를 구현해 주세요.
1. 모델을 `train`, `y`로 학습시킵니다.
2. `test`에 대해 예측합니다.
3. 예측값을 `np.expm1`으로 변환하고, `submission_model_name_RMSLE_100000.csv` 형태의 `csv` 파일을 저장합니다.
"""
def save_submission(model, train, y, test, model_name,best_params_df, rmsle=None,):
    best_params = best_params_df.iloc[0].drop(['score', 'RMSLE']).to_dict()
    model.set_params(**best_params)
    model.fit(train, y)
    prediction = model.predict(test)
    prediction = np.expm1(prediction)
    
    submission_path = './2019-2nd-ml-month-with-kakr/sample_submission.csv'
    submission = pd.read_csv(submission_path)
    submission['price'] = prediction
    submission_csv_path = '{}/submission_{}_RMSLE_{}.csv'.format('./2019-2nd-ml-month-with-kakr', model_name, rmsle)
    submission.to_csv(submission_csv_path, index=False)
    print('{} saved!'.format(submission_csv_path))
# 코드 작성

In [183]:
save_submission(model, train, y, test, 'lgbm',best_params_df, rmsle='0.160710',)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001724 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 15035, number of used features: 19
[LightGBM] [Info] Start training from score 13.048122
./2019-2nd-ml-month-with-kakr/submission_lgbm_RMSLE_0.160710.csv saved!


In [184]:
resulte = pd.read_csv('./2019-2nd-ml-month-with-kakr/submission_lgbm_RMSLE_0.160710.csv')
resulte.head()

Unnamed: 0,id,price
0,15035,503892.6
1,15036,460311.0
2,15037,1315737.0
3,15038,300679.9
4,15039,333199.7


- 오늘 노드 내용의 핵심은 grid_search를 통한 하이퍼 파라미터를 찾는 과정이였으며 이를 적용하여 캐글 대회에 무작정 시도해보았다.
- 아쉬운점은 목표 11만점을 도달하지 못한 점이면 아래는 개선 했었으면 좋아을 것 같은 내용이다.
- 다양한 모델로 앙상블 모델을 구현하여 적용해 보았으면 좋았을 것 같다.
- 데이터 전처리 과정 중 상관관계가 약한 몇몇 컬럼을 더 제외했으면 좋았을 것 같다.