## 1. 데이터 살펴보기
pandas의 read_csv 함수를 사용해 데이터를 읽어오고, 각 변수들이 나타내는 의미를 살펴보겠습니다.
1. ID : 집을 구분하는 번호
2. date : 집을 구매한 날짜
3. price : 타겟 변수인 집의 가격
4. bedrooms : 침실의 수
5. bathrooms : 침실당 화장실 개수
6. sqft_living : 주거 공간의 평방 피트
7. sqft_lot : 부지의 평방 피트
8. floors : 집의 층 수
9. waterfront : 집의 전방에 강이 흐르는지 유무 (a.k.a. 리버뷰)
10. view : 집이 얼마나 좋아 보이는지의 정도
11. condition : 집의 전반적인 상태
12. grade : King County grading 시스템 기준으로 매긴 집의 등급
13. sqft_above : 지하실을 제외한 평방 피트
14. sqft_basement : 지하실의 평방 피트
15. yr_built : 집을 지은 년도
16. yr_renovated : 집을 재건축한 년도
17. zipcode : 우편번호
18. lat : 위도
19. long : 경도
20. sqft_living15 : 2015년 기준 주거 공간의 평방 피트(집을 재건축했다면, 변화가 있을 수 있음)
21. sqft_lot15 : 2015년 기준 부지의 평방 피트(집을 재건축했다면, 변화가 있을 수 있음)

In [486]:
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

train_data_path = join('/kaggle/input/aiffel/data/', 'train.csv')
sub_data_path = join('/kaggle/input/aiffel/data/', 'test.csv')

data = pd.read_csv(train_data_path)
sub = pd.read_csv(sub_data_path)
print('train data dim : {}'.format(data.shape))
print('sub data dim : {}'.format(sub.shape))

# data = data.loc[data['id']!=456]
# data = data.loc[data['id']!=2302]
# data = data.loc[data['id']!=4123]
# data = data.loc[data['id']!=7259]
# data = data.loc[data['id']!=2777]

train data dim : (15035, 21)
sub data dim : (6468, 20)


In [487]:
y = data['price']

del data['price']

train_len = len(data)
data = pd.concat((data, sub), axis=0)

for c in data.columns:
    print('{} : {}'.format(c, len(data.loc[pd.isnull(data[c]), c].values)))

sub_id = data['id'][train_len:]
del data['id']
data['date'] = data['date'].apply(lambda x : str(x[:6])).astype(int)

id : 0
date : 0
bedrooms : 0
bathrooms : 0
sqft_living : 0
sqft_lot : 0
floors : 0
waterfront : 0
view : 0
condition : 0
grade : 0
sqft_above : 0
sqft_basement : 0
yr_built : 0
yr_renovated : 0
zipcode : 0
lat : 0
long : 0
sqft_living15 : 0
sqft_lot15 : 0


In [488]:
print(data.head())
print(data.describe())
for col in data.columns:
    zero_count = (data[col] == 0).sum()  # 해당 열에서 0인 값의 개수 계산
    print(f"{col} : {zero_count}")

     date  bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  \
0  201410         3       1.00         1180      5650     1.0           0   
1  201502         2       1.00          770     10000     1.0           0   
2  201502         3       2.00         1680      8080     1.0           0   
3  201406         3       2.25         1715      6819     2.0           0   
4  201501         3       1.50         1060      9711     1.0           0   

   view  condition  grade  sqft_above  sqft_basement  yr_built  yr_renovated  \
0     0          3      7        1180              0      1955             0   
1     0          3      6         770              0      1933             0   
2     0          3      8        1680              0      1987             0   
3     0          3      7        1715              0      1995             0   
4     0          3      7        1060              0      1963             0   

   zipcode      lat     long  sqft_living15  sqft_lot15 

In [489]:
# 피쳐 제거
# data = data.drop(columns=['sqft_basement'])

# 재건축 연도가 0인 경우, 첫 건축 연도로 채우기
data['yr_renovated'] = data.apply(
    lambda row: row['yr_built'] if row['yr_renovated'] == 0 else row['yr_renovated'], axis=1
)

In [490]:
data['yr_renovated'] = data['yr_renovated'] - data['yr_built'].min()
data['yr_built'] = data['yr_built'] - data['yr_built'].min()

In [491]:
data['yr_renovated'] = data['yr_renovated'].astype(int)
data['yr_renovated'].unique()

array([ 55,  33,  87,  95,  63,  65,  42,  27,  77,   0,  79,  94,  16,
        21,  69,  47,  68,  85,  41,  15,   9, 103,  29,  81, 102,   4,
        96,  84, 114,  22,  66,  50, 108,  59,  54,  89, 105,  73,  72,
        86,  56,  92,  64,  52,  25,  61, 106,  88, 101,  62,  39,  67,
        75,  10,  83,  91,   5,  80,  71,  99, 110,  45,  24,  78,  26,
        23,  90, 100,  76,  49,   1,  93,  48,  20,  97, 104,  60,  40,
        18,  74,  11,  30,  37,   8,  31,  98,  28,  43,  57,   7, 113,
        53, 111, 112, 107,  12,  17,  58,  46,  51,  32,  44,  82, 109,
        70,  38,  19,   6,  36,  13,   3,   2,  14, 115,  34,  35])

In [492]:
data['year'] = data['date'] // 100  # 100으로 나눈 몫은 연도
data['month'] = data['date'] % 100  # 100으로 나눈 나머지는 월
data['year'].unique()

array([2014, 2015])

In [493]:
data = data.drop(columns=['year', 'date'])

In [494]:
data['zipcode'] = data['zipcode'] - data['zipcode'].min()

In [495]:
# import pandas as pd
# from sklearn.preprocessing import MinMaxScaler

# # 위도와 경도를 2D 좌표로 스케일링
# lat_long = data[['lat', 'long']]
# scaler = MinMaxScaler()
# scaled_lat_long = scaler.fit_transform(lat_long)

# # 결과를 새로운 컬럼에 저장
# data['x_scaled'] = scaled_lat_long[:, 0]  # 스케일링된 위도
# data['y_scaled'] = scaled_lat_long[:, 1]  # 스케일링된 경도

# # 원래 좌표를 드랍하고 확인
# data = data.drop(columns=['lat', 'long'])


In [496]:
# data['long'] = data['long'] - data['long'].min()
# data['lat'] = data['lat'] - data['lat'].min()

In [497]:
skew_columns = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']

for c in skew_columns:
    data[c] = np.log1p(data[c].values)

In [498]:
print(data)

      bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  view  \
0            3       1.00     7.074117  8.639588     1.0           0     0   
1            2       1.00     6.647688  9.210440     1.0           0     0   
2            3       2.00     7.427144  8.997271     1.0           0     0   
3            3       2.25     7.447751  8.827615     2.0           0     0   
4            3       1.50     6.966967  9.181118     1.0           0     0   
...        ...        ...          ...       ...     ...         ...   ...   
6463         3       1.75     7.313887  9.390075     1.0           0     0   
6464         3       2.00     7.307202  7.027315     3.0           0     0   
6465         3       2.50     7.178545  7.166266     2.0           0     0   
6466         2       0.75     6.928538  7.208600     2.0           0     0   
6467         3       2.50     7.378384  7.778630     2.0           0     0   

      condition  grade  sqft_above  sqft_basement  yr_built  yr

In [499]:
sub = data.iloc[train_len:, :]
x = data.iloc[:train_len, :]
y = np.log1p(y)

In [500]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# random_state는 모델초기화나 데이터셋 구성에 사용되는 랜덤 시드값입니다. 
#random_state=None    # 이게 초기값입니다. 아무것도 지정하지 않고 None을 넘겨주면 모델 내부에서 임의로 선택합니다.  
random_state=42       # 하지만 우리는 이렇게 고정값을 세팅해 두겠습니다. 

gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = XGBRegressor(random_state=random_state)
lightgbm = LGBMRegressor(random_state=random_state)
rdforest = RandomForestRegressor(random_state=random_state)



models = [gboost, xgboost, lightgbm, rdforest]

def my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5):
    # GridSearchCV 모델로 초기화
    grid_model = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', \
                              cv=5, verbose=verbose, n_jobs=n_jobs)
    
    # 모델 fitting
    grid_model.fit(train, y)

    # 결과값 저장
    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']
    
    # 데이터 프레임 생성
    results = pd.DataFrame(params)
    results['score'] = score
    
    # RMSLE 값 계산 후 정렬
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results = results.sort_values('RMSLE')

    return results

In [504]:
best_model = LGBMRegressor(
    feature_fraction=0.2319, learning_rate=0.05, max_bin=80,
    min_data_in_leaf=6, min_sum_hessian_in_leaf=11, n_estimators=10000,
    num_leaves=15, objective='regression', random_state=25, verbosity=2)

best_model.fit(x, y)
prediction = best_model.predict(sub)
prediction = np.expm1(prediction)
data_dir = '/kaggle/input/aiffel/data/'
submission_path = join(data_dir, 'sample_submission.csv')
submission = pd.read_csv(submission_path)
submission['price'] = prediction
# submission_csv_path = 'submission_RMSLE_{}.csv'.format(round(results['RMSLE'].iloc[0], 5))

submission_csv_path = 'submission_best4.csv'
submission.to_csv(submission_csv_path, index=False)
print('{} saved!'.format(submission_csv_path))

submission_best4.csv saved!


In [506]:
param_grid = {
    'feature_fraction': [0.2, 0.2319, 0.3],  # 탐색 범위
    'learning_rate': [0.01, 0.05, 0.1],      # 학습률
    'max_bin': [80],               # max_bin 변경
    'min_data_in_leaf': [5, 6, 10],          # 리프당 최소 데이터 수
    'num_leaves': [15, 20],              # num_leaves 변경
    'n_estimators': [5000, 10000]      # 부스팅 라운드 수
}

model = LGBMRegressor(
    objective='regression',
    random_state=25,
    verbosity=2
)
results= my_GridSearch(model, x, y, param_grid, verbose=2, n_jobs=5)
results

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.834032
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.132921
[LightGBM] [Debug] init for col-wise cost 0.000735 seconds, init for row-wise cost 0.001227 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000923 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 15035, number of used features: 19
[LightGBM] [Info] Start training from score 13.048122
[LightGBM] [Debug] Trained a tree with leaves = 15 and depth = 5
[LightGBM] [Debug] Trained a tree with leaves = 15 and depth = 5
[LightGBM] [Debug] Trained a tree with leaves = 15 and depth = 5
[LightGBM] [Debug] Trained a tree 

Unnamed: 0,feature_fraction,learning_rate,max_bin,min_data_in_leaf,n_estimators,num_leaves,score,RMSLE
78,0.3000,0.01,80,6,10000,15,-0.025832,0.160724
75,0.3000,0.01,80,5,10000,20,-0.025860,0.160812
74,0.3000,0.01,80,5,10000,15,-0.025862,0.160816
82,0.3000,0.01,80,10,10000,15,-0.025873,0.160852
83,0.3000,0.01,80,10,10000,20,-0.025890,0.160904
...,...,...,...,...,...,...,...,...
67,0.2319,0.10,80,6,10000,20,-0.028549,0.168964
35,0.2000,0.10,80,10,10000,20,-0.028797,0.169698
71,0.2319,0.10,80,10,10000,20,-0.028797,0.169698
70,0.2319,0.10,80,10,10000,15,-0.028863,0.169890


In [518]:
# 최적의 파라미터를 딕셔너리로 변환
best_params = results.iloc[5].drop(['score', 'RMSLE']).to_dict()

# 정수형으로 변환해야 할 파라미터 리스트
int_params = ['n_estimators', 'max_bin', 'min_data_in_leaf', 'num_leaves']

# 정수형 파라미터 변환
for param in int_params:
    if param in best_params:
        best_params[param] = int(best_params[param])

# 최적의 파라미터로 모델 초기화 및 학습
best_model = LGBMRegressor(random_state=random_state, **best_params)
best_model.fit(x, y)

# 예측 수행
prediction = best_model.predict(sub)
prediction = np.expm1(prediction)


[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.834032
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.132921
[LightGBM] [Debug] init for col-wise cost 0.000744 seconds, init for row-wise cost 0.001935 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 15035, number of used features: 19
[LightGBM] [Info] Start training from score 13.048122
[LightGBM] [Debug] Trained a tree with leaves = 20 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 20 and depth = 5
[LightGBM] [Debug] Trained a tree with leaves = 20 and depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 20 and depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 20 and depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 20 and depth = 6
[LightGBM] 

In [519]:
data_dir = '/kaggle/input/aiffel/data/'
submission_path = join(data_dir, 'sample_submission.csv')
submission = pd.read_csv(submission_path)
submission['price'] = prediction
# submission_csv_path = 'submission_RMSLE_{}.csv'.format(round(results['RMSLE'].iloc[0], 5))

submission_csv_path = '5u6m15510n.csv'
submission.to_csv(submission_csv_path, index=False)
print('{} saved!'.format(submission_csv_path))

5u6m15510n.csv saved!


# just 구분

In [112]:
param_grid = {
    'n_estimators': [300, 500, 700],
    'max_depth': [10, 15, 20],
    'learning_rate': [0.05, 0.075, 0.1, 0.125, 0.15],
    'subsample': [0.8, 1.0],
}

model = LGBMRegressor(random_state=random_state)
results= my_GridSearch(model, x, y, param_grid, verbose=2, n_jobs=5)
results

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000910 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2401
[LightGBM] [Info] Number of data points in the train set: 15035, number of used features: 19
[LightGBM] [Info] Start training from score 13.048122


Unnamed: 0,learning_rate,max_depth,n_estimators,subsample,score,RMSLE
8,0.05,15,500,0.8,-0.026045,0.161386
9,0.05,15,500,1.0,-0.026045,0.161386
5,0.05,10,700,1.0,-0.026048,0.161393
4,0.05,10,700,0.8,-0.026048,0.161393
16,0.05,20,700,0.8,-0.026063,0.161440
...,...,...,...,...,...,...
89,0.15,20,700,1.0,-0.027323,0.165297
82,0.15,15,700,0.8,-0.027501,0.165833
83,0.15,15,700,1.0,-0.027501,0.165833
76,0.15,10,700,0.8,-0.027509,0.165859


In [113]:
best_params = results.iloc[0].drop(['score', 'RMSLE']).to_dict()
# 정수형 파라미터를 변환
int_params = ['n_estimators', 'max_depth']
for param in int_params:
    if param in best_params:
        best_params[param] = int(best_params[param])
# 최적의 파라미터로 모델 초기화
best_model = LGBMRegressor(random_state=random_state, **best_params)
best_model.fit(x, y)
prediction = best_model.predict(sub)
prediction = np.expm1(prediction)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000736 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2401
[LightGBM] [Info] Number of data points in the train set: 15035, number of used features: 19
[LightGBM] [Info] Start training from score 13.048122


In [130]:
from ngboost import NGBRegressor
from ngboost.distns import Normal
from sklearn.model_selection import GridSearchCV

# NGBoost 모델
model = NGBRegressor(Dist=Normal, random_state=42)

# 하이퍼파라미터 그리드
param_grid = {
    'n_estimators': [700, 800],
    'learning_rate': [0.1, 0.12],
    'minibatch_frac': [0.9, 1.0],
    'col_sample': [0.7, 0.8],
}


# GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=3, n_jobs=-1)
grid_search.fit(x, y)

# 최적의 하이퍼파라미터와 점수
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[iter 0] loss=0.7778 val_loss=0.0000 scale=1.0000 norm=0.7012
[iter 100] loss=-0.4712 val_loss=0.0000 scale=1.0000 norm=0.5369
[iter 200] loss=-0.5819 val_loss=0.0000 scale=1.0000 norm=0.5197
[iter 300] loss=-0.6457 val_loss=0.0000 scale=1.0000 norm=0.5096
[iter 400] loss=-0.6878 val_loss=0.0000 scale=1.0000 norm=0.5049
[iter 500] loss=-0.7288 val_loss=0.0000 scale=1.0000 norm=0.4957
[iter 600] loss=-0.7624 val_loss=0.0000 scale=1.0000 norm=0.4882
[iter 700] loss=-0.7930 val_loss=0.0000 scale=0.5000 norm=0.2417
Best Parameters: {'col_sample': 0.7, 'learning_rate': 0.1, 'minibatch_frac': 0.9, 'n_estimators': 800}
Best Score: -0.02718397197258579


In [131]:
prediction = grid_search.predict(sub)
prediction = np.expm1(prediction)

In [125]:
from ngboost import NGBRegressor
from ngboost.distns import Normal
from ngboost.scores import MLE

model = NGBRegressor(Dist=Normal, verbose=True)
model.fit(x, y)
prediction = model.predict(sub)
prediction = np.expm1(prediction)

[iter 0] loss=0.7792 val_loss=0.0000 scale=1.0000 norm=0.7024
[iter 100] loss=0.2997 val_loss=0.0000 scale=1.0000 norm=0.4999
[iter 200] loss=-0.0405 val_loss=0.0000 scale=2.0000 norm=0.9450
[iter 300] loss=-0.2315 val_loss=0.0000 scale=1.0000 norm=0.4841
[iter 400] loss=-0.3153 val_loss=0.0000 scale=1.0000 norm=0.5038


In [None]:
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4],
    'max_features': ['sqrt', None],
    'bootstrap': [True],
}
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# 모델 초기화
rf_model = RandomForestRegressor(random_state=42)

# GridSearchCV 설정
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # 성능 평가 지표
    cv=3,  # 교차 검증
    verbose=3,
    n_jobs=-1  # 병렬 처리
)

# 그리드 서치 실행
grid_search.fit(x, y)

# 최적의 하이퍼파라미터와 점수 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
prediction = grid_search.predict(sub)
prediction = np.expm1(prediction)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best Parameters: {'bootstrap': True, 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Best Score: -0.03185521303320607


TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'

In [135]:
prediction = grid_search.predict(sub)
prediction = np.expm1(prediction)

In [136]:
data_dir = '/kaggle/input/aiffel/data/'
submission_path = join(data_dir, 'sample_submission.csv')
submission = pd.read_csv(submission_path)
submission['price'] = prediction
# submission_csv_path = 'submission_RMSLE_{}.csv'.format(round(results['RMSLE'].iloc[0], 5))

submission_csv_path = 'submission_RMSLE_fin.csv'
submission.to_csv(submission_csv_path, index=False)
print('{} saved!'.format(submission_csv_path))

submission_RMSLE_fin.csv saved!
