In [1]:
# 라이브러리 불러오기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

In [2]:
# 데이터 불러오기

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
# 데이터 살펴보기
display(train.head())
display(test.head())
display(sample_submission.head())
display(train.info())

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,0,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,1,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
2,2,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
3,3,20140627T000000,257500.0,3,2.25,1715,6819,2.0,0,0,...,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
4,4,20150115T000000,291850.0,3,1.5,1060,9711,1.0,0,0,...,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711


Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,15035,20141209T000000,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
1,15036,20141209T000000,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
2,15037,20140512T000000,4,4.5,5420,101930,1.0,0,0,3,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930
3,15038,20150415T000000,3,1.0,1780,7470,1.0,0,0,3,7,1050,730,1960,0,98146,47.5123,-122.337,1780,8113
4,15039,20150312T000000,3,2.5,1890,6560,2.0,0,0,3,7,1890,0,2003,0,98038,47.3684,-122.031,2390,7570


Unnamed: 0,id,price
0,15035,100000
1,15036,100000
2,15037,100000
3,15038,100000
4,15039,100000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15035 entries, 0 to 15034
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             15035 non-null  int64  
 1   date           15035 non-null  object 
 2   price          15035 non-null  float64
 3   bedrooms       15035 non-null  int64  
 4   bathrooms      15035 non-null  float64
 5   sqft_living    15035 non-null  int64  
 6   sqft_lot       15035 non-null  int64  
 7   floors         15035 non-null  float64
 8   waterfront     15035 non-null  int64  
 9   view           15035 non-null  int64  
 10  condition      15035 non-null  int64  
 11  grade          15035 non-null  int64  
 12  sqft_above     15035 non-null  int64  
 13  sqft_basement  15035 non-null  int64  
 14  yr_built       15035 non-null  int64  
 15  yr_renovated   15035 non-null  int64  
 16  zipcode        15035 non-null  int64  
 17  lat            15035 non-null  float64
 18  long  

None

In [4]:
# 이상치 제거 아래 코드 참고
# https://www.kaggle.com/code/bluepinetree/57-of-415-eda-stacking-modeling
train = train.loc[train['id'] != 8912]
train = train.loc[train['id'] != 12346]
train = train.loc[train['id'] != 7173]
train = train.loc[train['id'] != 2775]

In [5]:
# 데이터 전처리
from sklearn.preprocessing import MinMaxScaler, StandardScaler

X = train.drop(['id','zipcode','price'], axis=1)
y = train['price']
y = np.log1p(y)

skew_columns = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']
skew_columns2 = ['sqft_living15', 'sqft_lot15'] # 1/3 제곱시켜 정규분포에 가깝게!

for c in skew_columns:
    X[c] = np.log1p(X[c].values)

for c in skew_columns2:
    X[c] = np.power(X[c].values, 1/3)

# date에는 년도만
X['date'] = X['date'].apply(lambda x:x[:4]).astype(int)
# old : date-지어진년도 또는 date-새로지은년도
X['old'] = X.apply(lambda x: x['date'] - (x['yr_built'] if x['yr_renovated'] == 0 else x['yr_renovated']), axis=1)
X = X.drop('yr_built', axis=1)
# 새로지은년도는 0(없음) 1(있음) 이분화
X['yr_renovated'] = X['yr_renovated'].apply(lambda x: x if x==0 else 1)

## date, 위도, 경도, 이산형 데이터는 minmax로 normalization
minmax = MinMaxScaler()
minmax_list = ['date', 'bedrooms', 'bathrooms', 'lat', 'long', 'floors', 'view', 'condition', 'grade', 'old']
X[minmax_list] = minmax.fit_transform(X[minmax_list])

# 연속형 데이터는 Standardization
standard = StandardScaler()
standard_list = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']
X[standard_list] = standard.fit_transform(X[standard_list])

display(X.head())


Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_renovated,lat,long,sqft_living15,sqft_lot15,old
0,0.0,0.3,0.125,-1.124226,-0.389653,0.0,0,0.0,0.5,0.5,-0.75464,-0.796131,0,0.571498,0.217608,-1.016721,-0.411073,0.517241
1,1.0,0.2,0.125,-2.127349,0.240957,0.0,0,0.0,0.5,0.416667,-1.748853,-0.796131,0,0.936143,0.237542,1.108108,-0.087504,0.715517
2,1.0,0.3,0.25,-0.29377,0.005473,0.0,0,0.0,0.5,0.583333,0.068439,-0.796131,0,0.741354,0.393688,-0.191456,-0.156035,0.25
3,0.0,0.3,0.28125,-0.245294,-0.181943,0.4,0,0.0,0.5,0.5,0.116484,-0.796131,0,0.247386,0.159468,0.47182,-0.244691,0.172414
4,1.0,0.3,0.1875,-1.376283,0.208564,0.0,0,0.0,0.5,0.5,-1.004458,-0.796131,0,0.407914,0.169435,-0.443269,0.097798,0.456897


In [6]:
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

In [7]:
random_state = 2025

# 4개의 회귀 모델 사용.
gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = XGBRegressor(random_state=random_state)
lightgbm = LGBMRegressor(random_state=random_state)
rdforest = RandomForestRegressor(random_state=random_state)

models = [gboost, xgboost, lightgbm, rdforest]

# 각 모델의 일반적인 성능 체크
def get_scores(models, train, y):
    df = {}

    for model in models:
        model_name = model.__class__.__name__

        X_train, X_test, y_train, y_test = train_test_split(train, y, random_state=random_state, test_size=0.2)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        df[model_name] = rmse(y_test, y_pred)
        score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)

    return score_df

In [8]:
#scores = get_scores(models, X, y)

In [10]:
# 다들 비슷한 성능을 가져, XGBRegressor 쓰기로 결정.
#display(scores)

In [11]:
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [5, 10],
}

def my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5):
    # GridSearchCV 모델로 초기화
    grid_model = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', \
                              cv=2, verbose=verbose, n_jobs=n_jobs)

    # 모델 fitting
    grid_model.fit(train, y)

    # 결과값 저장
    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']

    # 데이터 프레임 생성
    results = pd.DataFrame(params)
    results['score'] = score

    # RMSLE 값 계산 후 정렬
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results = results.sort_values('RMSLE')

    return results

In [12]:
"""for model in models:
  display(my_GridSearch(model, X, y, param_grid, verbose=2, n_jobs=5))"""

#display(my_GridSearch(xgboost, X, y, param_grid, verbose=2, n_jobs=5))

'for model in models:\n  display(my_GridSearch(model, X, y, param_grid, verbose=2, n_jobs=5))'

In [13]:
X_test = test.drop(['id','zipcode'], axis=1)

for c in skew_columns:
    X_test[c] = np.log1p(X_test[c].values)

for c in skew_columns2:
    X_test[c] = np.power(X_test[c].values, 1/3)

# date에는 년도만
X_test['date'] = X_test['date'].apply(lambda x:x[:4]).astype(int)
# old : date-지어진년도 또는 date-새로지은년도
X_test['old'] = X_test.apply(lambda x: x['date'] - (x['yr_built'] if x['yr_renovated'] == 0 else x['yr_renovated']), axis=1)
X_test = X_test.drop('yr_built', axis=1)
# 새로지은년도는 0(없음) 1(있음) 이분화
X_test['yr_renovated'] = X_test['yr_renovated'].apply(lambda x: x if x==0 else 1)

# date, 위도, 경도, 이산형 데이터는 minmax로 normalization
minmax_list = ['date', 'bedrooms', 'bathrooms', 'lat', 'long', 'floors', 'view', 'condition', 'grade', 'old']
X_test[minmax_list] = minmax.transform(X_test[minmax_list])

# 연속형 데이터는 Standardization
standard_list = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']
X_test[standard_list] = standard.transform(X_test[standard_list])

display(X_test.head())

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_renovated,lat,long,sqft_living15,sqft_lot15,old
0,0.0,0.3,0.28125,0.705773,-0.115468,0.4,0,0.0,0.5,0.5,0.664832,1.09698,1,0.908959,0.166113,-0.374675,-0.139057,0.206897
1,0.0,0.4,0.375,0.068652,-0.524639,0.0,0,0.0,1.0,0.5,-1.026537,1.35615,0,0.586939,0.104651,-0.977199,-0.513745,0.431034
2,0.0,0.4,0.5625,2.460617,2.805594,0.0,0,0.0,0.5,0.833333,2.025204,1.520111,0,0.804568,0.42691,3.180998,3.762399,0.12069
3,1.0,0.3,0.125,-0.157834,-0.08123,0.0,0,0.0,0.5,0.5,-1.026537,1.286625,0,0.573267,0.151163,-0.224202,-0.081411,0.482759
4,1.0,0.3,0.3125,-0.016854,-0.224712,0.4,0,0.0,0.5,0.5,0.342895,-0.796131,0,0.341805,0.405316,0.681577,-0.147645,0.112069


In [14]:
def AveragingBlending(models, x, y, sub_x):
    for m in models :
        m['model'].fit(x.values, y)

    predictions = np.column_stack([
        m['model'].predict(sub_x.values) for m in models
    ])
    return np.mean(predictions, axis=1)

In [15]:
# 4개의 회귀 모델 사용.
# 하이퍼파라메터는 grid search로 찾아냈음.
gboost = GradientBoostingRegressor(max_depth=5, n_estimators=200, random_state=random_state)
xgboost = XGBRegressor(max_depth=5, n_estimators=100, random_state=random_state)
lightgbm = LGBMRegressor(max_depth=10, n_estimators=200, random_state=random_state)
rdforest = RandomForestRegressor(max_depth=10, n_estimators=500, random_state=random_state)

models = [{'model':gboost, 'name':'GradientBoosting'}, {'model':xgboost, 'name':'XGBoost'},
          {'model':lightgbm, 'name':'LightGBM'}, {'model':rdforest, 'name':'RandomForest'}]

y_pred = AveragingBlending(models, X, y, X_test)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001532 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2200
[LightGBM] [Info] Number of data points in the train set: 15031, number of used features: 18
[LightGBM] [Info] Start training from score 13.047619




In [16]:
result = pd.DataFrame({
    'id' : test['id'],
    'price' : np.expm1(y_pred)
})

In [17]:
result.to_csv('submission.csv', index=False)
result.head()

Unnamed: 0,id,price
0,15035,495540.3
1,15036,465769.9
2,15037,1413031.0
3,15038,291666.0
4,15039,325644.3
