In [4]:
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ['dlopen(/Users/kyungmo/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Users/kyungmo/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: image not found']


## 데이터 살펴보기 및 전처리

In [3]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.2-py3-none-macosx_10_14_x86_64.macosx_10_15_x86_64.macosx_11_0_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 7.1 MB/s eta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'

train_data_path = join(data_dir, 'train.csv')
test_data_path = join(data_dir, 'test.csv') 

train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)

In [None]:
train.head()

In [None]:
#전처리를 위한 데이터 결합
train_len = len(train)
data = pd.concat((train, test), axis=0)

In [None]:
#date전처리
data['date'] = data['date'].apply(lambda i: i[:6]).astype(int)
data.head()

In [None]:
#target 열 삭제
y = train['price']
del data['price']

print(data.columns)

In [None]:
# id 열 삭제
del data['id']

print(data.columns)

In [None]:
y

In [None]:
sns.kdeplot(y)
plt.show()

In [None]:
#한쪽으로 치우쳐있으니 로그변환
y = np.log1p(y)


In [None]:
sns.kdeplot(y)
plt.show()

In [None]:
data.info() #feature 타입 확인

In [None]:
#결측치 확인
msno.matrix(data)

In [None]:
# 변수 분포 시각화

fig, ax = plt.subplots(9, 2, figsize=(12, 50))

# id 변수는 제외하고 분포를 확인합니다.
count = 1
columns = data.columns
for row in range(9):
    for col in range(2):
        sns.kdeplot(data=data[columns[count]], ax=ax[row][col])
        ax[row][col].set_title(columns[count], fontsize=15)
        count += 1
        if count == 19 :
            break

In [None]:
#왜도 확인
from scipy.stats import skew, kurtosis
columns = data.columns
skew = {}
for col in columns:
    skew[col] = data[col].skew()
dict(sorted(skew.items(), key=lambda x : x[1]))

In [None]:
# 분포가 치어친 feature들을 log-scaling 해보자(왜도가 1 이상))
skew_columns = ['sqft_lot', 'waterfront', 'sqft_lot15', 'yr_renovated', 'view', 'bedrooms',
               'sqft_basement','sqft_living','sqft_above','sqft_living15','condition']

for c in skew_columns:
    data[c] = np.log1p(data[c].values)

In [None]:
fig, ax = plt.subplots(5, 2, figsize=(12, 24))

count = 0
for row in range(5):
    for col in range(2):
        if count == 10:
            break
        sns.kdeplot(data=data[skew_columns[count]], ax=ax[row][col])
        ax[row][col].set_title(skew_columns[count], fontsize=15)
        count += 1

In [None]:
#종속변수와 독립변수 상관관계도 확인해보자, 아까 price를 지웠었는데, 잠깐 넣어보자
train['price'] = y
corr_df = train.corr()
corr_df = corr_df.apply(lambda x: round(x ,2))
corr_price = corr_df['price']
corr_price 

In [None]:
df = pd.DataFrame(corr_price)
df

In [None]:
#전처리가 끝났으니 다시 분리
train =  data.iloc[:train_len, :]

test =data.iloc[train_len:, :]


## 평가함수 및 모델 함수 선언

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
#rmse계산함수
#이때, y_test, y_pred는 로그변환 되어있기 때문에 expm1을 통해 다시 변환해줘야함
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))



In [None]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor



In [None]:
random_state=2020       

gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = XGBRegressor(random_state=random_state)
lightgbm = LGBMRegressor(random_state=random_state)


models = [gboost, xgboost, lightgbm]



In [None]:

def get_scores(models, train, y):
    df = {}
    
    for model in models:
        model_name = model.__class__.__name__ #모델 이름 얻기
        
        #train , validation set으로 분리
        X_train, X_test, y_train, y_test = train_test_split(train, y, random_state=random_state, test_size=0.2)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        #rmse값 df에 넣기
        df[model_name] = rmse(y_test, y_pred)
        score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)
            
    return score_df


In [None]:
get_scores(models, train, y)

## 하이퍼 파라미터 튜닝

In [None]:

from sklearn.model_selection import GridSearchCV

In [None]:
def my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5):
    # GridSearchCV 모델로 초기화
    grid_model = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', \
                              cv=5, verbose=verbose, n_jobs=n_jobs)
    
    # 모델 fitting
    grid_model.fit(train, y)

    # 결과값 저장
    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']
    
    # 데이터 프레임 생성
    results = pd.DataFrame(params)
    results['score'] = score
    
    # RMSLE 값 계산 후 정렬
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results = results.sort_values('RMSLE')

    return results

In [None]:
def AveragingBlending(models, x, y, sub_x):
    for m in models : 
        m['model'].fit(x.values, y)
    
    predictions = np.column_stack([
        m['model'].predict(sub_x.values) for m in models
    ])
    return np.mean(predictions, axis=1)



In [None]:
def save_submission(model, train, y, test, model_name, rmsle=None):
    model.fit(train, y)
    prediction = model.predict(test)
    prediction = np.expm1(prediction)
    data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'
    submission_path = join(data_dir, 'sample_submission.csv')
    submission = pd.read_csv(submission_path)
    submission['price'] = prediction
    submission_csv_path = '{}/submission_{}_RMSLE_{}.csv'.format(data_dir, model_name, rmsle)
    submission.to_csv(submission_csv_path, index=False)
    print('{} saved!'.format(submission_csv_path))