<span style = "color:blue; font-size:150%"> 
<b>train.csv를 활용해서 데이터를 뜯어보고 모델을 학습시킨 후, test.csv 파일의 데이터에 대해 price를 예측해서 sample_submission.csv의 형식에 맞는 형태로 캐글에 제출</span>

### 시각화 그래프가 나타날 수 있도록 하기위해

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

print('얍💢')

### Data 가져와서 Read 하기

In [None]:
train_data_path = os.getenv("HOME") + "/aiffel/kaggle_kakr_housing/data/train.csv"
sub_data_path = os.getenv("HOME") + "/aiffel/kaggle_kakr_housing/data/test.csv"

In [None]:
data = pd.read_csv(train_data_path)
sub = pd.read_csv(sub_data_path)
print(f'train data dim : {data.shape}')
print(f'sub data dim : {sub.shape}')

In [None]:
y = data['price']
del data['price']

print(data.columns)
y.shape

In [None]:
train_len = len(data)
data = pd.concat((data, sub), axis=0)

print(len(data))

In [None]:
data.head()

In [None]:
msno.matrix(data)  #missing no

In [None]:
# 1. id 컬럼이 결측치인지 확인합니다.
null_check = pd.isnull(data['id'])
print(null_check)

In [None]:
# 2. 결측치인 데이터만 뽑아냅니다.
null_data = data.loc[null_check, 'id']  #[row index, column index]
null_data.head()

In [None]:
# 3. 결측치인 데이터의 개수를 셉니다.
print(f'id: {len(null_data.values)}')

In [None]:
for c in data.columns:
    print('{} : {}'.format(c, len(data.loc[pd.isnull(data[c]), c].values)))

#### train과 sub를 합친 후, id colum을 지우기 전 sub의 id를 따로 보관

In [None]:
sub_id = data['id'][train_len:]
del data['id']

print(data.columns)
data.head()

In [None]:
data['date'] = data['date'].apply(lambda x : str(x[:6]))

data.head()

In [None]:
fig, ax = plt.subplots(9, 2, figsize=(12, 50))   # 가로스크롤 때문에 그래프 확인이 불편하다면 figsize의 x값을 조절해 보세요. 

# id 변수(count==0인 경우)는 제외하고 분포를 확인합니다.
count = 1
columns = data.columns
for row in range(9):
    for col in range(2):
        sns.kdeplot(data=data[columns[count]], ax=ax[row][col])  # A kernel density estimate (KDE) plot (data=None, .... ax, )
        ax[row][col].set_title(columns[count], fontsize=15)
        count += 1
        if count == 19 :
            break

### 한 쪽으로 치우친 분포의 경우에는 로그 변환(log-scaling)을 통해 데이터 분포를 정규분포에 가깝게 만들 수 있음

In [None]:
skew_columns = ['bedrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_lot15', 'sqft_living15']

for c in skew_columns:
    data[c] = np.log1p(data[c].values)  

print('얍💢')

In [None]:
columns = data[skew_columns].columns

count = 0
fig, ax = plt.subplots(4, 2, figsize=(12, 25))   # 가로스크롤 때문에 그래프 확인이 불편하다면 figsize의 x값을 조절해 보세요. 

for row in range(4):
    for col in range(2):
        if count == 7 :
            break
        sns.kdeplot(data=data[columns[count]], ax=ax[row][col])
        ax[row][col].set_title(columns[count], fontsize=15)
        count += 1
        

### Why log?

In [None]:
xx = np.linspace(0, 10, 500)  # 0~10사이 500개의 숫자를 순서대로 채우는 함수
yy = np.log(xx)

plt.hlines(0, 0, 10)  # horizontal line (y, xmin, xmax,)
plt.vlines(0, -5, 5)  # vertical line(x, ymin, ymax, )
plt.plot(xx, yy, c='r')
plt.show()

### Price 값 log 비교

 ### 0과 1000000 사이에 대부분의 값들이 몰려있고  ??

In [None]:
sns.kdeplot(y)
plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15,3)) 

plt.subplot(1,2,1)
sns.kdeplot(y)

plt.subplot(1,2,2)
y_log_transformation = np.log1p(y)
sns.kdeplot(y_log_transformation)

plt.show()

In [None]:
sub = data.iloc[train_len:, :]  # Test data
x = data.iloc[:train_len, :]   # Train data

print(x.shape)
print(sub.shape)

## Part 1. Introduction to Ensemble Learning

### Average Blending(집값을 분류중이므로)

In [None]:
gboost = GradientBoostingRegressor(random_state=2019)
xgboost = xgb.XGBRegressor(random_state=2019)
lightgbm = lgb.LGBMRegressor(random_state=2019)

models = [{'model':gboost, 'name':'GradientBoosting'}, {'model':xgboost, 'name':'XGBoost'},
          {'model':lightgbm, 'name':'LightGBM'}]

### K-fold Cross Validation : 학습용/ 평가용 데이터 세트를 나누는 방법론 중의 하나

In [None]:
def get_cv_score(models):
    kfold = KFold(n_splits=5).get_n_splits(x.values)
    for m in models:
        CV_score = np.mean(cross_val_score(m['model'], X=x.values, y=y, cv=kfold))  #y = price
        print(f"Model: {m['name']}, CV score:{CV_score:.4f}")

### cross_val_score는 R**2 값을 return 함
### R**2 :   값은 1에 가까울수록 모델이 잘 학습되었다는 것을 나타냄

In [None]:
get_cv_score(models)  # cv : Cross Validation

### Stacking - k-c=Cross Folding 기법 사용

In [None]:
def AveragingBlending(models, x, y, sub_x):
    for m in models : 
        m['model'].fit(x.values, y) # Model Training
    
    predictions = np.column_stack([
        m['model'].predict(sub_x.values) for m in models
    ])
    return np.mean(predictions, axis=1)

In [None]:
y_pred = AveragingBlending(models, x, y, sub)
print(len(y_pred))
y_pred

In [None]:
data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'

submission_path = join(data_dir, 'sample_submission.csv')
submission = pd.read_csv(submission_path)
submission.head()

In [None]:
result = pd.DataFrame({
    'id' : sub_id, 
    'price' : y_pred
})

result.head()

<span style = "color:green; font-size:150%"> 
<b>Q.앞에서 배웠던 join과 to_csv 메서드를 사용하여 'submission.csv' 파일로 저장해 보세요.</span>

In [None]:
my_submission_path =  join(data_dir, 'submission.csv') # [[YOUR CODE]]

# [[YOUR CODE]]
result.to_csv(my_submission_path, index = False)
print(my_submission_path)

## 최적의 모델을 찾아서, 하이퍼 파라미터 튜닝

<span style = "color: blue; font-size:110%"> 
<b>Train data 전처리하기</span>

In [None]:
data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'

train_data_path = join(data_dir, 'train.csv')
test_data_path = join(data_dir, 'test.csv') 

train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)

In [None]:
train.head()

In [None]:
train['date'] = train['date'].apply(lambda i: i[:6]).astype(int)
train.head()

In [None]:
y = train['price']
del train['price']

print(train.columns)

In [None]:
del train['id']

print(train.columns)

<span style = "color: blue; font-size:110%"> 
<b>Test data 전처리하기</span>

In [None]:
train.shape

<span style = "color:green; font-size:150%"> 
<b>Q. train 데이터와 마찬가지로 test 데이터도 전처리하는 코드를 작성해 보세요.</span>

In [None]:
# [[YOUR CODE]]
test.head()


In [None]:
test['date'] = test['date'].apply(lambda i: i[:6]).astype(int)

In [None]:
test.head()

In [None]:
print(test.columns)

In [None]:
sub_id = test["id"]

In [None]:
del test["id"]
test

In [None]:
test.head()
train.shape, test.shape

In [None]:
y

<span style = "color:green; font-size:150%"> 
<b>Q. Tseaborn의 `kdeplot`을 활용해 `y`의 분포를 확인해주세요!</span>

In [None]:
sns.kdeplot(y)
plt.show()

In [None]:
y = np.log1p(y)
y

In [None]:
sns.kdeplot(y)
plt.show()

In [None]:
train.info()

<span style = "color: blue; font-size:110%"> 
<b>RMSE 계산</span>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

In [None]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

In [None]:
# random_state는 모델초기화나 데이터셋 구성에 사용되는 랜덤 시드값입니다. 
#random_state=None    # 이게 초기값입니다. 아무것도 지정하지 않고 None을 넘겨주면 모델 내부에서 임의로 선택합니다.  
random_state=2020        # 하지만 우리는 이렇게 고정값을 세팅해 두겠습니다. 

gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = XGBRegressor(random_state=random_state)
lightgbm = LGBMRegressor(random_state=random_state)
rdforest = RandomForestRegressor(random_state=random_state)

models = [gboost, xgboost, lightgbm, rdforest]

print('얍💢')

In [None]:
gboost.__class__.__name__

## 다음과 같이 for문 안에서 각 모델 별로 학습 및 예측을 해볼 수 있죠.

In [None]:
def AveragingBlending(models, x, y, sub_x):
    for m in models : 
        m['model'].fit(x.values, y) # Model Training
    
    predictions = np.column_stack([
        m['model'].predict(sub_x.values) for m in models
    ])
    return np.mean(predictions, axis=1)

In [None]:
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

In [None]:
df = {}

for model in models:
    # 모델 이름 획득
    model_name = model.__class__.__name__

    # train, test 데이터셋 분리
    # random_state를 사용하여 고정하고 train과 test 셋의 비율은 8:2로 합니다.
    # [[YOUR CODE]]
    X_train, X_test, y_train, y_test = train_test_split(
            train, y, random_state=random_state,
            test_size = 0.2)

    # 모델 학습
    model.fit(X_train.values, y_train)
    
   # 예측
   # [[YOUR CODE]]
    
    predictions = model.predict(X_test.values)
    
     # 예측 결과의 rmse값 저장
    df[model_name] = rmse(y_test, predictions)# [[YOUR CODE]]
    
    # data frame에 저장
    score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)
    

In [None]:
def get_scores(models, train, y):
    df = {}
    
    for model in models:
        model_name = model.__class__.__name__
        
        X_train, X_test, y_train, y_test = train_test_split(train, y, random_state=random_state, test_size=0.2)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        df[model_name] = rmse(y_test, y_pred)
        score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)
            
    return score_df

predictions.shape, test.shape, train.shape, y.shape, type(test)

get_scores(models, train, y)

### Hyper parameter Tunning : Grid

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [1, 10],
}

In [None]:
model = LGBMRegressor(random_state=random_state)   #LightGBM : Ensemble Boosting 기법중 하나

In [None]:
grid_model = GridSearchCV(model, param_grid=param_grid, \
                        scoring='neg_mean_squared_error', \
                        cv=5, verbose=1, n_jobs=5)

grid_model.fit(train, y)

In [None]:
grid_model.cv_results_

In [None]:
params = grid_model.cv_results_['params']
params

In [None]:
score = grid_model.cv_results_['mean_test_score']
score

In [None]:
results = pd.DataFrame(params)
results['score'] = score
results

In [None]:
results['RMSE'] = np.sqrt(-1 * results['score'])
results

In [None]:
results = results.rename(columns={'RMSE': 'RMSLE'})
results

In [None]:
results = results.sort_values('RMSLE')
results

### Submission

In [None]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [1, 10],
}

model = LGBMRegressor(random_state=random_state)

In [None]:
grid_model = GridSearchCV(model, param_grid=param_grid, \
                        scoring='neg_mean_squared_error', \
                        cv=5, verbose=1, n_jobs=5)

grid_model.fit(train, y)

In [None]:
model = LGBMRegressor(max_depth=10, n_estimators=100, random_state=random_state)
model.fit(train, y)
prediction = model.predict(test)
prediction

<span style = "color:green; font-size:150%"> 
<b>Q.앞에서 로그 변환을 했던 것을 다시 원래 스케일로 되돌리는 코드를 작성하세요.</span>

In [None]:
prediction = np.expm1(prediction)# [[YOUR CODE]]
prediction

In [None]:
data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'

submission_path = join(data_dir, 'sample_submission.csv')
submission = pd.read_csv(submission_path)
submission.head()

In [None]:
submission['price'] = prediction
submission.head()

In [None]:
submission_csv_path = '{}/submission_{}_RMSLE_{}.csv'.format(data_dir, 'lgbm', '0.164399')
submission.to_csv(submission_csv_path, index=False)
print(submission_csv_path)

In [None]:
def save_submission(model, train, y, test, model_name, rmsle=None):
    model.fit(train, y)
    prediction = model.predict(test)
    prediction = np.expm1(prediction)
    data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'
    submission_path = join(data_dir, 'sample_submission.csv')
    submission = pd.read_csv(submission_path)
    submission['price'] = prediction
    submission_csv_path = '{}/submission_{}_RMSLE_{}.csv'.format(data_dir, model_name, rmsle)
    submission.to_csv(submission_csv_path, index=False)
    print('{} saved!'.format(submission_csv_path))

In [None]:
save_submission(model, train, y, test, 'lgbm', rmsle='0.164399')

## Leaderboard 정복하기

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [1, 10],
}

model = LGBMRegressor(random_state=random_state)

In [None]:
param_grid = {
        'n_estimators': [50, 100],
    'max_depth': [1, 10],
    'learning_rate':[0.0001,0.001,0.1],
  #   'boosting_type' : ["gbdt", 'dart'], 
     'num_leaves': [20],
    'num_iterations':[200]
}

In [None]:
model = LGBMRegressor(random_state=random_state)

In [None]:
grid_model = GridSearchCV(model, param_grid=param_grid, \
                        scoring='neg_mean_squared_error', \
                        cv=5, verbose=1, n_jobs=5)

grid_model.fit(train, y)

In [None]:
params = grid_model.cv_results_['params']
params

In [None]:
score = grid_model.cv_results_['mean_test_score']
score

In [None]:
results = pd.DataFrame(params)
results['score'] = score
results

In [None]:
results['RMSE'] = np.sqrt(-1 * results['score'])
results

In [None]:
results = results.rename(columns={'RMSE': 'RMSLE'})
results

In [None]:
results = results.sort_values('RMSLE')
results

In [None]:
save_submission(model, train, y, test, 'lgbm', rmsle='0.163302')

learningR = [0.1, 0.01, 0.001]
np.array(learningR)

for  lr in learningR:
    learning_rate = lr
    model = LGBMRegressor(max_depth=50, n_estimators=150, learning_rate =0.1000, random_state=random_state)
    model.fit(train, y)
    prediction = model.predict(test)
    
    result = pd.DataFrame[learning_rate]
    result['prediction'] = prediction
    
result