# Load Dataset

In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("data/train.csv", parse_dates=["datetime"])
train.shape

In [None]:
test = pd.read_csv("data/test.csv", parse_dates=["datetime"])
test.shape

## Feature Engineering

In [None]:
train["year"] = train["datetime"].dt.year
train["month"] = train["datetime"].dt.month
train["day"] = train["datetime"].dt.day
train["hour"] = train["datetime"].dt.hour
train["minute"] = train["datetime"].dt.minute
train["second"] = train["datetime"].dt.second
train["dayofweek"] = train["datetime"].dt.dayofweek
train.shape

In [None]:
test["year"] = test["datetime"].dt.year
test["month"] = test["datetime"].dt.month
test["day"] = test["datetime"].dt.day
test["hour"] = test["datetime"].dt.hour
test["minute"] = test["datetime"].dt.minute
test["second"] = test["datetime"].dt.second
test["dayofweek"] = test["datetime"].dt.dayofweek
test.shape

In [None]:
# 풍속의 0값에 특정 값을 넣어준다.

train.loc[train["windspeed"] == 0, "windspeed"] = train["windspeed"].mean()
test.loc[train["windspeed"] == 0, "windspeed"] = train["windspeed"].mean()

In [None]:
feature_names = ["season", "weather", 
                 "holiday", "workingday",
                 "temp", "atemp", "humidity",
                 "year", "hour", "dayofweek"
                ]

feature_names

In [None]:
X_train = train[feature_names]

print(X_train.shape)
X_train.head()

In [None]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head()

In [None]:
label_name = "count"

y_train = train[label_name]

print(y_train.shape)
y_train.head()

# Score
## RMSLE
과대평가 된 항목보다는 과소평가 된 항목에 패널티를 준다.

오차(Error)를 제곱(Square)해서 평균(Mean)한 값의 제곱근(Root) 으로 값이 작을 수록 정밀도가 높다. 

0에 가까운 값이 나올 수록 정밀도가 높은 값이다.

Submissions are evaluated one the Root Mean Squared Logarithmic Error (RMSLE)

$$ \sqrt{\frac{1}{n} \sum_{i=1}^n (\log(p_i + 1) - \log(a_i+1))^2 } $$

* \\({n}\\) is the number of hours in the test set
* \\(p_i\\) is your predicted count
* \\(a_i\\) is the actual count
* \\(\log(x)\\) is the natural logarithm

* 좀 더 자세한 설명은 : [RMSLE cost function](https://www.slideshare.net/KhorSoonHin/rmsle-cost-function)

In [None]:
import numpy as np
from sklearn.metrics import make_scorer
def rmsle(predict, actual):
    predict = np.array(predict)
    actual = np.array(actual)
    
    log_predict = np.log(predict + 1)
    log_actual = np.log(actual + 1)
    
    difference = log_predict - log_actual
    difference = np.square(difference)
    
    mean_difference = difference.mean()
    
    score = np.sqrt(mean_difference)
    
    return score

rmsle_scorer = make_scorer(rmsle)
rmsle_scorer

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

## RandomForest

In [None]:
from sklearn.ensemble import RandomForestRegressor

max_depth_list = []

model = RandomForestRegressor(n_estimators=10,
                              n_jobs=-1,
                              random_state=0)
model

In [None]:
score = cross_val_score(model, X_train, y_train, cv=20, scoring=rmsle_scorer)
score = score.mean()
# 0에 근접할수록 좋은 데이터
print("Score= {0:.5f}".format(score))

## xgboost

In [None]:
import xgboost as xgb
model = xgb.XGBClassifier(n_estimators=10,
                          max_depth=3,
                          learning_rate=0.1,
                          max_delta_step=8.8,
                          subsample=0.8,
                          nthread=4,
                          seed=0)

In [None]:
# score = cross_val_score(model, X_train, y_train, cv=20, scoring=rmsle_scorer)
# score = score.mean()

# print("Score= {0:.5f}".format(score))

## Lgbm

In [None]:
import lightgbm as lgb

model = lgb.LGBMRegressor(objective='regression',
                          num_leaves=10,
                          learning_rate=0.01,
                          nthread=4,
                          n_estimators=1000,
                          seed=0)

In [None]:
score = cross_val_score(model, X_train, y_train, cv=20, scoring=rmsle_scorer)
score = score.mean()

print("Score= {0:.5f}".format(score))

## Train

In [None]:
# 학습시킴, 피팅(옷을 맞출 때 사용하는 피팅을 생각함) - 피처와 레이블을 넣어주면 알아서 학습을 함
model.fit(X_train, y_train)

In [None]:
# 예측
predictions = model.predict(X_test)

print(predictions.shape)
predictions[0:10]

# Submit

In [None]:
submission = pd.read_csv("data/sampleSubmission.csv")
submission

# submission["count"] = predictions

submission["count"] = np.abs(predictions)

print(submission.shape)
submission.head()

In [None]:
submission.to_csv("data/submission.csv", index=False)