#Bike Rental Prediction


###1. 시간별 바이크 대여 횟수를 통해 해당 날짜에 바이크 대여가 N대 이하일 경우를 예측하여,
* 계절에 따른 날씨 데이터를 기반하여 예측한다.
* 사람들의 활동시간에 따라 예측한다.
* 휴일이냐 아니냐에 따라 예측한다.

###2. 대여 업체가 미리 서비스 제공에 대비할 날짜에 관해 사전 정보를 준다.
*   목적: 사전에 {연료 충당 & 바이크 정비 & 대체품 수급 & 기타 서비스 준비} 하도록 돕는다.
* 바이크 수요가 증가하는 특정 시간대 or 특정 날씨조건에 대한 정보 제공


#### * 주의사항
1. 모든 날짜가 ***24시간의 데이터를 가지고 있지는 않다.***
2. 데이터의 양이 부족하여 과적합이 일어나기 쉽다.
3. 날씨에 관한 데이터는 시계열성이 존재하므로 train-dataset과 test-dataset을 나눌 때 이를 고려해야 한다.
#### * 해결방법
* Nested Cross-Validation을 이용해본다(Regular Day-Forward Chaining [link text](https://towardsdatascience.com/time-series-nested-cross-validation-76adba623eb9))

In [None]:
from google.colab import drive
drive.mount('/gdrive') 
file = '/gdrive/My Drive/CodeStates/Section_2/Sprint3/Project/Bike_rental/hour.csv'
# url='https://www.kaggle.com/swatikhedekar/rental-bike-share/download'

In [None]:
import pandas as pd
df1 = pd.read_csv(file)
df = df1.copy()
df.head()

In [None]:
# df['dteday'].value_counts()

In [None]:
%%capture
# !pip install pandas_profiling -U

In [None]:
# from pandas_profiling import ProfileReport
# profile = ProfileReport(df)
# # profile


In [None]:
from datetime import date
# df['dteday'] = pd.to_datetime(df['dteday'])

In [None]:
df.groupby('dteday').hr.count().value_counts()

In [None]:
df.isnull().sum()

In [None]:
#타겟 임의 설정시 비율 확인
proportion = df['cnt'] < 100
base = proportion.value_counts(normalize=True)[1]


In [None]:
import matplotlib.pyplot as plt
 

fig = plt.figure(figsize=(8,8)) 
fig.set_facecolor('white') 
 
ax = fig.add_subplot()
ax.pie(x=[proportion.value_counts(normalize=True)[1],proportion.value_counts(normalize=True)[0]],
       labels=['Dayoff', 'Open'],
       colors = ['lightgray', 'black'],
       autopct='%.2f%%');
plt.rcParams.update({'font.size': 35,})

# Target Feature 생성 (대여 횟수가 100 미만일 가능성)

In [None]:
df['dayoff'] = proportion.astype(float)

In [None]:
df.head()
df_nocnt = df.drop('cnt', axis=1)

In [None]:
df_nocnt

## 데이터의 시계열성을 유지하는 train / test dataset 분리
* 전체 데이터 중 후반 20%를 test dataset으로 구분
* Validation set은 train dataset중 마지막 20%로 구분
* **일단 'cnt' 특성이 없는 데이터셋으로 먼저 한다.**

In [None]:
import numpy as np
split_point = int(len(df_nocnt)* 4/ 5)
df_train1 = df_nocnt.loc[:split_point-1,:]
df_test = df_nocnt.loc[split_point :,:]

split_val = int(len(df)* 3/5)
df_train = df_train1.loc[:split_val-1,:]
df_val = df_train1.loc[split_val:, :]
df_train

In [None]:
len(df_train) + len(df_val) + len(df_test) == len(df_nocnt)

In [None]:
target = 'dayoff'
feature = df_nocnt.drop(target, axis=1).columns

In [None]:
X_train = df_train[feature]
y_train = df_train[target]
X_val = df_val[feature]
y_val = df_val[target]
X_test = df_test[feature]
y_test = df_test[target]

#특성공학 진행 (특성 추가 및 삭제)

In [None]:

def engineer(df):
    #1. weekday, holiday, workingday 처리
    weekday = (df['weekday'] >= 1) & (df['weekday'] <= 4)
    Friday = df['weekday'] == 5
    Saturday = df['weekday'] == 6
    Sunday = df['weekday'] == 0
    holiday = (weekday) & (df['holiday']==1)

    Friday = Friday.astype(int)
    Saturday = Saturday.astype(int)*2
    Sunday = Sunday.astype(int)*3
    holiday = holiday.astype(int)*4
    Sunday.astype(int).value_counts()

    df['Weekday'] = Friday + Saturday + Sunday + holiday # 'Weekday' = 0:평일, 1:금요일, 2:토요일, 3:일요일, 4:평일 공휴일

    #2. 처리 후 중복적 특성들 제거
    dels = ['instant','weekday','workingday', 'holiday','dteday', 'atemp', 'registered', 'casual'] ##weekday, temp를 지울까 말까?
    df.drop(dels, axis=1, inplace=True)

    return df

engineer(X_train)
engineer(X_val)
engineer(X_test)

In [None]:
%%capture
!pip install pdpbox -U
!pip install eli5 -U
!pip install category_encoders
!pip install xgboost -U

In [None]:
from pdpbox.pdp import pdp_isolate, pdp_plot
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.pipeline import make_pipeline
from category_encoders import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report 
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score


## Baseline model
* 'Original dataset에서 target이 나타나는 확률'로 1을 출력하는 Series

In [None]:
# 각 데이터마다 base의 확률로 1을 나타내는 X와 같은 데이터 크기의 pd.Series를 내뱉는  함수
def baseline(X,y):
    np.random.seed(77)
    y_pred = np.random.binomial(n=1, p = base, size = len(X))

    print('Baseline model 검증 정확도: ', accuracy_score(y, y_pred))

    print(classification_report(y_pred, y))



In [None]:
baseline(X_val,y_val)

#Model 1

In [None]:
# 임의 모델
pipe1 = make_pipeline(
        # OrdinalEncoder(),
        XGBClassifier(n_estimators = 100, 
                      max_depth = 7, 
                      n_jobs=-1,
                      random_state=77,
                      learning_rate= 0.2, )
)
pipe1.fit(X_train, y_train);

In [None]:
y_pred = pipe1.predict(X_val)
print('검증 정확도: ', accuracy_score(y_val, y_pred))

print(classification_report(y_pred, y_val))

In [None]:
y_pred = pipe1.predict(X_test)
print('테스트 정확도: ', accuracy_score(y_test, y_pred))

print(classification_report(y_pred, y_test))

## RandomizedSearchCV로 최적화된 Hyper-parameter 찾기

In [None]:
from sklearn.model_selection import RandomizedSearchCV

dists = {
    'xgbclassifier__n_estimators' : range(100, 201, 10),
    'xgbclassifier__max_depth' : range(1,5, 1),
}

clf1 = RandomizedSearchCV(
                        pipe1,
                        param_distributions = dists,
                        n_iter=50,
                        cv=3,
)

clf1.fit(X_train, y_train)

In [None]:
print('최적 하이퍼파라미터: ', clf1.best_params_)
print(clf1.best_score_)

In [None]:
pipe_renew = clf1.best_estimator_
# pipe_renew.fit(X_train, y_train)

In [None]:
y_pred = pipe_renew.predict(X_val)

print('검증 정확도: ', accuracy_score(y_val, y_pred))

print(classification_report(y_pred, y_val))

In [None]:
y_pred_test = pipe_renew.predict(X_test)

print('테스트 정확도: ', accuracy_score(y_test, y_pred_test))

print(classification_report(y_pred_test, y_test))

In [None]:
from sklearn.metrics import roc_curve
y_pred_proba = pipe_renew.predict_proba(X_test)[:, 1]
fpr, tpr, threshold = roc_curve(y_test, y_pred_proba) #y_pred_proba = pipe.predict_proba(X_val)[:, 1]

roc = pd.DataFrame({
    'FPR(Fall-out)': fpr, 
    'TPRate(Recall)': tpr, 
    'Threshold': threshold
})
roc #Threshold값 변화에 따라 값들을 보여줌

plt.scatter(fpr, tpr)
plt.title('ROC curve')
plt.xlabel('FPR(Fall-out)')
plt.ylabel('TPR(Recall)');



In [None]:
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = threshold[optimal_idx]

print('idx:', optimal_idx, ', threshold:', optimal_threshold)

y_pred_optimal = y_pred_proba >= optimal_threshold

In [None]:
y_pred_optimal = y_pred_optimal.astype(float)

In [None]:
print('테스트 정확도: ', accuracy_score(y_test, y_pred_optimal))

print(classification_report(y_pred_optimal, y_test))

##특성중요도 확인 : Permutation Importances, PDP, or SHAP 

In [None]:
permuter = PermutationImportance(
                                pipe1.named_steps['xgbclassifier'],
                                scoring = 'accuracy',
                                n_iter = 5,
                                random_state = 77
)

permuter.fit(X_val, y_val)

In [None]:
feature_names = X_val.columns.tolist()
pd.Series(permuter.feature_importances_, feature_names).sort_values(ascending=False)

In [None]:
eli5.show_weights(
    permuter,
    top=None,
    feature_names = feature_names
)

In [None]:
import plotly.graph_objects as go
importance_result = pd.DataFrame(zip(X_val.columns.tolist(), permuter.feature_importances_))
importance_result

colors = ['lightslategray', ] * len(importance_result)
colors[3] = 'palegreen'
# colors[17] = 'crimson'
# colors[18] = 'royalblue'

fig = go.Figure(data=[go.Bar(
    x = importance_result[0],
    y = importance_result[1],
    text= importance_result[0],
    textposition='auto',
    marker_color = colors
)])
fig.update_layout(title_text = 'Feature Importance')
fig.update_xaxes(
    title_text = 'Feature'
)
fig.update_yaxes(
    title_text = 'Importance'
) 
fig.update_layout(
    autosize=False,
    width=500,
    height=650,)
# fig.layout.font.size = 12

In [None]:
# !pip install shap -U

In [None]:
import shap

explainer = shap.TreeExplainer(pipe1.named_steps['xgbclassifier'])
shap.initjs()
shap_values = explainer.shap_values(X_val.iloc[:100])
shap.force_plot(explainer.expected_value, shap_values, X_val.iloc[:100])

#시각화에 필요한 자료

In [None]:
df.head()

In [None]:
hr_cnt = df.groupby('hr').cnt.mean().reset_index()

In [None]:


colors = ['lightslategray', ] * len(hr_cnt['hr'])
colors[8] = 'lightsalmon'
colors[17] = 'crimson'
colors[18] = 'royalblue'

fig = go.Figure(data=[go.Bar(
    x = hr_cnt['hr'],
    y = hr_cnt['cnt'],
    text= hr_cnt['hr'],
    textposition='auto',
    marker_color = colors
)])
fig.update_layout(title_text = 'Average Bike Rental Counts per hour')
fig.update_xaxes(
    title_text = 'Hour'
)
fig.update_yaxes(
    title_text = 'Rental Counts'
) 
fig.update_layout(
    autosize=False,
    width=1400,
    height=700,
    font=dict(
        size=18,))

In [None]:
day_cnt = df.groupby('weekday').cnt.mean().reset_index()

In [None]:
colors = ['lightslategray', ] * len(day_cnt['weekday'])
colors[4] = 'forestgreen'
# colors[17] = 'darkmagenta'
colors[5] = 'blueviolet'

fig = go.Figure(data=[go.Bar(
    x = day_cnt['weekday'],
    y = day_cnt['cnt'],
    text= ['Sunday','Monday','Tuesday','Wendsday','Thursday','Friday','Saturday'],
    textposition='auto',
    marker_color = colors
)])
fig.update_layout(title_text = 'Average Bike Rental Counts per Day of the week')
fig.update_xaxes(
    title_text = 'Weekday'
)
fig.update_yaxes(
    title_text = 'Rental Counts'
) 
fig.update_layout(yaxis_range=[175,200])
fig.update_layout(
    autosize=False,
    width=900,
    height=700,
    font=dict(
        size=15,))

#결론 :
* 날씨조건과 요일, 시간 데이터를 가지고 자전거 대여 횟수를 예측하여 정비시간을 정할 수 있다.
* 정비 시간으로는 오전 9시부터 오후 5시까지가 적당하고, 요일로는 월요일과 일요일이 적당하다.

#한계 :
* 데이터의 양이 적어서 과적합의 위험이 있다.
* 데이터의 특성이 너무 간단하여 영향력있는 특성을 추출하지 못한다.

## Hour, Weekdayf 등 중요도가 높은 특성들을 제외하고 모델을 학습해서 날씨 관련 특성들의 영향력을 확인해보자.

In [None]:
X_train1 = df_train[feature]
y_train1 = df_train[target]
X_val1 = df_val[feature]
y_val1 = df_val[target]
X_test1 = df_test[feature]
y_test1 = df_test[target]

In [None]:
def engineer(df):
    #1. weekday, holiday, workingday 처리
    weekday = (df['weekday'] >= 1) & (df['weekday'] <= 4)
    Friday = df['weekday'] == 5
    Saturday = df['weekday'] == 6
    Sunday = df['weekday'] == 0
    holiday = (weekday) & (df['holiday']==1)

    Friday = Friday.astype(int)
    Saturday = Saturday.astype(int)*2
    Sunday = Sunday.astype(int)*3
    holiday = holiday.astype(int)*4
    Sunday.astype(int).value_counts()

    df['Weekday'] = Friday + Saturday + Sunday + holiday # 'Weekday' = 0:평일, 1:금요일, 2:토요일, 3:일요일, 4:평일 공휴일

    #2. 처리 후 중복적 특성들 제거
    dels = ['instant','weekday','workingday', 'holiday','dteday', 'hr', 'Weekday', 'atemp', 'registered', 'casual'] ##weekday, temp를 지울까 말까?
    df.drop(dels, axis=1, inplace=True)

    return df

engineer(X_train1)
engineer(X_val1)
engineer(X_test1)

In [None]:
pipe_renew.fit(X_train1, y_train1)

y_pred_val1 = pipe_renew.predict(X_val1)

print('검증 정확도: ', accuracy_score(y_val, y_pred_val1))

print(classification_report(y_pred_val1, y_val))

In [None]:
y_train1.value_counts()

In [None]:
permuter1 = PermutationImportance(
                                pipe_renew.named_steps['xgbclassifier'],
                                scoring = 'accuracy',
                                n_iter = 5,
                                random_state = 77
)

permuter1.fit(X_val1, y_val)

In [None]:
feature_names = X_val1.columns.tolist()
pd.Series(permuter1.feature_importances_, feature_names).sort_values(ascending=False)

eli5.show_weights(
    permuter1,
    top=None,
    feature_names = feature_names
)

In [None]:
y_val.value_counts(normalize=True)

## 날짜별로 묶어서 하루 총 대여횟수 확인

In [None]:
bydate = df.groupby('dteday').mean().drop(['hr', 'instant'], axis=1).reset_index()
bydate.head()

In [None]:
bycnt = bydate['cnt'].sort_values(ascending=False) > 100
bycnt.value_counts(normalize=True)

In [None]:
proportion = bydate['dayoff'] < 0.5
proportion.value_counts(normalize=True)
bydate['dayoff_av'] = proportion.astype('float')

In [None]:
bydate['dayoff_av'].value_counts()

# 공휴일이거나 주말인 경우에 날씨의 영향 확인

In [None]:
def engineer(df):
    #1. weekday, holiday, workingday 처리
    weekday = (df['weekday'] >= 1) & (df['weekday'] <= 4)
    Friday = df['weekday'] == 5
    Saturday = df['weekday'] == 6
    Sunday = df['weekday'] == 0
    holiday = (weekday) & (df['holiday']==1)

    Friday = Friday.astype(int)
    Saturday = Saturday.astype(int)*2
    Sunday = Sunday.astype(int)*3
    holiday = holiday.astype(int)*4
    Sunday.astype(int).value_counts()

    df['Weekday'] = Friday + Saturday + Sunday + holiday # 'Weekday' = 0:평일, 1:금요일, 2:토요일, 3:일요일, 4:평일 공휴일

    #2. 처리 후 중복적 특성들 제거
    dels = ['instant','weekday','workingday', 'holiday','dteday', 'hr', 'atemp', 'registered', 'casual'] ##weekday, temp를 지울까 말까?
    df.drop(dels, axis=1, inplace=True)

    return df


byday = engineer(df)

In [None]:
byday = byday.groupby('Weekday')['cnt', 'dayoff'].mean()

In [None]:
byday

In [None]:
colors = ['lightslategray', ] * len(byday)
# colors[4] = 'forestgreen'
# colors[17] = 'darkmagenta'
colors[1] = 'blueviolet'

fig = go.Figure(data=[go.Bar(
    x = byday.index,
    y = 1 - byday['dayoff'],
    text= ['Weekday','Friday','Saturday','Sunday','Holiday'],
    textposition='auto',
    marker_color = colors
)])
fig.update_layout(title_text = 'Average Dayoff per Day of the week')
fig.update_xaxes(
    title_text = 'Weekday'
)
fig.update_yaxes(
    title_text = 'Dayoff'
) 
fig.update_layout(yaxis_range=[0.4,0.65])
fig.update_layout(
    autosize=False,
    width=900,
    height=700,
    font=dict(
        size=15,))

In [None]:
df.groupby('dayoff')['temp','hum', 'windspeed'].mean()

In [None]:
# split_point = int(len(bydate)* 4/ 5)
# df_train1 = bydate.loc[:split_point-1,:]
# df_test = bydate.loc[split_point :,:]

# split_val = int(len(bydate)* 3/5)
# df_train = df_train1.loc[:split_val-1,:]
# df_val = df_train1.loc[split_val:, :]
# df_train

In [None]:
# target = 'dayoff'
# feature = bydate.drop(target, axis=1).columns

# X_train2 = df_train[feature]
# y_train2 = df_train[target]
# X_val2 = df_val[feature]
# y_val2 = df_val[target]
# X_test2 = df_test[feature]
# y_test2 = df_test[target]

In [None]:
# def engineer(df):
#     #1. weekday, holiday, workingday 처리
#     weekday = (df['weekday'] >= 1) & (df['weekday'] <= 4)
#     Friday = df['weekday'] == 5
#     Saturday = df['weekday'] == 6
#     Sunday = df['weekday'] == 0
#     holiday = (weekday) & (df['holiday']==1)

#     Friday = Friday.astype(int)
#     Saturday = Saturday.astype(int)*2
#     Sunday = Sunday.astype(int)*3
#     holiday = holiday.astype(int)*4
#     Sunday.astype(int).value_counts()

#     df['Weekday'] = Friday + Saturday + Sunday + holiday # 'Weekday' = 0:평일, 1:금요일, 2:토요일, 3:일요일, 4:평일 공휴일

#     #2. 처리 후 중복적 특성들 제거
#     dels = ['weekday','workingday', 'holiday','dteday', 'atemp', 'registered', 'casual', 'cnt'] ##weekday, temp를 지울까 말까?
#     df.drop(dels, axis=1, inplace=True)

#     return df

# engineer(X_train2)
# engineer(X_val2)
# engineer(X_test2)

In [None]:
# nontarget = y_train2.value_counts(normalize=True)[0]
# nontarget
# yestarget = y_train2.value_counts(normalize=True)[1]

# ratio = nontarget / yestarget

In [None]:
# model3 = XGBClassifier(
#         n_estimators=120,
#         max_depth=2,
#         scale_pos_weight=ratio,
#         learning_rate=0.15,
#         n_jobs=-1,
# )

In [None]:
# model3.fit(X_train2, y_train2)

In [None]:
# y_pred = model3.predict(X_val2)


# print('검증 정확도: ', accuracy_score(y_val2, y_pred))

# print(classification_report(y_pred, y_val2))

In [None]:
# y_pred = model3.predict(X_test2)


# print('검증 정확도: ', accuracy_score(y_test2, y_pred))

# print(classification_report(y_pred, y_test2))