In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

In [None]:
df = pd.read_csv('/kaggle/input/london-bike-sharing-dataset/london_merged.csv',parse_dates = ['timestamp'])
df. head()

In [None]:
#데이터의 타입과 구조
print('데이터의 구조는:', df.shape)
print('데이터의 타입은:', df.dtypes)
print('데이터의 칼럼은:', df.columns)

In [None]:
df.isna().sum()

In [None]:
msno.matrix(df)
plt.show()

In [None]:
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['dayofweek'] = df['timestamp'].dt.dayofweek
df['hour'] = df['timestamp'].dt.hour
df.head()

In [None]:
df['weather_code'].value_counts()

In [None]:
a, b = plt.subplots(1,1, figsize=(10,5))
sns.boxplot(data=df, x='year', y='cnt')

In [None]:
a, b = plt.subplots(1,1, figsize=(10,5))
sns.boxplot(data=df, x='month', y='cnt')

In [None]:
a, b = plt.subplots(1,1, figsize=(10,5))
sns.boxplot(data=df, x='dayofweek', y='cnt')

In [None]:
a, b = plt.subplots(1,1, figsize=(10,5))
sns.boxplot(data=df, x='hour', y='cnt')

In [None]:
#그래프 함수 만들기

def plot_bar(data, feature):
    fig = plt.figure(figsize=(12,3))
    sns.barplot(x=feature, y='cnt', data=data, palette='Set3', orient='v')

In [None]:
plot_bar(df, 'hour')

In [None]:
plot_bar(df, 'dayofweek')

In [None]:
plot_bar(df, 'month')

In [None]:
plot_bar(df, 'year')

In [None]:
#아웃라이어 제거

def is_outliers(s):
    lower_limit = s.mean() - (s.std()*3)
    upper_limit = s.mean() + (s.std()*3)
    return ~s.between(lower_limit, upper_limit)

In [None]:
df_out = df[~df.groupby('hour', group_keys=False)['cnt'].apply(is_outliers)]

In [None]:
print('이상치 제거전:', df.shape)
print('이상치 제거후:', df_out.shape)

In [None]:
df_out.dtypes

In [None]:
df_out['weather_code'] = df_out['weather_code'].astype('category')
df_out['season'] = df_out['season'].astype('category')
df_out['year'] = df_out['year'].astype('category')
df_out['month'] = df_out['month'].astype('category')
df_out['hour'] = df_out['hour'].astype('category')

In [None]:
df_out.dtypes

In [None]:
df_out['season']

In [None]:
df_out['hour']

In [None]:
df_out['year']

In [None]:
df_out['month']

In [None]:
df_out['weather_code']

In [None]:
df_out = pd.get_dummies(df_out, columns = ['weather_code', 'season', 'year', 'month', 'hour'])
df_out.head()

In [None]:
df_out.shape

In [None]:
df_y = df_out['cnt']
df_x = df_out.drop(['timestamp', 'cnt'], axis=1)
df_x.head()

In [None]:
df_y.head()

In [None]:
#훈련용,테스트용 데이터 분리

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, random_state=66, test_size=0.3, shuffle=False)

In [None]:
print('x_train의 구조는:', x_train.shape)
print('y_train의 구조는:', y_train.shape)

print('x_test의 구조는:', x_test.shape)
print('y_test의 구조는:', y_test.shape)

In [None]:
from tensorflow import keras
from keras.layers import Dense
from keras.callbacks import EarlyStopping

model = keras.Sequential()
model.add(Dense(units=160, activation='relu', input_dim=57))
model.add(Dense(units=60, activation='relu'))
model.add(Dense(units=20, activation='relu'))
model.add(Dense(units=1, activation='linear'))

In [None]:
model.summary()

In [None]:
model.compile(loss='mae', optimizer='adam', metrics=['mae'])
early_stopping = EarlyStopping(monitor='loss', patience=5, mode='min')
history = model.fit(x_train, y_train, epochs=30, batch_size=1, validation_split=0.1, callbacks=[early_stopping])

In [None]:
plt.plot(history.history['val_loss'])
plt.plot(history.history['loss'])
plt.title('loss')
plt.xlabel('Epochs')
plt.ylabel('loss')
plt.legend(['val_loss', 'loss'])
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
def RMSE(y_test, y_predict):
    return np.sqrt(mean_squared_error(y_test, y_predict))
print('RMSE', RMSE(y_test, y_predict))

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=16)
rf.fit(x_train, y_train)
rf_result = rf.predict(x_test)
print('RMSE', RMSE(y_test, rf_result))

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=100, random_state=16)
xgb.fit(x_train, y_train)
xgb_result = xgb.predict(x_test)
print('RMSE', RMSE(y_test, xgb_result))

In [None]:
from lightgbm import LGBMRegressor
lgb = LGBMRegressor(n_estimators=100, random_state=16)
lgb.fit(x_train, y_train)
lgb_result = lgb.predict(x_test)
print('RMSE', RMSE(y_test, lgb_result))

In [None]:
dnn = pd.DataFrame(y_predict)
rf = pd.DataFrame(rf_result)
xgb = pd.DataFrame(xgb_result)
lgb = pd.DataFrame(lgb_result)
compare = pd.DataFrame(y_test).reset_index(drop=True)

In [None]:
compare.head()

In [None]:
compare['dnn'] = dnn
compare['rf'] = rf
compare['xgb'] = xgb
compare['lgb'] = lgb
compare.head()

In [None]:
sns.kdeplot(compare['cnt'], fill=True, color='r')
sns.kdeplot(compare['dnn'], fill=True, color='b')
sns.kdeplot(compare['rf'], fill=True, color='y')
sns.kdeplot(compare['xgb'], fill=True, color='g')
sns.kdeplot(compare['lgb'], fill=True, color='c')