In [90]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
import xgboost as xg
import pandas as pd
import numpy as np

# Functions

In [91]:
def transform_date(X):
    X_ = X.copy()
    X_['date'] = pd.to_datetime(X_['date'],format="%d/%m/%Y")  #, format='"%d/%m/%Y"'
    X_['day'] = X_['date'].dt.day
    X_['week_day'] = X_['date'].dt.weekday
    X_['month'] = X_['date'].dt.month
    X_ = X_.drop(columns='date')
    return X_



In [92]:
# Define a function to remove outliers from a numerical column using the interquartile range (IQR) method
def remove_outliers(df,column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    df = df[df[column] < (Q3 + 1.5*IQR)]
    return df
    




# Data Wrangling

In [93]:
#parameter for xgboost cv
parameters_dict = {
    'xgb__n_estimators' : [600,700,800,900,1000,1200,1400]
    }

xgb= xg.XGBRegressor(objective ='reg:squarederror',n_estimators=1400)


In [94]:
file_path = '../data/SeoulBikeData.csv'

In [95]:
dict_rename_col = {'Temperature(°C)':'temp',
                   'Humidity(%)':'humidity',
                   'Wind speed (m/s)':'wind_speed',
                   'Visibility (10m)':'visibility',
                   'Dew point temperature(°C)':'dew_point_temperature',
                   'Solar Radiation (MJ/m2)':'solar_radiation',
                   'Rainfall(mm)':'rainfall',
                   'Snowfall (cm)':'snowfall',
                   'Rented Bike Count':'rented_bike_count',
                   'Hour':'hour',
                   'Seasons':'seasons',
                   'Holiday':'holiday',
                   'Functioning Day':'functioning_day',
                   'Date':'date'}

In [96]:
columstokeep=["hour","temp","humidity","wind_speed","visibility","seasons","holiday","functioning_day","date"]

In [97]:
num_col = ["temp","humidity","wind_speed","visibility"]
cat_col = ["hour","seasons","holiday","functioning_day","day","month",'week_day']

In [98]:
# Load the data into a pandas dataframe
df = pd.read_csv(file_path,encoding = "ISO-8859-1")

In [99]:
rel_cols = ['Date', 'Rented Bike Count', 'Hour', 'Temperature(°C)', 'Humidity(%)',
       'Wind speed (m/s)', 'Visibility (10m)', 'Seasons',
       'Holiday', 'Functioning Day']

In [100]:
df = df[rel_cols]
df.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,Winter,No Holiday,Yes


In [101]:
df.shape

(8760, 10)

In [102]:
# rename columns
df = df.rename(columns= dict_rename_col)
df.head()

Unnamed: 0,date,rented_bike_count,hour,temp,humidity,wind_speed,visibility,seasons,holiday,functioning_day
0,01/12/2017,254,0,-5.2,37,2.2,2000,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,Winter,No Holiday,Yes


In [103]:
#removing outliers

In [104]:
df = df.pipe(remove_outliers,'rented_bike_count').pipe(remove_outliers,'temp').pipe(remove_outliers,'humidity').pipe(remove_outliers,'wind_speed').pipe(remove_outliers,'visibility')

In [105]:
df = df.reset_index().drop(columns='index')

In [106]:
df.head()

Unnamed: 0,date,rented_bike_count,hour,temp,humidity,wind_speed,visibility,seasons,holiday,functioning_day
0,01/12/2017,254,0,-5.2,37,2.2,2000,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,Winter,No Holiday,Yes


# Now data is ready for model pipeline 

In [107]:
X = df.drop(columns='rented_bike_count')
y = np.log(df['rented_bike_count']+1)

In [108]:
X.head()

Unnamed: 0,date,hour,temp,humidity,wind_speed,visibility,seasons,holiday,functioning_day
0,01/12/2017,0,-5.2,37,2.2,2000,Winter,No Holiday,Yes
1,01/12/2017,1,-5.5,38,0.8,2000,Winter,No Holiday,Yes
2,01/12/2017,2,-6.0,39,1.0,2000,Winter,No Holiday,Yes
3,01/12/2017,3,-6.2,40,0.9,2000,Winter,No Holiday,Yes
4,01/12/2017,4,-6.0,36,2.3,2000,Winter,No Holiday,Yes


In [109]:
y.head()

0    5.541264
1    5.323010
2    5.159055
3    4.682131
4    4.369448
Name: rented_bike_count, dtype: float64

In [110]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

# Building model pipeline
- date spliting
- OHE on categorical variable
- scaling


In [111]:
# define preprocessing steps for categorical and numerical columns
cat_preprocessor = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

num_preprocessor = Pipeline([
    ('scaler', StandardScaler())
])

In [112]:
# create column transformer to apply preprocessing steps to appropriate columns
preprocessor = ColumnTransformer([
    ('cat', cat_preprocessor, cat_col),
    ('num', num_preprocessor, num_col)
])

In [113]:
ex_org_pipe = Pipeline([
        ('date_features', FunctionTransformer(transform_date)),
        ('preprocessor', preprocessor),
        ('xgboost',xgb)
        ])

In [114]:
#grid_pipe = GridSearchCV(ex_org_pipe,param_grid=parameters_dict,cv=5,verbose=2)
#grid_pipe.fit(X_train, y_train)

In [115]:
ex_org_pipe.fit(X_train,y_train)

In [116]:
z = ex_org_pipe.predict(X_train)
z

array([4.7805753, 6.4726043, 7.611247 , ..., 6.7627425, 6.794551 ,
       4.684807 ], dtype=float32)

In [117]:
ex_org_pipe.score(X_test,y_test)

0.9525335215338525

In [118]:
ex_org_pipe.predict(X_test)

array([ 7.260506 ,  3.8957443,  6.6269813, ...,  6.956806 ,  5.161492 ,
       -0.2064769], dtype=float32)

In [119]:
#test1 = X_train.iloc[0]
#test1

In [120]:
#pd.DataFrame(test1,index=[0])

In [121]:
test1 = X_train.iloc[0].to_dict()
test1 = pd.DataFrame(test1,index=[0])

In [122]:
test1

Unnamed: 0,date,hour,temp,humidity,wind_speed,visibility,seasons,holiday,functioning_day
0,20/01/2018,3,-0.9,65,0.2,1359,Winter,No Holiday,Yes
