## Introduction to this notebook

Through this notebook I want to generate the first submission to Kaggle for this challenge

In [1]:
import pandas as pd
import numpy as np

# models
from sklearn.linear_model import LinearRegression

# new utils
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.metrics import r2_score

# to visualize the column transformer and pipeline
set_config(display='diagram')

In [2]:
df = pd.read_csv("./data/bike-sharing-demand/train.csv", parse_dates=True, index_col=0)
X_test = pd.read_csv("./data/bike-sharing-demand/test.csv", parse_dates=True, index_col=0)

In [3]:
# Define a function to extract year, month and hour
def extract_year_month_hour(df):
    df = df.copy()
    df["year"] = df.index.year
    df["month"] = df.index.month
    df["hour"] = df.index.hour
    return df

In [4]:
preprocessor_1 = FunctionTransformer(extract_year_month_hour)

In [5]:
numeric_features = ["atemp", "humidity", "windspeed"]
numeric_transformer = StandardScaler()

categorical_features = ["season", "holiday", "workingday", "weather", "year", "month", "hour"]
categorical_transformer = OneHotEncoder()

In [6]:
preprocessor_2 = ColumnTransformer(
    [
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)],
    remainder = 'passthrough'
)

In [7]:
linear_reg_pipeline = Pipeline(steps=[
    ("create_new_columns", preprocessor_1),
    ("ColumnTransformer", preprocessor_2),
    ("classifier", LinearRegression())
])

## Train/Test Split

In [8]:
y_train = df["count"]
X_train = df.loc[:, df.columns != "count"].drop(["casual", "registered"], axis=1)

In [14]:
X_train.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0


In [15]:
len(X_test)

6493

In [10]:
linear_reg_pipeline.fit(X_train, y_train)

In [42]:
y_pred_test = linear_reg_pipeline.predict(X_test)

In [43]:
print(f"{round(len(y_pred_test[y_pred_test < 0])/len(y_pred_test), 2)}% of the predicted values are negative. In order to fix this different approaches can be chosen.")

0.1% of the predicted values are negative. In order to fix this different approaches can be chosen.


### 1. Set all negative Values to 1

In [44]:
y_pred_test[y_pred_test < 0] = 1

0

![title](img/0.png)

### 2. Transform numpy-array to absolute values

In [None]:
y_pred_test = linear_reg_pipeline.predict(X_test)
len(y_pred_test[y_pred_test < 0])

In [32]:
y_pred_test = np.absolute(y_pred_test).astype("int")
y_pred_test

array([ 82,  80,  92, ..., 146, 116,  69])

![title](img/abs.png)

### 3. Using the log_values to train the model and reverse the predictions

In [None]:
# Fit the model with the log of y_train
linear_reg_pipeline.fit(X_train, np.log1p(y_train))

In [None]:
# Calculate y_predict for X_test, inverse the log-function and transform it to int
y_pred_test = np.expm1(linear_reg_pipeline.predict(X_test)).astype("int")

![title](img/log.png)

## Create a submission to Kaggle

In [33]:
submission = pd.DataFrame({'datetime':X_test.index,'count':y_pred_test})
submission

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,82
1,2011-01-20 01:00:00,80
2,2011-01-20 02:00:00,92
3,2011-01-20 03:00:00,111
4,2011-01-20 04:00:00,111
...,...,...
6488,2012-12-31 19:00:00,268
6489,2012-12-31 20:00:00,186
6490,2012-12-31 21:00:00,146
6491,2012-12-31 22:00:00,116


In [34]:
filename = 'Bike_Predictions_DB_Abs.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: Bike_Predictions_DB_Abs.csv
