## Introduction to this notebook

In the previous notebook **1_WP_EDA** we discovered that the following factors have the biggest effect on the amount of rented bikes:
* Weather
* Season/month
* Time of the day
* (year)

In [1]:
import pandas as pd
import numpy as np

# models
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# new utils
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.metrics import r2_score

# to visualize the column transformer and pipeline
set_config(display='diagram')

In [2]:
df = pd.read_csv("./data/bike-sharing-demand/train.csv", parse_dates=True, index_col=0)

In [3]:
df.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [4]:
# Define a function to extract year, month and hour
def extract_year_month_hour(df):
    df = df.copy()
    df["year"] = df.index.year
    df["month"] = df.index.month
    df["hour"] = df.index.hour
    return df

In [5]:
preprocessor_1 = FunctionTransformer(extract_year_month_hour)

In [6]:
numeric_features = ["atemp", "humidity", "windspeed"]
numeric_transformer = StandardScaler()

categorical_features = ["season", "holiday", "workingday", "weather", "year", "month", "hour"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [7]:
preprocessor_2 = ColumnTransformer(
    [
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)],
    remainder = 'passthrough'
)

In [8]:
linear_reg_pipeline = Pipeline(steps=[
    ("create_new_columns", preprocessor_1),
    ("ColumnTransformer", preprocessor_2),
    ("classifier", LinearRegression())
])

## Train/Test Split

In [9]:
y = df["count"]
X = df.loc[:, df.columns != "count"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 85)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8708, 10), (2178, 10), (8708,), (2178,))

## Applying the model

In [11]:
linear_reg_pipeline.fit(X_train, y_train)

In [12]:
print(f"""The train R2-score of the linear regression is: {round(linear_reg_pipeline.score(X_train, y_train), 2)}
The test R2-score of the linear regression is: {round(linear_reg_pipeline.score(X_test, y_test), 2)}""")

The train R2-score of the linear regression is: 1.0
The test R2-score of the linear regression is: 1.0


In [13]:
#y_predict = linear_reg_pipeline.predict(X_test)

In [14]:
#print(f"{round(len(y_predict[y_predict < 0])/len(y_predict), 2)}% of the values are negative. This needs to be corrected since negative values are impossible in the context of our bikesharing example.")

## Avoid negative values

### Using the log-values

In [15]:
linear_reg_pipeline.fit(X_train, np.log1p(y_train))

In [16]:
print(f"""The train R2-score of the linear regression with log-values is: {round(linear_reg_pipeline.score(X_train,np.log1p(y_train)),2)}
The test R2-score of the linear regression with log-values is: {round(linear_reg_pipeline.score(X_test,np.log1p(y_test)),2)}""")

The train R2-score of the linear regression with log-values is: 0.89
The test R2-score of the linear regression with log-values is: 0.89


### Using the log-values but calculating the R2-Score with the reversed log-values

In [17]:
y_predict = linear_reg_pipeline.predict(X_test)
y_predict = np.expm1(y_predict).astype("int")

In [18]:
print(f"The R2 score of the train data after the reversed log-transformation is {round(r2_score(y_predict, y_test), 2)}")

The R2 score of the train data after the reversed log-transformation is 0.74


### Add the 'positive' Argument to the LinearRegression()

In [19]:
# Data Preprocessing was not possible because only dense arrays are supported
linear_reg_pipeline = Pipeline(steps=[
    ("create_new_columns", preprocessor_1),
    ("ColumnTransformer", preprocessor_2),
    ("classifier", LinearRegression(positive=True))
])

In [20]:
linear_reg_pipeline.fit(X_train, y_train)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [None]:
lr = LinearRegression(positive=True).fit(X_train, y_train)

In [None]:
y_predict = lr.predict(X_test)

In [None]:
print(f"The R2 score of the train data after adding the argument 'positive=True' to the 'LinearRegression()' {round(r2_score(y_predict, y_test), 2)}")