## Introduction to this notebook

In this notebook I will try to improve the results of my model through 'Hyperfeature Optimization'.
The R2-score of the current model with 'Linear Regression' was 0.82.

In [1]:
import sys

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# new utils
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV

# to visualize the column transformer and pipeline
set_config(display='diagram')

sns.set_theme(style="darkgrid")

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("./data/bike-sharing-demand/train.csv", parse_dates=True, index_col=0)

## 1. Define the preprocessing pipeline

In [3]:
# Define a function to extract year, month and hour
def extract_year_month_hour(df):
    df = df.copy()
    df["year"] = df.index.year
    df["month"] = df.index.month
    df["hour"] = df.index.hour
    return df

In [4]:
def feature_expansion(df, column, degree):
    df = df.copy()
    for i in range(2,degree+1):
        df[f"{column}^{i}"] = df[column] ** i
    return df


In [5]:
preprocessor_1 = (FunctionTransformer(extract_year_month_hour))

In [6]:
preprocessor_2 = (FunctionTransformer(feature_expansion, validate=False, kw_args={'column': 'hour', "degree": 3}))

In [7]:
numeric_features = ["atemp", "humidity", "windspeed"]
numeric_transformer = StandardScaler()

categorical_features = ["season", "holiday", "workingday", "weather", "year", "month", "hour", "hour^2", "hour^3"]
categorical_transformer = OneHotEncoder()

#hour_transformer = KBinsDiscretizer(n_bins=4)

In [8]:
preprocessor_3 = ColumnTransformer(
    [
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)],
    remainder = 'passthrough'
)

In [9]:
preprocessing_pipeline = Pipeline(steps=[
    ("Create_new_columns", preprocessor_1),
    ("Feature_Expansion", preprocessor_2),
    ("ColumnTransformer", preprocessor_3)
])

## 2. Train/Test Split and apply data preprocessing pipeline to the train data

In [10]:
y = df["count"]
X = df.loc[:, df.columns != "count"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 85)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8708, 10), (2178, 10), (8708,), (2178,))

In [12]:
X_train_fe = preprocessing_pipeline.fit_transform(X_train, y_train)
X_test_fe = preprocessing_pipeline.transform(X_test)

## 3. CrossValidation and GridSearch to test and improve the model

In [13]:
def cross_validation_regression(x_training_fe, y_training, n_cv=5, lasso_alpha=1):
    """ Calculates the 'Train Score' and the mean of the 'Cross Validation' of the LinearRegression, Ridge and Lasso Model in order to assess the robustness of the model
    :param x_training_fe: X_train dataset after Feature Engineering
    :param y_training: y_train dataset
    :param n_cv: Define n-fold cross validation splitting strategy. Default = 5
    :param lasso_alpha: Define hyperparameter 'alpha' for the 'Lasso-Model'. Default = 1
    :return: Prints the 'Train Score' and the mean of the 'Cross Validation'
    """

    dic = {LinearRegression(): {"Train Score": 0, "CV Mean": 0},
           Ridge(): {"Train Score": 0, "CV Mean": 0},
           Lasso(alpha=lasso_alpha): {"Train Score": 0, "CV Mean": 0}
           }

    for model in dic.keys():
        fitted_model = model.fit(x_training_fe, y_training)
        cross_acc = cross_val_score(fitted_model,   # estimator: # the model you want to evaluate
                                    x_training_fe,         # the training input data
                                    y_training,         # the training output data
                                    cv=n_cv,          # number of cross validation datasets
                                    scoring='r2') # evaluation metric
        dic[model]["Train Score"] = round(fitted_model.score(x_training_fe, y_training), 2)
        dic[model]["CV Mean"] = round(cross_acc.mean(), 2)

    print(f"Linear Regression - Train Score:{dic[list(dic.keys())[0]]['Train Score']}, CV Mean:{dic[list(dic.keys())[0]]['CV Mean']}\n"
          f"Ridge             - Train Score:{dic[list(dic.keys())[1]]['Train Score']}, CV Mean:{dic[list(dic.keys())[1]]['CV Mean']}\n"
          f"Lasso             - Train Score:{dic[list(dic.keys())[2]]['Train Score']}, CV Mean:{dic[list(dic.keys())[2]]['CV Mean']}")

#Mean and STD

In [14]:
def grid_search_regression(x_training_fe, y_training, cv=5):
    """
    Executes a GridSearchCV() for either LinearRegression(), Ridge() and Lasso()
    :param x_training_fe: X_train dataset after Feature Engineering
    :param y_training: y_train dataset
    :return: The object of the choosen model with the optimized hyperparameters
    """
    param_lr = {
        'fit_intercept':[True,False],
        'copy_X':[True, False]
    }
    param_ridge={
        "alpha":[0.001, 0.01, 0.1, 1, 10, 100, 1000],
        "fit_intercept":[True, False],
        "normalize": [True, False],
        "copy_X": [True, False],
        "solver":["auto", "svd", "cholesky", "lsqr", "sag", "saga", "lbfgs"]
            }
    param_lasso={
        "alpha":[0.001, 0.01, 0.1, 1, 10, 100, 1000],
        "fit_intercept":[True, False],
        "copy_X": [True, False],
        "warm_start": [True, False]
    }

    user_input = input("On which model would you like to perform GridSearchCV() (LR/R/L - Q to stop)?: ")

    if user_input == "LR":
        a = LinearRegression()
        param = param_lr
    elif user_input == "R":
        a = Ridge()
        param = param_ridge
    elif user_input == "L":
        a = Lasso()
        param = param_lasso
    elif user_input == "q":
        sys.exit("You have stopped the program")

    g = GridSearchCV(a, param, cv=cv, scoring='r2')
    g.fit(x_training_fe, y_training)

    return g.best_estimator_

In [15]:
cross_validation_regression(X_train_fe, np.log1p(y_train), lasso_alpha=0.001)

Linear Regression - Train Score:0.89, CV Mean:0.89
Ridge             - Train Score:0.85, CV Mean:0.85
Lasso             - Train Score:0.89, CV Mean:0.89


In [22]:
best_model = grid_search_regression(X_train_fe, np.log1p(y_train))

SystemExit: You have stopped the program

In [17]:
best_model.fit(X_train_fe, np.log1p(y_train))

In [18]:
print(f"""The train R2-score of the linear regression is: {round(best_model.score(X_train_fe,np.log1p(y_train)),2)}
The test R2-score of the linear regression is: {round(best_model.score(X_test_fe,np.log1p(y_test)),2)}""")

The train R2-score of the linear regression is: 0.89
The test R2-score of the linear regression is: 0.89


**Observation**:
* The R2-score for the train-set has not changed