# Widgetbasierte Regressionsanwendung

## Imports

In [1]:
import datetime
import pandas as pd
import ipywidgets as widgets

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [3]:
import warnings
from sklearn.exceptions import DataConversionWarning
# suppress dtype conversion warnings
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

## Data Cleaning

In [4]:
# load data
raw = pd.read_parquet("escooter_history.parquet", engine="fastparquet")
# create working copy 
df = raw.copy()

# fix data types
df = df.astype({
    "holiday": "boolean",
    "workingday": "boolean",
    "weather": "category"
})
# fix typo in weather
df.weather.cat.rename_categories(
    {"heacy rain or thunderstorm or snow or ice pallets": "heavy rain or thunderstorm or snow or ice pallets"}, inplace=True)

## Preprocessing Pipeline

In [5]:
def pipeline(raw):
    # create working copy
    df_linreg = raw.copy()
    # extract weekday
    df_linreg["weekday"] = df_linreg["datetime"].dt.day_name()
    df_linreg["month"] = df_linreg["datetime"].dt.month_name()
    # group by unique hour (MultiIndex of Date and Hour)
    # preserve mode of holiday, workingday, weather, month and weekday (should change rarely or never within an hour) => little info lost
    # calculate mean of tempeature, humidity and windspeed
    # count number of rentals as count
    df_linreg = df_linreg.groupby([df["datetime"].dt.date, df["datetime"].dt.hour]).agg(
        {"holiday": pd.Series.mode, "workingday": pd.Series.mode, "weather": pd.Series.mode, "month": pd.Series.mode,
         "temp": "mean", "humidity": "mean", "windspeed": "mean", 
         "weekday": pd.Series.mode, "datetime": "count"}).rename(columns={"datetime": "count"})
    # drop first level of index (date) => relying on date as a feature will fail for predicting the future 
    df_linreg.reset_index(0, drop=True, inplace=True)
    # keep second level of index as its own column
    df_linreg.reset_index(inplace=True)
    # rename newly created column to hour and treat as string
    df_linreg["datetime"] = df_linreg["datetime"].astype("str").rename("hour")
    # designate column count as label
    label = df_linreg["count"]
    # drop count and create dummy variables for hour and day of week
    df_linreg = pd.get_dummies(df_linreg.drop("count",axis=1))
    # return (X, y)
    return df_linreg, label
# run data through pipeline
df_linreg, label = pipeline(df)


In [6]:
# split into train and test set
train_x, test_x, train_y, test_y = train_test_split(df_linreg, label, test_size=0.2, random_state=1)
# initialize standardscaler
scaler = StandardScaler()
# scaler features 
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# Regression

In [7]:
# initalize and train Linear, Lasso and Ridge regressor
lin = LinearRegression()
las = Lasso(alpha=0.5)
rid = Ridge(alpha=5)

lin.fit(train_x, train_y)
las.fit(train_x, train_y)
rid.fit(train_x, train_y)

Ridge(alpha=5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

## Evaluation

In [8]:
# save scores in DataFrame
scores =  pd.DataFrame({"linear": lin.score(test_x, test_y), 
                        "ridge": rid.score(test_x, test_y), 
                        "lasso": las.score(test_x, test_y)}, index=[0])
# save coefficients in DataFrame
coefs = pd.DataFrame({"feature": df_linreg.columns, 
                      "linear coefficients": lin.coef_, 
                      "ridge coefficients": rid.coef_, 
                      "lasso coefficients": las.coef_})

scores # display scores

Unnamed: 0,linear,ridge,lasso
0,0.637204,0.637117,0.63683


In [9]:
#coefs # uncomment to display coefficients

## Interaktive Anwendung

In [10]:
# create widgets for date and time selection
date = widgets.DatePicker(description="Datum:", value=datetime.datetime.now())
hour = widgets.IntSlider(min=0,max=23,description="Stunde:")
# group in Horiontalbox => display next to each other
datetime_box = widgets.HBox([date, hour])

# create check boxes for holiday and workingday toggle
holiday = widgets.Checkbox(value=False,description='Ist Feiertag')
workingday = widgets.Checkbox(value=False,description='Ist Arbeitstag')
# group in Horiontalbox => display next to each other
work_box = widgets.HBox([holiday, workingday])

# create dropdown for weather category
weather = widgets.Dropdown(options=df["weather"].unique(),description='Wetter:')

# create sliders for temperature, humidity and windspeed
temperature = widgets.FloatSlider(min=0, max=50, description="Termperatur:")
humidity = widgets.FloatSlider(min=0, max=100, description="Luftfeuchtigkeit:")
windspeed = widgets.FloatSlider(min=0, max=100, description="Windgeschwindkeit:")
# group in Horiontalbox => display next to each other
weather_box = widgets.HBox([temperature, humidity, windspeed])

# create submit button and label that displays results
predict_button = widgets.Button(description="Treffe Vorhersage", layout=widgets.Layout(left="40%"))
prediction = widgets.Label(value="Regressionsergebnisse:")

# display all widgets
display(datetime_box)
display(work_box)
display(weather)
display(weather_box)
display(predict_button, prediction)

def check_bounds(pred):
    return min(2000, max(0, round(pred, 1)))

def predict(ref_to_caller):
    prediction.value="Regressionsergebnisse: Berechne..."
    # collect wigdet values in dataframe
    df_input = pd.DataFrame({"datetime":datetime.datetime.combine(date.value, datetime.time(hour.value)),
                             "holiday":holiday.value,
                             "workingday":workingday.value,
                             "weather":weather.value,
                             "temp":temperature.value,
                             "humidity":humidity.value,
                             "windspeed":windspeed.value
                            }, index=[0])
    # run through pipeline, only select first value in tuple, as label can be discarded
    df_input = pipeline(df_input)[0]
    # concatenate to original train_data for consistency in dummy variables, fill null values
    # then select last row (df_input but now as vector with correct dimensions)
    df_input = pd.concat([df_linreg, df_input]).fillna(0).tail(1)
    # scale features
    scaled = scaler.transform(df_input)
    # run regression and display scores (if infeasible, display bounds [0,2000])
    prediction.value=f"Regressionsergebnisse: linear: {check_bounds(lin.predict(scaled)[0])} | ridge: {check_bounds(rid.predict(scaled)[0])} | lasso: {check_bounds(las.predict(scaled)[0])}"

# link event handler to button
predict_button.on_click(predict)

HBox(children=(DatePicker(value=datetime.datetime(2021, 11, 13, 21, 25, 44, 529391), description='Datum:'), In…

HBox(children=(Checkbox(value=False, description='Ist Feiertag'), Checkbox(value=False, description='Ist Arbei…

Dropdown(description='Wetter:', options=('clear, few clouds', 'cloudy, mist', 'light snow or rain or thunderst…

HBox(children=(FloatSlider(value=0.0, description='Termperatur:', max=50.0), FloatSlider(value=0.0, descriptio…

Button(description='Treffe Vorhersage', layout=Layout(left='40%'), style=ButtonStyle())

Label(value='Regressionsergebnisse:')