In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd

from sklearn.linear_model import LinearRegression
import plotly.graph_objects as go
import plotly.express as px
import xgboost
from sklearn import metrics

from src.features.build_features import build_preprocessing_pipeline
from src.data.constants import DATA_PATH
import utils as U

In [None]:
data = U.load_data()
build_features = build_preprocessing_pipeline()

## Linear Regression

In [None]:
#train_save(LinearRegression(), "../../models/LinearRegression")
linreg_model, linreg_df = U.load_model("../../models/LinearRegression", data)

In [None]:
y_train_pred = linreg_model.predict(data.X_train_p)
print("LinReg test set RMSE:", metrics.mean_squared_error(linreg_df.y_true, linreg_df.y_pred, squared=False))
print("LinReg train set RMSE:", metrics.mean_squared_error(data.y_train, y_train_pred, squared=False))

In [None]:
U.train_save(LinearRegression(), "../../models/LinearRegression_211103", data)
linreg_model2, linreg_df2 = U.load_model("../../models/LinearRegression_211103", data)

In [None]:
y_train_pred2 = linreg_model2.predict(data.X_train_p)

metrics_df = pd.DataFrame()
metrics_df["lin2_test"] = U.regression_metrics(linreg_df2.y_true, linreg_df2.y_pred)
metrics_df["lin2_train"] = U.regression_metrics(data.y_train, y_train_pred2)

In [None]:
metrics_df.T

In [None]:
scatter_plot(linreg_df, 
    ["Chiapas DIE Wasserbahn", "River Quest", "Crazy Bats", "Taron", "F.L.Y."], "Linear Regression", col_wrap=3, width=1200, height=800)

In [None]:
scatter_plot(linreg_df, 
    ["Bolles Flugschule", "Feng Ju Palace", "Verrücktes Hotel Tartüff", "Würmling Express", "Black Mamba", "Wellenflug"], "Linear Regression", col_wrap=3, width=1200, height=800)

### Daily Aggregated Diagram

In [None]:
result_agg_df = result_df.groupby("date").agg("mean")

In [None]:
result_agg_df.reset_index().reset_index().drop(columns="date")

In [None]:
long_df = result_agg_df.reset_index().reset_index().drop(columns="date").melt(id_vars=["index"])

In [None]:
plot_df = result_agg_df.reset_index().reset_index().drop(columns="date")

In [None]:
plot_df

In [None]:


fig = go.Figure(data=[
    go.Bar(name='y_true', x=plot_df.index, y=plot_df.y_true),
    go.Bar(name='y_pred', x=plot_df.index, y=plot_df.y_pred),
])
# Change the bar mode
fig.update_layout(
    barmode='group',
    yaxis=dict(title="mean daily waiting time (min)"),
    title="expected vs predicted mean daily waiting time for all days in the test set"
    )
fig.show()

### RMSE

## XGBoost

In [None]:
#train_save(xgboost.XGBRegressor(random_state=42, verbosity=2, objective="reg:squarederror"), "../../models/XGBRegressor_r42_reg:squarederror")
xgb_model, xgb_df = load("../../models/XGBRegressor_r42_reg:squarederror")

In [None]:
y_train_pred = xgb_model.predict(data.X_train_p)

In [None]:
xgb_train_df = pd.DataFrame({
    "date": data.X_train.date,
    "time": data.X_train.half_hour_time,
    "attraction": data.X_train.attraction,
    "y_true": data.y_train.waiting_time,
    "y_pred": y_train_pred.flatten()
})

### Scatterplots

In [None]:
scatter_plot(
    xgb_df,
    [
        "Chiapas DIE Wasserbahn",
        "River Quest",
        "Crazy Bats",
        "Taron",
        "F.L.Y.",
    ],
    "XGBoost",
    col_wrap=3,
    width=1200,
    height=800,
)


In [None]:
scatter_plot(
    xgb_train_df,
    [
        "Chiapas DIE Wasserbahn",
        "River Quest",
        "Crazy Bats",
        "Taron",
        "F.L.Y.",
    ],
    "XGBoost (train set)",
    col_wrap=3,
    width=1200,
    height=800,
)


In [None]:
scatter_plot(xgb_df, 
    ["Bolles Flugschule", "Feng Ju Palace", "Verrücktes Hotel Tartüff", "Würmling Express", "Black Mamba", "Wellenflug"], "XGBoost", col_wrap=3, width=1200, height=800)

In [None]:
scatter_plot(xgb_train_df, 
    ["Bolles Flugschule", "Feng Ju Palace", "Verrücktes Hotel Tartüff", "Würmling Express", "Black Mamba", "Wellenflug"], "XGBoost (train set)", col_wrap=3, width=1200, height=800)

### Feature Importances

In [None]:
fscore_df = get_xgb_fscores(xgb_model)

In [None]:
px.bar(fscore_df, x="fscore", y="feature_name", text="fscore", height=1000)

### Notes:

- day_of_year: May lead to memorizing datapoints, esp. because of the way the test-train-split is done

### Metrics

In [None]:
xgb_df.head()

In [None]:
xgb_train_df.head()

In [None]:
print("XGBoost test set RMSE:", metrics.mean_squared_error(xgb_df.y_true, xgb_df.y_pred, squared=False))
print("XGBoost train set RMSE:", metrics.mean_squared_error(xgb_train_df.y_true, xgb_train_df.y_pred, squared=False))