# Try Different Test Train Split Strategies

We found out in the notebook `fm_train_linear_regression_xgboost` that the performance is substantially better on the test set than on the train set. This is a very weird effect that may be caused by looking in the future. 

To find out if this is indeed the problem, we will run the following experiment: We limit the data to the months 7,8,9 (for which we have three recorded years each). Then we perform two test-train-splits:

1. day-wise random 67:33 split
2. using 2019 and 2020 as train set and 2021 as test set

In [None]:
%load_ext autoreload
%autoreload 2

from types import SimpleNamespace

import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from src.data.create_train_test_split import prepare_waiting_time, prepare_weather, train_test_split_date_based
from src.features.build_features import build_preprocessing_pipeline
from src.data.constants import DATA_PATH

import utils as U

In [None]:
metrics_df = pd.DataFrame()

## Day-wise random 67:33 split (all data)

In [None]:
DAYWISE_PATH = "../../experiments/test_train_splits/daywise_random/"

In [None]:
waiting_time_df = pd.read_csv(
    DATA_PATH / "processed/waiting_times.csv", index_col="id", parse_dates=["date"]
)
lommersum_df = pd.read_csv(
    DATA_PATH / "processed/weather_station01327_Lommersum.csv",
    index_col="date",
    parse_dates=["date"],
)
koelnbonn_df = pd.read_csv(
    DATA_PATH / "processed/weather_station02667_Koeln-Bonn.csv",
    index_col="date",
    parse_dates=["date"],
)

waiting_time_df = prepare_waiting_time(waiting_time_df)
lommersum_df = prepare_weather(lommersum_df, "lommersum_")
koelnbonn_df = prepare_weather(koelnbonn_df, "koelnbonn_")

ext_datapoints_df = waiting_time_df.join(other=lommersum_df, on="date").join(
    other=koelnbonn_df, on="date"
)

# Drop datapoints where the park was closed
ext_datapoints_df.dropna(axis="index", how="any", subset=["waiting_time"], inplace=True)

#summer_datapoints_df = ext_datapoints_df.query("date.dt.month in [7, 8, 9]")
split_dfs = train_test_split_date_based(ext_datapoints_df, 0.33)

print(
    f"{len(split_dfs['X_train'])} train samples, {len(split_dfs['X_test'])} test samples"
)
print(
    f"proportion of test samples: {len(split_dfs['X_test'])/(len(split_dfs['X_test'])+len(split_dfs['X_train'])):.2%}"
)


In [None]:
#joblib.dump(split_dfs, DAYWISE_PATH + "split_dfs.joblib")
split_dfs = joblib.load(DAYWISE_PATH + "split_dfs.joblib")

In [None]:
pipeline = build_preprocessing_pipeline()

In [None]:
split_dfs.keys()

In [None]:
X_train_p = pipeline.fit_transform(split_dfs["X_train"])
X_test_p = pipeline.transform(split_dfs["X_test"])

In [None]:
#np.savez((DAYWISE_PATH + "processed.npz", X_train_p=X_train_p, X_test_p=X_test_p)
processed = np.load((DAYWISE_PATH + "processed.npz")

In [None]:
data = SimpleNamespace()

for k, v in split_dfs.items():
    setattr(data, k, v)

for k, v in processed.items():
    setattr(data, k, v)
processed.close()

In [None]:
U.train_save(LinearRegression(), DAYWISE_PATH + "LinearRegression", data)
lin_model, lin_df = U.load_model(DAYWISE_PATH + "LinearRegression", data)

In [None]:
y_pred_train = lin_model.predict(data.X_train_p)

In [None]:
metrics_df["daywise_lin_test"] = U.regression_metrics(lin_df.y_true, lin_df.y_pred)
metrics_df["daywise_lin_train"] = U.regression_metrics(data.y_train, y_pred_train)

In [None]:
metrics_df

## Day-wise random 80:20 split (all data)

In [None]:
DAYWISE_PATH = "../../experiments/test_train_splits/daywise_random_80:20/"

In [None]:
waiting_time_df = pd.read_csv(
    DATA_PATH / "processed/waiting_times.csv", index_col="id", parse_dates=["date"]
)
lommersum_df = pd.read_csv(
    DATA_PATH / "processed/weather_station01327_Lommersum.csv",
    index_col="date",
    parse_dates=["date"],
)
koelnbonn_df = pd.read_csv(
    DATA_PATH / "processed/weather_station02667_Koeln-Bonn.csv",
    index_col="date",
    parse_dates=["date"],
)

waiting_time_df = prepare_waiting_time(waiting_time_df)
lommersum_df = prepare_weather(lommersum_df, "lommersum_")
koelnbonn_df = prepare_weather(koelnbonn_df, "koelnbonn_")

ext_datapoints_df = waiting_time_df.join(other=lommersum_df, on="date").join(
    other=koelnbonn_df, on="date"
)

# Drop datapoints where the park was closed
ext_datapoints_df.dropna(axis="index", how="any", subset=["waiting_time"], inplace=True)

#summer_datapoints_df = ext_datapoints_df.query("date.dt.month in [7, 8, 9]")
split_dfs = train_test_split_date_based(ext_datapoints_df, 0.20)

print(
    f"{len(split_dfs['X_train'])} train samples, {len(split_dfs['X_test'])} test samples"
)
print(
    f"proportion of test samples: {len(split_dfs['X_test'])/(len(split_dfs['X_test'])+len(split_dfs['X_train'])):.2%}"
)


In [None]:
joblib.dump(split_dfs, DAYWISE_PATH + "split_dfs.joblib")
#split_dfs = joblib.load(DAYWISE_PATH + "split_dfs.joblib")

In [None]:
pipeline = build_preprocessing_pipeline()

In [None]:
split_dfs.keys()

In [None]:
X_train_p = pipeline.fit_transform(split_dfs["X_train"])
X_test_p = pipeline.transform(split_dfs["X_test"])

In [None]:
np.savez(DAYWISE_PATH + "processed.npz", X_train_p=X_train_p, X_test_p=X_test_p)
processed = np.load(DAYWISE_PATH + "processed.npz")

In [None]:
data = SimpleNamespace()

for k, v in split_dfs.items():
    setattr(data, k, v)

for k, v in processed.items():
    setattr(data, k, v)
processed.close()

In [None]:
U.train_save(LinearRegression(), DAYWISE_PATH + "LinearRegression", data)
lin_model, lin_df = U.load_model(DAYWISE_PATH + "LinearRegression", data)

In [None]:
y_pred_train = lin_model.predict(data.X_train_p)

In [None]:
metrics_df["daywise_lin_test_80:20"] = U.regression_metrics(lin_df.y_true, lin_df.y_pred)
metrics_df["daywise_lin_train_80:20"] = U.regression_metrics(data.y_train, y_pred_train)

In [None]:
metrics_df

## Compare with default data

In [None]:
default_data = U.load_data()

In [None]:
X_train_curr = data.X_train.copy()
X_train_curr.index = default_data.X_train.index
default_data.X_train.compare(X_train_curr)

In [None]:
X_test_curr = data.X_test.copy()
X_test_curr.index = default_data.X_test.index
default_data.X_test.compare(X_test_curr)

In [None]:
pd.DataFrame(data.X_train_p).compare(pd.DataFrame(default_data.X_train_p))

In [None]:
DAYWISE_PATH = "../../experiments/test_train_splits/default_data/"

In [None]:
U.train_save(LinearRegression(), DAYWISE_PATH + "LinearRegression", default_data)
lin_model, lin_df = U.load_model(DAYWISE_PATH + "LinearRegression", default_data)

In [None]:
y_pred_train = lin_model.predict(default_data.X_train_p)

In [None]:
metrics_df["default_data_lin_test"] = U.regression_metrics(lin_df.y_true, lin_df.y_pred)
metrics_df["default_data_lin_train"] = U.regression_metrics(default_data.y_train, y_pred_train)

In [None]:
metrics_df.T

## Re-train LinReg with switched test train

In [None]:
lin_reg_switch = LinearRegression().fit(data.X_test_p, data.y_test)

In [None]:
y_test_pred = lin_reg_switch.predict(data.X_test_p)
y_train_pred = lin_reg_switch.predict(data.X_train_p)

In [None]:
metrics_df["switched_lin_test"] = U.regression_metrics(data.y_test, y_test_pred)
metrics_df["switched_lin_train"] = U.regression_metrics(data.y_train, y_train_pred)

In [None]:
metrics_df.T

In [None]:
metrics_df["switched_lin_train"]["rmse"]