In [1]:
import sys
sys.path.append('../src')

In [2]:
from data.ingestion import load_car_sales_data
from data.preparation import generate_datasets
from evaluation.key_performance_indicators import kpi_ml
import pandas as pd

In [3]:
# Load car sales data
car_sales_df = load_car_sales_data("../data/norway_new_car_sales_by_make.csv")
# Generate datasets
X_train, y_train, X_test, y_test = generate_datasets(
    df=car_sales_df,
    x_length=12,
    y_length=1,
    test_loops=12
)

In [5]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.tree import DecisionTreeRegressor

In [4]:
max_depth = list(range(5, 11)) + [None]
min_samples_split = list(range(5, 20))
min_samples_leaf = list(range(2, 20))
param_dist = {
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
}

In [6]:
tree = DecisionTreeRegressor()
random_search = RandomizedSearchCV(
    estimator=tree,
    param_distributions=param_dist,
    n_jobs=-1,
    cv=TimeSeriesSplit(n_splits=5),
    verbose=1,
    n_iter=100,
    scoring="neg_mean_absolute_error",
)
random_search.fit(X=X_train, y=y_train)
print("Best parameters found: ", random_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found:  {'min_samples_split': 17, 'min_samples_leaf': 16, 'max_depth': 7}


In [7]:
y_train_pred = random_search.predict(X=X_train)
y_test_pred = random_search.predict(X=X_test)
kpi_ml(
    y_train=y_train,
    y_train_pred=y_train_pred,
    y_test=y_test,
    y_test_pred=y_test_pred,
    name="Decision Tree Regressor with Random Search"
)

                                             MAE  RMSE  BIAS
Decision Tree Regressor with Random Search                  
Train                                       16.7  40.9   0.0
Test                                        18.7  46.5   3.0
