# Naive models

Implements 2 naive models:  
- NaiveLag
- NaiveRollingMean

In [5]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from utils.utils_models import LightGBMForecastingModel
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor

import utils.utils as utils
import utils.utils_features as utils_features
import utils.utils_preprocessing as utils_preprocessing
import utils.utils_plots as utils_plots
import utils.constants as constants

sns.set_theme(style="darkgrid")

In [6]:
df_kaggle = pd.read_csv(constants.file_path)
df_agg = utils.aggregate_df(df_kaggle)

df_temp = df_agg[df_agg['product_number']==208257]

In [7]:
df_agg = utils_preprocessing.preprocess_columns(df_agg, bottom_up=False).drop_duplicates()


# Generate features
df_agg = utils_features.features_time_related(df_agg)
df_agg = utils_features.features_lag(df=df_agg, col='inventory_units', lags=range(1,16), group_column='product_number')
df_agg = utils_features.features_lag(df=df_agg, col='sales_units', lags=[13, 14, 15], group_column='product_number')
df_agg = utils_features.features_rolling(df=df_agg, col='inventory_units', window_sizes=[4, 8], group_column='product_number')
df_agg = utils_features.features_rolling(df=df_agg, col='sales_units', window_sizes=[4, 8], group_column='product_number')
df_agg = utils_features.create_periods_feature(df_agg, coll_agg='product_number', date_column='date', target_col='inventory_units')

df_agg = df_agg.rename(columns={'inventory_units': 'y'}).set_index('date').sort_index()

df_agg = utils_features.put_na_on_future_lags(df=df_agg, df_key='product_number', ts_name='inventory_units')

  df["first_nonzero_signal"] = df.groupby(coll_agg)["signal_above_zero"].cumsum() > 0
  df["feature_periods"] = df.groupby(coll_agg).cumcount() + 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[:] = np.where(np.arange(m)[:,None] > np.arange(n),np.nan,df)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[:] = np.where(np.arange(m)[:,None] > np.arange(n),np.nan,df)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

In [8]:
X_train, X_test, y_train, y_test = utils.train_test_split(df_agg, forecasting_horizon=13*2, target_col="y")
cv_split = TimeSeriesSplit(n_splits=10, test_size=13)

In [None]:
parameters = {
    'objective': ['regression'],
    "boosting_type": ["gbdt"],
    'metric': ['rmse'],
    'importance_type': ['split'],
    'verbosity': [-1],
    "max_depth": [-1, 1, 3, 4, 5, 6, 10],
    "num_leaves": [10, 20, 30, 40, 100, 120, 150, 200, 250],
    "learning_rate": [0.01, 0.1, 0.2, 0.3, 0.4, 0.5],
    "n_estimators": [50, 100, 300, 500, 700, 900, 1000],
    "colsample_bytree": [0.3, 0.5, 0.7, 1, 1.2, 1.5],
    "reg_alpha": [0, 0.01, 1, 2, 5, 7, 10, 50, 100],
    "reg_lambda": [0, 0.01, 1, 5, 10, 20, 50, 100]
}


random_search = RandomizedSearchCV(
    estimator=LGBMRegressor(), 
    cv=cv_split, 
    param_distributions=parameters,
    verbose=1
    )
random_search.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 382 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 382 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 382 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 382 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 382 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 382 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work

In [None]:
best_model = random_search.best_estimator_
optimal_params = best_model.get_params()

In [None]:
optimal_params

In [None]:
optimal_params = {
    'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree': 0.3,
    'importance_type': 'split',
    'learning_rate': 0.3,
    'max_depth': 3,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'n_estimators': 900,
    'n_jobs': None,
    'num_leaves': 200,
    'objective': 'regression',
    'random_state': None,
    'reg_alpha': 5,
    'reg_lambda': 100,
    'subsample': 1.0,
    'subsample_for_bin': 200000,
    'subsample_freq': 0,
    'verbosity': -1,
    'metric': 'rmse'
    }

In [None]:
tss = TimeSeriesSplit(2, test_size=13)
fc_model = LightGBMForecastingModel(hyperparameters=optimal_params, bottom_up=False)
results_cv = fc_model.cross_validate(df_agg, n_splits=2)