In [2]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
import pickle
from IPython.display import clear_output

# `Clean dataset`

In [3]:
data = pd.read_csv('sales.csv')
data.shape

(640840, 10)

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
0,425390,366,4,2013-04-18,517,1,0,0,0,4422
1,291687,394,6,2015-04-11,694,1,0,0,0,8297
2,411278,807,4,2013-08-29,970,1,1,0,0,9729
3,664714,802,2,2013-05-28,473,1,1,0,0,6513
4,540835,726,4,2013-10-10,1068,1,1,0,0,10882


In [5]:
data = data.drop('Unnamed: 0', axis=1)
data.shape

(640840, 9)

In [6]:
data.head()

Unnamed: 0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
0,366,4,2013-04-18,517,1,0,0,0,4422
1,394,6,2015-04-11,694,1,0,0,0,8297
2,807,4,2013-08-29,970,1,1,0,0,9729
3,802,2,2013-05-28,473,1,1,0,0,6513
4,726,4,2013-10-10,1068,1,1,0,0,10882


In [7]:
data.dtypes

store_ID                int64
day_of_week             int64
date                   object
nb_customers_on_day     int64
open                    int64
promotion               int64
state_holiday          object
school_holiday          int64
sales                   int64
dtype: object

In [8]:
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day

data.drop('date', axis=1, inplace=True)

In [9]:
data['state_holiday'] = pd.factorize(data['state_holiday'])[0]

In [10]:
data.head()

Unnamed: 0,store_ID,day_of_week,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales,year,month,day
0,366,4,517,1,0,0,0,4422,2013,4,18
1,394,6,694,1,0,0,0,8297,2015,4,11
2,807,4,970,1,1,0,0,9729,2013,8,29
3,802,2,473,1,1,0,0,6513,2013,5,28
4,726,4,1068,1,1,0,0,10882,2013,10,10


In [11]:
data.dtypes

store_ID               int64
day_of_week            int64
nb_customers_on_day    int64
open                   int64
promotion              int64
state_holiday          int64
school_holiday         int64
sales                  int64
year                   int64
month                  int64
day                    int64
dtype: object

# `Filter dataset`

In [12]:
data_open_days = data[data['open'] == 1].copy()

data_open_days.drop('open', axis=1, inplace=True)

data_open_days.shape

(532016, 10)

In [13]:
data_without_store_ID = data.drop('store_ID', axis=1)
data_without_store_ID.shape

(640840, 10)

In [14]:
data_open_days_without_store_ID = data_open_days.drop('store_ID', axis=1)
data_open_days_without_store_ID.shape

(532016, 9)

# `Check correlations`

In [15]:
def display_heatmap(dataframe):
    corr=dataframe.corr()

    mask=np.triu(np.ones_like(corr, dtype=bool))     # generate a mask for the upper triangle

    f, ax=plt.subplots(figsize=(11, 9))                 # set up the matplotlib figure

    cmap=sns.diverging_palette(220, 10, as_cmap=True)   # generate a custom diverging colormap

    sns.heatmap(corr, mask=mask, cmap=cmap,             # draw the heatmap with the mask and correct aspect ratio
                vmax=.3, center=0, square=True,
                linewidths=.5, cbar_kws={"shrink": .5})

## `Not filtered - without store_ID`

In [None]:
display_heatmap(data_without_store_ID)

## `Not filtered - with store_ID`

In [None]:
display_heatmap(data)

## `Filtered - without store_ID`

In [None]:
display_heatmap(data_open_days_without_store_ID)

## `Filtered - with store_ID`

In [None]:
display_heatmap(data_open_days)

# `Test algorithms`

In [16]:
def compare_models(models, dataframe):
    fitted_models = []

    r2_list = []
    mse_list = []
    rmse_list = []
    mae_list = []

    for model in models:

        print(f"====== {model} ======")

        full_df = dataframe.sample(frac=1, random_state=10)

        y = full_df['sales']
        X = full_df.drop('sales', axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

        # Scaling data = X_train
        X_train_transformer = MinMaxScaler().fit(X_train)
        X_train_normalized = X_train_transformer.transform(X_train)
        X_train_normalized = pd.DataFrame(X_train_normalized)

        # Scaling data = X_test
        X_test_transformer = MinMaxScaler().fit(X_test)
        X_test_normalized = X_test_transformer.transform(X_test)
        X_test_normalized = pd.DataFrame(X_test_normalized)

        model.fit(X_train_normalized, y_train)

        fitted_models.append(model)

        # Make predictions on the test data
        y_pred = model.predict(X_test_normalized)

        # R2 validation
        r2 = r2_score(y_test, y_pred)
        print("R2:", r2)

        # MSE validation
        mse=mean_squared_error(y_test, y_pred)
        print("MSE:", mse)

        # RMSE validation
        rmse = np.sqrt(mse)
        print("RMSE:", rmse)

        # MAE validation
        mae=mean_absolute_error(y_test, y_pred)
        print("MAE:", mae)

        r2_list.append(r2)
        mse_list.append(mse)
        rmse_list.append(rmse)
        mae_list.append(mae)


    summary = {
        'Algorithm': model,
        'R2': r2_list,
        'MSE': mse_list,
        'RMSE': rmse_list,
        'MAE': mae_list
    }
    summary = pd.DataFrame(summary)
    # summary.columns = models

    return summary, fitted_models

In [None]:
models_to_test = [
    LinearRegression(n_jobs=-1),
    Lasso(random_state=10),
    Ridge(random_state=10),
    ElasticNet(random_state=10),
    XGBRegressor(),
    LGBMRegressor(n_jobs=-1, random_state=10),
    DecisionTreeRegressor()
]

models_to_test_slow = [
    KNeighborsRegressor(n_jobs=-1),
    MLPRegressor(random_state=10),
    RandomForestRegressor(n_jobs=-1, random_state=10)
]

## `Not filtered - without store_ID`

In [None]:
data_without_store_ID_summary, data_without_store_ID_models = compare_models(models=models_to_test, dataframe=data_without_store_ID)

In [None]:
# ====== LGBMRegressor(random_state=10) ======
# R2: 0.8717880480454334
# MSE: 1920020.1403067634
# RMSE: 1385.6479135432505
# MAE: 934.6317351470149

In [None]:
data_without_store_ID_slow_summary, data_without_store_ID_models_slow = compare_models(models=models_to_test_slow, dataframe=data_without_store_ID)

In [None]:
# ====== MLPRegressor(random_state=10) ======
# R2: 0.8604318711437954
# MSE: 2090082.977941105
# RMSE: 1445.711927716274
# MAE: 980.7829302460166

## `Not filtered - with store_ID`

In [None]:
data_with_store_ID_summary, data_with_store_ID_models = compare_models(models=models_to_test, dataframe=data)

In [None]:
# ====== XGBRegressor() ======
# R2: 0.9494461028054196
# MSE: 757062.8112657383
# RMSE: 870.0935646617198
# MAE: 628.8746932658818

In [None]:
data_with_store_ID_slow_summary, data_with_store_ID_models_slow = compare_models(models=models_to_test_slow, dataframe=data)

In [None]:
# ====== RandomForestRegressor(n_jobs=-1, random_state=10) ======
# R2: 0.9228395853784788
# MSE: 1155504.9888035383
# RMSE: 1074.9441793895803
# MAE: 724.3297227857188

## `Filtered - without store_ID`

In [None]:
data_open_days_without_store_ID_summary, data_open_days_without_store_ID_models = compare_models(models=models_to_test, dataframe=data_open_days_without_store_ID)

In [None]:
# ====== LinearRegression(n_jobs=-1) ======
# R2: 0.7115516521122665
# MSE: 2807809.682227139
# RMSE: 1675.6520170450483
# MAE: 1206.1741847849569

In [None]:
data_open_days_without_store_ID_slow_summary, data_open_days_without_store_ID_models_slow = compare_models(models=models_to_test_slow, dataframe=data_open_days_without_store_ID)

In [None]:
# ====== MLPRegressor(random_state=10) ======
# R2: 0.7423804028651348
# MSE: 2507716.9082911913
# RMSE: 1583.5772504968588
# MAE: 1168.3293276782179

## `Filtered - with store_ID`

In [None]:
data_open_days_with_store_ID_summary, data_open_days_with_store_ID_models = compare_models(models=models_to_test, dataframe=data_open_days)

In [None]:
# ====== XGBRegressor() ======
# R2: 0.9151376668371063
# MSE: 826065.6802371338
# RMSE: 908.8815545697546
# MAE: 689.5817966818616

In [None]:
data_open_days_with_store_ID_slow_summary, data_open_days_with_store_ID_models_slow = compare_models(models=models_to_test_slow, dataframe=data_open_days)

# `For now : Not filtered - with store_ID -> data_with_store_ID_models, XGBRegressor()`

# `Try find best parameters`

In [None]:
def find_best_parameters(model, param_grid, nb_cross_validations):
    grid_search_decision_tree_classifier = GridSearchCV(estimator = model, param_grid = param_grid, cv = nb_cross_validations, n_jobs=-1)

    full_df = data.sample(frac=1, random_state=10)

    y = full_df['sales']
    X = full_df.drop('sales', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

    # Scaling data = X_train
    X_train_transformer = MinMaxScaler().fit(X_train)
    X_train_normalized = X_train_transformer.transform(X_train)
    X_train_normalized = pd.DataFrame(X_train_normalized)

    grid_search_decision_tree_classifier.fit(X_train_normalized, y_train)

    print(f"Best parameters: {grid_search_decision_tree_classifier.best_params_}")
    print(f"Best score: {grid_search_decision_tree_classifier.best_score_}")

    return grid_search_decision_tree_classifier

In [None]:
grid = {
    'booster': ['gbtree', 'gblinear', 'dart', 'gbtree', 'dart'],
    'validate_parameters': [True, False]
}

In [None]:
best_parameters_XGBRegressor_5_cv = find_best_parameters(model=XGBRegressor(), param_grid=grid, nb_cross_validations=5)

In [None]:
best_parameters_XGBRegressor_10_cv = find_best_parameters(model=XGBRegressor(), param_grid=grid, nb_cross_validations=10)

# `Best parameters for XGBRegressor seem to be : 'booster': 'gbtree', 'validate_parameters': True`

In [None]:
XGBRegressor_with_best_parameters_summary, XGBRegressor_with_best_parameters_models = compare_models(models=[XGBRegressor(booster='gbtree', validate_parameters=True, n_jobs=-1)], dataframe=data)

In [None]:
# ====== XGBRegressor(base_score=None, booster='gbtree', colsample_bylevel=None,
#              colsample_bynode=None, colsample_bytree=None,
#              enable_categorical=False, gamma=None, gpu_id=None,
#              importance_type=None, interaction_constraints=None,
#              learning_rate=None, max_delta_step=None, max_depth=None,
#              min_child_weight=None, missing=nan, monotone_constraints=None,
#              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
#              predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
#              scale_pos_weight=None, subsample=None, tree_method=None,
#              validate_parameters=True, verbosity=None) ======
# /opt/anaconda3/lib/python3.9/site-packages/xgboost/data.py:250: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
#   elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):

# R2: 0.9494461028054196
# MSE: 757062.8112657383
# RMSE: 870.0935646617198
# MAE: 628.8746932658818

In [None]:
XGBRegressor_default_parameters_summary, XGBRegressor_default_parameters_models = compare_models(models=[XGBRegressor(n_jobs=-1)], dataframe=data)

In [None]:
# ====== XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
#              colsample_bynode=None, colsample_bytree=None,
#              enable_categorical=False, gamma=None, gpu_id=None,
#              importance_type=None, interaction_constraints=None,
#              learning_rate=None, max_delta_step=None, max_depth=None,
#              min_child_weight=None, missing=nan, monotone_constraints=None,
#              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
#              predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
#              scale_pos_weight=None, subsample=None, tree_method=None,
#              validate_parameters=None, verbosity=None) ======

# R2: 0.9494461028054196
# MSE: 757062.8112657383
# RMSE: 870.0935646617198
# MAE: 628.8746932658818

# `Save XGBRegressor_default_parameters_model`

In [None]:
with open(f'models/XGBRegressor_sales_model.pkl', 'wb') as f:
    pickle.dump(XGBRegressor_default_parameters_models[0], f)

# `Load XGBRegressor_default_parameters_model`

In [None]:
with open(f'models/XGBRegressor_sales_model.pkl', "rb") as f:
    sales_model = pickle.load(f)

In [None]:
data['day_of_week'].unique()

# `Try create models per unique values of specific feature`

In [29]:
def create_models_per_feature(feature_name, dataframe):
    unique_values = dataframe[feature_name].unique()

    # Split dataframe into dataframes per day_of_week
    filtered_df_list = [
        {
            f'{feature_name}': value, 
            'dataframe': dataframe[dataframe[feature_name] == value].drop(feature_name, axis=1)
        }
                                                    for value in unique_values
    ]

    models_per_feature_results = [
        {
            f'{feature_name}': obj[feature_name],
            'summary_and_fitted_model': compare_models(models=[XGBRegressor(n_jobs=-1)], dataframe=obj['dataframe'])
        }
        for obj in filtered_df_list
    ]

    # Create results dataframe
    models_per_feature_summaries = [result['summary_and_fitted_model'][0].drop('Algorithm', axis=1).assign(feature_value = result[feature_name]) for result in models_per_feature_results]
    models_per_feature_comparison = pd.concat(models_per_feature_summaries, axis=0).sort_values(by='RMSE').reset_index().drop('index', axis=1)
    
    clear_output()
    display(models_per_feature_comparison)

    models = [
        {
            'feature_value': result[feature_name],
            'model': result['summary_and_fitted_model'][1]
        }
            for result in models_per_feature_results
    ]

    return models

## `Not filtered`

### `day_of_week`

In [30]:
day_of_week_models = create_models_per_feature('day_of_week', data)

Unnamed: 0,R2,MSE,RMSE,MAE,feature_value
0,0.995456,12768.023144,112.995678,12.142055,7
1,0.950169,519540.221714,720.791386,536.475006,4
2,0.928833,605265.683332,777.988228,578.105801,3
3,0.926281,718277.552113,847.512568,643.58093,5
4,0.911241,746686.391066,864.110173,679.419683,6
5,0.911696,840643.883542,916.866339,698.668529,2
6,0.938929,960774.905541,980.19126,698.465285,1


In [42]:
array = [112.995678, 720.791386, 777.988228, 847.512568, 864.110173, 916.866339, 980.191260]
sum(array) / len(array)

745.7793759999998

In [48]:
data['day_of_week'].value_counts()

5    92138
4    91972
2    91686
3    91651
6    91347
7    91075
1    90971
Name: day_of_week, dtype: int64

### `day`

In [21]:
create_models_per_feature('day', data)

Unnamed: 0,R2,MSE,RMSE,MAE,feature_value
0,0.95002,576622.4,759.356576,528.096424,12
1,0.942226,698372.2,835.686636,585.428555,13
2,0.932194,722833.4,850.196104,574.952052,10
3,0.942128,759670.9,871.591041,589.884215,21
4,0.936128,833746.6,913.097276,626.485654,8
5,0.94157,901049.1,949.236058,670.623839,17
6,0.942395,935668.4,967.299532,658.391503,5
7,0.952223,952644.7,976.035178,637.91643,3
8,0.917964,1042235.0,1020.898971,722.378205,23
9,0.922713,1055931.0,1027.585025,752.794479,28


[{'feature_value': 18,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.300000012,
                max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
                monotone_constraints='()', n_estimators=100, n_jobs=-1,
                num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
                reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
                validate_parameters=1, verbosity=None)]},
 {'feature_value': 11,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.30000

### `month`

In [22]:
create_models_per_feature('month', data)

Unnamed: 0,R2,MSE,RMSE,MAE,feature_value
0,0.960271,515222.0,717.78966,504.383295,1
1,0.95838,520112.9,721.188526,511.365049,2
2,0.963164,572581.6,756.691221,529.751864,4
3,0.953197,613837.7,783.477956,565.279606,7
4,0.955648,665434.6,815.74177,591.837144,11
5,0.945419,690877.8,831.190568,567.340243,8
6,0.951385,746666.7,864.098765,601.220947,3
7,0.944367,874599.0,935.199986,630.350703,6
8,0.964603,902126.9,949.803627,659.746905,12
9,0.926495,914834.4,956.469776,701.318233,10


[{'feature_value': 4,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.300000012,
                max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
                monotone_constraints='()', n_estimators=100, n_jobs=-1,
                num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
                reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
                validate_parameters=1, verbosity=None)]},
 {'feature_value': 8,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.3000000

### `year`

In [23]:
create_models_per_feature('year', data)

Unnamed: 0,R2,MSE,RMSE,MAE,feature_value
0,0.962841,544478.379643,737.887783,517.452278,2013
1,0.955524,655461.91223,809.606023,585.003003,2015
2,0.953916,700198.983172,836.778933,602.549676,2014


[{'feature_value': 2013,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.300000012,
                max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
                monotone_constraints='()', n_estimators=100, n_jobs=-1,
                num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
                reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
                validate_parameters=1, verbosity=None)]},
 {'feature_value': 2015,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.3

### `open`

In [24]:
create_models_per_feature('open', data)

Unnamed: 0,R2,MSE,RMSE,MAE,feature_value
0,0.0,2.618027e-32,1.618032e-16,1.618032e-16,0
1,0.915138,826065.7,908.8816,689.5818,1


[{'feature_value': 1,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.300000012,
                max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
                monotone_constraints='()', n_estimators=100, n_jobs=-1,
                num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
                reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
                validate_parameters=1, verbosity=None)]},
 {'feature_value': 0,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.3000000

### `promotion`

In [31]:
create_models_per_feature('promotion', data)

Unnamed: 0,R2,MSE,RMSE,MAE,feature_value
0,0.969484,365813.420749,604.825116,404.902095,0
1,0.923548,892209.347433,944.568339,722.817392,1


[{'feature_value': 0,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.300000012,
                max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
                monotone_constraints='()', n_estimators=100, n_jobs=-1,
                num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
                reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
                validate_parameters=1, verbosity=None)]},
 {'feature_value': 1,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.3000000

In [43]:
array = [604.825116, 944.568339]
sum(array) / len(array)

774.6967275

In [49]:
data['promotion'].value_counts()

0    396220
1    244620
Name: promotion, dtype: int64

### `state_holiday`

In [32]:
create_models_per_feature('state_holiday', data)

Unnamed: 0,R2,MSE,RMSE,MAE,feature_value
0,0.9869,58716.4,242.314679,30.158677,1
1,0.956299,625277.6,790.744937,565.274805,0
2,0.705125,766517.0,875.509565,91.395999,3
3,-0.016362,1709362.0,1307.425719,125.34813,2


[{'feature_value': 0,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.300000012,
                max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
                monotone_constraints='()', n_estimators=100, n_jobs=-1,
                num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
                reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
                validate_parameters=1, verbosity=None)]},
 {'feature_value': 1,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.3000000

In [44]:
array = [242.314679, 790.744937, 875.509565, 1307.425719]
sum(array) / len(array)

803.9987249999999

In [50]:
data['state_holiday'].value_counts()

0    621160
1     12842
3      4214
2      2624
Name: state_holiday, dtype: int64

### `school_holiday`

In [27]:
create_models_per_feature('school_holiday', data)

Unnamed: 0,R2,MSE,RMSE,MAE,feature_value
0,0.950559,735452.1,857.585029,620.034542,0
1,0.886357,1598503.0,1264.319126,957.316088,1


[{'feature_value': 0,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.300000012,
                max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
                monotone_constraints='()', n_estimators=100, n_jobs=-1,
                num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
                reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
                validate_parameters=1, verbosity=None)]},
 {'feature_value': 1,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.3000000

### `store_ID`

In [34]:
store_ID_models = create_models_per_feature('store_ID', data)

             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.4301251106912337
MSE: 4171902.4422201174
RMSE: 2042.5235475313664
MAE: 1736.4534505262213
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.8428451081171793
MSE: 2038973.7526228232
RMSE: 1427.926382073958
MAE: 1241.5319384785948
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.4785693871432979
MSE: 7416050.619688271
RMSE: 2723.242666324151
MAE: 2465.8571255277184
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.9276938459870632
MSE: 603762.2768106055
RMSE: 777.0214133539729
MAE: 602.1956109272695
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.6628422536755758
MSE: 9936902.873850534
RMSE: 3152.2853414388956
MAE: 2615.406804758206
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.4849128365187323
MSE: 8372411.786522363
RMSE: 2893.512015963017
MAE: 2197.963693274729
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.9810954696172981
MSE: 153876.08405162516
RMSE: 392.27042209632015
MAE: 279.2238417594664
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.452580656660863
MSE: 11294873.700393988
RMSE: 3360.7846852177227
MAE: 2918.9832402186667
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.9588328449116214
MSE: 142460.99796646388
RMSE: 377.44005877286514
MAE: 280.47662576238315
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.2264592128994466
MSE: 4608300.865313978
RMSE: 2146.6953359324134
MAE: 1887.15148841093
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.9508125119869466
MSE: 501412.0446426089
RMSE: 708.1045435828024
MAE: 503.807901577516
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.8019714206351602
MSE: 1582594.9932324
RMSE: 1258.0123183945377
MAE: 1039.3378703265878
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.9662449867666292
MSE: 91009.64279487438
RMSE: 301.67804493345943
MAE: 222.6414536883851
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.8977059736912189
MSE: 668310.5360511077
RMSE: 817.5026214337834
MAE: 639.8332509007947
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.9323660710777069
MSE: 405436.9727647442
RMSE: 636.7393287403756
MAE: 490.0724632012347
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.7209459750977896
MSE: 3842583.064073794
RMSE: 1960.2507656097966
MAE: 1720.3102458397548
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.9790996724471364
MSE: 140104.23902323245
RMSE: 374.30500801249303
MAE: 266.73430875004806
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.9668178913422416
MSE: 433896.8097758369
RMSE: 658.7084406441418
MAE: 485.92511383304753
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.9053839629207485
MSE: 777221.4027369481
RMSE: 881.6016122585916
MAE: 684.1927400534352
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.9762814932030731
MSE: 101597.35505931916
RMSE: 318.74340002472076
MAE: 223.01008957446132
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.9581159125562269
MSE: 340524.4946881823
RMSE: 583.5447666530669
MAE: 443.1875501675684
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.9495624923727886
MSE: 818019.3866175015
RMSE: 904.4442418510395
MAE: 677.0738282991668
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2: 0.6911394022547038
MSE: 1005453.0629444027
RMSE: 1002.72282458534
MAE: 882.6575504653474
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


KeyboardInterrupt: 

In [19]:
data.nunique()

store_ID                1115
day_of_week                7
nb_customers_on_day     3886
open                       2
promotion                  2
state_holiday              4
school_holiday             2
sales                  20129
year                       3
month                     12
day                       31
dtype: int64

## `Filtered`

### `day_of_week`

In [36]:
create_models_per_feature('day_of_week', data_open_days)

Unnamed: 0,R2,MSE,RMSE,MAE,feature_value
0,0.986296,542901.6,736.818555,487.97711,7
1,0.927612,572452.2,756.605707,561.391085,3
2,0.926124,710112.3,842.681627,611.731718,2
3,0.905648,759884.8,871.7137,709.340111,6
4,0.901061,763439.1,873.750002,660.738164,5
5,0.928569,971569.9,985.682459,709.238823,1
6,0.707552,2281033.0,1510.308989,1263.262342,4


[{'feature_value': 4,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.300000012,
                max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
                monotone_constraints='()', n_estimators=100, n_jobs=-1,
                num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
                reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
                validate_parameters=1, verbosity=None)]},
 {'feature_value': 6,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.3000000

### `day`

In [37]:
create_models_per_feature('day', data_open_days)

Unnamed: 0,R2,MSE,RMSE,MAE,feature_value
0,0.909513,716768.5,846.62181,599.432889,12
1,0.912333,801938.8,895.510373,648.540836,27
2,0.891407,854452.0,924.365712,668.078491,8
3,0.898248,872070.5,933.847165,664.841711,21
4,0.896743,884636.3,940.551051,696.292095,13
5,0.902815,911006.2,954.466448,704.023673,28
6,0.86727,963249.9,981.452938,674.176666,25
7,0.889726,1079837.0,1039.152037,772.077076,23
8,0.866295,1151274.0,1072.97449,776.235674,20
9,0.825874,1269828.0,1126.866258,825.405124,26


[{'feature_value': 18,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.300000012,
                max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
                monotone_constraints='()', n_estimators=100, n_jobs=-1,
                num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
                reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
                validate_parameters=1, verbosity=None)]},
 {'feature_value': 11,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.30000

### `promotion`

In [38]:
create_models_per_feature('promotion', data_open_days)

Unnamed: 0,R2,MSE,RMSE,MAE,feature_value
0,0.914486,591734.0,769.242485,585.383678,0
1,0.656246,3429111.0,1851.785879,1565.467251,1


[{'feature_value': 0,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.300000012,
                max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
                monotone_constraints='()', n_estimators=100, n_jobs=-1,
                num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
                reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
                validate_parameters=1, verbosity=None)]},
 {'feature_value': 1,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.3000000

### `state_holiday`

In [39]:
create_models_per_feature('state_holiday', data_open_days)

Unnamed: 0,R2,MSE,RMSE,MAE,feature_value
0,0.938488,593819.0,770.596491,566.378942,0
1,0.935133,1695367.0,1302.06241,975.83825,1
2,0.961076,2822136.0,1679.92143,1227.219599,3
3,-5.667057,91192000.0,9549.450413,6767.018774,2


[{'feature_value': 0,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.300000012,
                max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
                monotone_constraints='()', n_estimators=100, n_jobs=-1,
                num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
                reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
                validate_parameters=1, verbosity=None)]},
 {'feature_value': 1,
  'model': [XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', learning_rate=0.3000000

# `Save models per day_of_week`

In [61]:
def save_model(model):
    with open(f"models/XGBRegressor_sales_model_day{model['feature_value']}.pkl", 'wb') as f:
        pickle.dump(model['model'], f)

[save_model(model) for model in day_of_week_models]

[None, None, None, None, None, None, None]

# `Load models per day_of_week`

In [62]:
with open(f'models/XGBRegressor_sales_model_day1.pkl', "rb") as f:
    XGBRegressor_sales_model_day1_model = pickle.load(f)

with open(f'models/XGBRegressor_sales_model_day2.pkl', "rb") as f:
    XGBRegressor_sales_model_day2_model = pickle.load(f)

with open(f'models/XGBRegressor_sales_model_day3.pkl', "rb") as f:
    XGBRegressor_sales_model_day3_model = pickle.load(f)

with open(f'models/XGBRegressor_sales_model_day4.pkl', "rb") as f:
    XGBRegressor_sales_model_day4_model = pickle.load(f)

with open(f'models/XGBRegressor_sales_model_day5.pkl', "rb") as f:
    XGBRegressor_sales_model_day5_model = pickle.load(f)

with open(f'models/XGBRegressor_sales_model_day6.pkl', "rb") as f:
    XGBRegressor_sales_model_day6_model = pickle.load(f)

with open(f'models/XGBRegressor_sales_model_day7.pkl', "rb") as f:
    XGBRegressor_sales_model_day7_model = pickle.load(f)

# `Predict sales`

In [51]:
validation_data = pd.read_csv('validation_for_students.csv')
validation_data.shape

(71205, 9)

In [52]:
validation_data.head()

Unnamed: 0,True_index,Store_ID,Day_of_week,Date,Nb_customers_on_day,Open,Promotion,State_holiday,School_holiday
0,7,764,4,2013-12-26,0,0,0,c,1
1,19,22,3,2013-05-22,449,1,0,0,1
2,31,1087,6,2013-06-29,622,1,0,0,0
3,45,139,6,2013-08-17,314,1,0,0,0
4,56,568,1,2014-04-07,356,1,0,0,0


In [55]:
validation_data.columns = validation_data.columns.str.lower()
validation_data.head()

Unnamed: 0,true_index,store_id,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday
0,7,764,4,2013-12-26,0,0,0,c,1
1,19,22,3,2013-05-22,449,1,0,0,1
2,31,1087,6,2013-06-29,622,1,0,0,0
3,45,139,6,2013-08-17,314,1,0,0,0
4,56,568,1,2014-04-07,356,1,0,0,0


In [56]:
validation_data['date'] = pd.to_datetime(validation_data['date'])
validation_data['year'] = validation_data['date'].dt.year
validation_data['month'] = validation_data['date'].dt.month
validation_data['day'] = validation_data['date'].dt.day

validation_data.drop('date', axis=1, inplace=True)
validation_data.head()

Unnamed: 0,true_index,store_id,day_of_week,nb_customers_on_day,open,promotion,state_holiday,school_holiday,year,month,day
0,7,764,4,0,0,0,c,1,2013,12,26
1,19,22,3,449,1,0,0,1,2013,5,22
2,31,1087,6,622,1,0,0,0,2013,6,29
3,45,139,6,314,1,0,0,0,2013,8,17
4,56,568,1,356,1,0,0,0,2014,4,7


In [57]:
validation_data['state_holiday'] = pd.factorize(validation_data['state_holiday'])[0]
validation_data['state_holiday'].value_counts()

1    69050
2     1405
3      475
0      275
Name: state_holiday, dtype: int64

In [58]:
validation_data.isna().sum()

true_index             0
store_id               0
day_of_week            0
nb_customers_on_day    0
open                   0
promotion              0
state_holiday          0
school_holiday         0
year                   0
month                  0
day                    0
dtype: int64

In [200]:
unique_day_of_week_validation_data = sorted(validation_data['day_of_week'].unique())
models_day_of_week_validation_data = [
    XGBRegressor_sales_model_day1_model,
    XGBRegressor_sales_model_day2_model,
    XGBRegressor_sales_model_day3_model,
    XGBRegressor_sales_model_day4_model,
    XGBRegressor_sales_model_day5_model,
    XGBRegressor_sales_model_day6_model,
    XGBRegressor_sales_model_day7_model
]

def create_prediction_df(day_of_week, model):
    filtered_df = validation_data[validation_data['day_of_week'] == day_of_week]
    predictions = model.predict(filtered_df.drop(['true_index', 'day_of_week'], axis=1))
    df = pd.concat([filtered_df.reset_index(), pd.DataFrame(predictions, columns=['sales'])], axis=1)

    return df

predictions_df_list = [create_prediction_df(day_of_week, models[0]) for day_of_week, models in zip(unique_day_of_week_validation_data, models_day_of_week_validation_data)]

predicted_data = pd.concat(predictions_df_list, axis=0)

In [201]:
incorrect_predicted_data = predicted_data[(predicted_data['open'] == 0) & (predicted_data['sales'] != 0)]
incorrect_predicted_data['sales'] = 0
incorrect_predicted_data.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incorrect_predicted_data['sales'] = 0


(12100, 13)

In [202]:
correct_predicted_data = predicted_data[(predicted_data['open'] == 0) & (predicted_data['sales'] == 0) | (predicted_data['open'] == 1)]
correct_predicted_data.shape

(59105, 13)

In [203]:
predicted_data = pd.concat([incorrect_predicted_data, correct_predicted_data], axis=0)
predicted_data.shape

(71205, 13)

In [204]:
predicted_data.rename(columns={
    'true_index' : 'True_index',
    'sales': 'Sales'
}, inplace=True)
predicted_data = predicted_data[['True_index', 'Sales']].sort_values(by='True_index')
predicted_data.head()

Unnamed: 0,True_index,Sales
0,7,0.0
0,19,23864.761719
0,31,30475.923828
1,45,30475.923828
0,56,33753.238281


In [205]:
predicted_data.to_csv('predicted_data.csv', index=False)