In [207]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
import pickle
from IPython.display import clear_output

# `Clean dataset`

In [208]:
data = pd.read_csv('sales.csv')
data.shape

(640840, 10)

In [209]:
data.head()

Unnamed: 0.1,Unnamed: 0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
0,425390,366,4,2013-04-18,517,1,0,0,0,4422
1,291687,394,6,2015-04-11,694,1,0,0,0,8297
2,411278,807,4,2013-08-29,970,1,1,0,0,9729
3,664714,802,2,2013-05-28,473,1,1,0,0,6513
4,540835,726,4,2013-10-10,1068,1,1,0,0,10882


In [210]:
data = data.drop('Unnamed: 0', axis=1)
data.shape

(640840, 9)

In [211]:
data.head()

Unnamed: 0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
0,366,4,2013-04-18,517,1,0,0,0,4422
1,394,6,2015-04-11,694,1,0,0,0,8297
2,807,4,2013-08-29,970,1,1,0,0,9729
3,802,2,2013-05-28,473,1,1,0,0,6513
4,726,4,2013-10-10,1068,1,1,0,0,10882


In [212]:
data.dtypes

store_ID                int64
day_of_week             int64
date                   object
nb_customers_on_day     int64
open                    int64
promotion               int64
state_holiday          object
school_holiday          int64
sales                   int64
dtype: object

In [213]:
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day

data.drop('date', axis=1, inplace=True)

In [214]:
data['state_holiday'] = pd.factorize(data['state_holiday'])[0]

In [215]:
data.head()

Unnamed: 0,store_ID,day_of_week,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales,year,month,day
0,366,4,517,1,0,0,0,4422,2013,4,18
1,394,6,694,1,0,0,0,8297,2015,4,11
2,807,4,970,1,1,0,0,9729,2013,8,29
3,802,2,473,1,1,0,0,6513,2013,5,28
4,726,4,1068,1,1,0,0,10882,2013,10,10


In [216]:
data.dtypes

store_ID               int64
day_of_week            int64
nb_customers_on_day    int64
open                   int64
promotion              int64
state_holiday          int64
school_holiday         int64
sales                  int64
year                   int64
month                  int64
day                    int64
dtype: object

# `Filter dataset`

In [217]:
data_open_days = data[data['open'] == 1].copy()

data_open_days.drop('open', axis=1, inplace=True)

data_open_days.shape

(532016, 10)

In [218]:
data_without_store_ID = data.drop('store_ID', axis=1)
data_without_store_ID.shape

(640840, 10)

In [219]:
data_open_days_without_store_ID = data_open_days.drop('store_ID', axis=1)
data_open_days_without_store_ID.shape

(532016, 9)

# `Check correlations`

In [None]:
def display_heatmap(dataframe):
    corr=dataframe.corr()

    mask=np.triu(np.ones_like(corr, dtype=bool))     # generate a mask for the upper triangle

    f, ax=plt.subplots(figsize=(11, 9))                 # set up the matplotlib figure

    cmap=sns.diverging_palette(220, 10, as_cmap=True)   # generate a custom diverging colormap

    sns.heatmap(corr, mask=mask, cmap=cmap,             # draw the heatmap with the mask and correct aspect ratio
                vmax=.3, center=0, square=True,
                linewidths=.5, cbar_kws={"shrink": .5})

## `Not filtered - without store_ID`

In [None]:
display_heatmap(data_without_store_ID)

## `Not filtered - with store_ID`

In [None]:
display_heatmap(data)

## `Filtered - without store_ID`

In [None]:
display_heatmap(data_open_days_without_store_ID)

## `Filtered - with store_ID`

In [None]:
display_heatmap(data_open_days)

# `Test algorithms`

In [220]:
def compare_models(models, dataframe):
    fitted_models = []

    r2_list = []
    mse_list = []
    rmse_list = []
    mae_list = []

    for model in models:

        print(f"====== {model} ======")

        full_df = dataframe.sample(frac=1, random_state=10)

        y = full_df['sales']
        X = full_df.drop('sales', axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

        # Scaling data = X_train
        X_train_transformer = MinMaxScaler().fit(X_train)
        X_train_normalized = X_train_transformer.transform(X_train)
        X_train_normalized = pd.DataFrame(X_train_normalized)

        # Scaling data = X_test
        X_test_transformer = MinMaxScaler().fit(X_test)
        X_test_normalized = X_test_transformer.transform(X_test)
        X_test_normalized = pd.DataFrame(X_test_normalized)

        model.fit(X_train_normalized, y_train)

        fitted_models.append(model)

        # Make predictions on the test data
        y_pred = model.predict(X_test_normalized)

        # R2 validation
        r2 = r2_score(y_test, y_pred)
        print("R2:", r2)

        # MSE validation
        mse=mean_squared_error(y_test, y_pred)
        print("MSE:", mse)

        # RMSE validation
        rmse = np.sqrt(mse)
        print("RMSE:", rmse)

        # MAE validation
        mae=mean_absolute_error(y_test, y_pred)
        print("MAE:", mae)

        r2_list.append(r2)
        mse_list.append(mse)
        rmse_list.append(rmse)
        mae_list.append(mae)


    summary = {
        'Algorithm': model,
        'R2': r2_list,
        'MSE': mse_list,
        'RMSE': rmse_list,
        'MAE': mae_list
    }
    summary = pd.DataFrame(summary)
    # summary.columns = models

    return summary, fitted_models

In [None]:
models_to_test = [
    LinearRegression(n_jobs=-1),
    Lasso(random_state=10),
    Ridge(random_state=10),
    ElasticNet(random_state=10),
    XGBRegressor(),
    LGBMRegressor(n_jobs=-1, random_state=10),
    DecisionTreeRegressor()
]

models_to_test_slow = [
    KNeighborsRegressor(n_jobs=-1),
    MLPRegressor(random_state=10),
    RandomForestRegressor(n_jobs=-1, random_state=10)
]

## `Not filtered - without store_ID`

In [None]:
data_without_store_ID_summary, data_without_store_ID_models = compare_models(models=models_to_test, dataframe=data_without_store_ID)

In [None]:
# ====== LGBMRegressor(random_state=10) ======
# R2: 0.8717880480454334
# MSE: 1920020.1403067634
# RMSE: 1385.6479135432505
# MAE: 934.6317351470149

In [None]:
data_without_store_ID_slow_summary, data_without_store_ID_models_slow = compare_models(models=models_to_test_slow, dataframe=data_without_store_ID)

In [None]:
# ====== MLPRegressor(random_state=10) ======
# R2: 0.8604318711437954
# MSE: 2090082.977941105
# RMSE: 1445.711927716274
# MAE: 980.7829302460166

## `Not filtered - with store_ID`

In [None]:
data_with_store_ID_summary, data_with_store_ID_models = compare_models(models=models_to_test, dataframe=data)

In [None]:
# ====== XGBRegressor() ======
# R2: 0.9494461028054196
# MSE: 757062.8112657383
# RMSE: 870.0935646617198
# MAE: 628.8746932658818

In [None]:
data_with_store_ID_slow_summary, data_with_store_ID_models_slow = compare_models(models=models_to_test_slow, dataframe=data)

In [None]:
# ====== RandomForestRegressor(n_jobs=-1, random_state=10) ======
# R2: 0.9228395853784788
# MSE: 1155504.9888035383
# RMSE: 1074.9441793895803
# MAE: 724.3297227857188

## `Filtered - without store_ID`

In [None]:
data_open_days_without_store_ID_summary, data_open_days_without_store_ID_models = compare_models(models=models_to_test, dataframe=data_open_days_without_store_ID)

In [None]:
# ====== LinearRegression(n_jobs=-1) ======
# R2: 0.7115516521122665
# MSE: 2807809.682227139
# RMSE: 1675.6520170450483
# MAE: 1206.1741847849569

In [None]:
data_open_days_without_store_ID_slow_summary, data_open_days_without_store_ID_models_slow = compare_models(models=models_to_test_slow, dataframe=data_open_days_without_store_ID)

In [None]:
# ====== MLPRegressor(random_state=10) ======
# R2: 0.7423804028651348
# MSE: 2507716.9082911913
# RMSE: 1583.5772504968588
# MAE: 1168.3293276782179

## `Filtered - with store_ID`

In [None]:
data_open_days_with_store_ID_summary, data_open_days_with_store_ID_models = compare_models(models=models_to_test, dataframe=data_open_days)

In [None]:
# ====== XGBRegressor() ======
# R2: 0.9151376668371063
# MSE: 826065.6802371338
# RMSE: 908.8815545697546
# MAE: 689.5817966818616

In [None]:
data_open_days_with_store_ID_slow_summary, data_open_days_with_store_ID_models_slow = compare_models(models=models_to_test_slow, dataframe=data_open_days)

# `For now : Not filtered - with store_ID -> data_with_store_ID_models, XGBRegressor()`

# `Try find best parameters`

In [None]:
def find_best_parameters(model, param_grid, nb_cross_validations):
    grid_search_decision_tree_classifier = GridSearchCV(estimator = model, param_grid = param_grid, cv = nb_cross_validations, n_jobs=-1)

    full_df = data.sample(frac=1, random_state=10)

    y = full_df['sales']
    X = full_df.drop('sales', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

    # Scaling data = X_train
    X_train_transformer = MinMaxScaler().fit(X_train)
    X_train_normalized = X_train_transformer.transform(X_train)
    X_train_normalized = pd.DataFrame(X_train_normalized)

    grid_search_decision_tree_classifier.fit(X_train_normalized, y_train)

    print(f"Best parameters: {grid_search_decision_tree_classifier.best_params_}")
    print(f"Best score: {grid_search_decision_tree_classifier.best_score_}")

    return grid_search_decision_tree_classifier

In [None]:
grid = {
    'booster': ['gbtree', 'gblinear', 'dart', 'gbtree', 'dart'],
    'validate_parameters': [True, False]
}

In [None]:
best_parameters_XGBRegressor_5_cv = find_best_parameters(model=XGBRegressor(), param_grid=grid, nb_cross_validations=5)

In [None]:
best_parameters_XGBRegressor_10_cv = find_best_parameters(model=XGBRegressor(), param_grid=grid, nb_cross_validations=10)

# `Best parameters for XGBRegressor seem to be : 'booster': 'gbtree', 'validate_parameters': True`

In [None]:
XGBRegressor_with_best_parameters_summary, XGBRegressor_with_best_parameters_models = compare_models(models=[XGBRegressor(booster='gbtree', validate_parameters=True, n_jobs=-1)], dataframe=data)

In [None]:
# ====== XGBRegressor(base_score=None, booster='gbtree', colsample_bylevel=None,
#              colsample_bynode=None, colsample_bytree=None,
#              enable_categorical=False, gamma=None, gpu_id=None,
#              importance_type=None, interaction_constraints=None,
#              learning_rate=None, max_delta_step=None, max_depth=None,
#              min_child_weight=None, missing=nan, monotone_constraints=None,
#              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
#              predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
#              scale_pos_weight=None, subsample=None, tree_method=None,
#              validate_parameters=True, verbosity=None) ======
# /opt/anaconda3/lib/python3.9/site-packages/xgboost/data.py:250: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
#   elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):

# R2: 0.9494461028054196
# MSE: 757062.8112657383
# RMSE: 870.0935646617198
# MAE: 628.8746932658818

In [None]:
XGBRegressor_default_parameters_summary, XGBRegressor_default_parameters_models = compare_models(models=[XGBRegressor(n_jobs=-1)], dataframe=data)

In [None]:
# ====== XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
#              colsample_bynode=None, colsample_bytree=None,
#              enable_categorical=False, gamma=None, gpu_id=None,
#              importance_type=None, interaction_constraints=None,
#              learning_rate=None, max_delta_step=None, max_depth=None,
#              min_child_weight=None, missing=nan, monotone_constraints=None,
#              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
#              predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
#              scale_pos_weight=None, subsample=None, tree_method=None,
#              validate_parameters=None, verbosity=None) ======

# R2: 0.9494461028054196
# MSE: 757062.8112657383
# RMSE: 870.0935646617198
# MAE: 628.8746932658818

# `Save XGBRegressor_default_parameters_model`

In [None]:
with open(f'models/XGBRegressor_sales_model.pkl', 'wb') as f:
    pickle.dump(XGBRegressor_default_parameters_models[0], f)

# `Load XGBRegressor_default_parameters_model`

In [221]:
with open(f'models/XGBRegressor_sales_model.pkl', "rb") as f:
    sales_model = pickle.load(f)

In [None]:
data['day_of_week'].unique()

# `Try create models per unique values of specific feature`

In [None]:
def create_models_per_feature(feature_name, dataframe):
    unique_values = dataframe[feature_name].unique()

    # Split dataframe into dataframes per day_of_week
    filtered_df_list = [
        {
            f'{feature_name}': value, 
            'dataframe': dataframe[dataframe[feature_name] == value].drop(feature_name, axis=1)
        }
                                                    for value in unique_values
    ]

    models_per_feature_results = [
        {
            f'{feature_name}': obj[feature_name],
            'summary_and_fitted_model': compare_models(models=[XGBRegressor(n_jobs=-1)], dataframe=obj['dataframe'])
        }
        for obj in filtered_df_list
    ]

    # Create results dataframe
    models_per_feature_summaries = [result['summary_and_fitted_model'][0].drop('Algorithm', axis=1).assign(feature_value = result[feature_name]) for result in models_per_feature_results]
    models_per_feature_comparison = pd.concat(models_per_feature_summaries, axis=0).sort_values(by='RMSE').reset_index().drop('index', axis=1)
    
    clear_output()
    display(models_per_feature_comparison)

    models = [
        {
            'feature_value': result[feature_name],
            'model': result['summary_and_fitted_model'][1]
        }
            for result in models_per_feature_results
    ]

    return models

## `Not filtered`

### `day_of_week`

In [258]:
day_of_week_models = create_models_per_feature('day_of_week', data)

Unnamed: 0,R2,MSE,RMSE,MAE,feature_value
0,0.995456,12768.023144,112.995678,12.142055,7
1,0.950169,519540.221714,720.791386,536.475006,4
2,0.928833,605265.683332,777.988228,578.105801,3
3,0.926281,718277.552113,847.512568,643.58093,5
4,0.911241,746686.391066,864.110173,679.419683,6
5,0.911696,840643.883542,916.866339,698.668529,2
6,0.938929,960774.905541,980.19126,698.465285,1


In [259]:
array = [112.995678, 720.791386, 777.988228, 847.512568, 864.110173, 916.866339, 980.191260]
sum(array) / len(array)

745.7793759999998

### `day`

In [None]:
create_models_per_feature('day', data)

### `month`

In [None]:
create_models_per_feature('month', data)

### `year`

In [None]:
create_models_per_feature('year', data)

### `open`

In [None]:
create_models_per_feature('open', data)

### `promotion`

In [None]:
create_models_per_feature('promotion', data)

In [None]:
array = [604.825116, 944.568339]
sum(array) / len(array)

In [None]:
data['promotion'].value_counts()

### `state_holiday`

In [None]:
create_models_per_feature('state_holiday', data)

In [None]:
array = [242.314679, 790.744937, 875.509565, 1307.425719]
sum(array) / len(array)

In [None]:
data['state_holiday'].value_counts()

### `school_holiday`

In [None]:
create_models_per_feature('school_holiday', data)

### `store_ID`

In [None]:
store_ID_models = create_models_per_feature('store_ID', data)

In [None]:
data.nunique()

## `Filtered`

### `day_of_week`

In [None]:
create_models_per_feature('day_of_week', data_open_days)

### `day`

In [None]:
create_models_per_feature('day', data_open_days)

### `promotion`

In [None]:
create_models_per_feature('promotion', data_open_days)

### `state_holiday`

In [None]:
create_models_per_feature('state_holiday', data_open_days)

# `Save models per day_of_week`

In [None]:
def save_model(model):
    with open(f"models/XGBRegressor_sales_model_day{model['feature_value']}.pkl", 'wb') as f:
        pickle.dump(model['model'], f)

[save_model(model) for model in day_of_week_models]

# `Load models per day_of_week`

In [222]:
with open(f'models/XGBRegressor_sales_model_day1.pkl', "rb") as f:
    XGBRegressor_sales_model_day1_model = pickle.load(f)

with open(f'models/XGBRegressor_sales_model_day2.pkl', "rb") as f:
    XGBRegressor_sales_model_day2_model = pickle.load(f)

with open(f'models/XGBRegressor_sales_model_day3.pkl', "rb") as f:
    XGBRegressor_sales_model_day3_model = pickle.load(f)

with open(f'models/XGBRegressor_sales_model_day4.pkl', "rb") as f:
    XGBRegressor_sales_model_day4_model = pickle.load(f)

with open(f'models/XGBRegressor_sales_model_day5.pkl', "rb") as f:
    XGBRegressor_sales_model_day5_model = pickle.load(f)

with open(f'models/XGBRegressor_sales_model_day6.pkl', "rb") as f:
    XGBRegressor_sales_model_day6_model = pickle.load(f)

with open(f'models/XGBRegressor_sales_model_day7.pkl', "rb") as f:
    XGBRegressor_sales_model_day7_model = pickle.load(f)

# `Predict sales`

In [272]:
def predict_data_feature_value(prediction_dataframe, model, feature_name, feature_value):
    filtered_df = prediction_dataframe[prediction_dataframe[feature_name] == feature_value]
    filtered_df_normalized = MinMaxScaler().fit_transform(filtered_df.drop(['true_index', feature_name], axis=1))
    predictions = model.predict(filtered_df_normalized)
    df = pd.concat([filtered_df.reset_index(), pd.DataFrame(predictions, columns=['sales'])], axis=1)

    return df

def predict_data(model_dataframe, prediction_dataframe, feature_name):
    models = create_models_per_feature(feature_name, model_dataframe)
    feature_unique_values = sorted(prediction_dataframe[feature_name].unique())
    predictions_list = [predict_data_feature_value(prediction_dataframe, models['model'][0], feature_name, feature_value) for feature_value, models in zip(feature_unique_values, models)]
    return pd.concat(predictions_list, axis=0)

In [304]:
def get_cleaned_predicted_data(dataframe):
    incorrect_predicted_data = dataframe[dataframe['sales'] < 0]
    incorrect_predicted_data['sales'] = 0

    print(f"Shape incorrect_predictions: {incorrect_predicted_data.shape}")

    correct_predicted_data = dataframe[(dataframe['sales'] >= 0)]
    correct_predicted_data.shape

    print(f"Shape correct_predictions: {correct_predicted_data.shape}")

    cleaned_predictions = pd.concat([incorrect_predicted_data, correct_predicted_data], axis=0)

    print(f"Shape cleaned_predictions: {cleaned_predictions.shape}")

    return cleaned_predictions

In [342]:
def transform_and_save_prediction_dataframe(dataframe, rounded_type):
    # Remove useless columns
    dataframe = dataframe.rename(columns={
        'true_index' : 'True_index',
        'sales': 'Sales'
    })
    # Round values
    if rounded_type == 'round':
        dataframe['Sales'] = dataframe['Sales'].round()
        dataframe['Sales'] = dataframe['Sales'].astype(int)
    elif rounded_type == 'floor':
        dataframe['Sales'] = dataframe['Sales'].apply(np.floor)
        dataframe['Sales'] = dataframe['Sales'].astype(int)

    dataframe = dataframe[['True_index', 'Sales']].sort_values(by='True_index')
    dataframe.to_csv('predicted_data.csv', index=False)

    return dataframe

## `Factorize state_holiday`

In [274]:
validation_data = pd.read_csv('validation_for_students.csv')

validation_data.columns = validation_data.columns.str.lower()

validation_data['date'] = pd.to_datetime(validation_data['date'])
validation_data['year'] = validation_data['date'].dt.year
validation_data['month'] = validation_data['date'].dt.month
validation_data['day'] = validation_data['date'].dt.day

validation_data.drop('date', axis=1, inplace=True)

validation_data['state_holiday'] = pd.factorize(validation_data['state_holiday'])[0]

print(f"Shape: {validation_data.shape}")
validation_data.head()

Shape: (71205, 11)


Unnamed: 0,true_index,store_id,day_of_week,nb_customers_on_day,open,promotion,state_holiday,school_holiday,year,month,day
0,7,764,4,0,0,0,0,1,2013,12,26
1,19,22,3,449,1,0,1,1,2013,5,22
2,31,1087,6,622,1,0,1,0,2013,6,29
3,45,139,6,314,1,0,1,0,2013,8,17
4,56,568,1,356,1,0,1,0,2014,4,7


In [273]:
predicted_data = predict_data(data, validation_data, 'day_of_week')
predicted_data.shape

Unnamed: 0,R2,MSE,RMSE,MAE,feature_value
0,0.995456,12768.023144,112.995678,12.142055,7
1,0.950169,519540.221714,720.791386,536.475006,4
2,0.928833,605265.683332,777.988228,578.105801,3
3,0.926281,718277.552113,847.512568,643.58093,5
4,0.911241,746686.391066,864.110173,679.419683,6
5,0.911696,840643.883542,916.866339,698.668529,2
6,0.938929,960774.905541,980.19126,698.465285,1


(71205, 13)

In [306]:
array = [112.995678, 720.791386, 777.988228, 847.512568, 864.110173, 916.866339, 980.191260]
np.mean(array)

745.7793759999998

In [283]:
cleaned_predicted_data = get_cleaned_predicted_data(predicted_data)
cleaned_predicted_data.head()

Shape incorrect_predictions: (5284, 13)
Shape correct_predictions: (65921, 13)
Shape cleaned_predictions: (71205, 13)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incorrect_predicted_data['sales'] = 0


Unnamed: 0,index,true_index,store_id,day_of_week,nb_customers_on_day,open,promotion,state_holiday,school_holiday,year,month,day,sales
111,777,7984,1035,1,0,0,0,3,1,2014,4,21,0.0
129,910,9444,72,1,0,0,0,3,1,2013,4,1,0.0
280,2079,20865,858,1,0,0,0,3,1,2013,4,1,0.0
298,2243,22375,218,1,0,0,0,3,1,2013,4,1,0.0
324,2389,23701,105,1,0,0,0,3,1,2014,4,21,0.0


In [294]:
cleaned_predicted_data = transform_and_save_prediction_dataframe(cleaned_predicted_data, 'floor')
cleaned_predicted_data.head()

Unnamed: 0,True_index,Sales
0,7,0.031591
0,19,3977.55249
0,31,7186.211914
1,45,4211.681641
0,56,3944.895752


In [295]:
cleaned_predicted_data['Sales'].mean()

5599.109014585869

In [296]:
max_csv = pd.read_csv('predicted_data_max.csv')
max_csv['0'].mean()

5770.694940110116

## `OneHotEncode state_holiday`

In [334]:
validation_data = pd.read_csv('validation_for_students.csv')

validation_data.columns = validation_data.columns.str.lower()

validation_data['date'] = pd.to_datetime(validation_data['date'])
validation_data['year'] = validation_data['date'].dt.year
validation_data['month'] = validation_data['date'].dt.month
validation_data['day'] = validation_data['date'].dt.day

validation_data.drop('date', axis=1, inplace=True)

validation_data = pd.get_dummies(validation_data, columns=['state_holiday'], prefix='state_holiday')

print(f"Shape: {validation_data.shape}")
validation_data.head()

Shape: (71205, 14)


Unnamed: 0,true_index,store_id,day_of_week,nb_customers_on_day,open,promotion,school_holiday,year,month,day,state_holiday_0,state_holiday_a,state_holiday_b,state_holiday_c
0,7,764,4,0,0,0,1,2013,12,26,0,0,0,1
1,19,22,3,449,1,0,1,2013,5,22,1,0,0,0
2,31,1087,6,622,1,0,0,2013,6,29,1,0,0,0
3,45,139,6,314,1,0,0,2013,8,17,1,0,0,0
4,56,568,1,356,1,0,0,2014,4,7,1,0,0,0


In [335]:
data_one_hot_encoded = pd.read_csv('sales.csv')

data_one_hot_encoded = data_one_hot_encoded.drop('Unnamed: 0', axis=1)

data_one_hot_encoded['date'] = pd.to_datetime(data_one_hot_encoded['date'])
data_one_hot_encoded['year'] = data_one_hot_encoded['date'].dt.year
data_one_hot_encoded['month'] = data_one_hot_encoded['date'].dt.month
data_one_hot_encoded['day'] = data_one_hot_encoded['date'].dt.day

data_one_hot_encoded.drop('date', axis=1, inplace=True)

data_one_hot_encoded = pd.get_dummies(data_one_hot_encoded, columns=['state_holiday'], prefix='state_holiday')

data_one_hot_encoded.head()

Unnamed: 0,store_ID,day_of_week,nb_customers_on_day,open,promotion,school_holiday,sales,year,month,day,state_holiday_0,state_holiday_a,state_holiday_b,state_holiday_c
0,366,4,517,1,0,0,4422,2013,4,18,1,0,0,0
1,394,6,694,1,0,0,8297,2015,4,11,1,0,0,0
2,807,4,970,1,1,0,9729,2013,8,29,1,0,0,0
3,802,2,473,1,1,0,6513,2013,5,28,1,0,0,0
4,726,4,1068,1,1,0,10882,2013,10,10,1,0,0,0


In [336]:
one_hot_encoded_predicted_data = predict_data(data_one_hot_encoded, validation_data, 'day_of_week')
one_hot_encoded_predicted_data.shape

Unnamed: 0,R2,MSE,RMSE,MAE,feature_value
0,0.995456,12768.023144,112.995678,12.142055,7
1,0.951084,510000.329951,714.143074,529.28729,4
2,0.927266,618595.852519,786.508647,582.31351,3
3,0.926778,713429.245514,844.647409,641.613349,5
4,0.911241,746686.391066,864.110173,679.419683,6
5,0.911696,840643.6998,916.866239,698.66789,2
6,0.940662,933499.800487,966.177934,691.350892,1


(71205, 16)

In [337]:
array = [112.995678, 714.143074, 786.508647, 844.647409, 864.110173, 916.866239, 966.177934]
np.mean(array)

743.6355934285715

In [338]:
one_hot_encoded_cleaned_predicted_data = get_cleaned_predicted_data(one_hot_encoded_predicted_data)
one_hot_encoded_cleaned_predicted_data.head()

Shape incorrect_predictions: (5349, 16)
Shape correct_predictions: (65856, 16)
Shape cleaned_predictions: (71205, 16)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incorrect_predicted_data['sales'] = 0


Unnamed: 0,index,true_index,store_id,day_of_week,nb_customers_on_day,open,promotion,school_holiday,year,month,day,state_holiday_0,state_holiday_a,state_holiday_b,state_holiday_c,sales
111,777,7984,1035,1,0,0,0,1,2014,4,21,0,0,1,0,0.0
129,910,9444,72,1,0,0,0,1,2013,4,1,0,0,1,0,0.0
280,2079,20865,858,1,0,0,0,1,2013,4,1,0,0,1,0,0.0
298,2243,22375,218,1,0,0,0,1,2013,4,1,0,0,1,0,0.0
324,2389,23701,105,1,0,0,0,1,2014,4,21,0,0,1,0,0.0


In [346]:
one_hot_encoded_cleaned_predicted_data = transform_and_save_prediction_dataframe(one_hot_encoded_cleaned_predicted_data, 'floor')
one_hot_encoded_cleaned_predicted_data.head()

Unnamed: 0,True_index,Sales
0,7,0
0,19,3978
0,31,7186
1,45,4213
0,56,3956


In [347]:
one_hot_encoded_cleaned_predicted_data['Sales'].mean()

5658.671343304543