# Decathlon Turnover - Forecasting Model

## Get Started

In [26]:
# import packages
import datetime

import numpy as np
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from statsmodels.tsa.arima.model import ARIMA

In [2]:
## env variables
raw_data_folder = "../data/raw"
processed_data_folder = "../data/processed"

## Data Setup

In [3]:
# list features by type: categorical or numerical feature
# categorical features
cat_features = [
    "but_num_business_unit",
    "dpt_num_department",
    "but_postcode",
    "but_region_idr_region",
    "zod_idr_zone_dgr",
]
# numerical features
num_features = [
    "year",
    "month",
    "week_of_year",
    "day_of_week",
    "quarter",
    "but_latitude",
    "but_longitude",
]

# train and validation size
train_size = 0.7

In [4]:
# loading processed datasets
df_train = pd.read_csv(f"{processed_data_folder}/train.csv", encoding="utf-8", sep=";")
df_test = pd.read_csv(f"{processed_data_folder}/test.csv", encoding="utf-8", sep=";")

In [5]:
df_train = df_train.sort_values(by="day_id").reset_index(drop=True)
df_train.sample(frac=0.4)
df_train.head()

Unnamed: 0,day_id,but_num_business_unit,dpt_num_department,turnover,year,month,week_of_year,day_of_week,quarter,but_postcode,but_latitude,but_longitude,but_region_idr_region,zod_idr_zone_dgr
0,2012-12-29,54,73,53.337413,2012,12,52,5,4,13127,43.436266,5.256322,71,10
1,2012-12-29,255,73,0.0,2012,12,52,5,4,73200,45.656025,6.369133,51,4
2,2012-12-29,812,73,43.619478,2012,12,52,5,4,56300,48.054208,-2.946356,7,6
3,2012-12-29,24,73,5.237134,2012,12,52,5,4,59494,50.38001,3.475574,65,4
4,2012-12-29,201,127,825.383999,2012,12,52,5,4,13546,43.508418,5.406423,71,10


## Features Engineering

In [None]:
def save_encoder(encoder: OneHotEncoder):
    with open("../models/encoder", "wb") as encoder_file:
        pickle.dump(encoder, encoder_file)

In [6]:
def transform(
    df: pd.DataFrame, cat_features: list[str], num_features: list[str]
) -> tuple[pd.DataFrame, pd.DataFrame]:
    y_train = df[["turnover"]].to_numpy()
    x_train = df.drop("turnover", axis=1)
    x_train_cat = x_train[cat_features]
    x_train_num = x_train[num_features].to_numpy()

    enc = OneHotEncoder(handle_unknown="ignore")
    encoded_cat = enc.fit_transform(x_train_cat).toarray()
    save_encoder(enc)
    x_train = np.concatenate((encoded_cat, x_train_num), axis=1)

    return x_train, y_train

In [7]:
# split data to train and validation sets
shape = df_train.shape
X, y = transform(df_train, cat_features, num_features)
x_train, x_val = X[: round(train_size * shape[0])], X[round(train_size * shape[0]) :]
y_train, y_val = y[: round(train_size * shape[0])], y[round(train_size * shape[0]) :]

## Model's training

In [8]:
x_val.shape, x_train.shape

((83316, 678), (194403, 678))

In [9]:
# regr = RandomForestRegressor(max_depth=10, random_state=42)
# regr.fit(x_train, y_train)

  regr.fit(x_train, y_train)


In [10]:
with open("../models/turnover_prediction.joblib", "rb") as model_file:
    regr = pickle.load(model_file)

In [11]:
with open("../models/turnover_prediction.joblib", "wb") as model_file:
    pickle.dump(regr, model_file)

## Evaluation

In [20]:
y_pred = regr.predict(x_val)

In [23]:
y_pred.shape, y_val.shape

((83316,), (83316, 1))

In [24]:
mean_squared_error(y_val, y_pred)

1134361.2939378659

In [27]:
mean_absolute_error(y_val, y_pred)

393.7410072904008