In [43]:
import pandas as pd

In [44]:
data = pd.read_csv("cleaned_sales_data.csv")
data.head()

Unnamed: 0,date,units_sold,price,stock_available,store_id_str_02,store_id_str_03,product_id_pdt_002,product_id_pdt_003,product_id_pdt_004,product_id_pdt_005,...,product_category_Household,product_category_Snack,store_location_Los Angeles,store_location_New York,weekday_Monday,weekday_Saturday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,revenue
0,2023-01-02,50,13.54,51.0,False,False,False,False,False,True,...,False,False,False,True,True,False,False,False,False,677.0
1,2023-01-02,69,27.44,92.0,True,False,False,False,False,True,...,False,False,True,False,True,False,False,False,False,1893.36
2,2023-01-02,60,27.75,14.0,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,1665.0
3,2023-01-03,16,7.27,71.0,False,False,False,True,False,False,...,False,False,False,True,False,False,False,True,False,116.32
4,2023-01-03,46,29.15,60.0,True,False,False,False,True,False,...,True,False,True,False,False,False,False,True,False,1340.9


#### Feature Target Selection

In [45]:
X = data.drop(['units_sold'], axis=1)
y = data['units_sold'] # units sales as target to predict

In [46]:
y.head(8)

0    50
1    69
2    60
3    16
4    46
5    82
6    59
7    97
Name: units_sold, dtype: int64

In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

In [48]:
X_train.head()

Unnamed: 0,date,price,stock_available,store_id_str_02,store_id_str_03,product_id_pdt_002,product_id_pdt_003,product_id_pdt_004,product_id_pdt_005,product_category_Dairy,...,product_category_Household,product_category_Snack,store_location_Los Angeles,store_location_New York,weekday_Monday,weekday_Saturday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,revenue
105,2023-02-15,45.34,44.0,False,True,False,False,True,False,False,...,True,False,False,False,False,False,False,False,True,4352.64
17,2023-01-09,30.77,29.0,True,False,True,False,False,False,False,...,False,True,True,False,True,False,False,False,False,430.78
64,2023-01-28,25.73,46.0,True,False,True,False,False,False,False,...,False,True,True,False,False,True,False,False,False,1054.93
349,2023-06-02,15.55,26.0,True,False,False,False,True,False,False,...,True,False,True,False,False,False,False,False,False,808.6
116,2023-02-21,24.56,80.0,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,761.36


In [49]:
y_train.head()

105    96
17     14
64     41
349    52
116    31
Name: units_sold, dtype: int64

#### Modelling using XGBoost Regressor

In [50]:
# !pip install xgboost

In [51]:
from xgboost import XGBRegressor


xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:date: object

**Error means that XGBoost requires all input features to be numeric. Here 'Date' column is object so i need to change it into numeric**

In [52]:
print(X_train.dtypes)

date                           object
price                         float64
stock_available               float64
store_id_str_02                  bool
store_id_str_03                  bool
product_id_pdt_002               bool
product_id_pdt_003               bool
product_id_pdt_004               bool
product_id_pdt_005               bool
product_category_Dairy           bool
product_category_Frozen          bool
product_category_Household       bool
product_category_Snack           bool
store_location_Los Angeles       bool
store_location_New York          bool
weekday_Monday                   bool
weekday_Saturday                 bool
weekday_Thursday                 bool
weekday_Tuesday                  bool
weekday_Wednesday                bool
revenue                       float64
dtype: object


In [53]:
# Convert 'date' column to datetime and extract features
X_train['date'] = pd.to_datetime(X_train['date'])
X_train['day'] = X_train['date'].dt.day
X_train['month'] = X_train['date'].dt.month
X_train['year'] = X_train['date'].dt.year

# Drop original 'date' column
X_train = X_train.drop(columns=['date'])


X_test['date'] = pd.to_datetime(X_test['date'])
X_test['day'] = X_test['date'].dt.day
X_test['month'] = X_test['date'].dt.month
X_test['year'] = X_test['date'].dt.year

X_test = X_test.drop(columns=['date'])

In [54]:
X_train.head()

Unnamed: 0,price,stock_available,store_id_str_02,store_id_str_03,product_id_pdt_002,product_id_pdt_003,product_id_pdt_004,product_id_pdt_005,product_category_Dairy,product_category_Frozen,...,store_location_New York,weekday_Monday,weekday_Saturday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,revenue,day,month,year
105,45.34,44.0,False,True,False,False,True,False,False,False,...,False,False,False,False,False,True,4352.64,15,2,2023
17,30.77,29.0,True,False,True,False,False,False,False,False,...,False,True,False,False,False,False,430.78,9,1,2023
64,25.73,46.0,True,False,True,False,False,False,False,False,...,False,False,True,False,False,False,1054.93,28,1,2023
349,15.55,26.0,True,False,False,False,True,False,False,False,...,False,False,False,False,False,False,808.6,2,6,2023
116,24.56,80.0,False,True,False,False,False,False,False,False,...,False,False,False,False,True,False,761.36,21,2,2023


In [55]:
xgb_model.fit(X_train, y_train)

In [56]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse},  R2 score: {r2}")

MSE: 23.204287363489087,  R2 score: 0.9660345315933228


In [57]:
import mlflow
import mlflow.xgboost
import pandas as pd
from mlflow.models.signature import infer_signature

xgb_model.fit(X_train, y_train)

# Create an input example (first row of the training data)
input_example = pd.DataFrame(X_train.iloc[0:1])

# Infer the model signature
signature = infer_signature(input_example, xgb_model.predict(input_example))

# Set the experiment
mlflow.set_experiment("Retail-Sales-Forecasting")

with mlflow.start_run():
    # Log the model with signature and input example
    mlflow.xgboost.log_model(xgb_model, name="model", signature=signature, input_example=input_example)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2_score", r2)


  self.get_booster().save_model(fname)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

**Error says the type of integers, bools are not accept mlflow, instead convert them into float64**

In [58]:
X_train.dtypes

price                         float64
stock_available               float64
store_id_str_02                  bool
store_id_str_03                  bool
product_id_pdt_002               bool
product_id_pdt_003               bool
product_id_pdt_004               bool
product_id_pdt_005               bool
product_category_Dairy           bool
product_category_Frozen          bool
product_category_Household       bool
product_category_Snack           bool
store_location_Los Angeles       bool
store_location_New York          bool
weekday_Monday                   bool
weekday_Saturday                 bool
weekday_Thursday                 bool
weekday_Tuesday                  bool
weekday_Wednesday                bool
revenue                       float64
day                             int32
month                           int32
year                            int32
dtype: object

In [59]:
X_train_safe = X_train.copy()
X_train_safe = X_train_safe.astype({col: 'float64' for col in X_train_safe.select_dtypes(include=['int','bool']).columns})

In [60]:
X_train_safe.dtypes

price                         float64
stock_available               float64
store_id_str_02               float64
store_id_str_03               float64
product_id_pdt_002            float64
product_id_pdt_003            float64
product_id_pdt_004            float64
product_id_pdt_005            float64
product_category_Dairy        float64
product_category_Frozen       float64
product_category_Household    float64
product_category_Snack        float64
store_location_Los Angeles    float64
store_location_New York       float64
weekday_Monday                float64
weekday_Saturday              float64
weekday_Thursday              float64
weekday_Tuesday               float64
weekday_Wednesday             float64
revenue                       float64
day                           float64
month                         float64
year                          float64
dtype: object

In [61]:
xgb_model.fit(X_train_safe, y_train)

input_example = X_train_safe.iloc[0:1]
signature = infer_signature(input_example, xgb_model.predict(input_example))


mlflow.set_experiment("Retail-Sales-Forecasting")

with mlflow.start_run():
    mlflow.xgboost.log_model(xgb_model, name="model", signature=signature, input_example=input_example)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2 score", r2)

  self.get_booster().save_model(fname)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

**XGBoost has changed its default save format to UBJSON (a compact JSON format), it is not a error just warnning**

In [62]:
xgb_model.get_booster().save_model("model_01.json")