# **DESCRIPTION**

This project consist in a forecasting of cost for different industrial supplies from a company

# **Project Develop**

## *Install Libraries*

In [4]:
# !pip install pandas
# !pip install numpy
# !pip install openpyxl

## *Import Libraries*

In [5]:
import pandas as pd
import numpy as np
import openpyxl
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit

## *Import data*

Import data from an Excel file provided by the company

In [6]:
df_purchases = pd.read_excel("raw_data.xlsx", index_col=0)

In [7]:
df_purchases.head(10)

Unnamed: 0_level_0,order_date,delivery_date,supplier_name,supply_reference,unit_value,quantity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2019-01-10,2019-02-28,JOHN MONCRIEFF LIMITED,VID 9A/T,21.65,50.0
2,2019-01-10,2019-02-28,JOHN MONCRIEFF LIMITED,TRANSPORTE,220.0,1.0
3,2019-01-11,2019-01-18,"TC Medida y Control de Temperatura, S.A.","TP HILO TIPO A82-KX 2x0,5MM PVC",1.26,150.0
4,2019-01-11,2019-01-18,"TC Medida y Control de Temperatura, S.A.",TRANSPORTE,15.0,1.0
5,2019-01-11,2019-01-16,IBERLABO S.A.,"RA 1/4""XTU6MM C",4.6,6.0
6,2019-01-14,2019-01-25,JACQUET IBERICA S.A.,"PL 8""300X6MM C",18.5,26.0
7,2019-01-14,2019-01-25,JACQUET IBERICA S.A.,"PL 14""300X6MM CL",46.0,2.0
8,2019-01-14,2019-01-25,JACQUET IBERICA S.A.,"PL 16""300X10MM CP",93.0,4.0
9,2019-01-14,2019-01-25,JACQUET IBERICA S.A.,TRANSPORTE,22.0,1.0
10,2019-01-14,2019-01-21,"LAMONS GASKET&BOLT IBERICA,S.L.","T 7/8""X150 B16/GR7",3.28,12.0


## *Data Resume*

In [8]:
df_purchases.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10731 entries, 1 to 10890
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   order_date        10731 non-null  datetime64[ns]
 1   delivery_date     10720 non-null  datetime64[ns]
 2   supplier_name     10731 non-null  object        
 3   supply_reference  10731 non-null  object        
 4   unit_value        10731 non-null  float64       
 5   quantity          10731 non-null  float64       
dtypes: datetime64[ns](2), float64(2), object(2)
memory usage: 586.9+ KB


In [9]:
df_purchases.describe()

Unnamed: 0,order_date,delivery_date,unit_value,quantity
count,10731,10720,10731.0,10731.0
mean,2019-03-29 07:00:33.212188928,2019-05-05 06:02:08.955224064,219.373072,69.39104
min,2013-05-16 00:00:00,2013-09-05 00:00:00,0.0,0.0
25%,2016-09-30 00:00:00,2016-10-17 00:00:00,4.09,2.0
50%,2019-01-14 00:00:00,2019-02-08 00:00:00,18.0,7.0
75%,2021-12-09 00:00:00,2022-01-07 00:00:00,82.59,30.0
max,2025-02-06 00:00:00,2025-03-28 00:00:00,124970.27,13040.0
std,,,1510.502165,331.594064


Show number of rows for each product

In [10]:
product_counts = df_purchases['supply_reference'].value_counts()
product_counts.head(50)

Unnamed: 0_level_0,count
supply_reference,Unnamed: 1_level_1
TRANSPORTE,420
CORTE,258
TERMO BIME,155
PORTES,125
CERTIFICADO 3.1,80
PACKING,50
EXTRA COST,46
MECANIZADO VARIOS,45
CALIBRACION BIMETA,44
DOCUMENTATION,38


## *Data Engineering*

Columns that do not provide data of interest are eliminated.

In [11]:
# df_purchases = df_purchases.drop(columns=["supplier_order_id","position_supply","supply_id","discount","pending",
#                     "deliv_date_1","deliv_quant_1","deliv_note_1",
#                     "deliv_date_2","deliv_quant_2","deliv_note_2",
#                     "deliv_date_3","deliv_quant_3","deliv_note_3"])

Fill data for items not delivered with the last day of working before christmas holidays

In [12]:
df_purchases['delivery_date'] = df_purchases['delivery_date'].fillna(pd.Timestamp('2024-12-20'))

Change the order of columns in dataframe

In [13]:
new_column_order = ["order_date", "delivery_date", "supplier_name", "supply_reference","unit_value","quantity"]
df_purchases = df_purchases[new_column_order]


Calculation of the relative change in the unit price of a product compared to previous purchases

In [14]:
df_purchases = df_purchases.sort_values(by=['supply_reference', 'order_date'])

# Calculate the previous unit price for each product
df_purchases['previous_unit_value'] = df_purchases.groupby('supply_reference')['unit_value'].shift(1)

# Calculate the rate of change in the unit price
df_purchases['price_change_rate'] = ((df_purchases['unit_value'] - df_purchases['previous_unit_value']) / df_purchases['previous_unit_value']) * 100

# Fill the NaN values (which appear for the first purchase of each product) with 0 or an appropriate value
df_purchases['price_change_rate'] = df_purchases['price_change_rate'].fillna(0)

Verify if infinite or NaN values in new colum

In [15]:
num_infinite_values = np.isinf(df_purchases['price_change_rate']).sum()
num_nan_values = df_purchases['price_change_rate'].isnull().sum()

print(f"Infinites values: {num_infinite_values}; NaN values: {num_nan_values}")

Infinites values: 48; NaN values: 0


Replacing infinite values

In [16]:
df_purchases['price_change_rate'].replace([np.inf, -np.inf], np.nan, inplace=True)
mean_value = df_purchases['price_change_rate'].mean()
df_purchases['price_change_rate'].fillna(mean_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_purchases['price_change_rate'].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_purchases['price_change_rate'].fillna(mean_value, inplace=True)


In [17]:
num_infinite_values = np.isinf(df_purchases['price_change_rate']).sum()
num_nan_values = df_purchases['price_change_rate'].isnull().sum()

print(f"Infinites values: {num_infinite_values}; NaN values: {num_nan_values}")

Infinites values: 0; NaN values: 0


Coding cathegorical variables. Using Target Encoding to establish to each category the mean of target variable

In [18]:
supplier_avg_cost = df_purchases.groupby("supplier_name")["unit_value"].mean()
supply_ref_avg_cost = df_purchases.groupby("supply_reference")["unit_value"].mean()

df_purchases["supplier_encoded"] = df_purchases["supplier_name"].map(supplier_avg_cost)
df_purchases["supply_ref_encoded"] = df_purchases["supply_reference"].map(supply_ref_avg_cost)

Creation of new categories for time series

In [19]:
df_purchases["lead_time"] = (df_purchases["delivery_date"] - df_purchases["order_date"]).dt.days  # Delivery time in days
df_purchases["month"] = df_purchases["order_date"].dt.month  # Month of order
df_purchases["year"] = df_purchases["order_date"].dt.year # Year of order

Standarization of numeric columns

In [None]:
# Unit_value column is not included because it is the target variable
# Negative values can be obtained of this transformation
scaler = StandardScaler()

df_purchases['quantity'] = scaler.fit_transform(df_purchases[['quantity']])
df_purchases['lead_time'] = scaler.fit_transform(df_purchases[['lead_time']])

df_purchases.head(5)

Unnamed: 0_level_0,order_date,delivery_date,supplier_name,supply_reference,unit_value,quantity,previous_unit_value,price_change_rate,supplier_encoded,supply_ref_encoded,lead_time,month,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
9362,2018-01-25,2018-02-28,CODESOL,"1/2 CLAMP ORBIWELD 76S Ã¸25,40MM",-0.053152,-0.203243,,0.0,2108.802857,139.09,-0.059249,1,2018
4771,2024-02-29,2024-06-18,"Officine Orobiche, S.r.l.",2006.PF.PF.V.XX,2.714871,-0.206259,,0.0,4422.042429,4320.0,0.829483,2,2024
10564,2024-11-11,2025-01-10,"Officine Orobiche, S.r.l.",2016.825.TI.S.XX,4.827995,-0.206259,,0.0,4422.042429,7496.791667,0.244791,11,2024
10565,2024-11-11,2025-01-10,"Officine Orobiche, S.r.l.",2016.825.TI.S.XX,4.415073,-0.206259,7511.73,-8.302881,4422.042429,7496.791667,0.244791,11,2024
10566,2024-11-11,2025-01-10,"Officine Orobiche, S.r.l.",2016.825.TI.S.XX,5.173664,-0.206259,6888.04,16.63463,4422.042429,7496.791667,0.244791,11,2024


Get X and Y variables droping those columns without interesting data

In [None]:
X = df_purchases[['quantity', 'price_change_rate', 'supplier_encoded', 'supply_ref_encoded', 'lead_time', 'month', 'year']]
y = df_purchases['unit_value']

TimeSeriesSplit configuration for time series

In [None]:
n_splits = 5  # Number of divisions (folds)
tscv = TimeSeriesSplit(n_splits=n_splits)

Divide data in train and test. Train and prediction each model looking for the best

In [None]:
# Models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "LightGBM": lgb.LGBMRegressor(random_state=42),
}

# Initialise a dictionary to store the average metrics for each model.
metrics = {name: {"MAE": [], "RMSE": [], "R2": []} for name in models.keys()}

# Cross-Validation
for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    print(f"Fold {fold + 1}/{n_splits}")

    # Divide data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train and evaluate each model
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate metrics
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        # Store metrics
        metrics[name]["MAE"].append(mae)
        metrics[name]["RMSE"].append(rmse)
        metrics[name]["R2"].append(r2)

Fold 1/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 918
[LightGBM] [Info] Number of data points in the train set: 1791, number of used features: 7
[LightGBM] [Info] Start training from score 0.192292
Fold 2/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000310 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 985
[LightGBM] [Info] Number of data points in the train set: 3579, number of used features: 7
[LightGBM] [Info] Start training from score 0.122380
Fold 3/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000392 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1013
[LightGBM] [Info] Number of data po

In [None]:
# Average metrics by model
print("\nResultados Promedio:")
for name, model_metrics in metrics.items():
    avg_mae = np.mean(model_metrics["MAE"])
    avg_rmse = np.mean(model_metrics["RMSE"])
    avg_r2 = np.mean(model_metrics["R2"])
    print(f"{name}: MAE = {avg_mae:.2f}, RMSE = {avg_rmse:.2f}, R² = {avg_r2:.2f}")


Resultados Promedio:
Linear Regression: MAE = 0.02, RMSE = 0.13, R² = 0.89
Decision Tree: MAE = 0.03, RMSE = 0.20, R² = 0.79
Random Forest: MAE = 0.02, RMSE = 0.16, R² = 0.87
Gradient Boosting: MAE = 0.03, RMSE = 0.17, R² = 0.86
LightGBM: MAE = 0.04, RMSE = 0.26, R² = 0.70


Select Random Forest as best model and use GridSearch to hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grids = {
    "Random Forest": {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 3],
    },
    "Decision Tree": {
        'max_depth': [None, 5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2'],
    },
    "Gradient Boosting": {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.8, 1.0],
    },
}

# Initialise a dictionary to store the average metrics for each model.
best_models = {}

for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    print(f"Fold {fold + 1}/{n_splits}")

    # Divide data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train and evaluate each model
    for name, model in models.items():
        if name in param_grids:
          print(name)  # If model has defined hyperparameters
          grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name],
                                      cv=3, n_jobs=-1, verbose=1, scoring='neg_mean_absolute_error')
          grid_search.fit(X_train, y_train)
          best_model = grid_search.best_estimator_
          best_models[name] = best_model
        # else:
        #   model.fit(X_train, y_train)
        #   best_models[name] = model  # Save original model

          y_pred = best_models[name].predict(X_test)

          # Calculate metrics
          mae = mean_absolute_error(y_test, y_pred)
          rmse = np.sqrt(mean_squared_error(y_test, y_pred))
          r2 = r2_score(y_test, y_pred)

          # Store metrics
          metrics[name]["MAE"].append(mae)
          metrics[name]["RMSE"].append(rmse)
          metrics[name]["R2"].append(r2)

# Calculate mean metrics
average_metrics = {name: {metric: np.mean(values) for metric, values in metrics[name].items()} for name in metrics}

for name in metrics:
  for metric, values in metrics[name].items():
    print(f"{name}: {metric}: {np.mean(values)}")

Fold 1/5
Decision Tree
Fitting 3 folds for each of 108 candidates, totalling 324 fits


108 fits failed out of a total of 324.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
53 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skl

Random Forest
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Gradient Boosting
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Fold 2/5
Decision Tree
Fitting 3 folds for each of 108 candidates, totalling 324 fits


108 fits failed out of a total of 324.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
23 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skl

Random Forest
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Gradient Boosting
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Fold 3/5
Decision Tree
Fitting 3 folds for each of 108 candidates, totalling 324 fits


108 fits failed out of a total of 324.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
43 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skl

Random Forest
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Gradient Boosting
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Fold 4/5
Decision Tree
Fitting 3 folds for each of 108 candidates, totalling 324 fits


108 fits failed out of a total of 324.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
53 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skl

Random Forest
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Gradient Boosting
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Fold 5/5
Decision Tree
Fitting 3 folds for each of 108 candidates, totalling 324 fits


108 fits failed out of a total of 324.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
43 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skl

Random Forest
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Gradient Boosting
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Linear Regression: MAE: 0.024652008327466994
Linear Regression: RMSE: 0.1270931578092913
Linear Regression: R2: 0.8924788123907762
Decision Tree: MAE: 0.053725502454717224
Decision Tree: RMSE: 0.3266369708305198
Decision Tree: R2: 0.4050654747469354
Random Forest: MAE: 0.02659232482630347
Random Forest: RMSE: 0.20022796443214835
Random Forest: R2: 0.7791530698748032
Gradient Boosting: MAE: 0.02970503068843492
Gradient Boosting: RMSE: 0.1655539723657093
Gradient Boosting: R2: 0.8544411169348158
LightGBM: MAE: 0.03848239016186922
LightGBM: RMSE: 0.25884318004179657
LightGBM: R2: 0.6957637909020054


**Finally the best models are:**

*Linear Regression*
*   MAE: 0.02
*   RMSE: 0.13
*   R2: 0.89

*Random Forest*
*   MAE: 0.02
*   RMSE: 0.16
*   R2: 0.87

*Gradient Boosting:*
*   MAE: 0.03
*   RMSE: 0.17
*   R2: 0.86


**We will select Random Forest due to its versatility**