In [1]:
import pandas as pd
import numpy as np

In [2]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# **DATASET LOADING**

In [4]:
df = pd.read_csv("../data/processed/01_Data_Cleaning.csv")
df['date'] = pd.to_datetime(df['date'])

df.head()

Unnamed: 0,date,sell_quantity,sell_value,buy_quantity,buy_value,daily_balance,inventory,inv_correction,avg_temperature,precipitation,incc,is_holiday
0,2024-01-01,,,,,0,175,0,24.9,0.1,0.0027,1
1,2024-01-02,163.0,32.1,280.0,28.97,117,292,0,25.0,0.2,0.0027,0
2,2024-01-03,215.0,31.13,,28.97,-215,77,0,24.8,4.9,0.0027,0
3,2024-01-04,,,240.0,28.43,240,317,0,24.8,0.0,0.0027,0
4,2024-01-05,55.0,35.5,240.0,28.43,185,502,0,25.1,0.0,0.0027,0


# **FEATURE ENGINEERING**

## **DEMAND, SUPPLY AND STOCK DYNAMICS**

7-day rolling average of the daily selling and buyingprice, used to smooth out short-term fluctuations and highlight trends.

In [8]:
df['rolling_mean_sell_value'] = df['sell_value'].rolling(window = 7, min_periods = 1).mean()
df['rolling_mean_buy_value'] = df['buy_value'].rolling(window = 7, min_periods = 1).mean()

<br>
Lag features of daily quantity sold (1-day and 7-day), allowing the model to detect temporal patterns or autocorrelation in demand. As well as, previous day’s inventory value, to help understand dynamic stock variation, and 7-day rolling average of inventory, reflecting smoothed stock availability.
<br>
The standard deviation helps identify sale volatility over the past 7 days.

In [10]:
df['sell_quantity_lag_1'] = df['sell_quantity'].shift(1)
df['sell_quantity_lag_7'] = df['sell_quantity'].shift(7)
df['sell_quantity_std_7d'] = df['sell_quantity'].rolling(window = 7).std()

df['inventory_lag_1'] = df['inventory'].shift(1)
df['avg_inventory_rolling'] = df['inventory'].rolling(window = 7).mean()

<br>
7-day average of quantity sold — helpful to track recent sales momentum.

In [12]:
df['avg_sell_quantity_last_7d'] = df['sell_quantity'].rolling(window = 7).mean()

<br>
Ratio between units sold and purchased — indicates potential inventory pressure or imbalance.

In [14]:
df['sell_to_buy_ratio'] = df['sell_quantity'] / np.where(df['buy_quantity'] == 0, np.nan, df['buy_quantity'])

<br>
Cumulative sum of product units sold, indicating total sales volume evolution over time.

In [16]:
df['cumulative_sell_quantity'] = df['sell_quantity'].cumsum()

<br>
Daily percentage change in selling and buying prices, indicating price volatility.

In [18]:
df['sell_value_pct_change'] = df['sell_value'].pct_change().replace([np.inf, -np.inf], np.nan)
df['buy_value_pct_change'] = df['buy_value'].pct_change().replace([np.inf, -np.inf], np.nan)

<br>
Gross margin per unit (selling price / buying price), used to measure profitability.

In [20]:
df['price_margin'] = df.apply(lambda row: (row['sell_value'] / row['buy_value']) if row['sell_value'] != 0 else np.nan, axis = 1)

## **EXOGENOUS VARIABLES: WEATHER AND HOLIDAYS**

Flags for moderate (2.5–10 mm) and heavy (≥10 mm) daily rainfall, based on Brazilian climate thresholds, which may affect sales of construction materials like cement.

In [23]:
df['is_moderate_rain'] = (df['precipitation'] >= 2.5) & (df['precipitation'] < 10)
df['is_heavy_rain'] = df['precipitation'] >= 10

<br>
Temperature bins that categorize daily average temperature into "low" (< 9 °C), "medium" (9 °C – 26 °C), and "high" (> 26 °C) based on UTCI-based thermal stress thresholds defined by INMET.

In [25]:
df['temperature_bin'] = pd.cut(
    df['avg_temperature'],
    bins = [-float('inf'), 9, 26, float('inf')],
    labels = ['low', 'medium', 'high']
)

<br>
Date-based time features that help the model capture seasonal patterns in demand and pricing.

In [27]:
df['month'] = df['date'].dt.month
df['week'] = df['date'].dt.isocalendar().week
df['quarter'] = df['date'].dt.quarter

<br>
Flags to capture potential stockpiling or slowdown before and after holidays.

In [29]:
df['is_day_before_holiday'] = df['is_holiday'].shift(-1).fillna(0).astype(int)
df['is_day_after_holiday'] = df['is_holiday'].shift(1).fillna(0).astype(int)

## **INFLATION-AJUSTED FEATURES**

Lagged values of monthly INCC inflation rate, allowing the model to capture inflation trends and their delayed effects on pricing.

In [32]:
df['incc_lag_month'] = df['incc'].shift(30)

<br>
Month-over-month change in the INCC inflation rate.

In [34]:
df['incc_monthly_change'] = df['incc'] - df['incc'].shift(30)

<br>

The `sell_value_pct_vs_incc` feature represents the difference between the daily percentage change in selling price and the INCC inflation rate.
  - A positive value means the selling price increased more than inflation on that day, indicating a potential real price gain;
  - A negative value means the price increased less than inflation, or even decreased, indicating a real loss in value;
  - A value close to zero means the price kept pace with inflation, a neutral adjustment.

In [77]:
df['sell_value_pct_change'] = df['sell_value'].pct_change().replace([np.inf, -np.inf], np.nan).fillna(0)
df['sell_value_pct_vs_incc'] = df['sell_value_pct_change'] - df['incc']

# **EXPLORATORY DATA ANALYSIS**

In [79]:
df[['date', 'sell_value', 'sell_value_pct_change', 'sell_value_pct_vs_incc', 'incc']].head(40)

Unnamed: 0,date,sell_value,sell_value_pct_change,sell_value_pct_vs_incc,incc
0,2024-01-01,,0.0,-0.0027,0.0027
1,2024-01-02,32.1,0.0,-0.0027,0.0027
2,2024-01-03,31.13,-0.030218,-0.032918,0.0027
3,2024-01-04,,0.0,-0.0027,0.0027
4,2024-01-05,35.5,0.140379,0.137679,0.0027
5,2024-01-06,33.5,-0.056338,-0.059038,0.0027
6,2024-01-07,,0.0,-0.0027,0.0027
7,2024-01-08,34.53,0.030746,0.028046,0.0027
8,2024-01-09,35.29,0.02201,0.01931,0.0027
9,2024-01-10,34.92,-0.010485,-0.013185,0.0027


In [38]:
df

Unnamed: 0,date,sell_quantity,sell_value,buy_quantity,buy_value,daily_balance,inventory,inv_correction,avg_temperature,precipitation,...,is_heavy_rain,temperature_bin,month,week,quarter,is_day_before_holiday,is_day_after_holiday,incc_lag_month,incc_monthly_change,incc_cumulative
0,2024-01-01,,,,,0,175,0,24.9,0.1,...,False,medium,1,1,1,0,0,,,1.000090
1,2024-01-02,163.0,32.10,280.0,28.97,117,292,0,25.0,0.2,...,False,medium,1,1,1,0,1,,,1.000180
2,2024-01-03,215.0,31.13,,28.97,-215,77,0,24.8,4.9,...,False,medium,1,1,1,0,0,,,1.000270
3,2024-01-04,,,240.0,28.43,240,317,0,24.8,0.0,...,False,medium,1,1,1,0,0,,,1.000360
4,2024-01-05,55.0,35.50,240.0,28.43,185,502,0,25.1,0.0,...,False,medium,1,1,1,0,0,,,1.000450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,2025-04-26,,,,30.22,0,475,0,21.6,1.6,...,False,medium,4,17,2,0,0,0.0039,0.0013,1.089405
482,2025-04-27,,,,30.22,0,475,0,22.3,0.1,...,False,medium,4,17,2,0,0,0.0039,0.0013,1.089593
483,2025-04-28,39.0,35.17,,30.22,-39,436,0,21.7,7.6,...,False,medium,4,18,2,0,0,0.0039,0.0013,1.089782
484,2025-04-29,107.0,34.71,,30.22,-107,329,0,21.0,0.3,...,False,medium,4,18,2,0,0,0.0039,0.0013,1.089971
