In [None]:
%pip install scikit-learn==1.5.1


### Default notebook optimization

This cell configures the logging and warning settings to reduce unnecessary output and focus on critical information. It suppresses specific warnings and logs from the underlying libraries, ensuring a cleaner and more readable notebook experience.

In [6]:
import logging
import warnings
 
logging.getLogger('synapse.ml').setLevel(logging.CRITICAL)
logging.getLogger('mlflow.utils').setLevel(logging.CRITICAL)
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=UserWarning)

StatementMeta(, cbe7beff-7a69-4de8-8d15-12be7562e773, 13, Finished, Available, Finished)

## Step 1: Load the Data

This cell is responsible for importing the raw data from the specified source into the notebook environment. The data could come from various sources, such as a file or table in your lakehouse.

Once loaded, this data will serve as the input for subsequent steps, such as data transformation, model training, and evaluation.

In [7]:
import re
import pandas as pd
import numpy as np

df = spark.read.format("delta").load(
    "Tables/dbo/despesas_consigcar_normalizada"
).cache()
# Transform to pandas according to the selected models
X = df.limit(100000).toPandas() # Use df.toPandas() to use all the data
X = X.rename(columns = lambda c:re.sub('[^A-Za-z0-9_]+', '_', c))  # Replace not supported characters in column name with underscore to avoid invalid character for model training and saving

target_col = re.sub('[^A-Za-z0-9_]+', '_', "Valor_Despesa")


StatementMeta(, cbe7beff-7a69-4de8-8d15-12be7562e773, 14, Finished, Available, Finished)

In [8]:
display(X)

StatementMeta(, cbe7beff-7a69-4de8-8d15-12be7562e773, 15, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 05419ad3-4bd6-446a-9be1-ace5c0161f41)

#### SImple EXP Smoothing

In [10]:
import pandas as pd
from statsmodels.tsa.api import SimpleExpSmoothing

# --- Step 1: Data Cleaning and Preparation ---

# Clean the target column 'Valor_Despesa' to ensure it's a numeric type
# This step is crucial and was part of the original autoML code
X[target_col] = X[target_col].astype(str) \
                           .str.replace('R$', '', regex=False) \
                           .str.strip() \
                           .str.replace('.', '', regex=False) \
                           .str.replace(',', '.', regex=False)
X[target_col] = pd.to_numeric(X[target_col], errors='coerce')


# Ensure the date column is in datetime format
X['Data_Referencia'] = pd.to_datetime(X['Data_Referencia'])

# Get a list of all unique expense categories
all_categories = X['Categoria_Despesa'].unique()
print(f"Found {len(all_categories)} unique categories to forecast.")

# This list will store each category's forecast DataFrame
all_forecasts = []


# --- Step 2: Loop Through Each Category and Forecast ---

for category in all_categories:
    print(f"--- Forecasting for: {category} ---")
    
    # Filter the data for the current category
    ts_data = X[X['Categoria_Despesa'] == category].copy()

    # Check if there is enough data to forecast
    if ts_data.shape[0] < 2:
        print(f"Skipping '{category}' due to insufficient data.")
        continue

    # Set the date as the index and select the target column
    ts_data = ts_data.set_index('Data_Referencia').sort_index()
    ts_df = ts_data[[target_col]]

    try:
        # -- Train the Model (using the stable Simple Exponential Smoothing) --
        model = SimpleExpSmoothing(ts_df[target_col], initialization_method="estimated")
        results = model.fit()

        # -- Generate Predictions for the next 12 months --
        n_steps = 12
        predicted_values = results.forecast(steps=n_steps)

        # -- Structure Forecast into a DataFrame --
        last_date = ts_df.index.max()
        future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=n_steps, freq='MS')

        forecast_df = pd.DataFrame({
            'Data_Referencia': future_dates,
            'Valor_Despesa_Previsto': predicted_values.values
        })
        
        # Add the other columns for context
        forecast_df['Empresa'] = ts_data['Empresa'].iloc[0]
        forecast_df['Categoria_Despesa'] = category
        forecast_df['Ano'] = forecast_df['Data_Referencia'].dt.year
        forecast_df['Mes_Numero'] = forecast_df['Data_Referencia'].dt.month
        
        # Add the completed forecast to our list
        all_forecasts.append(forecast_df)
        print(f"Successfully forecasted '{category}'.")

    except Exception as e:
        print(f"Could not forecast '{category}'. Error: {e}")


# --- Step 3: Combine All Forecasts into a Single DataFrame ---

if all_forecasts:
    final_forecast_df = pd.concat(all_forecasts, ignore_index=True)

    print("\n--- All forecasts have been combined successfully! ---")
    display(final_forecast_df)
else:
    print("\nNo forecasts were generated.")

StatementMeta(, cbe7beff-7a69-4de8-8d15-12be7562e773, 17, Finished, Available, Finished)

  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


Successfully forecasted 'Rodrigo Oliveira Costa Silva'.

--- All forecasts have been combined successfully! ---


SynapseWidget(Synapse.DataFrame, 4de6bfe8-9b8a-410a-b720-86b79be81060)

In [11]:
import pandas as pd
from statsmodels.tsa.api import SimpleExpSmoothing
from statsmodels.tsa.api import Holt

# --- Step 1: Data Cleaning and Preparation ---

# Clean the target column 'Valor_Despesa' to ensure it's a numeric type
# This step is crucial and was part of the original autoML code
X[target_col] = X[target_col].astype(str) \
                           .str.replace('R$', '', regex=False) \
                           .str.strip() \
                           .str.replace('.', '', regex=False) \
                           .str.replace(',', '.', regex=False)
X[target_col] = pd.to_numeric(X[target_col], errors='coerce')


# Ensure the date column is in datetime format
X['Data_Referencia'] = pd.to_datetime(X['Data_Referencia'])

# Get a list of all unique expense categories
all_categories = X['Categoria_Despesa'].unique()
print(f"Found {len(all_categories)} unique categories to forecast.")

# This list will store each category's forecast DataFrame
all_forecasts = []


# --- Step 2: Loop Through Each Category and Forecast ---

for category in all_categories:
    print(f"--- Forecasting for: {category} ---")
    
    # Filter the data for the current category
    ts_data = X[X['Categoria_Despesa'] == category].copy()

    # Check if there is enough data to forecast
    if ts_data.shape[0] < 2:
        print(f"Skipping '{category}' due to insufficient data.")
        continue

    # Set the date as the index and select the target column
    ts_data = ts_data.set_index('Data_Referencia').sort_index()
    ts_df = ts_data[[target_col]]

    try:
        # -- Train the Model (using the stable Simple Exponential Smoothing) --
        model = Holt(ts_df['Valor_Despesa'], initialization_method="estimated")

        results = model.fit()

        # -- Generate Predictions for the next 12 months --
        n_steps = 12
        predicted_values = results.forecast(steps=n_steps)

        # -- Structure Forecast into a DataFrame --
        last_date = ts_df.index.max()
        future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=n_steps, freq='MS')

        forecast_df = pd.DataFrame({
            'Data_Referencia': future_dates,
            'Valor_Despesa_Previsto': predicted_values.values
        })
        
        # Add the other columns for context
        forecast_df['Empresa'] = ts_data['Empresa'].iloc[0]
        forecast_df['Categoria_Despesa'] = category
        forecast_df['Ano'] = forecast_df['Data_Referencia'].dt.year
        forecast_df['Mes_Numero'] = forecast_df['Data_Referencia'].dt.month
        
        # Add the completed forecast to our list
        all_forecasts.append(forecast_df)
        print(f"Successfully forecasted '{category}'.")

    except Exception as e:
        print(f"Could not forecast '{category}'. Error: {e}")


# --- Step 3: Combine All Forecasts into a Single DataFrame ---

if all_forecasts:
    final_forecast_df = pd.concat(all_forecasts, ignore_index=True)

    print("\n--- All forecasts have been combined successfully! ---")
    display(final_forecast_df)
else:
    print("\nNo forecasts were generated.")

StatementMeta(, cbe7beff-7a69-4de8-8d15-12be7562e773, 18, Finished, Available, Finished)

  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


  self._init_dates(dates, freq)


Successfully forecasted 'Rodrigo Oliveira Costa Silva'.

--- All forecasts have been combined successfully! ---


SynapseWidget(Synapse.DataFrame, 400125e3-f5a6-4f67-8234-ef6f9e68a50f)