In [21]:
import logging
import warnings
 
logging.getLogger('synapse.ml').setLevel(logging.CRITICAL)
logging.getLogger('mlflow.utils').setLevel(logging.CRITICAL)
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=UserWarning)

StatementMeta(, f47f3b4e-9ceb-4c45-a06d-0214dfd6d5e6, 23, Finished, Available, Finished)

In [22]:
import re
import pandas as pd
import numpy as np

df = spark.read.format("delta").load(
    "Tables/dbo/receita_alucar_otimizada"
).cache()
# Transform to pandas according to the selected models
X = df.limit(100000).toPandas() # Use df.toPandas() to use all the data
X = X.rename(columns = lambda c:re.sub('[^A-Za-z0-9_]+', '_', c))  # Replace not supported characters in column name with underscore to avoid invalid character for model training and saving

target_col = re.sub('[^A-Za-z0-9_]+', '_', "Valor_Receita")


StatementMeta(, f47f3b4e-9ceb-4c45-a06d-0214dfd6d5e6, 24, Finished, Available, Finished)

In [23]:
display(X)

StatementMeta(, f47f3b4e-9ceb-4c45-a06d-0214dfd6d5e6, 25, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 468d88ca-7fd5-4ab1-9634-4136bde8da93)

In [24]:
# --- Handle 'Estimativa' Data ---

print(f"Original number of rows: {len(X)}")

# Filter out any rows where the client name is 'Estimativa'
X = X[X['Nome_Cliente'] != 'Estimativa']

print(f"Rows after removing 'Estimativa': {len(X)}")

StatementMeta(, f47f3b4e-9ceb-4c45-a06d-0214dfd6d5e6, 26, Finished, Available, Finished)

Original number of rows: 88
Rows after removing 'Estimativa': 81


In [25]:
import pandas as pd
from statsmodels.tsa.api import SimpleExpSmoothing
from statsmodels.tsa.api import Holt

# --- Step 1: Data Cleaning and Preparation ---

# Clean the target column 'Valor_Receita' to ensure it's a numeric type
X[target_col] = pd.to_numeric(X[target_col], errors='coerce')

# Ensure the date column is in datetime format and set it as the index
X['Data_Receita'] = pd.to_datetime(X['Data_Receita'], format='%d/%m/%Y')
X = X.set_index('Data_Receita')


# --- Step 2: Aggregate Data into a Single Time Series ---

# Group by date and sum the revenue to get total daily revenue.
# This is the key step to adapt the logic for this dataset.
print("Aggregating transactional data into daily total revenue...")
ts_df = X[[target_col]].resample('D').sum()

# Fill any missing days (e.g., weekends) with 0 to have a continuous series
ts_df = ts_df.fillna(0)


# --- Step 3: Train Model and Forecast ---

try:
    print("Training forecasting model...")
    # -- Train the Model (using the stable Simple Exponential Smoothing) --
    # model = SimpleExpSmoothing(ts_df[target_col], initialization_method="estimated")
    model = Holt(ts_df[target_col], initialization_method="estimated")

    results = model.fit()

    # -- Generate Predictions for the next 30 days --
    n_steps = 30
    predicted_values = results.forecast(steps=n_steps)
    print(f"Successfully generated forecast for the next {n_steps} days.")

    # -- Structure Forecast into a DataFrame --
    last_date = ts_df.index.max()
    future_dates = pd.date_range(start=last_date + pd.DateOffset(days=1), periods=n_steps, freq='D')

    forecast_df = pd.DataFrame({
        'Data_Prevista': future_dates,
        'Valor_Receita_Previsto': predicted_values.values
    })

    # --- Step 4: Display Final Forecast ---
    print("\n--- Daily Revenue Forecast ---")
    display(forecast_df)

except Exception as e:
    print(f"\nCould not generate forecast. Error: {e}")

StatementMeta(, f47f3b4e-9ceb-4c45-a06d-0214dfd6d5e6, 27, Finished, Available, Finished)

Aggregating transactional data into daily total revenue...
Training forecasting model...


Successfully generated forecast for the next 30 days.

--- Daily Revenue Forecast ---


SynapseWidget(Synapse.DataFrame, 3029ca7d-bad5-467f-9875-cd8878f8367c)

In [26]:
# --- Verify the Training Data's Date Range ---

# Find the minimum and maximum dates from the DataFrame's index
min_date = X.index.min()
max_date = X.index.max()

# Print the date range in a clear format
print("--- Training Data Range ---")
print(f"Start Date: {min_date.strftime('%Y-%m-%d')}")
print(f"End Date:   {max_date.strftime('%Y-%m-%d')}")

StatementMeta(, f47f3b4e-9ceb-4c45-a06d-0214dfd6d5e6, 28, Finished, Available, Finished)

--- Training Data Range ---
Start Date: 2025-01-02
End Date:   2025-05-01
