In [1]:
# Let's convert 'ORDER_MONTH' to datetime and proceed with forecasting for each 'PRODUCT_ID'
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()
 
table_name = '"AGG_QTY_MASTER_DATA"'
 
sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()



df['ORDER_MONTH'] = pd.to_datetime(df['ORDER_MONTH'])

#print(f"{df['ORDER_MONTH']}")

# Sort the DataFrame by 'ORDER_MONTH' to maintain the time series order
df = df.sort_values(by=['PRODUCT_ID', 'ORDER_MONTH'])

# Function to create multiple lag features for ORDER_QTY
def create_lags(df, target_column, num_lags):
    for lag in range(1, num_lags + 1):
        df[f'Lag_{lag}'] = df[target_column].shift(lag)
    return df

# Define a function that applies the prediction process for each product, including RMSE and R² calculation
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


def train_single_model(df, num_lags=6):
    # Create lag features for ORDER_QTY
    df = create_lags(df, 'ORDER_QTY', num_lags)

    # Drop rows with NaN values (due to shifting from lag features)
    df = df.dropna()

    # Convert 'ORDER_MONTH' to an ordinal number to use as a feature
    df['ORDER_MONTH_ORDINAL'] = df['ORDER_MONTH'].apply(lambda x: x.toordinal())

    # One-hot encode 'PART_CODE' and 'CATEGORY'
   # part_code_encoded = pd.get_dummies(df['PART_CODE'], prefix='PART_CODE')
    #category_encoded = pd.get_dummies(df['CATEGORY'], prefix='CATEGORY')

    # Add the encoded columns to the df
   # df = pd.concat([df, part_code_encoded, category_encoded], axis=1)

    # Define the features (Lag_1, Lag_2, ..., Lag_n, UNIT_PRICE, LEAD_TIME_IN_WEEKS, ORDER_MONTH_ORDINAL, encoded columns)
    lag_columns = [f'Lag_{lag}' for lag in range(1, num_lags + 1)]
    #part_code_columns = list(part_code_encoded.columns)
    #category_columns = list(category_encoded.columns)
    feature_columns = lag_columns + ['UNIT_PRICE', 'LEAD_TIME_IN_WEEKS', 'ORDER_MONTH_ORDINAL']

    # Define X (features) and y (target)
    X = df[feature_columns]
    y = df['ORDER_QTY']

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Initialize the Linear Regression model
    model = LinearRegression()

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Calculate RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    # Calculate R²
    r2 = r2_score(y_test, y_pred)

    return model, rmse, r2, X_test, y_test, df, feature_columns

def forecast_for_each_product(model, product_df, df, feature_columns, num_lags=6, forecast_periods=25):
    # Use the same lag columns and part code/category one-hot encoding
    lag_columns = [f'Lag_{lag}' for lag in range(1, num_lags + 1)]
#    part_code_columns = [col for col in df.columns if col.startswith('PART_CODE')]
#    category_columns = [col for col in df.columns if col.startswith('CATEGORY')]

    # Create forecast for a specific product
    # Create lag features for ORDER_QTY
    product_df = create_lags(product_df, 'ORDER_QTY', num_lags)
    
    # Drop rows with NaN values (due to shifting from lag features)
    product_df = product_df.dropna()

    # Convert 'ORDER_MONTH' to an ordinal number to use as a feature
    product_df['ORDER_MONTH_ORDINAL'] = product_df['ORDER_MONTH'].apply(lambda x: x.toordinal())

    # Add the PART_CODE and CATEGORY encodings directly (from df, since they already exist)
    #product_df = pd.concat([product_df, df[part_code_columns + category_columns]], axis=1)

    # Define X (features) for this product
    X_product = product_df[feature_columns]

    # Forecast for the next 25 months
    future_months = pd.date_range(product_df['ORDER_MONTH'].max(), periods=forecast_periods, freq='MS')

    # Initialize the last known lag values, UNIT_PRICE, LEAD_TIME_IN_WEEKS, and ORDER_MONTH_ORDINAL
    last_lags = list(product_df[lag_columns].iloc[-1])
    last_unit_price = product_df['UNIT_PRICE'].iloc[-1]
    last_lead_time = product_df['LEAD_TIME_IN_WEEKS'].iloc[-1]
    last_order_month_ordinal = product_df['ORDER_MONTH_ORDINAL'].iloc[-1]
    #last_part_code_encoded = product_df[part_code_columns].iloc[-1].values  # Part code one-hot encoding
    #last_category_encoded = product_df[category_columns].iloc[-1].values  # Category one-hot encoding

    # Create an empty list to store predictions
    future_preds = []

    for i in range(forecast_periods):
        # Increment the ORDER_MONTH_ORDINAL for the next future month
        last_order_month_ordinal += 30  # Assuming average month is ~30 days

        # Prepare input features for the next prediction, including lag values, UNIT_PRICE, LEAD_TIME_IN_WEEKS, ORDER_MONTH_ORDINAL, PART_CODE_encoded, CATEGORY_encoded
        future_X = np.array([last_lags + [last_unit_price, last_lead_time, last_order_month_ordinal] ])

        # Predict the next ORDER_QTY
        future_pred = model.predict(future_X)[0]

        # Append the prediction to the list
        future_preds.append(future_pred)

        # Update lag values for the next iteration
        last_lags = [future_pred] + last_lags[:-1]  # Shift the lags with the new prediction

    # Format future_months as 'YYYY-MMM' (e.g., '2024-Aug')
    future_months_formatted = future_months.strftime('%Y-%b')

    # Create a DataFrame for this product's future ORDER_QTY predictions
    future_forecast_df = pd.DataFrame({
        'ORDER_MONTH': future_months_formatted,
        'Predicted_ORDER_QTY': future_preds,
        'PRODUCT_ID': product_df['PRODUCT_ID'].iloc[0],  # Add the PRODUCT_ID to each row
    })

    return future_forecast_df

# Example usage:
# Train the model once on the entire dataset
model, rmse, r2, X_test, y_test, df, feature_columns = train_single_model(df)

#print(f"{rmse},{r2}")

# Loop through each product and forecast its future ORDER_QTY
all_forecasts = []

for product_id, product_df in df.groupby('PRODUCT_ID'):
    forecast_df = forecast_for_each_product(model, product_df, df, feature_columns)
    all_forecasts.append(forecast_df)

# Combine all forecasts into one DataFrame
final_forecast_df = pd.concat(all_forecasts)
final_forecast_df

Unnamed: 0,ORDER_MONTH,Predicted_ORDER_QTY,PRODUCT_ID
0,2024-Aug,113131.033325,Product_001
1,2024-Sep,112961.394587,Product_001
2,2024-Oct,112791.084915,Product_001
3,2024-Nov,112620.105356,Product_001
4,2024-Dec,112448.456610,Product_001
...,...,...,...
20,2026-Apr,31598.557708,Product_100
21,2026-May,31630.769103,Product_100
22,2026-Jun,31661.979743,Product_100
23,2026-Jul,31692.192279,Product_100


In [2]:
from fosforml import register_model

In [None]:
sf_df = my_session.createDataFrame(final_forecast_df)
sf_df.write.mode("overwrite").save_as_table("ORDER_FORECAST_NEW")
my_session.table("ORDER_FORECAST_NEW").show()

In [None]:
register_model(
    model_obj=model,
    session=my_session,
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test =y_test,
    y_pred=y_pred,
    source="Notebook",
    dataset_name="AGG_QTY_MASTER_DATA",
    dataset_source="Snowflake",
    name="final_forecast_df",
    description="This is model for order forecast",
    flavour="sklearn",
    model_type="Regression",
    conda_dependencies=["scikit-learn==1.3.2"]
)