In [1]:
pip install fosforml

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install statsmodels

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
# Assuming 'df' is the DataFrame that contains your data
# df = pd.read_csv('your_data.csv')

from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()
 
table_name = '"AGG_QTY_MASTER_DATA"'
 
sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()

In [5]:
df.columns

Index(['PRODUCT_CODE', 'PRODUCT_DESCRIPTION', 'CATEGORY_CODE', 'CATEGORY',
       'MANUFACTURING_LT', 'MFG_COST', 'LABOR_COST', 'PROFIT_MARGIN',
       'CUSTOMER_CODE', 'PRODUCT_ID', 'Production_Cost', 'Profit', 'Sales',
       'SUM_Part_Cost', 'CUSTOMER_NAME', 'CITY', 'STATE_PROVINCE',
       'COUNTRY_CODE', 'TRANSIT_LT', 'PART_CODE', 'QPA', 'UOM', 'UNIT_PRICE',
       'LEAD_TIME_IN_WEEKS', 'MOQ', 'PROD_LIFE_CYCLE', 'Agg_PRODUCT_ID',
       'ORDER_MONTH', 'ORDER_QTY'],
      dtype='object')

In [6]:
# Convert 'ORDER_MONTH' to datetime
df['ORDER_MONTH'] = pd.to_datetime(df['ORDER_MONTH'])

In [7]:
# Sort data by 'ORDER_MONTH' to maintain time series order
df = df.sort_values(by='ORDER_MONTH')

In [8]:
# Function to create multiple lag features for ORDER_QTY
def create_lags(df, target_column, num_lags):
    for lag in range(1, num_lags + 1):
        df[f'Lag_{lag}'] = df[target_column].shift(lag)
    return df

In [14]:
# Define a function that applies the prediction process for each product, along with RMSE and R² calculation
def forecast_for_product(product_df, num_lags=6, forecast_periods=25):
    # Create lag features for ORDER_QTY
    product_df = create_lags(product_df, 'ORDER_QTY', num_lags)
    
    # Drop rows with NaN values (due to shifting from lag features)
    product_df = product_df.dropna()
    
    # Define the features (Lag_1, Lag_2, ..., Lag_n, UNIT_PRICE, LEAD_TIME_IN_WEEKS) and target (ORDER_QTY)
    lag_columns = [f'Lag_{lag}' for lag in range(1, num_lags + 1)]
    feature_columns = lag_columns + ['UNIT_PRICE', 'LEAD_TIME_IN_WEEKS']
    
    # Define X (features) and y (target)
    X = product_df[feature_columns]
    y = product_df['ORDER_QTY']
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    # Initialize the Linear Regression model
    model = LinearRegression()
    
    # Fit the model to the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    # Calculate R²
    r2 = r2_score(y_test, y_pred)
    
    # Forecast for the next 25 months
    # Create a new DataFrame for future dates
    future_dates = pd.date_range(product_df['ORDER_MONTH'].max(), periods=forecast_periods, freq='M')
    
    # Initialize the last known lag values, UNIT_PRICE, and LEAD_TIME_IN_WEEKS
    last_lags = list(product_df[lag_columns].iloc[-1])
    last_unit_price = product_df['UNIT_PRICE'].iloc[-1]
    last_lead_time = product_df['LEAD_TIME_IN_WEEKS'].iloc[-1]
    
    # Create an empty list to store predictions
    future_preds = []
    
    for i in range(forecast_periods):
        # Prepare input features for the next prediction, including lag values, UNIT_PRICE, and LEAD_TIME_IN_WEEKS
        future_X = np.array([last_lags + [last_unit_price, last_lead_time]])
        
        # Predict the next ORDER_QTY
        future_pred = model.predict(future_X)[0]
        
        # Append the prediction to the list
        future_preds.append(future_pred)
        
        # Update lag values for the next iteration
        last_lags = [future_pred] + last_lags[:-1]  # Shift the lags with the new prediction

    # Create a DataFrame for this product's future ORDER_QTY predictions
    future_forecast_df = pd.DataFrame({
        'ORDER_DATE': future_dates,
        'Predicted_ORDER_QTY': future_preds,
        'PRODUCT_ID': product_df['Agg_PRODUCT_ID'].iloc[0],  # Add the PRODUCT_ID to each row
        'RMSE': rmse,  # Include the RMSE value for this product
        'R2': r2       # Include the R² value for this product
    })
    
    return future_forecast_df


In [15]:
# Initialize an empty DataFrame to store all product forecasts
all_products_forecast_df = pd.DataFrame()

In [16]:
# Loop over each product group by 'PRODUCT_ID'
for product_id, product_df in df.groupby('Agg_PRODUCT_ID'):
    # Apply the forecast function for each product
    product_forecast_df = forecast_for_product(product_df)
    
    # Append the forecast for this product to the overall forecast DataFrame
    all_products_forecast_df = pd.concat([all_products_forecast_df, product_forecast_df], ignore_index=True)

In [17]:
# Rename the columns
all_products_forecast_df.rename(columns={'Predicted_ORDER_QTY': 'ORDER_QTY'}, inplace=True)

In [18]:
# Display the combined forecast DataFrame for all products
print(all_products_forecast_df)

     ORDER_DATE      ORDER_QTY   PRODUCT_ID         RMSE        R2
0    2024-08-31  113684.640521  Product_001  3685.278099  0.943574
1    2024-09-30  114064.503561  Product_001  3685.278099  0.943574
2    2024-10-31  114439.933293  Product_001  3685.278099  0.943574
3    2024-11-30  114810.731922  Product_001  3685.278099  0.943574
4    2024-12-31  115177.311757  Product_001  3685.278099  0.943574
...         ...            ...          ...          ...       ...
1095 2026-04-30   28825.148999  Product_100  2921.990042  0.587413
1096 2026-05-31   28827.782579  Product_100  2921.990042  0.587413
1097 2026-06-30   28832.630212  Product_100  2921.990042  0.587413
1098 2026-07-31   28836.693347  Product_100  2921.990042  0.587413
1099 2026-08-31   28839.760446  Product_100  2921.990042  0.587413

[1100 rows x 5 columns]
