In [6]:

import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# Assuming 'df' is the DataFrame that contains your data
# df = pd.read_csv('your_data.csv')



from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()
 
table_name = '"MASTER_DATA"'
 
sf_df = my_session.sql("select * from {}".format(table_name))
gdf = sf_df.to_pandas()

df = df[df['PRODUCT_ID'] == 'Product_001']

# Convert 'ORDER_DATE' to datetime
df['ORDER_DATE'] = pd.to_datetime(df['ORDER_DATE'])

# Sort data by 'ORDER_DATE' to maintain time series order
df = df.sort_values(by='ORDER_DATE')

# Function to create multiple lag features for ORDER_QTY
def create_lags(df, target_column, num_lags):
    for lag in range(1, num_lags + 1):
        df[f'Lag_{lag}'] = df[target_column].shift(lag)
    return df

# Create lag features for ORDER_QTY, for example, using the last 6 months
num_lags = 6
df = create_lags(df, 'ORDER_QTY', num_lags)

# Drop rows with NaN values (due to shifting from lag features)
df = df.dropna()

# Define the features (Lag_1, Lag_2, ..., Lag_n, UNIT_PRICE, LEAD_TIME_IN_WEEKS) and target (ORDER_QTY)
lag_columns = [f'Lag_{lag}' for lag in range(1, num_lags + 1)]
feature_columns = lag_columns + ['UNIT_PRICE', 'LEAD_TIME_IN_WEEKS']

# Define X (features) and y (target)
X = df[feature_columns]
y = df['ORDER_QTY']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculate Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Root Mean Squared Error: {rmse}")

# Calculate R-squared (R²)
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R²): {r2}")

# Forecast for the next 25 months
# Create a new DataFrame for future dates
future_dates = pd.date_range(df['ORDER_DATE'].max(), periods=25, freq='M')

# Initialize the last known lag values, UNIT_PRICE, and LEAD_TIME_IN_WEEKS
last_lags = list(df[lag_columns].iloc[-1])
last_unit_price = df['UNIT_PRICE'].iloc[-1]
last_lead_time = df['LEAD_TIME_IN_WEEKS'].iloc[-1]

# Create an empty list to store predictions
future_preds = []

for i in range(25):
    # Prepare input features for the next prediction, including lag values, UNIT_PRICE, and LEAD_TIME_IN_WEEKS
    future_X = np.array([last_lags + [last_unit_price, last_lead_time]])
    
    # Predict the next ORDER_QTY
    future_pred = model.predict(future_X)[0]
    
    # Append the prediction to the list
    future_preds.append(future_pred)
    
    # Update lag values for the next iteration
    last_lags = [future_pred] + last_lags[:-1]  # Shift the lags with the new prediction

# Create a DataFrame to store future ORDER_QTY predictions along with their corresponding ORDER_DATE
future_forecast_df = pd.DataFrame({
    'ORDER_DATE': future_dates,
    'Predicted_ORDER_QTY': future_preds
})

# Display the resulting DataFrame
print(future_forecast_df)

# Optionally, save the DataFrame to a CSV file if needed
# future_forecast_df.to_csv('future_order_qty_forecast.csv', index=False)

Mean Squared Error: 22774.70195849878
Root Mean Squared Error: 150.91289526908818
R-squared (R²): 0.7294089810268344
   ORDER_DATE  Predicted_ORDER_QTY
0  2024-08-31           389.057952
1  2024-09-30           376.217232
2  2024-10-31           384.315002
3  2024-11-30           394.041601
4  2024-12-31           407.482364
5  2025-01-31           393.450357
6  2025-02-28           405.166493
7  2025-03-31           408.368866
8  2025-04-30           412.518463
9  2025-05-31           416.509130
10 2025-06-30           420.249788
11 2025-07-31           423.369700
12 2025-08-31           426.967737
13 2025-09-30           430.225459
14 2025-10-31           433.410018
15 2025-11-30           436.480015
16 2025-12-31           439.437074
17 2026-01-31           442.283738
18 2026-02-28           445.045010
19 2026-03-31           447.705062
20 2026-04-30           450.274218
21 2026-05-31           452.754096
22 2026-06-30           455.147812
23 2026-07-31           457.458473
24 2026-