In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Assuming 'df' is the DataFrame that contains your data
# df = pd.read_csv('your_data.csv')

from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()
 
table_name = '"MASTER_DATA"'
 
sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()


# Convert 'ORDER_DATE' to datetime
df['ORDER_DATE'] = pd.to_datetime(df['ORDER_DATE'])

# Sort data by 'ORDER_DATE' to maintain time series order
df = df.sort_values(by='ORDER_DATE')

# Function to create multiple lag features for ORDER_QTY
def create_lags(df, target_column, num_lags):
    for lag in range(1, num_lags + 1):
        df[f'Lag_{lag}'] = df[target_column].shift(lag)
    return df

# Define a function that applies the prediction process for each product
def forecast_for_product(product_df, num_lags=6, forecast_periods=25):
    # Create lag features for ORDER_QTY
    product_df = create_lags(product_df, 'ORDER_QTY', num_lags)
    
    # Drop rows with NaN values (due to shifting from lag features)
    product_df = product_df.dropna()
    
    # Define the features (Lag_1, Lag_2, ..., Lag_n, UNIT_PRICE, LEAD_TIME_IN_WEEKS) and target (ORDER_QTY)
    lag_columns = [f'Lag_{lag}' for lag in range(1, num_lags + 1)]
    feature_columns = lag_columns + ['UNIT_PRICE', 'LEAD_TIME_IN_WEEKS']
    
    # Define X (features) and y (target)
    X = product_df[feature_columns]
    y = product_df['ORDER_QTY']
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    # Initialize the Linear Regression model
    model = LinearRegression()
    
    # Fit the model to the training data
    model.fit(X_train, y_train)
    
    # Forecast for the next 25 months
    # Create a new DataFrame for future dates
    future_dates = pd.date_range(product_df['ORDER_DATE'].max(), periods=forecast_periods, freq='M')
    
    # Initialize the last known lag values, UNIT_PRICE, and LEAD_TIME_IN_WEEKS
    last_lags = list(product_df[lag_columns].iloc[-1])
    last_unit_price = product_df['UNIT_PRICE'].iloc[-1]
    last_lead_time = product_df['LEAD_TIME_IN_WEEKS'].iloc[-1]
    
    # Create an empty list to store predictions
    future_preds = []
    
    for i in range(forecast_periods):
        # Prepare input features for the next prediction, including lag values, UNIT_PRICE, and LEAD_TIME_IN_WEEKS
        future_X = np.array([last_lags + [last_unit_price, last_lead_time]])
        
        # Predict the next ORDER_QTY
        future_pred = model.predict(future_X)[0]
        
        # Append the prediction to the list
        future_preds.append(future_pred)
        
        # Update lag values for the next iteration
        last_lags = [future_pred] + last_lags[:-1]  # Shift the lags with the new prediction

    # Create a DataFrame for this product's future ORDER_QTY predictions
    future_forecast_df = pd.DataFrame({
        'ORDER_DATE': future_dates,
        'Predicted_ORDER_QTY': future_preds,
        'PRODUCT_ID': product_df['PRODUCT_ID'].iloc[0]  # Add the PRODUCT_ID to each row
    })
    
    return future_forecast_df

# Initialize an empty DataFrame to store all product forecasts
all_products_forecast_df = pd.DataFrame()

# Loop over each product group by 'PRODUCT_ID'
for product_id, product_df in df.groupby('PRODUCT_ID'):
    # Apply the forecast function for each product
    product_forecast_df = forecast_for_product(product_df)
    
    # Append the forecast for this product to the overall forecast DataFrame
    all_products_forecast_df = pd.concat([all_products_forecast_df, product_forecast_df], ignore_index=True)

# Rename the columns
all_products_forecast_df.rename(columns={'Predicted_ORDER_QTY': 'ORDER_QTY'}, inplace=True)

# Display the combined forecast DataFrame for all products
print(all_products_forecast_df)

# Optionally, save the resulting DataFrame to a CSV file
# all_products_forecast_df.to_csv('all_products_order_qty_forecast.csv', index=False)

ModuleNotFoundError: No module named 'statsmodels'