In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from sklearn.metrics import mean_absolute_error
import plotly.graph_objs as go
from prophet.plot import plot_plotly
from sklearn.model_selection import train_test_split
# import xgboost
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from nixtlats import TimeGPT
import logging
from tabulate import tabulate
from statsmodels.tsa.arima.model import ARIMA


In [2]:
# !python -m pip install --upgrade pip setuptools


In [3]:
import os

def read_csv_files(folder_path):
    # Create an empty dictionary to store the DataFrames
    dfs = {}

    # Iterate over the files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file is a CSV file
        if filename.endswith(".csv"):
            # Extract the product ID from the filename
            product_id = filename.split("_")[1].split(".")[0]
            product_id = int(product_id)

            # Read the DataFrame from the CSV file
            df = pd.read_csv(os.path.join(folder_path, filename), index_col='Date')

            # Store the DataFrame in the dictionary
            dfs[product_id] = df

    # Sort the dictionary by product ID
    dfs = {key: dfs[key] for key in sorted(dfs.keys())}

    return dfs


## Products Original

In [4]:
dfs_original_train = read_csv_files('products_original_train')

# Read CSV files from "products_original_test" folder
dfs_original_test = read_csv_files('products_original_test')

In [5]:
dfs_original_train.keys()

dict_keys([1, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 16, 20, 36])

In [6]:
dfs_original_train[4].head()

Unnamed: 0_level_0,PRO27826_org,RohiENERGY1000_org,PRO27840_org,MAB_ELE_PRO826,RohiNATGAS1000_org,Sales_CPI_€
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-10-01,118.670791,100.222169,112.853256,113.659322,89.570796,394180.84379
2018-11-01,120.467019,84.436807,113.145294,115.088417,97.362468,365752.5837
2018-12-01,105.378705,74.898746,111.823624,101.556108,94.406578,423649.4456
2019-01-01,107.174933,76.204771,109.499725,101.799754,80.055366,473037.88076
2019-02-01,110.64764,80.086039,110.835655,103.495768,69.545042,608251.1958


In [7]:
dfs_original_test.keys()

dict_keys([1, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 16, 20, 36])

## Products Lag

In [8]:
dfs_lag_train = read_csv_files('products_lag_train')

# Read CSV files from "products_original_test" folder
dfs_lag_test = read_csv_files('products_lag_test')

## Product Outlier

In [9]:
dfs_outlier_train = read_csv_files('products_outliers_train')

# Read CSV files from "products_original_test" folder
dfs_outlier_test = read_csv_files('products_outliers_test')

## Test Set

In [10]:
test = pd.read_csv('Case2_Test Set Template.csv', sep =';')

In [11]:
test.head()

Unnamed: 0,Month Year,Mapped_GCK,Sales_EUR
0,Mai 22,#3,
1,Jun 22,#3,
2,Jul 22,#3,
3,Aug 22,#3,
4,Sep 22,#3,


In [12]:
month_map = {
    'Mai': 'May', 'Okt': 'Oct', 'Dez': 'Dec'
}

# Aplicar o mapeamento para converter os nomes dos meses de PT para EN
test['Month Year'] = test['Month Year'].replace(month_map, regex=True)

# Converter 'Month Year' para o tipo datetime e formatar para 'YYYY-MM-DD'
test['Month Year'] = pd.to_datetime(test['Month Year'] + ' 1', format='%b %y %d')

In [13]:
test['Mapped_GCK'] = test['Mapped_GCK'].str.replace('#', '')

In [14]:
print(test['Month Year'].isna().sum())
print(test['Month Year'].unique())

0
<DatetimeArray>
['2022-05-01 00:00:00', '2022-06-01 00:00:00', '2022-07-01 00:00:00',
 '2022-08-01 00:00:00', '2022-09-01 00:00:00', '2022-10-01 00:00:00',
 '2022-11-01 00:00:00', '2022-12-01 00:00:00', '2023-01-01 00:00:00',
 '2023-02-01 00:00:00']
Length: 10, dtype: datetime64[ns]


In [15]:
test.head()

Unnamed: 0,Month Year,Mapped_GCK,Sales_EUR
0,2022-05-01,3,
1,2022-06-01,3,
2,2022-07-01,3,
3,2022-08-01,3,
4,2022-09-01,3,


In [16]:
def date_index_to_datetime(df):
    df.index = pd.to_datetime(df.index, format='%Y-%m-%d')

# Iterate over each DataFrame in the dictionary and convert the index to datetime
for dfs_dict in [dfs_original_train, dfs_original_test, dfs_lag_train, dfs_lag_test, dfs_outlier_train, dfs_outlier_test]:
    for df in dfs_dict.values():
        date_index_to_datetime(df)


## Modelling

In [17]:
def evaluate_model_with_error_table(model, X_train, y_train, X_test, y_test):
    # Predictions
    train_predictions = model.predict(X_train)
    val_predictions = model.predict(X_test)
    
    # Calculate RMSE
    train_rmse = mean_squared_error(y_train, train_predictions, squared=False)  # RMSE for training set
    val_rmse = mean_squared_error(y_test, val_predictions, squared=False)  # RMSE for testing set
    
    # Print the RMSE for both train and test sets
    print(f"Train RMSE: {train_rmse:.3f}")
    print(f"Test RMSE: {val_rmse:.3f}")
    
    # Return the RMSE for both train and test sets
    return train_rmse, val_rmse

In [18]:
# Define RMSE scorer for evaluation
def rmse(y_true, y_pred):
    return mean_squared_error(y_true,y_pred, squared=False)

In [19]:
def plot_predictions(y_val, predictions):

    plt.figure(figsize=(10, 6))
    
    # Ensure the index is correctly sorted for plotting
    sorted_index = y_val.sort_index().index
    sorted_y_val = y_val.sort_index()
    sorted_predictions = pd.Series(predictions, index=sorted_index).sort_index()
    
    # Plot real vs predicted values
    plt.plot(sorted_index, sorted_y_val, label='Real', marker='o', color='blue')
    plt.plot(sorted_index, sorted_predictions, label='Prediction', marker='x', linestyle='--', color='orange')
    
    plt.title('Predictions vs Actual Values')
    plt.xlabel('Index')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

### XGBoost

In [20]:
def run_gridsearch_xgboost(X_train, y_train):

    # Define the parameter grid to search
    param_grid = {
        'n_estimators': [100, 500, 1000],  # Number of trees in the forest
        'learning_rate': [0.01, 0.05, 0.1],  # Step size shrinkage used to prevent overfitting
        'max_depth': [3, 5]  # Maximum depth of a tree
    }
    
    # Initialize the model
    model = XGBRegressor(objective='reg:squarederror')  # Objective is to minimize squared error
    
    rmse_scorer = make_scorer(rmse, greater_is_better=False)
    
    # Setup GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring=rmse_scorer, verbose=0)
    grid_search.fit(X_train, y_train)  # Perform the grid search

    best_params = grid_search.best_params_  # Retrieve the best parameters
    
    print("Best Parameters:")  # Print the best parameters found
    print(best_params)
    
    return grid_search.best_estimator_ , best_params # Return the best model

## XGBoost for each Product

In [21]:
#dictionary to store the best RMSE, for each product 
best_rmse_xgboost = {}

In [22]:
def process_data_and_model(dfs_original_train, dfs_original_test, dfs_lag_train, dfs_lag_test, dfs_outlier_train, dfs_outlier_test, model_name):

    for product_id in dfs_original_train.keys():
        print(f"\nProduct ID: {product_id}")

        product_results = {}
        min_product_rmse = float('inf')  # Initialize with a very high value
        min_rmse_df = None  # Initialize with None
        best_parameters = {}

        for df_train, df_test, df_type in zip(
            [dfs_original_train, dfs_lag_train, dfs_outlier_train],
            [dfs_original_test, dfs_lag_test, dfs_outlier_test],
            ['original', 'lag', 'outlier']
        ):
            print(f"\n{df_type}")

            # Get the corresponding train and test DataFrames
            df_train_data = df_train[product_id]
            df_test_data = df_test[product_id]

            # Split data into train and test sets
            X_train, y_train = df_train_data.drop(columns=['Sales_CPI_€']), df_train_data['Sales_CPI_€']
            X_val, y_val = df_test_data.drop(columns=['Sales_CPI_€']), df_test_data['Sales_CPI_€']

            # Perform GridSearchCV to find the best parameters
            best_model, best_params  = run_gridsearch_xgboost(X_train, y_train)

            # Evaluate the model and get RMSE
            train_rmse, val_rmse = evaluate_model_with_error_table(best_model, X_train, y_train, X_val, y_val)
    
            # Update the minimum RMSE for the current product and the corresponding DataFrame
            if val_rmse < min_product_rmse:
                min_product_rmse = val_rmse
                min_rmse_df = df_type
                best_parameters = best_params


        # Store the best RMSE for validation and the corresponding DataFrame
        best_rmse_xgboost[product_id] = {
            'Model': model_name,
            'Dataframe': min_rmse_df,  
            'Best RMSE for Validation': round(min_product_rmse),
            'Best Parameters': best_parameters
        }

        # Print the minimum RMSE for the current product and the corresponding DataFrame
        print(f"\nMinimum RMSE for Product {product_id}: {min_product_rmse:.3f} (DataFrame: {min_rmse_df})")

    # return results

# Call the function with your dictionaries for train and test DataFrames and the model name
process_data_and_model(
    dfs_original_train, dfs_original_test,
    dfs_lag_train, dfs_lag_test,
    dfs_outlier_train, dfs_outlier_test,
    model_name='XGBoost'
)



Product ID: 1

original


Best Parameters:
{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
Train RMSE: 3632897.247
Test RMSE: 6442858.568

lag
Best Parameters:
{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Train RMSE: 1699093.585
Test RMSE: 5968853.318

outlier
Best Parameters:
{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
Train RMSE: 1754044.620
Test RMSE: 6086321.346

Minimum RMSE for Product 1: 5968853.318 (DataFrame: lag)

Product ID: 3

original
Best Parameters:
{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Train RMSE: 1464198.842
Test RMSE: 3782289.497

lag
Best Parameters:
{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
Train RMSE: 1403810.804
Test RMSE: 3591252.468

outlier
Best Parameters:
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
Train RMSE: 387484.945
Test RMSE: 3067808.469

Minimum RMSE for Product 3: 3067808.469 (DataFrame: outlier)

Product ID: 4

original
Best Parameters:
{'learning_rate': 0.01, 'max_depth':

In [23]:
best_rmse_xgboost[1]

{'Model': 'XGBoost',
 'Dataframe': 'lag',
 'Best RMSE for Validation': 5968853,
 'Best Parameters': {'learning_rate': 0.01,
  'max_depth': 3,
  'n_estimators': 100}}

## Prophet for each Product

In [24]:
#dictionary to store the best RMSE, for each product 
best_rmse_prophet = {}

In [25]:
dfs_original_train_copy = dfs_original_train.copy()
dfs_original_test_copy = dfs_original_test.copy()
dfs_lag_train_copy = dfs_lag_train.copy()
dfs_lag_test_copy = dfs_lag_test.copy()
dfs_outlier_train_copy = dfs_outlier_train.copy()
dfs_outlier_test_copy = dfs_outlier_test.copy()

In [26]:
def rename_columns(df):
    df_copy = df.reset_index().rename(columns={'Date': 'ds', 'Sales_CPI_€': 'y'})
    return df_copy

# Rename columns in each DataFrame in the dictionaries
for df_dict in [dfs_original_train_copy, dfs_original_test_copy, dfs_lag_train_copy, dfs_lag_test_copy, dfs_outlier_train_copy, dfs_outlier_test_copy]:
    for product_id in df_dict.keys():
        df_dict[product_id] = rename_columns(df_dict[product_id])

In [27]:
def evaluate_prophet_model(df, forecast):
    # Extract actual values from the original dataframe
    y_true = df['y'].values
    
    # Extract predicted values from the forecast
    y_pred = forecast['yhat'].values[-len(df):]  # Forecasted values for the original period
    
    # Calculate RMSE
    rmse = np.sqrt(np.mean((y_true - y_pred)**2))
    
    return None, rmse  # No train RMSE for Prophet

In [28]:
# Suppressing INFO level logs from cmdstanpy and prophet
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)
logging.getLogger('prophet').setLevel(logging.WARNING)

In [29]:
def process_data_and_model_prophet(df_original_train, df_original_test, df_lag_train, df_lag_test, df_outlier_train, df_outlier_test, model_name):

    for product_id in df_original_train.keys():
        print(f"\nProduct ID: {product_id}")

        min_product_rmse = float('inf')  # Initialize with a very high value
        min_rmse_df = None  # Initialize with None

        
        for df_train, df_test, df_type in zip(
            [df_original_train, df_lag_train, df_outlier_train],
            [df_original_test, df_lag_test, df_outlier_test],
            ['original', 'lag', 'outlier']
        ):
            print(f"\n{df_type}")

            # Get the corresponding train and test DataFrames
            df_train_data = df_train[product_id]
            df_test_data = df_test[product_id]


            # Initialize and fit Prophet model
            model = Prophet()
            model.fit(df_train_data)
            
            # Make future predictions
            future = model.make_future_dataframe(periods=10)  # Extend the dataframe by 1 year
            forecast = model.predict(df_test_data)
            
            # Evaluate the model and get RMSE
            train_rmse, val_rmse = evaluate_prophet_model(df_test_data, forecast)
            
            # Update the minimum RMSE for the current product and the corresponding DataFrame
            if val_rmse < min_product_rmse:
                min_product_rmse = val_rmse
                min_rmse_df = df_type
        
        #Store the best RMSE for validation and the corresponding DataFrame
        best_rmse_prophet[product_id] = {
            'Model': model_name,
            'Dataframe': min_rmse_df,  
            'Best RMSE for Validation': min_product_rmse
        }

        # Print the minimum RMSE for the current product and the corresponding DataFrame
        print(f"\nMinimum RMSE for Product {product_id}: {min_product_rmse:.3f} (DataFrame: {min_rmse_df})")
        
    #return results

# Call the function with your dictionaries for train and test DataFrames and the model name
process_data_and_model_prophet(
    dfs_original_train_copy, dfs_original_test_copy, dfs_lag_train_copy, dfs_lag_test_copy, dfs_outlier_train_copy, dfs_outlier_test_copy,
    model_name='Prophet'
)


Product ID: 1

original


DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/q4n0002_.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/xxphjndn.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=29194', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/q4n0002_.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/xxphjndn.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelyh4_h_ue/prophet_model-20240409172959.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:29:59 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:29:59 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done pro


lag


17:29:59 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/3dem8y51.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/71lz5sw7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=1577', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/3dem8y51.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/71lz5sw7.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modeluwes074m/prophet_model-20240409172959.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:29:59 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start proc


outlier


17:30:00 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/y9dlmypj.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/whsrytr7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=98089', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/y9dlmypj.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/whsrytr7.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_model2y0ltupp/prophet_model-20240409173000.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:00 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start pro


Minimum RMSE for Product 1: 4436099.624 (DataFrame: original)

Product ID: 3

original


17:30:00 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/_myu0doa.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/aygciz4y.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=72533', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/_myu0doa.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/aygciz4y.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelrp_m8whi/prophet_model-20240409173000.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:00 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start pro


lag


DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/3uza8zqy.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/wv5ymles.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=33300', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/3uza8zqy.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/wv5ymles.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modele20tzaod/prophet_model-20240409173000.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:00 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing



outlier


17:30:00 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/z7e7ta1j.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/hk99f49m.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=46624', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/z7e7ta1j.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/hk99f49m.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelcg9x9rqy/prophet_model-20240409173001.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:01 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start pro


Minimum RMSE for Product 3: 2018385.480 (DataFrame: outlier)

Product ID: 4

original


17:30:01 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/ptrtz46c.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/y9h7esgd.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=8543', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/ptrtz46c.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/y9h7esgd.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelt4vbcdfw/prophet_model-20240409173001.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:01 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start proc


lag


17:30:01 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/w9df7ky6.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/8cyxguwe.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=57362', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/w9df7ky6.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/8cyxguwe.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modellcz6pztf/prophet_model-20240409173001.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:01 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start pro


outlier


17:30:01 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/u4je3tqc.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/iszhccfn.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=84626', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/u4je3tqc.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/iszhccfn.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modellkyugm5f/prophet_model-20240409173001.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:01 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start pro


Minimum RMSE for Product 4: 195258.513 (DataFrame: original)

Product ID: 5

original


17:30:02 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/q69n6h0k.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/5i32fkp7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=29855', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/q69n6h0k.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/5i32fkp7.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelo5v72vb8/prophet_model-20240409173002.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:02 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start pro


lag


DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/dkirrbc9.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/u1toaiuy.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=21848', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/dkirrbc9.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/u1toaiuy.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelkm019te_/prophet_model-20240409173002.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:02 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing



outlier


17:30:02 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/agcqeuf6.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/ya1n_qw8.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=50918', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/agcqeuf6.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/ya1n_qw8.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelxp8a2lwb/prophet_model-20240409173002.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:02 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start pro


Minimum RMSE for Product 5: 4049232.281 (DataFrame: lag)

Product ID: 6

original


17:30:02 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/w17z912_.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/xvgiw7ac.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=52837', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/w17z912_.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/xvgiw7ac.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelgj6vk6uj/prophet_model-20240409173002.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:02 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start pro


lag


DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/4usuerz6.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/6e486gav.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=33363', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/4usuerz6.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/6e486gav.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_model5wwb3zod/prophet_model-20240409173003.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:03 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing



outlier


17:30:03 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/f_a7a2c3.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/boj55igw.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=22035', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/f_a7a2c3.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/boj55igw.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelyxzenkp2/prophet_model-20240409173003.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:03 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start pro


Minimum RMSE for Product 6: 281802.000 (DataFrame: original)

Product ID: 8

original


17:30:03 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/ui_9qrx4.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/zj4yef20.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=36153', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/ui_9qrx4.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/zj4yef20.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_model6jwnwax8/prophet_model-20240409173003.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:03 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start pro


lag


17:30:03 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/uvoe5nyr.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/yclajnva.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=28012', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/uvoe5nyr.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/yclajnva.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modeljteenbbi/prophet_model-20240409173003.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:03 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start pro


outlier


17:30:04 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/zjymw0q7.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/_lgr1ton.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=10205', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/zjymw0q7.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/_lgr1ton.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelipy3daks/prophet_model-20240409173004.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:04 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start pro


Minimum RMSE for Product 8: 610963.283 (DataFrame: lag)

Product ID: 9

original

lag


DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/23rohms6.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/1pjutpmv.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=53139', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/23rohms6.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/1pjutpmv.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelqpymhxo2/prophet_model-20240409173004.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:04 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:30:04 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done pro


outlier

Minimum RMSE for Product 9: 8852.540 (DataFrame: outlier)

Product ID: 11

original


DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/284jll15.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/g213y4dl.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=17560', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/284jll15.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/g213y4dl.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelnwhr_k61/prophet_model-20240409173004.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:04 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:30:05 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done pro


lag

outlier


DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=28728', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/xcsjc350.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/3dto74xv.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_model2rsdkamf/prophet_model-20240409173005.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:05 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:30:05 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/71rb9ziw.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/vyf9qwlt.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_thread


Minimum RMSE for Product 11: 780346.628 (DataFrame: original)

Product ID: 12

original

lag


17:30:05 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/nual1xti.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/y12nfg42.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=15630', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/nual1xti.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/y12nfg42.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_models_tr4rg2/prophet_model-20240409173005.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:05 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start pro


outlier

Minimum RMSE for Product 12: 120494.566 (DataFrame: lag)

Product ID: 13

original


17:30:06 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/uom76xmw.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/bcncenv5.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=69745', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/uom76xmw.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/bcncenv5.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelof0f_pax/prophet_model-20240409173006.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:06 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start pro


lag


DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/rtxbyeqb.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/kkiq99ld.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=39380', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/rtxbyeqb.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/kkiq99ld.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modeloa11_pwi/prophet_model-20240409173006.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:06 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:30:06 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done pro


outlier


DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/64unvstu.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/crbfi9h7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=17598', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/64unvstu.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/crbfi9h7.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_model4stzqgc9/prophet_model-20240409173006.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:06 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:30:06 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done pro


Minimum RMSE for Product 13: 17281.824 (DataFrame: original)

Product ID: 14

original


DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/1dyi61uh.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/rjpcpp_2.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=72088', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/1dyi61uh.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/rjpcpp_2.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_model497eoa7m/prophet_model-20240409173006.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:06 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:30:06 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done pro


lag

outlier


DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/50y8n6uw.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=85203', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/cmh80q3s.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/50y8n6uw.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelnyclgzl1/prophet_model-20240409173007.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:07 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:30:07 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/4al839


Minimum RMSE for Product 14: 17703.230 (DataFrame: lag)

Product ID: 16

original


DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/l5avtpx_.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/4_p39d0y.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=46278', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/l5avtpx_.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/4_p39d0y.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelcs1ti_n_/prophet_model-20240409173007.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:07 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:30:07 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done pro


lag

outlier


DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/x82us509.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/8qkrgczi.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=76230', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/x82us509.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/8qkrgczi.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_model7piq0rc6/prophet_model-20240409173007.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:07 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:30:08 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done pro


Minimum RMSE for Product 16: 169896.884 (DataFrame: lag)

Product ID: 20

original


DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/bv0rs89d.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/5qojg9sh.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=5195', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/bv0rs89d.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/5qojg9sh.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelx2l_giwz/prophet_model-20240409173008.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:08 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:30:08 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done proc


lag

outlier


17:30:08 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/lwv0gjad.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/vreys_qb.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=32178', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/lwv0gjad.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/vreys_qb.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_model7oe_cnfy/prophet_model-20240409173008.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:08 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start pro


Minimum RMSE for Product 20: 3364.168 (DataFrame: original)

Product ID: 36

original

lag


DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/h8gn2fqk.json
DEBUG:cmdstanpy:input tempfile: /var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/o_wo5jhf.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/ritamatias/anaconda3/envs/DM2324/lib/python3.11/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=15593', 'data', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/h8gn2fqk.json', 'init=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/o_wo5jhf.json', 'output', 'file=/var/folders/g8/89ybwtg53nb272y2m1cv_qnc0000gn/T/tmplosni2kv/prophet_modelkv5ghgac/prophet_model-20240409173008.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:30:08 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:30:08 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done pro


outlier


17:30:09 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing



Minimum RMSE for Product 36: 16031.966 (DataFrame: outlier)


## LLM for each product

In [30]:
dfs_original_train_copy = dfs_original_train.copy()
dfs_original_test_copy = dfs_original_test.copy()
dfs_lag_train_copy = dfs_lag_train.copy()
dfs_lag_test_copy = dfs_lag_test.copy()
dfs_outlier_train_copy = dfs_outlier_train.copy()
dfs_outlier_test_copy = dfs_outlier_test.copy()

In [31]:
def rename_columns(df):
    df_copy = df.reset_index()
    return df_copy

# Rename columns in each DataFrame in the dictionaries
for df_dict in [dfs_original_train_copy, dfs_original_test_copy, dfs_lag_train_copy, dfs_lag_test_copy, dfs_outlier_train_copy, dfs_outlier_test_copy]:
    for product_id in df_dict.keys():
        df_dict[product_id] = rename_columns(df_dict[product_id])

In [32]:
timegpt = TimeGPT(token='oQLhmPAiLjJdTk9sQBjLizwpE6w8YTOtzSVujoenJyEPH6TiJWz5ZEwJ3fk9lxhFt0sESeDkWNkdkClmhor1ikFbsa4I08qQwMEtb6lzAPCGDzuHvSA2rJWvYs8fuHkPpBSvTPMqOsUbp0ip8Okp3ZxTC4j6DByjMF2GAb2htzo1P4MRfzFtGCLsqqlI1r1EM5pC4qeZmvDNNSLCkyu7SUp8GCruLQUInMqB0MfEEsnwnHQnxJYiatoWvgdb9yie')



In [33]:
#dictionary to store the best RMSE, for each product 
best_rmse_timegpt = {}

In [34]:
logging.getLogger('nixtlats.timegpt').setLevel(logging.WARNING)

In [35]:
def process_data_and_model(df_original_train, df_original_test, df_lag_train,
                           df_lag_test, df_outlier_train, df_outlier_test,
                           model_name):

    for product_id in df_original_train.keys():
        print(f"\nProduct ID: {product_id}")

        product_results = {}
        min_product_rmse = float('inf')  # Initialize with a very high value
        min_rmse_df = None  # Initialize with None

        for df_train, df_test, df_type in zip(
                [df_original_train, df_lag_train, df_outlier_train],
                [df_original_test, df_lag_test, df_outlier_test],
                ['original', 'lag', 'outlier']
        ):
            print(f"\n{df_type}")

            # Get the corresponding train and test DataFrames
            train_df = df_train[product_id]
            test_df = df_test[product_id]

            # Make forecasts using TimeGPT
            forecasts = timegpt.forecast(train_df, time_col="Date", h=len(test_df), freq='M', target_col='Sales_CPI_€')

            # Get true values from the test dataframe
            true_values = test_df['Sales_CPI_€'].values

            # Get forecasted values from the TimeGPT forecasts
            forecasted_values = forecasts['TimeGPT'].values

            # Calculate RMSE
            test_rmse = rmse(true_values, forecasted_values)
            print(f"Test RMSE: {test_rmse}")

            # Update the minimum RMSE for the current product and the corresponding DataFrame
            if test_rmse < min_product_rmse:
                min_product_rmse = test_rmse
                min_rmse_df = df_type

        # Store the results for the product
        best_rmse_timegpt[product_id] = {
            'Model': model_name,
            'Dataframe': min_rmse_df,
            'Best RMSE for Validation': min_product_rmse
        }

        # Print the minimum RMSE for the current product and the corresponding DataFrame
        print(f"\nMinimum RMSE for Product {product_id}: {min_product_rmse:.3f} (DataFrame: {min_rmse_df})")


# Call the function with your dictionaries for train and test DataFrames and the model name
process_data_and_model(
    dfs_original_train_copy, dfs_original_test_copy, dfs_lag_train_copy, dfs_lag_test_copy, dfs_outlier_train_copy, dfs_outlier_test_copy,
    model_name='TimeGPT'
)


INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS



Product ID: 1

original


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 4849114.18174757

lag


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 5684870.045848313

outlier


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 4849114.18174757

Minimum RMSE for Product 1: 4849114.182 (DataFrame: original)

Product ID: 3

original


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 2800589.884279119

lag


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 3638206.4689079877

outlier


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 2134890.853863158

Minimum RMSE for Product 3: 2134890.854 (DataFrame: outlier)

Product ID: 4

original


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 202880.98959241904

lag


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 163770.153573528

outlier


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 202880.98959241904

Minimum RMSE for Product 4: 163770.154 (DataFrame: lag)

Product ID: 5

original


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 4243791.135265702

lag


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 3242274.3109364375

outlier


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 4243791.135265702

Minimum RMSE for Product 5: 3242274.311 (DataFrame: lag)

Product ID: 6

original


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 274856.36973037757

lag


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 276829.10649591434

outlier


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 274856.36973037757

Minimum RMSE for Product 6: 274856.370 (DataFrame: original)

Product ID: 8

original


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 627906.9656185085

lag


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 697116.6720730268

outlier


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 627906.9656185085

Minimum RMSE for Product 8: 627906.966 (DataFrame: original)

Product ID: 9

original


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 8657.509145033717

lag


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 5876.297885137595

outlier


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 7316.952158684977

Minimum RMSE for Product 9: 5876.298 (DataFrame: lag)

Product ID: 11

original


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 826243.4087104176

lag


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 800266.9018866945

outlier


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 826243.4087104176

Minimum RMSE for Product 11: 800266.902 (DataFrame: lag)

Product ID: 12

original


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 145677.67790386878

lag


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 118115.73931067564

outlier


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 145677.67790386878

Minimum RMSE for Product 12: 118115.739 (DataFrame: lag)

Product ID: 13

original


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 12985.393664317

lag


INFO:nixtlats.nixtla_client:Attempt 1 failed...
INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Attempt 1 failed...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 15926.981331237172

outlier


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 12985.393664317

Minimum RMSE for Product 13: 12985.394 (DataFrame: original)

Product ID: 14

original


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 15649.523391593008

lag


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 16837.348247659334

outlier


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 17709.232077619658

Minimum RMSE for Product 14: 15649.523 (DataFrame: original)

Product ID: 16

original


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 89172.7919704339

lag


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 89587.56567986694

outlier


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 89172.7919704339

Minimum RMSE for Product 16: 89172.792 (DataFrame: original)

Product ID: 20

original


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 2249.737731928441

lag


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 2290.789873168815

outlier


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 2249.737731928441

Minimum RMSE for Product 20: 2249.738 (DataFrame: original)

Product ID: 36

original


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 17391.40781079768

lag


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...
INFO:nixtlats.nixtla_client:Inferred freq: MS


Test RMSE: 17145.747474047996

outlier


INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...


Test RMSE: 16264.211009613986

Minimum RMSE for Product 36: 16264.211 (DataFrame: outlier)


### Choose the best model for each product

In [36]:
# Initialize a dictionary to store the best RMSE values for each product and its corresponding DataFrame
best_rmse_values = {}

# Combine all product IDs from the three dictionaries
all_product_ids = set(best_rmse_timegpt.keys()) | set(best_rmse_prophet.keys()) | set(best_rmse_xgboost.keys())

# Iterate over all product IDs
for product_id in all_product_ids:
    # Initialize variables to store the best RMSE value and model information
    best_rmse = float('inf')
    best_model = None
    best_dataframe = None
    best_parameters = None
    
    # Check if the product ID exists in the best_rmse_timegpt dictionary
    if product_id in best_rmse_timegpt:
        rmse_timegpt = best_rmse_timegpt[product_id]['Best RMSE for Validation']
        if rmse_timegpt < best_rmse:
            best_rmse = rmse_timegpt
            best_model = 'TimeGPT'
            best_dataframe = best_rmse_timegpt[product_id]['Dataframe']
    
    # Check if the product ID exists in the best_rmse_prophet dictionary
    if product_id in best_rmse_prophet:
        rmse_prophet = best_rmse_prophet[product_id]['Best RMSE for Validation']
        if rmse_prophet < best_rmse:
            best_rmse = rmse_prophet
            best_model = 'Prophet'
            best_dataframe = best_rmse_prophet[product_id]['Dataframe']
    
    # Check if the product ID exists in the best_rmse_xgboost dictionary
    if product_id in best_rmse_xgboost:
        rmse_xgboost = best_rmse_xgboost[product_id]['Best RMSE for Validation']
        if rmse_xgboost < best_rmse:
            best_rmse = rmse_xgboost
            best_model = 'XGBoost'
            best_dataframe = best_rmse_xgboost[product_id]['Dataframe']
            best_parameters = best_rmse_xgboost[product_id]['Best Parameters']
    
    # Store the best RMSE value and model information for the current product
    best_rmse_values[product_id] = {
        'Model': best_model,
        'Dataframe': best_dataframe,
        'Best RMSE for Validation': round(best_rmse),
        'Best Parameters': best_parameters
    }

# Now best_rmse_values contains the best RMSE value for each product among the three models


In [37]:
# df_best_rmse = pd.DataFrame(best_rmse_values).T.reset_index()

# # Rename the columns for better readability
# df_best_rmse.columns = ['Product_ID', 'Model', 'Dataframe', 'Best_RMSE', 'Best_Parameters']

# # Display the DataFrame
# df_best_rmse

In [38]:
# Assume best_rmse_values is your dictionary
table_data = []
for product_id, values in best_rmse_values.items():
    row = [product_id, values['Model'], values['Dataframe'], values['Best RMSE for Validation']]
    if values.get('Best Parameters'):
        parameters = ', '.join([f"{key}={value}" for key, value in values['Best Parameters'].items()])
        row.append(parameters)
    else:
        row.append('')
    table_data.append(row)

# Define the headers for the table
headers = ["Product ID", "Model", "Dataframe", "Best RMSE", "Best Parameters"]

# Print the table
print(tabulate(table_data, headers=headers, tablefmt="grid"))


+--------------+---------+-------------+-------------+----------------------------------------------------+
|   Product ID | Model   | Dataframe   |   Best RMSE | Best Parameters                                    |
|            1 | Prophet | original    |     4436100 |                                                    |
+--------------+---------+-------------+-------------+----------------------------------------------------+
|            3 | Prophet | outlier     |     2018385 |                                                    |
+--------------+---------+-------------+-------------+----------------------------------------------------+
|            4 | XGBoost | lag         |      115017 | learning_rate=0.05, max_depth=3, n_estimators=1000 |
+--------------+---------+-------------+-------------+----------------------------------------------------+
|            5 | TimeGPT | lag         |     3242274 |                                                    |
+--------------+---------+--

# Predict features for each product for test

In [39]:
dfs_original_train[4].head()

Unnamed: 0_level_0,PRO27826_org,RohiENERGY1000_org,PRO27840_org,MAB_ELE_PRO826,RohiNATGAS1000_org,Sales_CPI_€
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-10-01,118.670791,100.222169,112.853256,113.659322,89.570796,394180.84379
2018-11-01,120.467019,84.436807,113.145294,115.088417,97.362468,365752.5837
2018-12-01,105.378705,74.898746,111.823624,101.556108,94.406578,423649.4456
2019-01-01,107.174933,76.204771,109.499725,101.799754,80.055366,473037.88076
2019-02-01,110.64764,80.086039,110.835655,103.495768,69.545042,608251.1958


In [40]:
# Initialize the remerged_data dictionary
remerged_data = {}

# Concatenate train and test sets for each product and add them to the remerged_data dictionary
remerged_data[1] = pd.concat([dfs_original_train[1].copy(), dfs_original_test[1].copy()], axis=0)
remerged_data[3] = pd.concat([dfs_outlier_train[3].copy(), dfs_outlier_test[3].copy()], axis=0)
remerged_data[4] = pd.concat([dfs_lag_train[4].copy(), dfs_lag_test[4].copy()], axis=0)
remerged_data[5] = pd.concat([dfs_lag_train[5].copy(), dfs_lag_test[5].copy()], axis=0)
remerged_data[6] = pd.concat([dfs_original_train[6].copy(), dfs_original_test[6].copy()], axis=0)
remerged_data[8] = pd.concat([dfs_lag_train[8].copy(), dfs_lag_test[8].copy()], axis=0)
remerged_data[9] = pd.concat([dfs_lag_train[9].copy(), dfs_lag_test[9].copy()], axis=0)
remerged_data[11] = pd.concat([dfs_original_train[11].copy(), dfs_original_test[11].copy()], axis=0)
remerged_data[12] = pd.concat([dfs_lag_train[12].copy(), dfs_lag_test[12].copy()], axis=0)
remerged_data[13] = pd.concat([dfs_original_train[13].copy(), dfs_original_test[13].copy()], axis=0)
remerged_data[14] = pd.concat([dfs_original_train[14].copy(), dfs_original_test[14].copy()], axis=0)
remerged_data[16] = pd.concat([dfs_original_train[16].copy(), dfs_original_test[16].copy()], axis=0)
remerged_data[20] = pd.concat([dfs_original_train[20].copy(), dfs_original_test[20].copy()], axis=0)
remerged_data[36] = pd.concat([dfs_outlier_train[36].copy(), dfs_outlier_test[36].copy()], axis=0)


In [41]:
remerged_data[1].tail(10)

Unnamed: 0_level_0,MAB_ELE_PRO276,PRO27826_org,MAB_ELE_PRO756,MAB_ELE_PRO826,PRO27840_org,Sales_CPI_€
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-07-01,110.680184,122.742241,93.286137,105.426099,108.732765,38392420.0
2021-08-01,100.880844,116.395569,96.965037,99.283383,109.376541,37581700.0
2021-09-01,115.043611,130.525896,100.643938,115.833771,109.221512,37657480.0
2021-10-01,108.420507,117.832552,102.663969,99.135838,113.523598,46395780.0
2021-11-01,115.95156,127.053188,104.683999,109.856972,113.067886,35725830.0
2021-12-01,118.103281,121.065762,106.704029,101.273544,109.624107,35545720.0
2022-01-01,94.55061,112.324119,103.49926,95.003541,111.36467,39915980.0
2022-02-01,103.987916,115.55733,100.294492,98.458412,114.6884,38470660.0
2022-03-01,121.308119,145.254965,97.089723,121.993915,115.164093,43478260.0
2022-04-01,99.522205,114.359844,97.36045,95.266502,112.158089,44665880.0


In [42]:
from scipy.stats import shapiro

# Define a function to check normality using Shapiro-Wilk test for each variable in each DataFrame
def check_normality_for_all_products(remerged_data, best_rmse_values):
    # Initialize a dictionary to store the results
    all_normality_results = {}

    # Get models that are not "timegpt"
    non_timegpt_models = [product_id for product_id, values in best_rmse_values.items() if values['Model'] != 'TimeGPT']

    # Iterate through each product in remerged_data
    for product_id, df in remerged_data.items():
        if product_id in non_timegpt_models:
            normality_results = {}

            # Exclude 'Sales_CPI_€' column from the DataFrame
            df_exog = df.drop(columns=['Sales_CPI_€'])

            # Iterate over each column in the DataFrame
            for column in df_exog.columns:
                # Perform Shapiro-Wilk test
                stat, p_value = shapiro(df_exog[column].dropna()) # Drop NaN values

                # Store the result
                normality_results[column] = {
                    'Test Statistic': stat,
                    'p-value': p_value
                }

            # Store the normality results for the current product
            all_normality_results[product_id] = normality_results

    return all_normality_results

# Check normality for each variable in each product's DataFrame
all_normality_results = check_normality_for_all_products(remerged_data, best_rmse_values)

# Print normality results
for product_id, product_results in all_normality_results.items():
    print(f"Product ID: {product_id}")
    for column, result in product_results.items():
        print(f"    Variable: {column}")
        print(f"    Test Statistic: {result['Test Statistic']}")
        print(f"    p-value: {result['p-value']}")
        print()


Product ID: 1
    Variable: MAB_ELE_PRO276
    Test Statistic: 0.965643048286438
    p-value: 0.2221372276544571

    Variable: PRO27826_org
    Test Statistic: 0.9144018292427063
    p-value: 0.0035255765542387962

    Variable: MAB_ELE_PRO756
    Test Statistic: 0.992559015750885
    p-value: 0.993213951587677

    Variable: MAB_ELE_PRO826
    Test Statistic: 0.9129245281219482
    p-value: 0.003161100437864661

    Variable: PRO27840_org
    Test Statistic: 0.8624692559242249
    p-value: 0.00010922953515546396

Product ID: 3
    Variable: PRO27826_org
    Test Statistic: 0.9144018292427063
    p-value: 0.0035255765542387962

    Variable: MAB_ELE_PRO156
    Test Statistic: 0.9585809707641602
    p-value: 0.12300286442041397

    Variable: RohiNATGAS1000_org
    Test Statistic: 0.7715831995010376
    p-value: 9.260044748771179e-07

    Variable: PRI27826_org
    Test Statistic: 0.9333119988441467
    p-value: 0.015125780366361141

    Variable: PRO27756_org
    Test Statistic: 0.964

In [43]:
from statsmodels.tsa.stattools import adfuller

def check_stationarity_for_all_products(remerged_data, best_rmse_values):
    """
    Check stationarity using ADF test for each variable in each DataFrame, excluding variables associated with TimeGPT model.
    
    Parameters:
        remerged_data (dict): Dictionary containing DataFrames for each product.
        best_rmse_values (dict): Dictionary containing best RMSE values for each product and its associated model.
    
    Returns:
        dict: A dictionary where keys are product IDs and values are dictionaries containing ADF test results for each variable.
    """
    all_stationarity_results = {}

    # Get models that are not "TimeGPT"
    non_timegpt_models = [product_id for product_id, values in best_rmse_values.items() if values['Model'] != 'TimeGPT']

    # Iterate through each product in remerged_data
    for product_id, df in remerged_data.items():
        if product_id in non_timegpt_models:
            stationarity_results = {}

            # Exclude 'Sales_CPI_€' column from the DataFrame
            df_exog = df.drop(columns=['Sales_CPI_€'])

            # Iterate over each column in the DataFrame
            for column in df_exog.columns:
                # Perform ADF test
                result = adfuller(df_exog[column].dropna())  # Drop NaN values
                
                # Extract ADF test results
                adf_statistic = result[0]
                p_value = result[1]
                is_stationary = p_value < 0.05  # True if the variable is stationary
                
                # Store the result
                stationarity_results[column] = {
                    'ADF Statistic': adf_statistic,
                    'p-value': p_value,
                    'Stationary': is_stationary
                }

            # Store the stationarity results for the current product
            all_stationarity_results[product_id] = stationarity_results
    
    return all_stationarity_results

all_stationarity_results = check_stationarity_for_all_products(remerged_data, best_rmse_values)

# Print stationarity results
for product_id, product_results in all_stationarity_results.items():
    print(f"Product ID: {product_id}")
    for column, result in product_results.items():
        print(f"    Variable: {column}")
        print(f"    ADF Statistic: {result['ADF Statistic']}")
        print(f"    p-value: {result['p-value']}")
        print(f"    Stationary: {result['Stationary']}")
        print()

Product ID: 1
    Variable: MAB_ELE_PRO276
    ADF Statistic: -2.747344219173471
    p-value: 0.06619779592967073
    Stationary: False

    Variable: PRO27826_org
    ADF Statistic: -4.853463077754567
    p-value: 4.2926977771570865e-05
    Stationary: True

    Variable: MAB_ELE_PRO756
    ADF Statistic: -2.145974448389574
    p-value: 0.22643414880279056
    Stationary: False

    Variable: MAB_ELE_PRO826
    ADF Statistic: -2.2277679399093704
    p-value: 0.19632207403368973
    Stationary: False

    Variable: PRO27840_org
    ADF Statistic: -2.0725769483770753
    p-value: 0.2556835310614358
    Stationary: False

Product ID: 3
    Variable: PRO27826_org
    ADF Statistic: -4.853463077754567
    p-value: 4.2926977771570865e-05
    Stationary: True

    Variable: MAB_ELE_PRO156
    ADF Statistic: -0.19620286804902076
    p-value: 0.9389727145274538
    Stationary: False

    Variable: RohiNATGAS1000_org
    ADF Statistic: 0.4599340270660138
    p-value: 0.9836030810225652
    Stat

## Predict 

### Product 1

In [44]:
test_1 = pd.DataFrame(index=test[test['Mapped_GCK'] == '1']['Month Year'], columns=remerged_data[1].columns)

In [45]:
test_1

Unnamed: 0_level_0,MAB_ELE_PRO276,PRO27826_org,MAB_ELE_PRO756,MAB_ELE_PRO826,PRO27840_org,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,,,,,,
2022-06-01,,,,,,
2022-07-01,,,,,,
2022-08-01,,,,,,
2022-09-01,,,,,,
2022-10-01,,,,,,
2022-11-01,,,,,,
2022-12-01,,,,,,
2023-01-01,,,,,,
2023-02-01,,,,,,


#### MAB_ELE_PRO276

Has a p-value > 0.05, so it follows a normal distribution

In [46]:
from scipy.stats import norm

In [47]:
#  Calculate mean and standard deviation of the non-missing values
mean_value = remerged_data[1]['MAB_ELE_PRO276'].mean()
std_value = remerged_data[1]['MAB_ELE_PRO276'].std()

#  Get the number of missing values
num_missing = test_1['MAB_ELE_PRO276'].isnull().sum()

# Generate a sequence of random values from a normal distribution
predictions = norm.rvs(loc=mean_value, scale=std_value, size=num_missing)

# Get the indices of the missing values
missing_indexes = test_1[test_1['MAB_ELE_PRO276'].isnull()].index

# Create a pd.Series with the predictions for the missing indexes
predictions_series = pd.Series(predictions, index=missing_indexes)

# Fill the missing values in the 'MAB_ELE_PRO276' column with the predictions
test_1['MAB_ELE_PRO276'].fillna(predictions_series, inplace=True)

#### PRO27826_org

Has a p-value < 0.05, so it does not follow a normal distribution and is stacionary so we are gonna apply Simple Exponencial Smoothing to predict the values

In [48]:
from statsmodels.tsa.holtwinters import SimpleExpSmoothing

# Extract non-missing values
series = remerged_data[1]['PRO27826_org'].dropna()

# Fit the simple exponential smoothing model
model = SimpleExpSmoothing(series)
fitted_model = model.fit()

# Forecast future values
n_periods = test_1['PRO27826_org'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_1[test_1['PRO27826_org'].isnull()].index

# Replace NaN values in the 'PRO27826_org' column with the forecasted values
test_1.loc[missing_indexes, 'PRO27826_org'] = forecast


  self._init_dates(dates, freq)


#### MAB_ELE_PRO756

Has a p-value > 0.05, so it follows a normal distribution

In [49]:
#  Calculate mean and standard deviation of the non-missing values
mean_value = remerged_data[1]['MAB_ELE_PRO756'].mean()
std_value = remerged_data[1]['MAB_ELE_PRO756'].std()

#  Get the number of missing values
num_missing = test_1['MAB_ELE_PRO756'].isnull().sum()

# Generate a sequence of random values from a normal distribution
predictions = norm.rvs(loc=mean_value, scale=std_value, size=num_missing)

# Get the indices of the missing values
missing_indexes = test_1[test_1['MAB_ELE_PRO756'].isnull()].index

# Create a pd.Series with the predictions for the missing indexes
predictions_series = pd.Series(predictions, index=missing_indexes)

# Fill the missing values in the 'MAB_ELE_PRO756' column with the predictions
test_1['MAB_ELE_PRO756'].fillna(predictions_series, inplace=True)

#### MAB_ELE_PRO826

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [50]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Extract non-missing values
series = remerged_data[1]['MAB_ELE_PRO826'].dropna()

# Fit the Holt-Winters Exponential Smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='additive', seasonal_periods=12)
fitted_model = model.fit()

# Forecast future values
n_periods = test_1['MAB_ELE_PRO826'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_1[test_1['MAB_ELE_PRO826'].isnull()].index

# Replace NaN values in the 'MAB_ELE_PRO826' column with the forecasted values
test_1.loc[missing_indexes, 'MAB_ELE_PRO826'] = forecast


  self._init_dates(dates, freq)


#### PRO27840_org

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [51]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

#  Extract non-missing values
series = remerged_data[1]['PRO27840_org'].dropna()

# Fit the exponential smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='multiplicative')
fitted_model = model.fit()

# Forecast future values
n_periods = test_1['PRO27840_org'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_1[test_1['PRO27840_org'].isnull()].index

# Replace NaN values in the 'PRO27840_org' column with the forecasted values
test_1.loc[missing_indexes, 'PRO27840_org'] = forecast


  self._init_dates(dates, freq)


In [52]:
test_1

Unnamed: 0_level_0,MAB_ELE_PRO276,PRO27826_org,MAB_ELE_PRO756,MAB_ELE_PRO826,PRO27840_org,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,117.602286,118.670726,105.263948,97.98079,112.919318,
2022-06-01,120.2926,118.670726,84.042132,109.658616,113.11048,
2022-07-01,116.126088,118.670726,98.029081,105.869498,112.167111,
2022-08-01,109.638478,118.670726,86.736721,97.97361,113.532076,
2022-09-01,111.056307,118.670726,110.153464,113.899246,112.481588,
2022-10-01,116.416852,118.670726,108.12662,107.709413,116.403894,
2022-11-01,112.829967,118.670726,98.291681,110.977378,116.65317,
2022-12-01,110.486227,118.670726,93.018922,101.840492,114.148308,
2023-01-01,123.828164,118.670726,86.20777,96.865944,114.958416,
2023-02-01,96.984587,118.670726,98.231947,101.037951,116.28118,


### Product 3

In [53]:
test_3 = pd.DataFrame(index=test[test['Mapped_GCK'] == '3']['Month Year'], columns=remerged_data[3].columns)

In [54]:
test_3

Unnamed: 0_level_0,PRO27826_org,MAB_ELE_PRO156,RohiNATGAS1000_org,PRI27826_org,PRO27756_org,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,,,,,,
2022-06-01,,,,,,
2022-07-01,,,,,,
2022-08-01,,,,,,
2022-09-01,,,,,,
2022-10-01,,,,,,
2022-11-01,,,,,,
2022-12-01,,,,,,
2023-01-01,,,,,,
2023-02-01,,,,,,


#### PRO27826_org

In [55]:
test_3['PRO27826_org'] = test_1['PRO27826_org']

#### MAB_ELE_PRO156

Has a p-value > 0.05, so it follows a normal distribution

In [56]:
#  Calculate mean and standard deviation of the non-missing values
mean_value = remerged_data[3]['MAB_ELE_PRO156'].mean()
std_value = remerged_data[3]['MAB_ELE_PRO156'].std()

#  Get the number of missing values
num_missing = test_3['MAB_ELE_PRO156'].isnull().sum()

# Generate a sequence of random values from a normal distribution
predictions = norm.rvs(loc=mean_value, scale=std_value, size=num_missing)

# Get the indices of the missing values
missing_indexes = test_3[test_3['MAB_ELE_PRO156'].isnull()].index

# Create a pd.Series with the predictions for the missing indexes
predictions_series = pd.Series(predictions, index=missing_indexes)

# Fill the missing values in the 'MAB_ELE_PRO156' column with the predictions
test_3['MAB_ELE_PRO156'].fillna(predictions_series, inplace=True)

#### RohiNATGAS1000_org

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [57]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

#  Extract non-missing values
series = remerged_data[3]['RohiNATGAS1000_org'].dropna()

# Fit the exponential smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='multiplicative')
fitted_model = model.fit()

# Forecast future values
n_periods = test_3['RohiNATGAS1000_org'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_3[test_3['RohiNATGAS1000_org'].isnull()].index

# Replace NaN values in the 'RohiNATGAS1000_org' column with the forecasted values
test_3.loc[missing_indexes, 'RohiNATGAS1000_org'] = forecast


  self._init_dates(dates, freq)


#### PRI27826_org

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [58]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

#  Extract non-missing values
series = remerged_data[3]['PRI27826_org'].dropna()

# Fit the exponential smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='multiplicative')
fitted_model = model.fit()

# Forecast future values
n_periods = test_3['PRI27826_org'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_3[test_3['PRI27826_org'].isnull()].index

# Replace NaN values in the 'PRI27826_org' column with the forecasted values
test_3.loc[missing_indexes, 'PRI27826_org'] = forecast


  self._init_dates(dates, freq)


#### PRO27756_org

Has a p-value > 0.05, so it follows a normal distribution

In [59]:
#  Calculate mean and standard deviation of the non-missing values
mean_value = remerged_data[3]['PRO27756_org'].mean()
std_value = remerged_data[3]['PRO27756_org'].std()

#  Get the number of missing values
num_missing = test_3['PRO27756_org'].isnull().sum()

# Generate a sequence of random values from a normal distribution
predictions = norm.rvs(loc=mean_value, scale=std_value, size=num_missing)

# Get the indices of the missing values
missing_indexes = test_3[test_3['PRO27756_org'].isnull()].index

# Create a pd.Series with the predictions for the missing indexes
predictions_series = pd.Series(predictions, index=missing_indexes)

# Fill the missing values in the 'PRO27756_org' column with the predictions
test_3['PRO27756_org'].fillna(predictions_series, inplace=True)

In [60]:
test_3

Unnamed: 0_level_0,PRO27826_org,MAB_ELE_PRO156,RohiNATGAS1000_org,PRI27826_org,PRO27756_org,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,118.670726,249.876055,266.988303,119.865031,107.373152,
2022-06-01,118.670726,263.394005,292.098955,120.661201,95.448685,
2022-07-01,118.670726,231.094229,345.070741,121.179187,85.02855,
2022-08-01,118.670726,279.640592,402.373417,121.447896,113.682701,
2022-09-01,118.670726,263.411072,543.230004,121.262017,103.701788,
2022-10-01,118.670726,279.404055,660.634007,120.83341,90.546631,
2022-11-01,118.670726,304.171919,638.454951,120.78039,105.949542,
2022-12-01,118.670726,291.971129,729.592793,120.951953,84.445655,
2023-01-01,118.670726,260.481567,629.929812,120.967418,102.000027,
2023-02-01,118.670726,221.379915,654.606808,121.08973,101.039066,


### Product 4

In [61]:
test_4 = pd.DataFrame(index=test[test['Mapped_GCK'] == '4']['Month Year'], columns=remerged_data[4].columns)

In [62]:
test_4

Unnamed: 0_level_0,Sales_CPI_€_rolling_mean_3,WKLWEUR840_org,Sales_CPI_€_lag_1,PRO27826_org,RohiNATGAS1000_org,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,,,,,,
2022-06-01,,,,,,
2022-07-01,,,,,,
2022-08-01,,,,,,
2022-09-01,,,,,,
2022-10-01,,,,,,
2022-11-01,,,,,,
2022-12-01,,,,,,
2023-01-01,,,,,,
2023-02-01,,,,,,


#### Sales_CPI_€_rolling_mean_3

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [63]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

#  Extract non-missing values
series = remerged_data[4]['Sales_CPI_€_rolling_mean_3'].dropna()

# Fit the exponential smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='multiplicative')
fitted_model = model.fit()

# Forecast future values
n_periods = test_4['Sales_CPI_€_rolling_mean_3'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_4[test_4['Sales_CPI_€_rolling_mean_3'].isnull()].index

# Replace NaN values in the 'Sales_CPI_€_rolling_mean_3' column with the forecasted values
test_4.loc[missing_indexes, 'Sales_CPI_€_rolling_mean_3'] = forecast


  self._init_dates(dates, freq)


#### WKLWEUR840_org

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [64]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

#  Extract non-missing values
series = remerged_data[4]['WKLWEUR840_org'].dropna()

# Fit the exponential smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='multiplicative')
fitted_model = model.fit()

# Forecast future values
n_periods = test_4['WKLWEUR840_org'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_4[test_4['WKLWEUR840_org'].isnull()].index

# Replace NaN values in the 'WKLWEUR840_org' column with the forecasted values
test_4.loc[missing_indexes, 'WKLWEUR840_org'] = forecast


  self._init_dates(dates, freq)


#### Sales_CPI_€_lag_1

Has a p-value < 0.05, so it does not follow a normal distribution and is stacionary so we are gonna apply Simple Exponencial Smoothing to predict the values

In [65]:
from statsmodels.tsa.holtwinters import SimpleExpSmoothing

# Extract non-missing values
series = remerged_data[4]['Sales_CPI_€_lag_1'].dropna()

# Fit the simple exponential smoothing model
model = SimpleExpSmoothing(series)
fitted_model = model.fit()

# Forecast future values
n_periods = test_4['Sales_CPI_€_lag_1'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_4[test_4['Sales_CPI_€_lag_1'].isnull()].index

# Replace NaN values in the 'Sales_CPI_€_lag_1' column with the forecasted values
test_4.loc[missing_indexes, 'Sales_CPI_€_lag_1'] = forecast


  self._init_dates(dates, freq)


#### PRO27826_org

In [66]:
test_4['PRO27826_org'] = test_3['PRO27826_org']

#### RohiNATGAS1000_org

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [67]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

#  Extract non-missing values
series = remerged_data[4]['RohiNATGAS1000_org'].dropna()

# Fit the exponential smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='multiplicative')
fitted_model = model.fit()

# Forecast future values
n_periods = test_4['RohiNATGAS1000_org'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_4[test_4['RohiNATGAS1000_org'].isnull()].index

# Replace NaN values in the 'RohiNATGAS1000_org' column with the forecasted values
test_4.loc[missing_indexes, 'RohiNATGAS1000_org'] = forecast

  self._init_dates(dates, freq)


In [68]:
test_4.dtypes

Sales_CPI_€_rolling_mean_3    object
WKLWEUR840_org                object
Sales_CPI_€_lag_1             object
PRO27826_org                  object
RohiNATGAS1000_org            object
Sales_CPI_€                   object
dtype: object

In [69]:
test_4

Unnamed: 0_level_0,Sales_CPI_€_rolling_mean_3,WKLWEUR840_org,Sales_CPI_€_lag_1,PRO27826_org,RohiNATGAS1000_org,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,335907.773903,1.078756,345999.356954,118.670726,268.771808,
2022-06-01,408108.492055,1.080986,345999.356954,118.670726,296.259275,
2022-07-01,411033.375776,1.069669,345999.356954,118.670726,350.833367,
2022-08-01,346191.424686,1.068663,345999.356954,118.670726,409.139941,
2022-09-01,328025.511084,1.055818,345999.356954,118.670726,553.162939,
2022-10-01,338836.046619,1.043714,345999.356954,118.670726,688.013522,
2022-11-01,376768.09474,1.032455,345999.356954,118.670726,640.143764,
2022-12-01,289693.331625,1.034,345999.356954,118.670726,739.017461,
2023-01-01,286136.982277,1.026457,345999.356954,118.670726,629.926231,
2023-02-01,321608.848767,1.01224,345999.356954,118.670726,659.910012,


### Product 6

In [70]:
test_6 = pd.DataFrame(index=test[test['Mapped_GCK'] == '6']['Month Year'], columns=remerged_data[6].columns)

In [71]:
test_6

Unnamed: 0_level_0,PRO27276_org,MAB_ELE_SHP840,MAB_ELE_PRO276,MAB_ELE_PRO380,MAB_ELE_SHP826,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,,,,,,
2022-06-01,,,,,,
2022-07-01,,,,,,
2022-08-01,,,,,,
2022-09-01,,,,,,
2022-10-01,,,,,,
2022-11-01,,,,,,
2022-12-01,,,,,,
2023-01-01,,,,,,
2023-02-01,,,,,,


#### PRO27276_org

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [72]:
#  Extract non-missing values
series = remerged_data[6]['PRO27276_org'].dropna()

# Fit the exponential smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='multiplicative')
fitted_model = model.fit()

# Forecast future values
n_periods = test_6['PRO27276_org'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_6[test_6['PRO27276_org'].isnull()].index

# Replace NaN values in the 'PRO27276_org' column with the forecasted values
test_6.loc[missing_indexes, 'PRO27276_org'] = forecast

  self._init_dates(dates, freq)
  return err.T @ err


#### MAB_ELE_SHP840

Has a p-value > 0.05, so it follows a normal distribution

In [73]:
#  Calculate mean and standard deviation of the non-missing values
mean_value = remerged_data[6]['MAB_ELE_SHP840'].mean()
std_value = remerged_data[6]['MAB_ELE_SHP840'].std()

#  Get the number of missing values
num_missing = test_6['MAB_ELE_SHP840'].isnull().sum()

# Generate a sequence of random values from a normal distribution
predictions = norm.rvs(loc=mean_value, scale=std_value, size=num_missing)

# Get the indices of the missing values
missing_indexes = test_6[test_6['MAB_ELE_SHP840'].isnull()].index

# Create a pd.Series with the predictions for the missing indexes
predictions_series = pd.Series(predictions, index=missing_indexes)

# Fill the missing values in the 'MAB_ELE_SHP840' column with the predictions
test_6['MAB_ELE_SHP840'].fillna(predictions_series, inplace=True)

#### MAB_ELE_PRO276

In [74]:
test_6['MAB_ELE_PRO276'] = test_1['MAB_ELE_PRO276']

#### MAB_ELE_PRO380

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [75]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

#  Extract non-missing values
series = remerged_data[6]['MAB_ELE_PRO380'].dropna()

# Fit the exponential smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='multiplicative')
fitted_model = model.fit()

# Forecast future values
n_periods = test_6['MAB_ELE_PRO380'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_6[test_6['MAB_ELE_PRO380'].isnull()].index

# Replace NaN values in the 'MAB_ELE_PRO380' column with the forecasted values
test_6.loc[missing_indexes, 'MAB_ELE_PRO380'] = forecast

  self._init_dates(dates, freq)


#### MAB_ELE_SHP826

Has a p-value < 0.05, so it does not follow a normal distribution and is stacionary so we are gonna apply Simple Exponencial Smoothing to predict the values

In [76]:
from statsmodels.tsa.holtwinters import SimpleExpSmoothing

# Extract non-missing values
series = remerged_data[6]['MAB_ELE_SHP826'].dropna()

# Fit the simple exponential smoothing model
model = SimpleExpSmoothing(series)
fitted_model = model.fit()

# Forecast future values
n_periods = test_6['MAB_ELE_SHP826'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_6[test_6['MAB_ELE_SHP826'].isnull()].index

# Replace NaN values in the 'MAB_ELE_SHP826' column with the forecasted values
test_6.loc[missing_indexes, 'MAB_ELE_SHP826'] = forecast


  self._init_dates(dates, freq)


In [77]:
test_6

Unnamed: 0_level_0,PRO27276_org,MAB_ELE_SHP840,MAB_ELE_PRO276,MAB_ELE_PRO380,MAB_ELE_SHP826,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,101.990985,135.021423,117.602286,120.169624,109.680343,
2022-06-01,104.508181,136.111567,120.2926,118.998123,109.680343,
2022-07-01,108.800287,121.875341,116.126088,125.429775,109.680343,
2022-08-01,101.307528,130.72424,109.638478,64.286661,109.680343,
2022-09-01,112.743424,130.196612,111.056307,120.583328,109.680343,
2022-10-01,112.765987,138.504734,116.416852,123.489779,109.680343,
2022-11-01,113.011056,145.02663,112.829967,116.992908,109.680343,
2022-12-01,92.435647,107.480376,110.486227,98.153986,109.680343,
2023-01-01,101.612179,120.257369,123.828164,101.62835,109.680343,
2023-02-01,105.899404,130.623601,96.984587,113.962894,109.680343,


### Product 8

In [78]:
test_8 = pd.DataFrame(index=test[test['Mapped_GCK'] == '8']['Month Year'], columns=remerged_data[8].columns)

In [79]:
test_8

Unnamed: 0_level_0,Sales_CPI_€_rolling_mean_3,Sales_CPI_€_lag_6,WKLWEUR840_org,Sales_CPI_€_lag_3,PRO271000_org,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,,,,,,
2022-06-01,,,,,,
2022-07-01,,,,,,
2022-08-01,,,,,,
2022-09-01,,,,,,
2022-10-01,,,,,,
2022-11-01,,,,,,
2022-12-01,,,,,,
2023-01-01,,,,,,
2023-02-01,,,,,,


#### Sales_CPI_€_rolling_mean_3

In [80]:
test_8['Sales_CPI_€_rolling_mean_3'] = test_4['Sales_CPI_€_rolling_mean_3']

#### Sales_CPI_€_lag_6

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [81]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

#  Extract non-missing values
series = remerged_data[8]['Sales_CPI_€_lag_6'].dropna()

# Fit the exponential smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='multiplicative')
fitted_model = model.fit()

# Forecast future values
n_periods = test_8['Sales_CPI_€_lag_6'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_8[test_8['Sales_CPI_€_lag_6'].isnull()].index

# Replace NaN values in the 'Sales_CPI_€_lag_6' column with the forecasted values
test_8.loc[missing_indexes, 'Sales_CPI_€_lag_6'] = forecast

  self._init_dates(dates, freq)
  return err.T @ err


#### WKLWEUR840_org

In [82]:
test_8['WKLWEUR840_org']= test_4['WKLWEUR840_org']

#### Sales_CPI_€_lag_3

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [83]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

#  Extract non-missing values
series = remerged_data[8]['Sales_CPI_€_lag_3'].dropna()

# Fit the exponential smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='multiplicative')
fitted_model = model.fit()

# Forecast future values
n_periods = test_8['Sales_CPI_€_lag_3'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_8[test_8['Sales_CPI_€_lag_3'].isnull()].index

# Replace NaN values in the 'Sales_CPI_€_lag_3' column with the forecasted values
test_8.loc[missing_indexes, 'Sales_CPI_€_lag_3'] = forecast

  self._init_dates(dates, freq)


#### PRO271000_org

Has a p-value > 0.05, so it follows a normal distribution

In [84]:
#  Calculate mean and standard deviation of the non-missing values
mean_value = remerged_data[8]['PRO271000_org'].mean()
std_value = remerged_data[8]['PRO271000_org'].std()

#  Get the number of missing values
num_missing = test_8['PRO271000_org'].isnull().sum()

# Generate a sequence of random values from a normal distribution
predictions = norm.rvs(loc=mean_value, scale=std_value, size=num_missing)

# Get the indices of the missing values
missing_indexes = test_8[test_8['PRO271000_org'].isnull()].index

# Create a pd.Series with the predictions for the missing indexes
predictions_series = pd.Series(predictions, index=missing_indexes)

# Fill the missing values in the 'PRO271000_org' column with the predictions
test_8['PRO271000_org'].fillna(predictions_series, inplace=True)

In [85]:
test_8

Unnamed: 0_level_0,Sales_CPI_€_rolling_mean_3,Sales_CPI_€_lag_6,WKLWEUR840_org,Sales_CPI_€_lag_3,PRO271000_org,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,335907.773903,3846001720836854043508281233367909434654253643...,1.078756,2090191.441812,121.611326,
2022-06-01,408108.492055,6726324878095722402774802958121662069486320033...,1.080986,2254740.945709,131.418696,
2022-07-01,411033.375776,5168856161856411592498945013984906419191210967...,1.069669,1540927.718676,130.825563,
2022-08-01,346191.424686,2444192948225237828862370246402072429219907024...,1.068663,2016097.115437,140.669796,
2022-09-01,328025.511084,3949053624485848548468021044636594861706565197...,1.055818,2045379.462427,119.908817,
2022-10-01,338836.046619,2241446741695006730500772314280135146448775669...,1.043714,1885793.879506,110.813092,
2022-11-01,376768.09474,9537181765595268114760994492413111527193132459...,1.032455,2511826.396807,105.823003,
2022-12-01,289693.331625,1316936817734429492113200715204914944196794740...,1.034,4965394.464856,125.015413,
2023-01-01,286136.982277,1561878744974975379084242665779693918416671939...,1.026457,544431.309797,132.357487,
2023-02-01,321608.848767,5948720723738194707677757672210105814956055531...,1.01224,2889135.744967,98.252199,


### Product 9

In [86]:
test_9 = pd.DataFrame(index=test[test['Mapped_GCK'] == '9']['Month Year'], columns=remerged_data[9].columns)

In [87]:
test_9

Unnamed: 0_level_0,Sales_CPI_€_rolling_mean_3,PRO271000_org,PRO27826_org,MAB_ELE_SHP840,MAB_ELE_PRO250,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,,,,,,
2022-06-01,,,,,,
2022-07-01,,,,,,
2022-08-01,,,,,,
2022-09-01,,,,,,
2022-10-01,,,,,,
2022-11-01,,,,,,
2022-12-01,,,,,,
2023-01-01,,,,,,
2023-02-01,,,,,,


#### Sales_CPI_€_rolling_mean_3

In [88]:
test_9['Sales_CPI_€_rolling_mean_3']= test_4['Sales_CPI_€_rolling_mean_3']

#### PRO271000_org

In [89]:
test_9['PRO271000_org'] = test_8['PRO271000_org']

#### PRO27826_org

In [90]:
test_9['PRO27826_org'] = test_3['PRO27826_org']

#### MAB_ELE_SHP840

In [91]:
test_9['MAB_ELE_SHP840'] = test_6['MAB_ELE_SHP840']

#### MAB_ELE_PRO250

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [92]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

#  Extract non-missing values
series = remerged_data[9]['MAB_ELE_PRO250'].dropna()

# Fit the exponential smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='multiplicative')
fitted_model = model.fit()

# Forecast future values
n_periods = test_9['MAB_ELE_PRO250'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_9[test_9['MAB_ELE_PRO250'].isnull()].index

# Replace NaN values in the 'MAB_ELE_PRO250' column with the forecasted values
test_9.loc[missing_indexes, 'MAB_ELE_PRO250'] = forecast

  self._init_dates(dates, freq)


In [93]:
test_9

Unnamed: 0_level_0,Sales_CPI_€_rolling_mean_3,PRO271000_org,PRO27826_org,MAB_ELE_SHP840,MAB_ELE_PRO250,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,335907.773903,121.611326,118.670726,135.021423,87.311013,
2022-06-01,408108.492055,131.418696,118.670726,136.111567,105.80699,
2022-07-01,411033.375776,130.825563,118.670726,121.875341,101.931434,
2022-08-01,346191.424686,140.669796,118.670726,130.72424,67.507146,
2022-09-01,328025.511084,119.908817,118.670726,130.196612,107.763417,
2022-10-01,338836.046619,110.813092,118.670726,138.504734,109.150758,
2022-11-01,376768.09474,105.823003,118.670726,145.02663,104.497997,
2022-12-01,289693.331625,125.015413,118.670726,107.480376,107.440707,
2023-01-01,286136.982277,132.357487,118.670726,120.257369,96.131115,
2023-02-01,321608.848767,98.252199,118.670726,130.623601,102.152693,


### Product 11

In [94]:
test_11 = pd.DataFrame(index=test[test['Mapped_GCK'] == '11']['Month Year'], columns=remerged_data[11].columns)

In [95]:
test_11

Unnamed: 0_level_0,RohiNATGAS1000_org,MAB_ELE_SHP840,MAB_ELE_PRO756,PRI27826_org,PRI27380_org,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,,,,,,
2022-06-01,,,,,,
2022-07-01,,,,,,
2022-08-01,,,,,,
2022-09-01,,,,,,
2022-10-01,,,,,,
2022-11-01,,,,,,
2022-12-01,,,,,,
2023-01-01,,,,,,
2023-02-01,,,,,,


#### MAB_ELE_SHP840 

Has a p-value > 0.05, so it follows a normal distribution

In [96]:
#  Calculate mean and standard deviation of the non-missing values
mean_value = remerged_data[11]['MAB_ELE_SHP840'].mean()
std_value = remerged_data[11]['MAB_ELE_SHP840'].std()

#  Get the number of missing values
num_missing = test_11['MAB_ELE_SHP840'].isnull().sum()

# Generate a sequence of random values from a normal distribution
predictions = norm.rvs(loc=mean_value, scale=std_value, size=num_missing)

# Get the indices of the missing values
missing_indexes = test_11[test_11['MAB_ELE_SHP840'].isnull()].index

# Create a pd.Series with the predictions for the missing indexes
predictions_series = pd.Series(predictions, index=missing_indexes)

# Fill the missing values in the 'MAB_ELE_PRO840' column with the predictions
test_11['MAB_ELE_SHP840'].fillna(predictions_series, inplace=True)

#### MAB_ELE_PRO756

Has a p-value > 0.05, so it follows a normal distribution

In [97]:
#  Calculate mean and standard deviation of the non-missing values
mean_value = remerged_data[11]['MAB_ELE_PRO756'].mean()
std_value = remerged_data[11]['MAB_ELE_PRO756'].std()

#  Get the number of missing values
num_missing = test_11['MAB_ELE_PRO756'].isnull().sum()

# Generate a sequence of random values from a normal distribution
predictions = norm.rvs(loc=mean_value, scale=std_value, size=num_missing)

# Get the indices of the missing values
missing_indexes = test_11[test_11['MAB_ELE_PRO756'].isnull()].index

# Create a pd.Series with the predictions for the missing indexes
predictions_series = pd.Series(predictions, index=missing_indexes)

# Fill the missing values in the 'MAB_ELE_PRO840' column with the predictions
test_11['MAB_ELE_PRO756'].fillna(predictions_series, inplace=True)

####  RohiNATGAS1000_org
Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [98]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Extract non-missing values
series = remerged_data[11]['RohiNATGAS1000_org'].dropna()

# Fit the Holt-Winters Exponential Smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='additive', seasonal_periods=12)
fitted_model = model.fit()

# Forecast future values
n_periods = test_11['RohiNATGAS1000_org'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_11[test_11['RohiNATGAS1000_org'].isnull()].index

# Replace NaN values in the 'RohiNATGAS1000_org' column with the forecasted values
test_11.loc[missing_indexes, 'RohiNATGAS1000_org'] = forecast


  self._init_dates(dates, freq)


####  PRI27826_org
Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [99]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Extract non-missing values
series = remerged_data[11]['PRI27826_org'].dropna()

# Fit the Holt-Winters Exponential Smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='additive', seasonal_periods=12)
fitted_model = model.fit()

# Forecast future values
n_periods = test_11['PRI27826_org'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_11[test_11['PRI27826_org'].isnull()].index

# Replace NaN values in the 'PRI27826_org' column with the forecasted values
test_11.loc[missing_indexes, 'PRI27826_org'] = forecast


  self._init_dates(dates, freq)


####  PRI27380_org
Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [100]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Extract non-missing values
series = remerged_data[11]['PRI27380_org'].dropna()

# Fit the Holt-Winters Exponential Smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='additive', seasonal_periods=12)
fitted_model = model.fit()

# Forecast future values
n_periods = test_11['PRI27380_org'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_11[test_11['PRI27380_org'].isnull()].index

# Replace NaN values in the 'PRI27380_org' column with the forecasted values
test_11.loc[missing_indexes, 'PRI27380_org'] = forecast


  self._init_dates(dates, freq)


### Product 12

In [101]:
test_12 = pd.DataFrame(index=test[test['Mapped_GCK'] == '12']['Month Year'], columns=remerged_data[12].columns)

In [102]:
test_12

Unnamed: 0_level_0,Sales_CPI_€_rolling_mean_3,MAB_ELE_PRO250,MAB_ELE_PRO156,Sales_CPI_€_lag_3,MAB_ELE_SHP380,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,,,,,,
2022-06-01,,,,,,
2022-07-01,,,,,,
2022-08-01,,,,,,
2022-09-01,,,,,,
2022-10-01,,,,,,
2022-11-01,,,,,,
2022-12-01,,,,,,
2023-01-01,,,,,,
2023-02-01,,,,,,


#### MAB_ELE_SHP380 

Has a p-value > 0.05, so it follows a normal distribution


In [103]:
#  Calculate mean and standard deviation of the non-missing values
mean_value = remerged_data[12]['MAB_ELE_SHP380'].mean()
std_value = remerged_data[12]['MAB_ELE_SHP380'].std()

#  Get the number of missing values
num_missing = test_12['MAB_ELE_SHP380'].isnull().sum()

# Generate a sequence of random values from a normal distribution
predictions = norm.rvs(loc=mean_value, scale=std_value, size=num_missing)

# Get the indices of the missing values
missing_indexes = test_12[test_12['MAB_ELE_SHP380'].isnull()].index

# Create a pd.Series with the predictions for the missing indexes
predictions_series = pd.Series(predictions, index=missing_indexes)

# Fill the missing values in the 'MAB_ELE_PRO840' column with the predictions
test_12['MAB_ELE_SHP380'].fillna(predictions_series, inplace=True)

#### Sales_CPI_€_lag_3

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values


In [104]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Extract non-missing values
series = remerged_data[12]['Sales_CPI_€_lag_3'].dropna()

# Fit the Holt-Winters Exponential Smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='additive', seasonal_periods=12)
fitted_model = model.fit()

# Forecast future values
n_periods = test_12['Sales_CPI_€_lag_3'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_12[test_12['Sales_CPI_€_lag_3'].isnull()].index

# Replace NaN values in the 'MAB_ELE_PRO826' column with the forecasted values
test_12.loc[missing_indexes, 'Sales_CPI_€_lag_3'] = forecast


  self._init_dates(dates, freq)


#### Sales_CPI_€_rolling_mean_3

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [105]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Extract non-missing values
series = remerged_data[12]['Sales_CPI_€_rolling_mean_3'].dropna()

# Fit the Holt-Winters Exponential Smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='additive', seasonal_periods=12)
fitted_model = model.fit()

# Forecast future values
n_periods = test_12['Sales_CPI_€_rolling_mean_3'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_12[test_12['Sales_CPI_€_rolling_mean_3'].isnull()].index

# Replace NaN values in the 'Sales_CPI_€_rolling_mean_3' column with the forecasted values
test_12.loc[missing_indexes, 'Sales_CPI_€_rolling_mean_3'] = forecast


  self._init_dates(dates, freq)


#### MAB_ELE_PRO250 

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values


In [106]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Extract non-missing values
series = remerged_data[12]['MAB_ELE_PRO250'].dropna()

# Fit the Holt-Winters Exponential Smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='additive', seasonal_periods=12)
fitted_model = model.fit()

# Forecast future values
n_periods = test_12['MAB_ELE_PRO250'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_12[test_12['MAB_ELE_PRO250'].isnull()].index

# Replace NaN values in the 'MAB_ELE_PRO250' column with the forecasted values
test_12.loc[missing_indexes, 'MAB_ELE_PRO250'] = forecast


  self._init_dates(dates, freq)


#### MAB_ELE_PRO156 

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [107]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Extract non-missing values
series = remerged_data[12]['MAB_ELE_PRO156'].dropna()

# Fit the Holt-Winters Exponential Smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='additive', seasonal_periods=12)
fitted_model = model.fit()

# Forecast future values
n_periods = test_12['MAB_ELE_PRO156'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_12[test_12['MAB_ELE_PRO156'].isnull()].index

# Replace NaN values in the 'MAB_ELE_PRO156' column with the forecasted values
test_12.loc[missing_indexes, 'MAB_ELE_PRO156'] = forecast


  self._init_dates(dates, freq)


### Product 13

In [108]:
test_13 = pd.DataFrame(index=test[test['Mapped_GCK'] == '13']['Month Year'], columns=remerged_data[13].columns)

In [109]:
test_13

Unnamed: 0_level_0,PRI27840_org,PRO27756_org,MAB_ELE_PRO826,MAB_ELE_PRO276,RohiENERGY1000_org,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,,,,,,
2022-06-01,,,,,,
2022-07-01,,,,,,
2022-08-01,,,,,,
2022-09-01,,,,,,
2022-10-01,,,,,,
2022-11-01,,,,,,
2022-12-01,,,,,,
2023-01-01,,,,,,
2023-02-01,,,,,,


#### PRO27756_org 
Has a p-value > 0.05, so it follows a normal distribution


In [110]:
#  Calculate mean and standard deviation of the non-missing values
mean_value = remerged_data[13]['PRO27756_org'].mean()
std_value = remerged_data[13]['PRO27756_org'].std()

#  Get the number of missing values
num_missing = test_13['PRO27756_org'].isnull().sum()

# Generate a sequence of random values from a normal distribution
predictions = norm.rvs(loc=mean_value, scale=std_value, size=num_missing)

# Get the indices of the missing values
missing_indexes = test_13[test_13['PRO27756_org'].isnull()].index

# Create a pd.Series with the predictions for the missing indexes
predictions_series = pd.Series(predictions, index=missing_indexes)

# Fill the missing values in the 'MAB_ELE_PRO840' column with the predictions
test_13['PRO27756_org'].fillna(predictions_series, inplace=True)

#### MAB_ELE_PRO276

Has a p-value > 0.05, so it follows a normal distribution


In [111]:
#  Calculate mean and standard deviation of the non-missing values
mean_value = remerged_data[13]['MAB_ELE_PRO276'].mean()
std_value = remerged_data[13]['MAB_ELE_PRO276'].std()

#  Get the number of missing values
num_missing = test_13['MAB_ELE_PRO276'].isnull().sum()

# Generate a sequence of random values from a normal distribution
predictions = norm.rvs(loc=mean_value, scale=std_value, size=num_missing)

# Get the indices of the missing values
missing_indexes = test_13[test_13['MAB_ELE_PRO276'].isnull()].index

# Create a pd.Series with the predictions for the missing indexes
predictions_series = pd.Series(predictions, index=missing_indexes)

# Fill the missing values in the 'MAB_ELE_PRO840' column with the predictions
test_13['MAB_ELE_PRO276'].fillna(predictions_series, inplace=True)

#### PRI27840_org

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values


In [112]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Extract non-missing values
series = remerged_data[13]['PRI27840_org'].dropna()

# Fit the Holt-Winters Exponential Smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='additive', seasonal_periods=12)
fitted_model = model.fit()

# Forecast future values
n_periods = test_13['PRI27840_org'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_13[test_13['PRI27840_org'].isnull()].index

# Replace NaN values in the 'PRI27840_org' column with the forecasted values
test_13.loc[missing_indexes, 'PRI27840_org'] = forecast


  self._init_dates(dates, freq)


#### MAB_ELE_PRO826

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [113]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Extract non-missing values
series = remerged_data[13]['MAB_ELE_PRO826'].dropna()

# Fit the Holt-Winters Exponential Smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='additive', seasonal_periods=12)
fitted_model = model.fit()

# Forecast future values
n_periods = test_13['MAB_ELE_PRO826'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_13[test_13['MAB_ELE_PRO826'].isnull()].index

# Replace NaN values in the 'MAB_ELE_PRO826' column with the forecasted values
test_13.loc[missing_indexes, 'MAB_ELE_PRO826'] = forecast


  self._init_dates(dates, freq)


#### RohiENERGY1000_org

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [114]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Extract non-missing values
series = remerged_data[13]['RohiENERGY1000_org'].dropna()

# Fit the Holt-Winters Exponential Smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='additive', seasonal_periods=12)
fitted_model = model.fit()

# Forecast future values
n_periods = test_13['RohiENERGY1000_org'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_13[test_13['RohiENERGY1000_org'].isnull()].index

# Replace NaN values in the 'RohiENERGY1000_org' column with the forecasted values
test_13.loc[missing_indexes, 'RohiENERGY1000_org'] = forecast


  self._init_dates(dates, freq)


### Product 36

In [115]:
test_36 = pd.DataFrame(index=test[test['Mapped_GCK'] == '36']['Month Year'], columns=remerged_data[36].columns)

In [116]:
test_36

Unnamed: 0_level_0,MAB_ELE_PRO840,RohiNATGAS1000_org,PRO27826_org,RohiBASEMET1000_org,MAB_ELE_PRO756,Sales_CPI_€
Month Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,,,,,,
2022-06-01,,,,,,
2022-07-01,,,,,,
2022-08-01,,,,,,
2022-09-01,,,,,,
2022-10-01,,,,,,
2022-11-01,,,,,,
2022-12-01,,,,,,
2023-01-01,,,,,,
2023-02-01,,,,,,


#### MAB_ELE_PRO840

Has a p-value > 0.05, so it follows a normal distribution

In [117]:
#  Calculate mean and standard deviation of the non-missing values
mean_value = remerged_data[36]['MAB_ELE_PRO840'].mean()
std_value = remerged_data[36]['MAB_ELE_PRO840'].std()

#  Get the number of missing values
num_missing = test_36['MAB_ELE_PRO840'].isnull().sum()

# Generate a sequence of random values from a normal distribution
predictions = norm.rvs(loc=mean_value, scale=std_value, size=num_missing)

# Get the indices of the missing values
missing_indexes = test_36[test_36['MAB_ELE_PRO840'].isnull()].index

# Create a pd.Series with the predictions for the missing indexes
predictions_series = pd.Series(predictions, index=missing_indexes)

# Fill the missing values in the 'MAB_ELE_PRO840' column with the predictions
test_36['MAB_ELE_PRO840'].fillna(predictions_series, inplace=True)

#### MAB_ELE_PRO756

Has a p-value > 0.05, so it follows a normal distribution


In [118]:
#  Calculate mean and standard deviation of the non-missing values
mean_value = remerged_data[36]['MAB_ELE_PRO756'].mean()
std_value = remerged_data[36]['MAB_ELE_PRO756'].std()

#  Get the number of missing values
num_missing = test_36['MAB_ELE_PRO756'].isnull().sum()

# Generate a sequence of random values from a normal distribution
predictions = norm.rvs(loc=mean_value, scale=std_value, size=num_missing)

# Get the indices of the missing values
missing_indexes = test_36[test_36['MAB_ELE_PRO756'].isnull()].index

# Create a pd.Series with the predictions for the missing indexes
predictions_series = pd.Series(predictions, index=missing_indexes)

# Fill the missing values in the 'MAB_ELE_PRO840' column with the predictions
test_36['MAB_ELE_PRO756'].fillna(predictions_series, inplace=True)

#### RohiNATGAS1000_org

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [119]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Extract non-missing values
series = remerged_data[36]['RohiNATGAS1000_org'].dropna()

# Fit the Holt-Winters Exponential Smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='additive', seasonal_periods=12)
fitted_model = model.fit()

# Forecast future values
n_periods = test_36['RohiNATGAS1000_org'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_36[test_36['RohiNATGAS1000_org'].isnull()].index

# Replace NaN values in the 'RohiNATGAS1000_org' column with the forecasted values
test_36.loc[missing_indexes, 'RohiNATGAS1000_org'] = forecast


  self._init_dates(dates, freq)


#### RohiBASEMET1000_org

Has a p-value < 0.05, so it does not follow a normal distribution and is non stacionary so we are gonna apply Holt Winters to predict the values

In [120]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Extract non-missing values
series = remerged_data[36]['RohiBASEMET1000_org'].dropna()

# Fit the Holt-Winters Exponential Smoothing model
model = ExponentialSmoothing(series, seasonal='multiplicative', trend='additive', seasonal_periods=12)
fitted_model = model.fit()

# Forecast future values
n_periods = test_36['RohiBASEMET1000_org'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_36[test_36['RohiBASEMET1000_org'].isnull()].index

# Replace NaN values in the 'RohiNATGAS1000_org' column with the forecasted values
test_36.loc[missing_indexes, 'RohiBASEMET1000_org'] = forecast


  self._init_dates(dates, freq)


#### PRO27826_org

Has a p-value < 0.05, so it does not follow a normal distribution and is stacionary so we are gonna apply Simple Exponencial Smoothing to predict the values


In [121]:
from statsmodels.tsa.holtwinters import SimpleExpSmoothing

# Extract non-missing values
series = remerged_data[36]['PRO27826_org'].dropna()

# Fit the simple exponential smoothing model
model = SimpleExpSmoothing(series)
fitted_model = model.fit()

# Forecast future values
n_periods = test_36['PRO27826_org'].isnull().sum()
forecast = fitted_model.forecast(steps=n_periods)

# Get the indices of the missing values
missing_indexes = test_36[test_36['PRO27826_org'].isnull()].index

# Replace NaN values in the 'PRO27826_org' column with the forecasted values
test_36.loc[missing_indexes, 'PRO27826_org'] = forecast


  self._init_dates(dates, freq)


In [122]:

# Define output directory for test DataFrames
test_output_dir = 'products_test'

# Create the directory if it doesn't exist
os.makedirs(test_output_dir, exist_ok=True)

# Assuming you have your test DataFrames created and named as follows:
test_dataframes = {
    'test_1': test_1,   # Replace pd.DataFrame() with your actual DataFrame
    'test_3': test_3,
    'test_4': test_4,
    'test_5': remerged_data[5],
    'test_6': test_6,
    'test_8': test_8,
    'test_9': test_9,
    'test_11': test_11,
    'test_12': test_12,
    'test_13': test_13,
    'test_14': remerged_data[14],
    'test_16': remerged_data[16],
    'test_20': remerged_data[20],
    'test_36': test_36
}

# Iterate over the dictionary and save each DataFrame to a CSV file in the specified directory
for test_name, df in test_dataframes.items():
    file_path = os.path.join(test_output_dir, f'{test_name}.csv')
    df.to_csv(file_path, index=False)  # Set index=False if you don't want to save DataFrame index in the CSV

print("CSV files have been created in the directory:", test_output_dir)





CSV files have been created in the directory: products_test
