In [1]:
import pandas as pd

In [87]:
beer = pd.read_csv("/Users/emmy/Desktop/Last_term_SMU/AI Planning and decision making/ai-planning/ref_impl/data/raw/SKU_data_store_236117_beer.csv")
beer

Unnamed: 0,236117_0_1_18200_106-UNITS,236117_0_1_18200_11047-UNITS,236117_0_1_18200_15168-UNITS,236117_0_1_18200_16-UNITS,236117_0_1_18200_466-UNITS,236117_0_1_18200_468-UNITS,236117_0_1_18200_53047-UNITS,236117_0_1_18200_769-UNITS,236117_0_1_18200_771-UNITS,236117_0_1_18200_784-UNITS,...,236117_0_2_80660_95785-F4,236117_0_2_83820_12393-F4,236117_6_1_31030_11-F4,236117_6_1_31030_22-F4,236117_6_1_31030_44-F4,236117_6_1_35985_10062-F4,236117_7_1_23830_9-F4,236117_7_1_2770_1-F4,236117_7_1_62274_81393-F4,236117_7_1_86150_1-F4
0,3.0,20.0,2.000000,5.000000,5.000000,5.000000,11.000000,10.000000,12.000000,7.0,...,0,0,0,0,0,0,0,0,0,0
1,4.0,4.0,2.665689,1.000000,5.991525,5.000000,4.000000,3.000000,7.029851,4.0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,1.0,1.000000,6.000000,3.000000,1.000000,3.000000,1.000000,7.000000,5.0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,10.0,2.665689,5.351955,3.000000,2.000000,11.000000,7.000000,6.000000,5.0,...,0,0,0,0,0,0,0,0,0,0
4,2.0,15.0,1.000000,2.000000,5.000000,2.000000,6.000000,8.000000,8.000000,9.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,4.0,2.0,1.000000,4.000000,5.991525,4.167192,2.000000,1.000000,2.000000,7.0,...,0,0,0,0,0,0,0,0,0,0
361,2.0,2.0,2.000000,2.000000,10.000000,1.000000,2.000000,5.000000,3.000000,1.0,...,0,0,0,0,0,0,0,0,0,0
362,1.0,3.0,2.000000,3.000000,4.000000,10.000000,2.000000,6.654286,7.029851,5.0,...,0,0,0,0,0,0,0,0,0,0
363,7.0,3.0,1.000000,1.000000,1.000000,2.000000,2.000000,1.000000,2.000000,4.0,...,0,0,0,0,0,0,0,0,0,0


In [79]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score


#Data preprocessing 
def prepare_and_engineer_features(df):
    df_filled = df.ffill()
    lagged_features = {} #creating a lag feature to use past values as predictors to account for the influence of the previous time points.

    for col in df.columns:
        if 'PRICE' in col or '-D' in col or '-F' in col: 
            lagged_features[f'{col}_lag1'] = df_filled[col].shift(1).fillna(0)

    lagged_df = pd.DataFrame(lagged_features, index=df_filled.index)
    df_filled = pd.concat([df_filled, lagged_df], axis=1)
    df_filled['promotional_index'] = df_filled.filter(regex='(-D|-F)').sum(axis=1) # consolidate overall promotional activity in 1 metric

    return df_filled

def load_data(file_path):
    df = pd.read_csv(file_path)
    units_cols = [col for col in df.columns if 'UNITS' in col]
    df['avg_units_sold'] = df[units_cols].mean(axis=1) # taking the mean of all unit columns to create a target variable. Assuming demand = units sold. not sure about this part
    return df

# Model 
def model_analysis(df, target_variable, feature_columns):
    
    X = df[feature_columns]
    y = df[target_variable]
    X = sm.add_constant(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = sm.OLS(y_train, X_train).fit() #used ordinary least squares regression model
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(model.summary())
    print(f"Mean Absolute Error: {mae}")
    print(f"R-squared: {r2}")


# Data loading and execution
beer_df = load_data('./ref_impl/demand model with promotion indense index/data/raw/SKU_data_store_657979_beer.csv')
milk_df = load_data('./ref_impl/demand model with promotion indense index/data/raw/SKU_data_store_657979_milk.csv')
mayo_df = load_data('./ref_impl/demand model with promotion indense index/data/raw/SKU_data_store_657979_mayo.csv')
yogurt_df = load_data('./ref_impl/demand model with promotion indense index/data/raw/SKU_data_store_657979_yogurt.csv')

beer_prepared = prepare_and_engineer_features(beer_df)
milk_prepared = prepare_and_engineer_features(milk_df)
mayo_prepared = prepare_and_engineer_features(mayo_df)
yogurt_prepared = prepare_and_engineer_features(yogurt_df)

# assuming "Units" columns as the target variable 
target_variable = 'avg_units_sold'


#EXPERIMENTS WITH FEATURE COLUMNS

#feature_columns_beer = [col for col in beer_prepared.columns if 'lag' in col or col == 'promotional_index']
#feature_columns_milk = [col for col in milk_prepared.columns if 'lag' in col or col == 'promotional_index']
#feature_columns_mayo = [col for col in mayo_prepared.columns if 'lag' in col or col == 'promotional_index']
#feature_columns_yogurt = [col for col in yogurt_prepared.columns if 'lag' in col or col == 'promotional_index']

feature_columns_beer = [col for col in beer_prepared.columns if 'lag' in col]
feature_columns_milk = [col for col in milk_prepared.columns if 'lag' in col]
feature_columns_mayo = [col for col in mayo_prepared.columns if 'lag' in col]
feature_columns_yogurt = [col for col in yogurt_prepared.columns if 'lag' in col]

#feature_columns_beer = [col for col in beer_prepared.columns if col == 'promotional_index']
#feature_columns_milk = [col for col in milk_prepared.columns if col == 'promotional_index']
#feature_columns_mayo = [col for col in mayo_prepared.columns if col == 'promotional_index']
#feature_columns_yogurt = [col for col in yogurt_prepared.columns if col == 'promotional_index']


#NOTE: this model might be overfitting 

In [80]:
# Perform model analysis
print("Beer Dataset:")
model_analysis(beer_prepared, target_variable, feature_columns_beer)

Beer Dataset:
                            OLS Regression Results                            
Dep. Variable:         avg_units_sold   R-squared:                       0.736
Model:                            OLS   Adj. R-squared:                  0.396
Method:                 Least Squares   F-statistic:                     2.164
Date:                Fri, 01 Mar 2024   Prob (F-statistic):           3.80e-06
Time:                        05:23:25   Log-Likelihood:                -398.80
No. Observations:                 292   AIC:                             1128.
Df Residuals:                     127   BIC:                             1734.
Df Model:                         164                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------

In [83]:
print('Milk analysis')
model_analysis(milk_prepared, target_variable, feature_columns_milk)

Milk analysis
                            OLS Regression Results                            
Dep. Variable:         avg_units_sold   R-squared:                       0.406
Model:                            OLS   Adj. R-squared:                  0.327
Method:                 Least Squares   F-statistic:                     5.168
Date:                Fri, 01 Mar 2024   Prob (F-statistic):           4.08e-15
Time:                        05:24:34   Log-Likelihood:                -1096.3
No. Observations:                 292   AIC:                             2263.
Df Residuals:                     257   BIC:                             2391.
Df Model:                          34                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------

In [84]:
print("Mayo Dataset:")
model_analysis(mayo_prepared, target_variable, feature_columns_mayo)

Mayo Dataset:
                            OLS Regression Results                            
Dep. Variable:         avg_units_sold   R-squared:                       0.252
Model:                            OLS   Adj. R-squared:                  0.193
Method:                 Least Squares   F-statistic:                     4.324
Date:                Fri, 01 Mar 2024   Prob (F-statistic):           5.62e-09
Time:                        05:24:46   Log-Likelihood:                -550.21
No. Observations:                 292   AIC:                             1144.
Df Residuals:                     270   BIC:                             1225.
Df Model:                          21                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------

In [85]:
print('Yogurt dataset:')
model_analysis(yogurt_prepared, target_variable, feature_columns_yogurt)

Yogurt dataset:
                            OLS Regression Results                            
Dep. Variable:         avg_units_sold   R-squared:                       0.417
Model:                            OLS   Adj. R-squared:                  0.243
Method:                 Least Squares   F-statistic:                     2.393
Date:                Fri, 01 Mar 2024   Prob (F-statistic):           9.32e-07
Time:                        05:25:26   Log-Likelihood:                -963.18
No. Observations:                 292   AIC:                             2062.
Df Residuals:                     224   BIC:                             2312.
Df Model:                          67                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------