In [19]:
import pandas as pd
import numpy as np

# Load dataset
data_path = '../../data/processed/Retail_Sales_Data_with_Products.csv'
df = pd.read_csv(data_path)

# Filter data for Retailer1 only
df_retailer1 = df[df['retailer'] == 'retail1'].copy()
print("Data Sample:")
print(df_retailer1.head())
print("\nData Types:")
print(df_retailer1.dtypes)


Data Sample:
  retailer  store_id  week                   product  \
0  retail1         1     1       MintyFresh Mint 18g   
1  retail1         1     1    ChocoDelight Dark 200g   
2  retail1         1     1   ChocoDelight White 350g   
3  retail1         1     1  NuttyCream Hazelnuts 80g   
4  retail1         1     1        DarkDream Dark 60g   

                                         description  regular_price  \
0                    MintyFresh refresh mint 18grams      10.308645   
1                           200g ChocoDelight smooth       7.947956   
2                        350g ChocoDelight delicious      13.439368   
3                delightful Hazelnuts 80g NuttyCream       5.112777   
4  A decadent DarkDream dark chocolate ice cream ...      14.085425   

   competition_1_regular_price  competition_2_regular_price  \
0                    10.624815                    10.013451   
1                     7.748726                     7.589198   
2                    13.751508    

In [20]:
# Fill NaN promo prices with 0
promo_columns = [col for col in df_retailer1.columns if 'promo_price' in col]
df_retailer1[promo_columns] = df_retailer1[promo_columns].fillna(0)

# Drop non-numeric and irrelevant columns
df_retailer1 = df_retailer1.drop(columns=['retailer', 'product', 'description'])
print("\nRemaining Columns:")
print(df_retailer1.dtypes)



Remaining Columns:
store_id                          int64
week                              int64
regular_price                   float64
competition_1_regular_price     float64
competition_2_regular_price     float64
competition_3_regular_price     float64
competition_4_regular_price     float64
competition_5_regular_price     float64
competition_6_regular_price     float64
competition_7_regular_price     float64
competition_8_regular_price     float64
competition_9_regular_price     float64
competition_10_regular_price    float64
promo_price                     float64
competition_1_promo_price       float64
competition_2_promo_price       float64
competition_3_promo_price       float64
competition_4_promo_price       float64
competition_5_promo_price       float64
competition_6_promo_price       float64
competition_7_promo_price       float64
competition_8_promo_price       float64
competition_9_promo_price       float64
competition_10_promo_price      float64
value               

In [21]:
# Calculate average competition price
comp_price_cols = [col for col in df_retailer1.columns if 'competition_' in col and 'regular_price' in col]
df_retailer1['avg_competition_price'] = df_retailer1[comp_price_cols].mean(axis=1)
df_retailer1['price_difference'] = df_retailer1['regular_price'] - df_retailer1['avg_competition_price']

# Seasonality indicators
df_retailer1['sin_week'] = np.sin(2 * np.pi * df_retailer1['week'] / max(df_retailer1['week']))
df_retailer1['cos_week'] = np.cos(2 * np.pi * df_retailer1['week'] / max(df_retailer1['week']))

# Interaction term between price difference and promo price
df_retailer1['price_promo_interaction'] = df_retailer1['price_difference'] * df_retailer1['promo_price']


In [22]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df_retailer1.drop(columns=['quantity'])
y = df_retailer1['quantity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Create pipeline with imputation and Gradient Boosting Regressor
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('gbr', GradientBoostingRegressor(random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Baseline Model - Mean Squared Error (MSE): {mse}")
print(f"Baseline Model - Mean Absolute Error (MAE): {mae}")
print(f"Baseline Model - R-squared (R²): {r2}")


ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'DarkDream'