In [1]:
import pandas as pd
from scipy.stats import ttest_ind, f_oneway
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("../data/raw/sales_data.csv")
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

TARGET_COL = "Units Sold"

In [2]:
promo = df[df["Promotion"] == 1][TARGET_COL]
non_promo = df[df["Promotion"] == 0][TARGET_COL]

t_stat, p_value = ttest_ind(promo, non_promo, equal_var=False)
t_stat, p_value

(np.float64(59.41426736156105), np.float64(0.0))

In [3]:
epi = df[df["Epidemic"] == 1][TARGET_COL]
normal = df[df["Epidemic"] == 0][TARGET_COL]

t_stat, p_value = ttest_ind(epi, normal, equal_var=False)
t_stat, p_value

(np.float64(-101.9186722582504), np.float64(0.0))

In [4]:
groups = [df[df["Category"] == c][TARGET_COL] for c in df["Category"].unique()]
f_stat, p_value = f_oneway(*groups)
f_stat, p_value

(np.float64(2325.192824806394), np.float64(0.0))

In [5]:
categorical_features = [
    "Category", "Region", "Weather Condition", "Seasonality"
]

df_fe = df.copy()

for col in categorical_features:
    le = LabelEncoder()
    df_fe[col] = le.fit_transform(df_fe[col])

In [6]:
features = [
    "Inventory Level", "Units Ordered", "Price", "Discount",
    "Promotion", "Competitor Pricing", "Epidemic", "Demand"
]

In [7]:
daily_df = (
    df_fe
    .groupby("Date")[["Units Sold"] + features + categorical_features]
    .mean()
    .sort_index()
)

daily_df.head()

Unnamed: 0_level_0,Units Sold,Inventory Level,Units Ordered,Price,Discount,Promotion,Competitor Pricing,Epidemic,Demand,Category,Region,Weather Condition,Seasonality
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2022-01-01,86.01,193.83,206.35,68.669,6.45,0.15,70.85,0.0,100.6,2.24,1.4,1.8,3.0
2022-01-02,76.84,123.85,188.01,70.3806,7.35,0.2,72.4986,0.0,108.14,2.24,1.4,1.4,3.0
2022-01-03,59.65,103.56,97.22,69.791,9.15,0.35,72.1262,0.0,113.17,2.24,1.4,1.4,3.0
2022-01-04,78.59,180.71,97.89,69.8443,10.3,0.45,72.0545,0.0,114.69,2.24,1.4,1.4,3.0
2022-01-05,96.21,271.52,73.92,66.819,14.35,0.7,68.6097,0.0,117.24,2.24,1.4,1.8,3.0


In [8]:
daily_df = (
    df_fe
    .groupby("Date")[["Units Sold"] + features + categorical_features]
    .mean()
    .sort_index()
)

In [9]:
import os

os.makedirs("../data/processed", exist_ok=True)

daily_df.to_csv("../data/processed/daily_features.csv")

print("File daily_features.csv berhasil disimpan")

File daily_features.csv berhasil disimpan


In [10]:
import os
os.listdir("../data/processed")

['daily_features.csv']

## Feature Engineering Summary

- Fitur dipilih berdasarkan hasil EDA dan hypothesis testing.
- Seluruh variabel dikonversi ke format numerik dan diagregasi
- pada level harian untuk kebutuhan forecasting.