游릱 Load Libraries & Cleaned Dataset

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../Data/cleaned_dataset.csv", parse_dates=[
    "order_purchase_timestamp",
    "order_approved_at",
    "order_delivered_carrier_date",
    "order_delivered_customer_date",
    "order_estimated_delivery_date"
])


游릴 1. Date Features (Year, Month, Weekday)

In [3]:
df["order_year"] = df["order_purchase_timestamp"].dt.year
df["order_month"] = df["order_purchase_timestamp"].dt.month
df["order_weekday"] = df["order_purchase_timestamp"].dt.day_name()


游릴 2. Delivery Time (in Days)

In [4]:
df["delivery_time_days"] = (
    df["order_delivered_customer_date"] - df["order_purchase_timestamp"]
).dt.days


游릴 3. Delivery Delay (Actual vs Estimated)

In [5]:
df["delay_days"] = (
    df["order_delivered_customer_date"] - df["order_estimated_delivery_date"]
).dt.days


游릴 4. Product Volume (cm췁)

In [6]:
df["product_volume_cm3"] = (
    df["product_length_cm"] *
    df["product_height_cm"] *
    df["product_width_cm"]
)


游릴 5. Product Density

In [7]:
df["product_density"] = df["product_weight_g"] / df["product_volume_cm3"]


游릴 6. Customer Order Count

In [8]:
df["customer_order_count"] = df.groupby("customer_unique_id")["order_id"].transform("count")


游릴 7. Customer Total Spent

In [9]:
df["customer_total_spent"] = df.groupby("customer_unique_id")["price"].transform("sum")


游릴 8. Shipping Ratio

In [11]:
df["shipping_ratio"] = df["freight_value"] / df["price"]


游릴 9. Repeat Customer Flag

In [13]:
df["is_repeat_customer"] = df["customer_order_count"].apply(lambda x: 1 if x > 1 else 0)


游릴 10. Outlier Treatment (Delivery Time)

In [14]:
# remove negative values and extreme outliers
df = df[(df["delivery_time_days"] >= 0) & (df["delivery_time_days"] <= 90)]


游릱 Save Engineered Dataset

In [16]:
df.to_csv("../Data/engineered_dataset.csv", index=False)
