In [None]:
# =====================================================
# Brazilian E-Commerce Public Dataset (Olist)
# End-to-End Data Analysis
# =====================================================


"""
Objective:
Analyze Brazilian e-commerce orders to understand delivery performance,
customer satisfaction, and key operational drivers.
"""

# -------------------------------
# 1. Imports & Data Loading
# -------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import skrub
from skrub import ToDatetime
from skrub import TableReport
#from monkey_patching_v02_data_provenance import set_provenance, enter_provenance_mode_dataop, enter_provenance_mode_var
#set_provenance(skrub._data_ops._evaluation,"evaluate", provenance_func=enter_provenance_mode_dataop)
#set_provenance(skrub._data_ops._data_ops.Var,"compute", provenance_func=enter_provenance_mode_var)
sns.set(style="whitegrid")

base_path = "/kaggle/input/brazilian-ecommerce"

customers = skrub.var("customers",pd.read_csv('C:/Users/teodo/Desktop/github/rdepro_skrub_provenance/monkey_patching_v02/data_provenance/kagglePipelines/data/datasets/olistbr/brazilian-ecommerce/versions/2/olist_customers_dataset.csv'))
orders = pd.read_csv('C:/Users/teodo/Desktop/github/rdepro_skrub_provenance/monkey_patching_v02/data_provenance/kagglePipelines/data/datasets/olistbr/brazilian-ecommerce/versions/2/olist_orders_dataset.csv')
order_items = skrub.var("order_items", pd.read_csv('C:/Users/teodo/Desktop/github/rdepro_skrub_provenance/monkey_patching_v02/data_provenance/kagglePipelines/data/datasets/olistbr/brazilian-ecommerce/versions/2/olist_order_items_dataset.csv'))
payments = skrub.var("payment", pd.read_csv('C:/Users/teodo/Desktop/github/rdepro_skrub_provenance/monkey_patching_v02/data_provenance/kagglePipelines/data/datasets/olistbr/brazilian-ecommerce/versions/2/olist_order_payments_dataset.csv'))
reviews = skrub.var("reviews", pd.read_csv('C:/Users/teodo/Desktop/github/rdepro_skrub_provenance/monkey_patching_v02/data_provenance/kagglePipelines/data/datasets/olistbr/brazilian-ecommerce/versions/2/olist_order_reviews_dataset.csv'))

# -------------------------------
# 2. Data Preparation & Merging
# -------------------------------
#df = (
 #   orders
  #  .merge(customers, on="customer_id", how="left")
   # .merge(order_items, on="order_id", how="left")
    #.merge(payments, on="order_id", how="left")
    #.merge(reviews[["order_id", "review_score"]], on="order_id", how="left")
#)
orders = skrub.var("df", orders)
df = orders.merge(customers, on="customer_id", how="left")
df = df.merge(order_items, on="order_id", how="left")
df = df.merge(payments, on="order_id", how="left")
df = df.merge(reviews[["order_id", "review_score"]], on="order_id", how="left")
# -------------------------------
# 3. Date Handling & Filtering
# -------------------------------
date_cols = [
    "order_purchase_timestamp",
    "order_delivered_customer_date",
    "order_estimated_delivery_date"
]

#for col in date_cols:
 #   df[col] = pd.to_datetime(df[col], errors="coerce")

toDateTimeEncoder = ToDatetime()
df= df.skb.apply(toDateTimeEncoder, cols= date_cols)

# Only delivered orders make sense for delay analysis
df = df[df["order_status"] == "delivered"]

# -------------------------------
# 4. Feature Engineering
# -------------------------------
#df["delivery_delay"] = (
 #   df["order_delivered_customer_date"] -
  #  df["order_estimated_delivery_date"]
#.dt.days

df = df.assign(delivery_delay = (
    df["order_delivered_customer_date"] -
    df["order_estimated_delivery_date"]
).dt.days)

#df["order_value"] = df["price"] + df["freight_value"]
df = df.assign(order_value = df["price"])

#df["is_delayed"] = (df["delivery_delay"] > 0).astype(int)
df = df.assign(is_delayed = (df["delivery_delay"] > 0).astype(int))
# -------------------------------
# 5. Data Cleaning
# -------------------------------
df = df.replace([np.inf, -np.inf], np.nan)

df = df.dropna(subset=[
    "order_value",
    "freight_value",
    "payment_installments",
    "review_score",
    "delivery_delay"
])

# Remove extreme outliers (realistic delivery window)
df = df[(df["delivery_delay"] >= -20) & (df["delivery_delay"] <= 60)]

# -------------------------------
# 6. Exploratory Data Analysis (EDA)
# -------------------------------
#plt.figure(figsize=(8,5))
#sns.histplot(df["order_value"], bins=50) error: dataop type cannot be interpreted as integer
#plt.title("Order Value Distribution")
#plt.show()

#plt.figure(figsize=(8,5))
#sns.boxplot(x=df["delivery_delay"]) same here
#plt.title("Delivery Delay Distribution (Days)")
#plt.show()

#plt.figure(figsize=(6,4))
#sns.countplot(x="review_score", data=df) Data source must be a DataFrame or Mapping, not <class 'skrub._data_ops._data_ops.DataOp'>.
#plt.title("Review Score Distribution")
#plt.show()

#plt.figure(figsize=(6,4))
#sns.countplot(x="is_delayed", data=df)  Data source must be a DataFrame or Mapping, not <class 'skrub._data_ops._data_ops.DataOp'>.
#plt.title("Delayed vs On-Time Orders")
#plt.show()

# -------------------------------
# 7. Modeling â€“ Classification (Business-Oriented)
# -------------------------------
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

model_df = df[[
    "freight_value",
    "price",
    "payment_installments",
    "order_value",
    "review_score",
    "is_delayed"
]]

#X = model_df.drop("is_delayed", axis=1)
#y = model_df["is_delayed"]

X = model_df.drop("is_delayed", axis=1).skb.mark_as_X()
y = model_df["is_delayed"].skb.mark_as_y()

#X_train, X_test, y_train, y_test = train_test_split(
 #   X, y, test_size=0.2, random_state=42, stratify=y
#)

#clf = LogisticRegression(max_iter=1000)
#clf.fit(X_train, y_train)
#y_pred = clf.predict(X_test)

model = LogisticRegression(max_iter=1000)
clf = X.skb.apply(model, y=y)
split = clf.skb.train_test_split(random_state = 42)
learner = clf.skb.make_learner()
learner.fit(split["train"])
learner.score(split["test"])
y_pred = learner.predict(split["test"])


#print(classification_report(y_test, y_pred))
print(classification_report(split["y_test"], y_pred))

# -------------------------------
# 8. Model Interpretation
# -------------------------------
#coef_df = pd.DataFrame(
 #   learner.coef_[0],
  #  X.columns,
   # columns=["Coefficient"]
#).sort_values(by="Coefficient", ascending=False)

#coef_df
#SADLY, SKRUBLEARNER DOES NOT HAVE .COEF_ ATTRIBUTE, I LOOKED INTO THE DOCUMENTATION
# -------------------------------
# 9. Business Insights
# -------------------------------
"""
Key Insights:
- Higher freight costs significantly increase the probability of delivery delays.
- Payment installments correlate with delayed deliveries, indicating potential operational risk.
- Delayed orders are strongly associated with lower review scores.
- Delivery performance is a critical driver of customer satisfaction.
"""

# -------------------------------
# 10. Conclusion
# -------------------------------
"""
This project presents a realistic e-commerce analytics workflow using 100k+ orders.
By combining multiple datasets, engineering meaningful features, and shifting from
regression to classification, the analysis delivers actionable business insights
focused on logistics performance and customer experience.
"""

report = df.skb.full_report()
report

              precision    recall  f1-score   support

           0       0.93      1.00      0.96     22759
           1       0.41      0.01      0.01      1800

    accuracy                           0.93     24559
   macro avg       0.67      0.50      0.49     24559
weighted avg       0.89      0.93      0.89     24559

Generating report for node 0
Generating report for node 1
Generating report for node 2
Generating report for node 3
Generating report for node 4
Generating report for node 5
Generating report for node 6
Generating report for node 7
Generating report for node 8
Generating report for node 9
Generating report for node 10
Generating report for node 11
Generating report for node 12
Generating report for node 13
Generating report for node 14
Generating report for node 15
Generating report for node 16
Generating report for node 18
Generating report for node 19
Generating report for node 20
Generating report for node 21
Generating report for node 22
Generating report for n

{'result':                                 order_id                       customer_id  \
 0       e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
 1       e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
 2       e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
 3       53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
 4       47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
 ...                                  ...                               ...   
 119136  880675dff2150932f1601e1c07eadeeb  47cd45a6ac7b9fb16537df2ccffeb5ac   
 119137  9c5dedf39a927c1b2549525ed64a053c  39bd1228ee8140590ac3aca26f2dfe00   
 119138  63943bddc261676b46f01ca7ac2f7bd8  1fca14ff2861355f6e5f14306ff977a7   
 119139  83c1379a015df1e13d02aae0204711ab  1aa71eb042121263aafbe80c1b562c9c   
 119142  66dea50a8b16d9b4dee7af250b4be1a5  edb027a75a1449115f6b43211ae02a24   
 
        order_status order_purchase_time