# Setup

In [30]:
# Basics
import html
import json
import re
from pathlib import Path
import warnings

# Data Manipulation
import pandas as pd

# EDA
from unidecode import unidecode
from klib import convert_datatypes
from ydata_profiling import ProfileReport
import ppscore as pps
import dtreeviz

# Feature Engineering
from feature_engine.encoding import OrdinalEncoder
from sklearn.preprocessing import OrdinalEncoder as SkOrdinalEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.encoding import CountFrequencyEncoder, OneHotEncoder
from feature_engine.pipeline import make_pipeline

# Machine Learning
from sklearn.tree import DecisionTreeRegressor

In [31]:
# Constants
PROJECT_PATH = Path.cwd().parent.resolve()
DATA_PATH = PROJECT_PATH.joinpath("data")
DATA_RAW_PATH = DATA_PATH.joinpath("raw")
DATA_PROCESSED_PATH = DATA_PATH.joinpath("processed")
RESOURCES_PATH = PROJECT_PATH.joinpath("resources")

In [32]:
# Data anonimization
store_name_encoder = OrdinalEncoder(
    encoding_method="arbitrary",
    variables=["marca_da_loja", "nome_da_loja"],
)

In [None]:
# Metadata
with open(DATA_RAW_PATH.joinpath("raw_data_dict.json")) as json_file:
    metadata = json.load(json_file)

dtype_mapping = {
    item["nome_do_campo_antes_limpeza"]: item["tipo"]
    for item in metadata
    if item["tipo"] != "datetime64[ns]"
}

date_columns = [
    item["nome_do_campo_antes_limpeza"]
    for item in metadata
    if item["tipo"] == "datetime64[ns]"
]

variables_descriptions = {
    item["nome_do_campo_apos_limpeza"]: item["descrição"] for item in metadata
}

column_names_mapping = {
    item["nome_do_campo_antes_limpeza"]: item["nome_do_campo_apos_limpeza"]
    for item in metadata
}

categorical_columns = [
    item["nome_do_campo_apos_limpeza"]
    for item in metadata
    if item["tipo"] == "category"
]

columns_to_keep = [
    item["nome_do_campo_apos_limpeza"]
    for item in metadata
    if item["disponivel_na_hora_do_pedido"]
]

# Ingestion

In [34]:
# Ingestion
temp_dfs = []

for file in list(DATA_RAW_PATH.glob("relatorio-logistica_*.xlsx")):
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            category=UserWarning,
            module=re.escape("openpyxl.styles.stylesheet"),
        )
        df = (
            pd.read_excel(
                io=file,
                dtype=dtype_mapping,
                engine="openpyxl",
                na_values="",
                parse_dates=date_columns,
                date_format="%Y-%m-%d %H:%M:%S",
            )
            .pipe(
                lambda _df: _df.set_axis(
                    [html.unescape(col) for col in _df.columns], axis=1
                )
            )
            .rename(columns=column_names_mapping)
            .assign(
                **{
                    col: lambda _df, col=col: _df[col].apply(
                        lambda x: unidecode(html.unescape(str(x))) if pd.notna(x) else x
                    )
                    for col in categorical_columns
                }
            )
        )
        temp_dfs.append(df)

df = store_name_encoder.fit_transform(
    X=convert_datatypes(
        pd.concat(temp_dfs, ignore_index=True)
        .loc[lambda _df: _df["status_final_do_pedido"].ne("CANCELADO"), columns_to_keep]
        .drop(
            columns=[
                # Business decision
                "tempo_prometido_de_entrega_min",
                "tempo_de_atraso_em_relacao_ao_tempo_prometido_de_entrega_min",
                "frete_cobrado_do_restaurante_apenas_sob_demanda",
                # No value added
                "id_da_loja",
                "id_curto_do_pedido",
                "id_completo_do_pedido",
                # Constant after filtering
                "status_final_do_pedido",
            ]
        )
    )
).assign(
    marca_da_loja=lambda _df: _df["marca_da_loja"].astype("category"),
    nome_da_loja=lambda _df: _df["nome_da_loja"].astype("category"),
)

# Preprocessing

In [35]:
# Datetime features creation
df = df.assign(
    dia_da_semana=lambda _df: _df["data_e_hora_do_pedido"].dt.dayofweek,
    dia_do_mes=lambda _df: _df["data_e_hora_do_pedido"].dt.day,
    hora=lambda _df: _df["data_e_hora_do_pedido"].dt.hour,
    minuto=lambda _df: _df["data_e_hora_do_pedido"].dt.minute,
    minutos_desde_meia_noite=lambda _df: (
        _df["data_e_hora_do_pedido"]
        .dt.hour.mul(60)
        .add(_df["data_e_hora_do_pedido"].dt.minute)
    ),
)

# Feature Engineering

In [36]:
def count_orders_60min_vectorized(group):
    counts = []
    datetimes = group["data_e_hora_do_pedido"].values

    for _, current_time in enumerate(datetimes):
        time_60min_ago = current_time - pd.Timedelta(minutes=60)
        count = ((datetimes < current_time) & (datetimes >= time_60min_ago)).sum()
        counts.append(count)

    group = group.copy()
    group["pedidos_ultimos_60min"] = counts
    return group

In [37]:
df = (
    df.sort_values(["nome_da_loja", "data_e_hora_do_pedido"])
    .groupby("nome_da_loja", group_keys=False, observed=True)[df.columns]
    .apply(count_orders_60min_vectorized)
    .sort_index()
)

# Overall Report

In [38]:
ProfileReport(
    df=df,
    title="deliveries_data",
    variables={"descriptions": variables_descriptions},
    vars={"cat": {"length": False}},
    interactions={"continuous": True, "targets": ["tempo_da_entrega_realizada_min"]},
).to_file(
    output_file=RESOURCES_PATH.joinpath(
        "reports", "deliveries_data_profile_report.html"
    )
)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 16/16 [00:00<00:00, 271.37it/s]



Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Predictive Power Score

In [39]:
# Encoders for Preprocessing
shift_encoder = SklearnTransformerWrapper(
    transformer=SkOrdinalEncoder(
        categories=[["MANHA", "ALMOCO", "TARDE", "JANTAR", "CEIA", "MADRUGADA"]]
    ),
    variables=["turno"],
)
priority_encoder = SklearnTransformerWrapper(
    transformer=SkOrdinalEncoder(categories=[["PADRAO", "RAPIDA"]]),
    variables=["prioridade_do_pedido"],
)
brand_and_store_encoder = CountFrequencyEncoder(
    encoding_method="count", variables=["marca_da_loja", "nome_da_loja"]
)
logistic_service_encoder = OneHotEncoder(
    drop_last=True, variables=["servico_logistico"]
)

pipeline = make_pipeline(
    shift_encoder,
    priority_encoder,
    brand_and_store_encoder,
    logistic_service_encoder,
)

df_encoded = pipeline.fit_transform(df.drop(columns=["data_e_hora_do_pedido"]))

df_pps_predictors = (
    pps.predictors(
        df=df_encoded,
        y="tempo_da_entrega_realizada_min",
    )
    .loc[lambda df_: df_["ppscore"] > 0, :]
    .drop(columns=["model", "y", "case", "is_valid_score", "metric"])
    .sort_values(by="ppscore", ascending=False)
)

df_pps_predictors.to_parquet(
    path=DATA_PROCESSED_PATH.joinpath("pps_predictors.parquet"),
    engine="pyarrow",
    index=False,
)

In [40]:
# Visualization
display(df_pps_predictors)

Unnamed: 0,x,ppscore,baseline_score,model_score
0,taxa_de_entrega_paga_pelo_cliente_reais,0.04133,12.422264,11.908855
1,servico_logistico_FULL_SERVICE,0.003133,12.422264,12.38334


# Tree Visualization

In [41]:
tree_regressor = DecisionTreeRegressor(max_depth=4, random_state=42)

tree_regressor.fit(
    X=df_encoded.drop(columns=["tempo_da_entrega_realizada_min"]),
    y=df_encoded[["tempo_da_entrega_realizada_min"]],
)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning)
    tree_viz_model = dtreeviz.model(
        model=tree_regressor,
        X_train=df_encoded.drop(columns=["tempo_da_entrega_realizada_min"]),
        y_train=df_encoded["tempo_da_entrega_realizada_min"],
        target_name="tempo_da_entrega_realizada_min",
        feature_names=df_encoded.drop(
            columns=["tempo_da_entrega_realizada_min"]
        ).columns.tolist(),
    )

    tree_viz_model.view(orientation="LR").save(
        filename=str(RESOURCES_PATH.joinpath("visualizations", "tree_regressor.svg"))
    )

# Exporting

In [42]:
df.to_parquet(
    path=DATA_PROCESSED_PATH.joinpath("deliveries.parquet"),
    engine="pyarrow",
    index=False,
)