In [1]:
from pyspark.sql import functions as f
from pyspark.sql import Window
import sys
from pathlib import Path

In [2]:
#voltando para a raiz do projeto para poder trazer a funcao get_spark_session do Utils
PROJECT_ROOT = Path().resolve().parents[0]
sys.path.append(str(PROJECT_ROOT))

In [3]:
from src.ifood_case.utils import get_spark_session

spark = get_spark_session(app_name="ifood_case_exploration")

[32m2025-12-25 01:58:55.623[0m | [1mINFO    [0m | [36msrc.ifood_case.utils[0m:[36mget_spark_session[0m:[36m18[0m - [1mSubindo Spark Session: ifood_case_exploration[0m
your 131072x1 screen size is bogus. expect trouble
25/12/25 01:58:57 WARN Utils: Your hostname, DESKTOP-HH1RONB resolves to a loopback address: 127.0.1.1; using 172.24.183.91 instead (on interface eth0)
25/12/25 01:58:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/25 01:58:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
#informando caminho para leitura dos arquivos parquet
ORDERS_PATH = PROJECT_ROOT / "data" / "processed" / "orders"
CONSUMERS_PATH = PROJECT_ROOT / "data" / "processed" / "consumers"
RESTAURANTS_PATH = PROJECT_ROOT / "data" / "processed" / "restaurants"
AB_TEST_PATH = PROJECT_ROOT / "data" / "processed" / "ab_test_ref"
ORDERS_ITENS_PATH = PROJECT_ROOT / "data" / "processed" / "orders_itens"

orders = spark.read.parquet(str(ORDERS_PATH))
consumers = spark.read.parquet(str(CONSUMERS_PATH))
restaurants = spark.read.parquet(str(RESTAURANTS_PATH))
ab_test = spark.read.parquet(str(AB_TEST_PATH))
orders_itens = spark.read.parquet(str(ORDERS_ITENS_PATH))

In [16]:
#Calculando métricas por Estado
df_geo_base = orders.join(ab_test, "customer_id")

geo_pivot = (
    df_geo_base
    .groupBy("delivery_address_state")
    .pivot("is_target", ["control", "target"])
    .agg(
        f.count_distinct("customer_id").alias("users"),
        f.count("order_id").alias("orders"),
        f.avg("order_total_amount").alias("ticket_avg")
    )
)

geo_final = (
    geo_pivot
    .withColumn("total_users_state", f.col("control_users") + f.col("target_users"))
    .withColumn("total_users_br", f.sum("total_users_state").over(Window.partitionBy()))
    .withColumn("Share_Base (%)", f.round((f.col("total_users_state") / f.col("total_users_br")) * 100, 2))
    .withColumn("Freq_Control", f.round(f.col("control_orders") / f.col("control_users"), 2))
    .withColumn("Freq_Target", f.round(f.col("target_orders") / f.col("target_users"), 2))
    .withColumn("Lift_Freq (%)", f.round(((f.col("Freq_Target") / f.col("Freq_Control")) - 1) * 100, 2))
    .withColumn("Ticket_Control", f.round(f.col("control_ticket_avg"), 2))
    .withColumn("Ticket_Target", f.round(f.col("target_ticket_avg"), 2))
    .select(
        f.col("delivery_address_state").alias("UF"),
        f.col("Share_Base (%)"),
        f.col("Freq_Control"),
        f.col("Freq_Target"),
        f.col("Lift_Freq (%)"),
        f.col("Ticket_Control"),
        f.col("Ticket_Target")
    )
    .orderBy(f.col("Share_Base (%)").desc())
)

geo_final.show()



+---+--------------+------------+-----------+-------------+--------------+-------------+
| UF|Share_Base (%)|Freq_Control|Freq_Target|Lift_Freq (%)|Ticket_Control|Ticket_Target|
+---+--------------+------------+-----------+-------------+--------------+-------------+
| SP|          42.3|         2.7|       3.06|        13.33|         51.02|         51.1|
| RJ|          18.8|        3.03|       3.45|        13.86|         51.63|         51.9|
| MG|          5.65|        2.47|       2.83|        14.57|         37.88|        37.46|
| PR|          4.88|        2.59|       2.89|        11.58|         41.46|        41.34|
| PE|          3.96|        2.78|       3.04|         9.35|         44.86|        44.79|
| CE|          3.68|        3.56|       3.95|        10.96|         39.45|        39.51|
| DF|          3.58|        2.69|       3.07|        14.13|         44.56|        44.74|
| RS|          3.05|        2.67|       2.95|        10.49|         44.01|        43.79|
| BA|          2.94| 

                                                                                

O comportamento do usuário nos maiores mercados é praticamente idêntico à média nacional. Isso gera previsibilidade financeira. Não foi um estado pequeno (como AL com 21% de lift) que distorceu a média.

In [15]:
#Calculando métricas por dispositivo
df_device_base = orders.join(ab_test, "customer_id")

device_pivot = (
    df_device_base
    .groupBy("origin_platform")
    .pivot("is_target", ["control", "target"])
    .agg(
        f.count_distinct("customer_id").alias("users"),
        f.count("order_id").alias("orders"),
        f.avg("order_total_amount").alias("ticket_avg")
    )
)

device_final = (
    device_pivot
    .withColumn("total_users_plat", f.col("control_users") + f.col("target_users"))
    .withColumn("total_users_br", f.sum("total_users_plat").over(Window.partitionBy()))
    .withColumn("Share_Base (%)", f.round((f.col("total_users_plat") / f.col("total_users_br")) * 100, 2))
    .withColumn("Freq_Control", f.round(f.col("control_orders") / f.col("control_users"), 2))
    .withColumn("Freq_Target", f.round(f.col("target_orders") / f.col("target_users"), 2))
    .withColumn("Lift_Freq (%)", f.round(((f.col("Freq_Target") / f.col("Freq_Control")) - 1) * 100, 2))
    .withColumn("Ticket_Control", f.round(f.col("control_ticket_avg"), 2))
    .withColumn("Ticket_Target", f.round(f.col("target_ticket_avg"), 2))
    .select(
        f.col("origin_platform").alias("Device"),
        f.col("Share_Base (%)"),
        f.col("Freq_Control"),
        f.col("Freq_Target"),
        f.col("Lift_Freq (%)"),
        f.col("Ticket_Control"),
        f.col("Ticket_Target")
    )
    .filter(f.col("Device").isNotNull())
    .orderBy(f.col("Share_Base (%)").desc())

)

device_final.show(truncate=False)



+-------------+--------------+------------+-----------+-------------+--------------+-------------+
|Device       |Share_Base (%)|Freq_Control|Freq_Target|Lift_Freq (%)|Ticket_Control|Ticket_Target|
+-------------+--------------+------------+-----------+-------------+--------------+-------------+
|ANDROID      |42.16         |2.53        |2.85       |12.65        |43.45         |43.03        |
|IOS          |40.86         |2.89        |3.23       |11.76        |52.66         |52.77        |
|DESKTOP      |16.02         |2.48        |2.79       |12.5         |46.31         |46.37        |
|WINDOWS_PHONE|0.95          |2.23        |2.37       |6.28         |39.61         |40.37        |
+-------------+--------------+------------+-----------+-------------+--------------+-------------+



                                                                                

In [None]:
#calculando a frequência individual de cada cliente para analisar lealdade no periodo
df_freq = orders.groupBy("customer_id").agg(f.count("order_id").alias("freq_total"))

#Critério: 
# - Bronze: 1 pedido (Risco de Churn/Não engajado)
# - Prata: 2 a 4 pedidos (Usuário em desenvolvimento)
# - Ouro: 5+ pedidos (Heavy User/Fiel)
df_segments = df_freq.withColumn(
    "segmento",
    f.when(f.col("freq_total") == 1, "1. Bronze (1 pedido)")
     .when((f.col("freq_total") >= 2) & (f.col("freq_total") <= 4), "2. Prata (2-4 pedidos)")
     .otherwise("3. Ouro (5+ pedidos)")
)

df_seg_analise = df_segments.join(ab_test, "customer_id")

resultado_seg = (
    df_seg_analise
    .groupBy("segmento", "is_target")
    .agg(f.count("customer_id").alias("total_usuarios"))
    .withColumn("share_percent", 
        f.round(
            f.col("total_usuarios") / f.sum("total_usuarios").over(Window.partitionBy("is_target")) * 100, 
        2)
    )
    .orderBy("segmento", "is_target")
)

resultado_seg.show(truncate=False)

                                                                                

+----------------------+---------+--------------+-------------+
|segmento              |is_target|total_usuarios|share_percent|
+----------------------+---------+--------------+-------------+
|1. Bronze (1 pedido)  |control  |188824        |52.37        |
|1. Bronze (1 pedido)  |target   |188996        |42.38        |
|2. Prata (2-4 pedidos)|control  |114075        |31.64        |
|2. Prata (2-4 pedidos)|target   |171132        |38.38        |
|3. Ouro (5+ pedidos)  |control  |57629         |15.98        |
|3. Ouro (5+ pedidos)  |target   |85781         |19.24        |
+----------------------+---------+--------------+-------------+

