In [0]:
# Importando libs
from pyspark.sql.functions import to_date
from pyspark.sql import functions as F
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu

In [0]:
#Importando as bases
order = spark.table('workspace.tabelas_ifood.order').dropDuplicates()
order_details = spark.table('workspace.tabelas_ifood.order_details').dropDuplicates()
ab_ref = spark.table('workspace.tabelas_ifood.ab_test_ref').dropDuplicates()

# Estimando preço dos pedidos de acordo com o preço unitário de cada item e guarnição 
order = order.withColumn('order_created_at', to_date('order_created_at'))
order = order.withColumn('order_scheduled_date', to_date('order_scheduled_date'))
order_details = order_details.withColumn('order_created_at', to_date('order_created_at'))

order_calc_price = order_details.select('order_id','order_created_at','item_sequence','item_externalId','garnish_externalId',
'item_quantity','unit_price','garnish_quantity','garnish_unit_price').dropDuplicates().fillna({'order_id': '', 'order_created_at': '', 'item_sequence': '', 'item_externalId': '', 'garnish_externalId': '', 'item_quantity': 0, 'unit_price': 0, 'garnish_quantity': 0, 'garnish_unit_price': 0})

order_calc_price = order_calc_price.withColumn('unit_price', F.col('unit_price').cast('double')/100)
order_calc_price = order_calc_price.withColumn('garnish_unit_price', F.col('garnish_unit_price').cast('double')/100)

order_calc_price = order_calc_price.withColumn('total_item_calc_price', F.round(F.col('item_quantity') * F.col('unit_price'), 2))
order_calc_price = order_calc_price.withColumn('total_garnish_calc_price', F.round(F.col('garnish_quantity') * F.col('garnish_unit_price'), 2))

In [0]:
#Agrupando guarnições de um mesmo item
order_calc_total_garnish = order_calc_price.groupBy('order_id','order_created_at','item_sequence','item_externalId').agg(F.round(F.sum('total_garnish_calc_price'), 2).alias('total_garnish_calc_price'))

#Calculando preço por item+guarnições
order_calc_price = order_calc_price.select('order_id','order_created_at','item_sequence','item_externalId','total_item_calc_price').dropDuplicates()
order_calc_price = order_calc_price.join(order_calc_total_garnish, on=['order_id','order_created_at','item_sequence','item_externalId'])
order_calc_price = order_calc_price.withColumn('total_item_cost', F.round(F.col('total_item_calc_price') + F.col('total_garnish_calc_price'), 2))

#Calculando preço total do pedido
order_calc_price_total = order_calc_price.groupBy('order_id','order_created_at').agg(F.round(F.sum('total_item_cost'), 2).alias('total_order_amount_calc'))

#Base unificada e com grupos de teste
order_calc_price = order_calc_price.join(order_calc_price_total, on=['order_id','order_created_at'], how='left')
order = order.select('customer_id', 'order_id', 'merchant_id', 'order_created_at', 'order_total_amount')
order_complete = order.join(order_calc_price, on=['order_id', 'order_created_at'], how='left')
order_complete = order_complete.join(ab_ref, on=['customer_id'], how='left')

#Separando bases por grupo
order_prices_target = order_complete.filter(order_complete.is_target == 'target')
order_prices_control = order_complete.filter(order_complete.is_target == 'control')

In [0]:
#Verificando se os preços pagos pelos clientes do grupo de teste foram significativamente menores que os estimados
order_prices_target_pd = order_prices_target.toPandas()

In [0]:
order_prices_target_pd['diff_price'] = np.where(order_prices_target_pd['order_total_amount'] != order_prices_target_pd['total_order_amount_calc'], 'yes', 'no')
order_prices_diff =  order_prices_target_pd[['order_id','order_created_at','diff_price']].drop_duplicates()
print('difference_percentage: ',order_prices_diff['diff_price'].value_counts()['yes']/order_prices_diff['diff_price'].value_counts()['no'])

difference_percentage:  0.10567292909964093


In [0]:
# Aplicando o teste Mann-Whitney para comparação dos grupos

stat, p_val = mannwhitneyu(
    order_prices_target_pd['order_total_amount'].dropna(),
    order_prices_target_pd['total_order_amount_calc'].dropna(),
    alternative='two-sided'
)
print(f"Mann-Whitney U test para avg_order_amount (p-valor): {p_val:.4f}")

Mann-Whitney U test para avg_order_amount (p-valor): 0.0000


In [0]:
#Verificando se há diferença de preço entre itens de mesmo id dos grupos
diff_items = order_prices_target.select('item_externalId','total_item_calc_price').dropDuplicates().join(order_prices_control.select('item_externalId','total_item_calc_price').dropDuplicates().withColumnRenamed('total_item_calc_price','total_item_calc_price_control'), on=['item_externalId'], how='left')
diff_items_pd = diff_items.toPandas()

# Aplicando o teste Mann-Whitney para comparação dos grupos

stat, p_val = mannwhitneyu(
    diff_items_pd['total_item_calc_price'].dropna(),
    diff_items_pd['total_item_calc_price_control'].dropna(),
    alternative='two-sided'
)
print(f"Mann-Whitney U test para avg_order_amount (p-valor): {p_val:.4f}")

Mann-Whitney U test para avg_order_amount (p-valor): 0.0000


IOStream.flush timed out
