In [1]:
from analystcommunity import read_connection_data_warehouse

In [2]:
import pandas as pd
import numpy as np
from datetime import date
import dill
import re

# Settings

In [3]:
from_date = '2022-09-01'
to_date = '2022-09-30'

In [4]:
offer_date = '202209'

# Extra functions

In [5]:
def normalize_text(raw_text):
    """
    Removes common accent characters and normalize text.
    """
    raw_text = raw_text.replace(" ", "")
    raw_text = raw_text.replace("�", "")
    raw_text = raw_text.replace("/", "")
    raw_text = re.sub(r'\s*', '', raw_text)
    raw_text = re.sub(r"[àáâãäå]", 'a', raw_text)
    raw_text = re.sub(r"[èéêë]", 'e', raw_text)
    raw_text = re.sub(r"[ìíîï]", 'i', raw_text) 
    raw_text = re.sub(r"[òóôõö]", 'o', raw_text)
    raw_text = re.sub(r"[ùúûü]", 'u', raw_text)
    raw_text = re.sub(r"[ýÿ]", 'y', raw_text)
    raw_text = re.sub(r"[ß]", 'ss', raw_text)
    raw_text = re.sub(r"[ñ]", 'n', raw_text)
    raw_text = re.sub(r"[ç]", 'c', raw_text)
    raw_text = raw_text.lower()
    return raw_text

In [6]:
def organize_text(x):

    def normalize_text(raw_text):
        """
        Removes common accent characters and normalize text.
        """
        raw_text = raw_text.replace(" ", "")
        raw_text = raw_text.replace("�", "")
        raw_text = raw_text.replace("/", "")
        raw_text = re.sub(r'\s*', '', raw_text)
        raw_text = re.sub(r"[àáâãäå]", 'a', raw_text)
        raw_text = re.sub(r"[èéêë]", 'e', raw_text)
        raw_text = re.sub(r"[ìíîï]", 'i', raw_text)
        raw_text = re.sub(r"[òóôõö]", 'o', raw_text)
        raw_text = re.sub(r"[ùúûü]", 'u', raw_text)
        raw_text = re.sub(r"[ýÿ]", 'y', raw_text)
        raw_text = re.sub(r"[ß]", 'ss', raw_text)
        raw_text = re.sub(r"[ñ]", 'n', raw_text)
        raw_text = raw_text.lower()
        return raw_text

    x = x.lower()
    x = normalize_text(x)
    x = x.replace("&", "y")
    x = x.replace(",", "")
    x = x.replace("ç", "")
    for a in ["pollo", "res", "cerdo"]:
        for b in ["fresco", "congelado"]:
            x = x.replace(f"{a}{b}", a)
    x = x

    return x

# Data segments

In [7]:
q_segment = f"""
SELECT DISTINCT
        bo.offer_id,
        --bo.offer_name,
        --TO_CHAR((left(bo.offer_name,4)||'-'||right(left(bo.offer_name,6),2)||'-01')::DATE, 'YYYY-MM') AS mes,
        bocsx.customer_segment_id,
        bo.offer_name,
        bo.offer_value::int as value_discount,
        bo.max_uses_per_customer,
        bo.max_uses as max_uses_per_order,
        substring(bo.offer_name, 14, strpos(bo.offer_name, '_NB_')-14) AS ms,
        substring(bo.offer_name, strpos(bo.offer_name, '_NB_') + 4, strpos(bo.offer_name, '_W') - strpos(bo.offer_name, '_NB_') - 4) AS subcategory,
        substring(bo.offer_name, 1, 6) AS campaign_month,
        substring(bo.offer_name, 10, 3) AS region_code,
        REPLACE(SPLIT_PART(SPLIT_PART(boic.order_item_match_rule, '[',2),']',1),'"','') AS skus,
        cust.external_identifier as customer_source_id
            
    FROM postgres_broadleaf_federate."broadleaf.blc_offer"                          bo
    LEFT JOIN postgres_broadleaf_federate."broadleaf.blc_offer_customer_seg_xref"   bocsx   ON (bo.offer_id = bocsx.offer_id AND ((bocsx.archived is NULL OR bocsx.archived='N')))
    LEFT JOIN postgres_broadleaf_federate."broadleaf.blc_customer_offer_xref"       bcox    ON (bo.offer_id = bcox.offer_id  )
    LEFT JOIN postgres_broadleaf_federate."broadleaf.blc_tar_crit_offer_xref"       btcofx  ON (btcofx.offer_id = bo.offer_id)
    INNER JOIN postgres_broadleaf_federate."broadleaf.blc_offer_item_criteria"      boic    ON (btcofx.offer_item_criteria_id = boic.offer_item_criteria_id AND (boic.archived is NULL OR boic.archived ='N'))
    INNER JOIN postgres_broadleaf_federate."broadleaf.blc_site"                     bs      ON bs.site_id = bo.catalog_disc
    INNER JOIN postgres_broadleaf_federate."broadleaf.blc_customer_customer_seg_xref" cust  ON cust.customer_segment_id = bocsx.customer_segment_id
    
    WHERE   
        bo.offer_name like '{offer_date}%'
        -- bo.end_date >= current_date
        ---TIME VALIDATIONS
        AND extract(year from bo.date_created) = extract(year from current_date)
        -- AND right(left(bo.offer_name,6),2) = TO_CHAR(current_date, 'MM') 
        -- AND right(left(bo.offer_name,6),2) = TO_CHAR(current_date, 'MM') 
        -------------------
        AND bo.offer_discount_type = 'PERCENT_OFF'
        AND bo.offer_type = 'ORDER_ITEM'
        AND bo.sndbx_id is NULL
        AND (bo.archived is NULL OR bo.archived='N')
        AND (bo.sndbx_tier is NULL OR bo.sndbx_tier = 999999)
        AND bo.automatically_added = 'true'
        AND (boic.sndbx_tier is NULL OR bo.sndbx_tier = 999999)
        AND (bocsx.sndbx_tier is NULL OR bocsx.sndbx_tier = 999999)
        AND bocsx.customer_segment_id IS NOT NULL
        AND bo.offer_name ILIKE '%\_NB_%'
"""

In [8]:
data_segment = read_connection_data_warehouse.run_read_prod_query(q_segment)

In [9]:
data_segment

Unnamed: 0,offer_id,customer_segment_id,offer_name,value_discount,max_uses_per_customer,max_uses_per_order,ms,subcategory,campaign_month,region_code,skus,customer_source_id
0,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,202209,BAQ,281192440369,146869951
1,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,202209,BAQ,281192440369,158649897
2,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,202209,BAQ,281192440369,204518842
3,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,202209,BAQ,281192440369,79497049
4,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,202209,BAQ,281192440369,191965864
...,...,...,...,...,...,...,...,...,...,...,...,...
2153170,-14459945,79276,20220901_VCP_other_control_NB_feijao_W35,10,1,1,other_control,feijao,202209,VCP,"430899,430898,430895,430896,430643,430642,4306...",210368542
2153171,-14459945,79276,20220901_VCP_other_control_NB_feijao_W35,10,1,1,other_control,feijao,202209,VCP,"430899,430898,430895,430896,430643,430642,4306...",175826304
2153172,-14459945,79276,20220901_VCP_other_control_NB_feijao_W35,10,1,1,other_control,feijao,202209,VCP,"430899,430898,430895,430896,430643,430642,4306...",173718466
2153173,-14459945,79276,20220901_VCP_other_control_NB_feijao_W35,10,1,1,other_control,feijao,202209,VCP,"430899,430898,430895,430896,430643,430642,4306...",158778666


In [10]:
data_segment['ms'] = data_segment['ms'].apply(lambda x: x.replace("_exp_50%",""))

In [11]:
def fix_data_segment(data_segment):
    data_segment['offer_id'] = data_segment['offer_id'].astype(np.int32)
    data_segment['customer_segment_id'] = data_segment['customer_segment_id'].astype(np.int32)
    data_segment['customer_source_id'] = data_segment['customer_source_id'].astype(np.int32)
    data_segment['value_discount'] = data_segment['value_discount'].astype(np.int32)
    data_segment['max_uses_per_customer'] = data_segment['max_uses_per_customer'].astype(np.int32)
    data_segment['max_uses_per_order'] = data_segment['max_uses_per_order'].astype(np.int32)
    data_segment['campaign_month'] = pd.to_datetime(data_segment['campaign_month'], format='%Y%m').dt.date
    #data_segment['campaign_month'] = data_segment['campaign_month'].dt.date
    return data_segment

In [12]:
data_segment = fix_data_segment(data_segment)

In [13]:
data_segment["group"] = data_segment.apply(lambda x: "Treatment" if x.campaign_month<date(2022,9,1)
                                          else "Control" if "_control" in x.offer_name
                                          else "Treatment",
                                          axis=1)

In [14]:
data_segment = data_segment.loc[(data_segment.offer_name.apply(lambda x: x[:8])=='20220901')]

In [15]:
data_segment.head(10)

Unnamed: 0,offer_id,customer_segment_id,offer_name,value_discount,max_uses_per_customer,max_uses_per_order,ms,subcategory,campaign_month,region_code,skus,customer_source_id,group
0,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,146869951,Treatment
1,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,158649897,Treatment
2,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,204518842,Treatment
3,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,79497049,Treatment
4,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,191965864,Treatment
5,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,46340250,Treatment
6,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,165017192,Treatment
7,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,145548473,Treatment
8,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,189493392,Treatment
9,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,215050563,Treatment


In [16]:
set(data_segment.group.values)

{'Control', 'Treatment'}

# Data Merged

## Data broadleaf

In [17]:
q_broadleaf = f"""
SELECT DISTINCT
        bo.order_id,
        ffg.close_date,
        s.site_identifier_value AS region_code,
        CASE
            WHEN ( (LOWER(boida.adjustment_reason) ILIKE '%merma%') AND ((COALESCE(bcat2.name, bcat.name) = 'Frutas & Verduras') OR (COALESCE(bcat2.name, bcat.name) = 'Frutas e Verduras')) ) THEN 'MERMA FRUVER'
            WHEN ( (LOWER(boida.adjustment_reason) ILIKE '%merma%') AND (COALESCE(bcat2.name, bcat.name) NOT IN ('Frutas & Verduras','Frutas e Verduras')) ) THEN 'MERMA 3PL'
            WHEN ( (LOWER(boida.adjustment_reason) ILIKE '%acm%') OR (LOWER(boida.adjustment_reason) ILIKE '%kof%') OR (LOWER(boida.adjustment_reason) ILIKE '%campana%') ) THEN 'MONETIZACION'
            WHEN boida.adjustment_reason IS NULL THEN 'NO DISCOUNTS'
            ELSE 'GROWTH & OTHERS'
        END AS responsable,
        CASE
            WHEN LOWER(boida.adjustment_reason) ILIKE '%hoo%' THEN 'Hooks'
            WHEN ( (boida.adjustment_reason ILIKE '%Spend-All%') OR (boida.adjustment_reason ILIKE '%Spend-BMS%') OR (boida.adjustment_reason ILIKE '%NB%') ) THEN 'SUPER DISCOUNTS'
            ELSE 'Others'
        END AS growth_owners,
        boipd.order_item_id,
        boida.offer_name,
        boida.adjustment_reason,
        boida.adjustment_value*boipd.quantity*foi.step_unit as discount_offer_local,
        case    
            when fr.country_code = 'MX' then 0.05089059 --antes 0.052110
            when fr.country_code = 'CO' then 0.00026483 --antes 0.000333
            when fr.country_code = 'BR' then 0.21052632 --antes 0.257732
            else 0
        end as coefficient
        
    FROM postgres_broadleaf_federate."broadleaf.blc_order" bo
    INNER JOIN postgres_broadleaf_federate."broadleaf.blc_fulfillment_group" bfg        ON bfg.order_id = bo.order_id
    INNER JOIN postgres_broadleaf_federate."broadleaf.blc_fulfillment_order" bfo        ON bfo.fulfillment_group_id = bfg.fulfillment_group_id
    INNER JOIN postgres_broadleaf_federate."broadleaf.fb_fulfillment_group" ffg         ON ffg.fulfillment_group_id = bfg.fulfillment_group_id
    INNER JOIN postgres_broadleaf_federate."broadleaf.blc_site"                     s       ON s.site_id = bo.site_disc
    INNER JOIN postgres_broadleaf_federate."broadleaf.fb_region"                    fr ON fr.region_code = s.site_identifier_value
    INNER JOIN postgres_broadleaf_federate."broadleaf.blc_order_item"               boi     ON boi.order_id=bo.order_id
    INNER JOIN postgres_broadleaf_federate."broadleaf.fb_order_item"                foi     ON boi.order_item_id= foi.order_item_id
    INNER JOIN postgres_broadleaf_federate."broadleaf.fb_order"                     fo      ON fo.order_id = bo.order_id
    LEFT JOIN  postgres_broadleaf_federate."broadleaf.fb_order_type" fot                ON fot.fb_order_type_id=fo.fb_order_type_id
    INNER JOIN postgres_broadleaf_federate."broadleaf.blc_order_item_price_dtl"     boipd   ON boipd.order_item_id=boi.order_item_id
    INNER JOIN postgres_broadleaf_federate."broadleaf.blc_discrete_order_item"      bdoi    ON bdoi.order_item_id = boi.order_item_id
    INNER JOIN postgres_broadleaf_federate."broadleaf.blc_sku"                      bs      ON bs.sku_id = bdoi.sku_id
    INNER JOIN postgres_broadleaf_federate."broadleaf.blc_product"                  bp      ON bs.addl_product_id = bp.product_id
    INNER JOIN postgres_broadleaf_federate."broadleaf.blc_category"                 bcat    ON bcat.category_id = bp.default_category_id 
    INNER JOIN  postgres_broadleaf_federate."broadleaf.blc_category_xref"            bcx     ON bcx.sub_category_id = bp.default_category_id AND bcx.archived='N' AND bcx.sndbx_tier is NULL
    INNER JOIN  postgres_broadleaf_federate."broadleaf.blc_category"                 bcat2   ON bcx.category_id = bcat2.category_id
    INNER JOIN postgres_broadleaf_federate."broadleaf.blc_order_item_dtl_adj"        boida   ON boida.order_item_price_dtl_id=boipd.order_item_price_dtl_id
    LEFT JOIN postgres_broadleaf_federate."broadleaf.blc_offer"                 blc_offer  ON boida.offer_id = blc_offer.offer_id
    LEFT JOIN postgres_broadleaf_federate."broadleaf.blc_admin_user"            id         ON blc_offer.created_by = id.admin_user_id
    where 1=1
        and bo.submit_date is not null
        and bo.external_id is null
        and fo.fb_order_status_id in(1, 6, 7, 8)
        and bo.order_status = 'SUBMITTED'
        and (bfo.status is null or bfo.status not in('ARCHIVED', 'CANCELLED'))
        and (fot.name is null or fot.name <> 'REFUND')
        and ffg.close_date is not null
        and ffg.close_date between '{from_date}' AND '{to_date}'
       -- FIX SUPER DESCUENTOS
        AND bcat2.category_id not in ('110873','-1000','100768','100765','100815') --ids de super descuentos en cada país
        AND bcat.name <> 'Oferton Frubana' AND bcat2.name <> 'Oferton Frubana'
        -- FILTER OFFER
        AND boida.offer_name like '{offer_date}%'
"""

In [18]:
data_broadleaf = read_connection_data_warehouse.run_read_prod_query(q_broadleaf)

In [19]:
data_broadleaf["discount_offer_usd"] = data_broadleaf["discount_offer_local"] * data_broadleaf["coefficient"]

In [20]:
data_broadleaf.head(10)

Unnamed: 0,order_id,close_date,region_code,responsable,growth_owners,order_item_id,offer_name,adjustment_reason,discount_offer_local,coefficient,discount_offer_usd
0,14621458,2022-09-13,SPO,GROWTH & OTHERS,SUPER DISCOUNTS,64476218,20220901_SPO_pratododia_piloto_NB_refrigerante...,20220901_SPO_pratododia_piloto_NB_refrigerante...,6.0,0.21052632,1.26315792
1,14437495,2022-09-07,SPO,GROWTH & OTHERS,SUPER DISCOUNTS,63794553,20220901_SPO_pratododia_piloto_NB_refrigerante...,20220901_SPO_pratododia_piloto_NB_refrigerante...,3.0,0.21052632,0.63157896
2,14665194,2022-09-14,SPO,GROWTH & OTHERS,SUPER DISCOUNTS,64634396,20220901_SPO_pratododia_piloto_NB_refrigerante...,20220901_SPO_pratododia_piloto_NB_refrigerante...,8.64,0.21052632,1.8189474048
3,15131889,2022-09-29,SPO,GROWTH & OTHERS,SUPER DISCOUNTS,66518690,20220901_SPO_pratododia_piloto_NB_refrigerante...,20220901_SPO_pratododia_piloto_NB_refrigerante...,8.28,0.21052632,1.7431579296
4,14475261,2022-09-07,SPO,GROWTH & OTHERS,SUPER DISCOUNTS,63908179,20220901_SPO_pratododia_piloto_NB_refrigerante...,20220901_SPO_pratododia_piloto_NB_refrigerante...,4.8,0.21052632,1.010526336
5,15132958,2022-09-28,SPO,GROWTH & OTHERS,SUPER DISCOUNTS,66421179,20220901_SPO_pratododia_piloto_NB_refrigerante...,20220901_SPO_pratododia_piloto_NB_refrigerante...,15.84,0.21052632,3.3347369088
6,14631599,2022-09-14,SPO,GROWTH & OTHERS,SUPER DISCOUNTS,64509074,20220901_SPO_pratododia_piloto_NB_refrigerante...,20220901_SPO_pratododia_piloto_NB_refrigerante...,12.24,0.21052632,2.5768421568
7,14430846,2022-09-09,SPO,GROWTH & OTHERS,SUPER DISCOUNTS,63998863,20220901_SPO_pratododia_piloto_NB_refrigerante...,20220901_SPO_pratododia_piloto_NB_refrigerante...,7.56,0.21052632,1.5915789792
8,15198699,2022-09-30,SPO,GROWTH & OTHERS,SUPER DISCOUNTS,66687438,20220901_SPO_pratododia_piloto_NB_refrigerante...,20220901_SPO_pratododia_piloto_NB_refrigerante...,11.52,0.21052632,2.4252632064
9,15129425,2022-09-29,SPO,GROWTH & OTHERS,SUPER DISCOUNTS,66408574,20220901_SPO_pratododia_piloto_NB_refrigerante...,20220901_SPO_pratododia_piloto_NB_refrigerante...,2.64,0.21052632,0.5557894848


## Data fact sales

In [21]:
q_fact_sales = f"""SELECT
    fs.order_id,
    fs.order_item_id,
    ds.identifier_value as region_code,
    fs.product_discount as discount_item_local,
    dcus.source_id AS customer_source_id,
    dp.source_id as sku_id,
    dfscat.description AS subcategory,
    dfscat.parent_description AS category,
    fs.order_close_date::date as close_date,
    dcc.coefficient,
    sum(fs.gmv_local) as gmv_local
    
FROM
    dpr_sales.fact_sales fs
    INNER JOIN dpr_shared.dim_product dp on fs.dim_product=dp.product_id
    INNER JOIN dpr_shared.dim_site ds ON ds.site_id = fs.dim_site
    INNER JOIN dpr_sales.dim_status dst ON fs.dim_status = dst.status_id
    INNER JOIN dpr_shared.dim_customer dcus ON fs.dim_customer = dcus.customer_id
    LEFT JOIN dpr_shared.dim_category dfscat ON fs.dim_category = dfscat.category_id
    INNER JOIN dpr_shared.dim_country  dc  on dc.country_id  = ds.country_id
    INNER JOIN dpr_shared.dim_currency dcy on dcy.currency_id = dc.currency_id
    INNER JOIN dpr_shared.dim_currency_conversion dcc on dcc.currency_id = dcy.currency_id and fs.order_close_date >= dcc.start_date and fs.order_close_date <= dcc.end_date 
where 1=1
   
    AND gmv_enabled=true
    AND fs.order_close_date between '{from_date}' AND '{to_date}'
    AND dfscat.parent_description not in ('Super descontos!', 'Súper descuentos')
GROUP BY 1,2,3,4,5,6,7,8,9,10"""

In [22]:
data_fact_sales = read_connection_data_warehouse.run_read_dwd_query(q_fact_sales)

In [23]:
for col in ["discount_item", "gmv"]:
    data_fact_sales[f"{col}_usd"] = data_fact_sales[f"{col}_local"] * data_fact_sales["coefficient"]

In [24]:
data_fact_sales["subcategory_transformed"] = data_fact_sales["subcategory"].apply(organize_text)

In [25]:
data_fact_sales.head(10)

Unnamed: 0,order_id,order_item_id,region_code,discount_item_local,customer_source_id,sku_id,subcategory,category,close_date,coefficient,gmv_local,discount_item_usd,gmv_usd,subcategory_transformed
0,14164327,62996281,BHZ,0.0,76700003,313549,Legumes,Frutas e verduras,2022-09-01,0.21052632,3.92,0.0,0.8252631744,legumes
1,14275073,63094235,CMX,0.0,207121608,-303834,Tubérculos,Frutas & verduras,2022-09-01,0.05089059,18.0,0.0,0.91603062,tuberculos
2,14253729,63010671,SPO,2.34,19798250,112095,Verduras,Frutas e verduras,2022-09-01,0.21052632,28.62,0.4926315888,6.0252632784,verduras
3,14238425,63000104,SPO,2.88,128511128,-314737,Legumes,Frutas e verduras,2022-09-01,0.21052632,57.0,0.6063158016,12.00000024,legumes
4,14277370,63103277,BAQ,6124.8,215373918,132652,Pollo congelado,"Carne, pollo & pescados",2022-09-01,0.00026483,122496.0,1.622030784,32.44061568,pollo
5,14176531,62928643,PBC,0.0,120238268,415804,Verduras,Frutas & verduras,2022-09-01,0.05089059,49.5,0.0,2.519084205,verduras
6,14133201,62931973,SPO,0.0,58839977,74216,Verduras,Frutas e verduras,2022-09-01,0.21052632,5.28,0.0,1.1115789696,verduras
7,14198610,62775231,SPO,12.6,118411340,73559,Leite,Laticínios e ovos,2022-09-01,0.21052632,74.16,2.652631632,15.6126318912,leite
8,14250878,62991347,SPO,0.0,85930796,-314857,Legumes,Frutas e verduras,2022-09-01,0.21052632,9.63,0.0,2.0273684616,legumes
9,14281161,63117724,SPO,0.0,37584273,172017,Temperos,Mercearia,2022-09-01,0.21052632,21.98,0.0,4.6273685136,temperos


In [26]:
def fix_data_fact_sales(data_fact_sales):
    data_fact_sales['order_id'] = data_fact_sales['order_id'].astype(np.int32)
    data_fact_sales['order_item_id'] = data_fact_sales['order_item_id'].astype(np.int32)
    data_fact_sales['customer_source_id'] = data_fact_sales['customer_source_id'].astype(np.int32)
    data_fact_sales['sku_id'] = data_fact_sales['sku_id'].astype(np.int32)
    data_fact_sales['customer_source_id'] = data_fact_sales['customer_source_id'].astype(np.int32)
    data_fact_sales['close_date'] = pd.to_datetime(data_fact_sales['close_date'], format='%Y-%m-%d').dt.date
    data_fact_sales['discount_item_local'] = data_fact_sales['discount_item_local'].astype(np.float32)
    data_fact_sales['discount_item_usd'] = data_fact_sales['discount_item_usd'].astype(np.float32)
    data_fact_sales['gmv_local'] = data_fact_sales['gmv_local'].astype(np.float32)
    data_fact_sales['gmv_usd'] = data_fact_sales['gmv_usd'].astype(np.float32)

    return data_fact_sales

In [27]:
data_fact_sales = fix_data_fact_sales(data_fact_sales)

## Merge

In [28]:
data_merged = data_fact_sales.merge(data_broadleaf,how='left',on=['order_id','order_item_id','region_code','close_date'])

In [29]:
data_merged.drop(columns=["coefficient_y"],inplace=True)
data_merged.rename(columns={"coefficient_x":"coefficient"},inplace=True)

In [30]:
for col in ['responsable', 'growth_owners', 'offer_name', 'adjustment_reason']:
    data_merged[col].fillna("No Discount", inplace=True)

In [31]:
for col in ['discount_offer_local', 'discount_offer_usd']:
    data_merged[col].fillna(0, inplace=True)

In [32]:
data_merged.head(5)

Unnamed: 0,order_id,order_item_id,region_code,discount_item_local,customer_source_id,sku_id,subcategory,category,close_date,coefficient,gmv_local,discount_item_usd,gmv_usd,subcategory_transformed,responsable,growth_owners,offer_name,adjustment_reason,discount_offer_local,discount_offer_usd
0,14164327,62996281,BHZ,0.0,76700003,313549,Legumes,Frutas e verduras,2022-09-01,0.21052632,3.92,0.0,0.825263,legumes,No Discount,No Discount,No Discount,No Discount,0,0
1,14275073,63094235,CMX,0.0,207121608,-303834,Tubérculos,Frutas & verduras,2022-09-01,0.05089059,18.0,0.0,0.916031,tuberculos,No Discount,No Discount,No Discount,No Discount,0,0
2,14253729,63010671,SPO,2.34,19798250,112095,Verduras,Frutas e verduras,2022-09-01,0.21052632,28.620001,0.492632,6.025263,verduras,No Discount,No Discount,No Discount,No Discount,0,0
3,14238425,63000104,SPO,2.88,128511128,-314737,Legumes,Frutas e verduras,2022-09-01,0.21052632,57.0,0.606316,12.0,legumes,No Discount,No Discount,No Discount,No Discount,0,0
4,14277370,63103277,BAQ,6124.799805,215373918,132652,Pollo congelado,"Carne, pollo & pescados",2022-09-01,0.00026483,122496.0,1.622031,32.440617,pollo,No Discount,No Discount,No Discount,No Discount,0,0


# converted

In [33]:
data_converted = data_merged.loc[
    data_merged.growth_owners == 'SUPER DISCOUNTS',
    ['customer_source_id','offer_name']
].drop_duplicates()

In [34]:
data_converted['label'] = 1

In [35]:
dataset = data_segment.merge(
    data_converted,
    how='left',
    on=['offer_name','customer_source_id'],
)

In [36]:
dataset["label"].fillna(0, inplace=True)

In [37]:
dataset

Unnamed: 0,offer_id,customer_segment_id,offer_name,value_discount,max_uses_per_customer,max_uses_per_order,ms,subcategory,campaign_month,region_code,skus,customer_source_id,group,label
0,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,146869951,Treatment,0.0
1,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,158649897,Treatment,0.0
2,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,204518842,Treatment,0.0
3,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,79497049,Treatment,0.0
4,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,191965864,Treatment,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2153170,-14459945,79276,20220901_VCP_other_control_NB_feijao_W35,10,1,1,other_control,feijao,2022-09-01,VCP,"430899,430898,430895,430896,430643,430642,4306...",210368542,Control,0.0
2153171,-14459945,79276,20220901_VCP_other_control_NB_feijao_W35,10,1,1,other_control,feijao,2022-09-01,VCP,"430899,430898,430895,430896,430643,430642,4306...",175826304,Control,0.0
2153172,-14459945,79276,20220901_VCP_other_control_NB_feijao_W35,10,1,1,other_control,feijao,2022-09-01,VCP,"430899,430898,430895,430896,430643,430642,4306...",173718466,Control,0.0
2153173,-14459945,79276,20220901_VCP_other_control_NB_feijao_W35,10,1,1,other_control,feijao,2022-09-01,VCP,"430899,430898,430895,430896,430643,430642,4306...",158778666,Control,0.0


In [38]:
dataset["label"].mean()

0.04289386603504128

In [40]:
dataset.groupby("region_code").label.mean()

region_code
BAQ    0.063973
BHZ    0.029062
BOG    0.049430
CMX    0.040661
CWB    0.034606
GDL    0.031688
MDE    0.039016
PBC    0.048030
SPO    0.045070
VCP    0.043984
Name: label, dtype: float64

In [41]:
dataset.groupby("group").label.mean()

group
Control      0.012268
Treatment    0.050562
Name: label, dtype: float64

# should convert

In [43]:
data_segment_skus = data_segment.loc[
    (data_segment.offer_name.str.contains('20220901')),
    ['customer_segment_id', 'region_code', 'skus']
].drop_duplicates()

In [44]:
data_segment_skus['skus_list'] = data_segment_skus.skus.apply(lambda x: x.split(","))

In [45]:
data_segment_skus_explode = data_segment_skus.explode('skus_list')

In [46]:
data_segment_skus_explode.rename(columns={'skus_list':'sku_id'},inplace=True)

In [47]:
data_segment_skus_explode['sku_id'] = data_segment_skus_explode['sku_id'].astype(int)

In [48]:
data_merged_segment = data_merged.loc[:,['customer_source_id','region_code','sku_id']].drop_duplicates().merge(
    data_segment_skus_explode,
    how='inner',
    on=['region_code','sku_id']
)

In [49]:
data_should_convert = data_merged_segment.loc[:,['customer_source_id','customer_segment_id','region_code']].merge(
    data_segment,
    how='inner',
    on=['customer_source_id','customer_segment_id','region_code']
)[['offer_name','customer_source_id']].drop_duplicates()

In [50]:
data_should_convert["label_should"] = 1

In [51]:
dataset = dataset.merge(data_should_convert,
                       how='left',
                       on=['offer_name','customer_source_id'])

In [52]:
dataset

Unnamed: 0,offer_id,customer_segment_id,offer_name,value_discount,max_uses_per_customer,max_uses_per_order,ms,subcategory,campaign_month,region_code,skus,customer_source_id,group,label,label_should
0,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,146869951,Treatment,0.0,
1,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,158649897,Treatment,0.0,
2,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,204518842,Treatment,0.0,
3,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,79497049,Treatment,0.0,
4,-14458086,80197,20220901_BAQ_other_piloto_NB_arroz_W35,7,1,5,other_piloto,arroz,2022-09-01,BAQ,281192440369,191965864,Treatment,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2153170,-14459945,79276,20220901_VCP_other_control_NB_feijao_W35,10,1,1,other_control,feijao,2022-09-01,VCP,"430899,430898,430895,430896,430643,430642,4306...",210368542,Control,0.0,
2153171,-14459945,79276,20220901_VCP_other_control_NB_feijao_W35,10,1,1,other_control,feijao,2022-09-01,VCP,"430899,430898,430895,430896,430643,430642,4306...",175826304,Control,0.0,
2153172,-14459945,79276,20220901_VCP_other_control_NB_feijao_W35,10,1,1,other_control,feijao,2022-09-01,VCP,"430899,430898,430895,430896,430643,430642,4306...",173718466,Control,0.0,
2153173,-14459945,79276,20220901_VCP_other_control_NB_feijao_W35,10,1,1,other_control,feijao,2022-09-01,VCP,"430899,430898,430895,430896,430643,430642,4306...",158778666,Control,0.0,


In [53]:
dataset["label_should"].fillna(0, inplace=True)

In [54]:
dataset.customer_source_id.nunique()

103395

In [55]:
dataset[dataset.group == "Treatment"].customer_source_id.nunique()

83324

In [56]:
dataset[dataset.group == "Control"].customer_source_id.nunique()

21320

In [57]:
(1638249+13829+69910)*0.040598

69909.268824

In [58]:
1721988/83324

20.666170611108445

In [59]:
dataset.groupby(['label','group','label_should']).customer_source_id.count()

label  group      label_should
0.0    Control    0.0              411653
                  1.0               14244
       Treatment  0.0             1621381
                  1.0               13539
1.0    Control    1.0                5290
       Treatment  1.0               87068
Name: customer_source_id, dtype: int64

In [60]:
dataset.loc[
    (dataset.label==0)&
    (dataset.label_should==1)
].groupby('region_code').customer_source_id.count()/dataset.loc[dataset.label_should==1].groupby('region_code').customer_source_id.count()

region_code
BAQ    0.215503
BHZ    0.231067
BOG    0.251431
CMX    0.213585
CWB    0.223427
GDL    0.234968
MDE    0.317078
PBC    0.204781
SPO    0.219060
VCP    0.241813
Name: customer_source_id, dtype: float64

In [61]:
dataset.loc[
    (dataset.label==1) &
    (dataset.label_should.isin([0,1]))
].groupby('region_code').customer_source_id.count()/dataset.loc[dataset.label_should.isin([0,1])].groupby('region_code').customer_source_id.count()

region_code
BAQ    0.063973
BHZ    0.029062
BOG    0.049430
CMX    0.040661
CWB    0.034606
GDL    0.031688
MDE    0.039016
PBC    0.048030
SPO    0.045070
VCP    0.043984
Name: customer_source_id, dtype: float64

In [62]:
len(dataset),len(data_segment)

(2153175, 2153175)

# should convert subcategory

In [63]:
data_should_convert_subcategory = data_merged.loc[
    :,['customer_source_id','region_code','subcategory_transformed']
].drop_duplicates().merge(
    data_segment,
    how='inner',
    left_on=['customer_source_id','region_code','subcategory_transformed'],
    right_on=['customer_source_id','region_code','subcategory'],
)[['offer_name','customer_source_id']].drop_duplicates()

In [64]:
data_should_convert_subcategory

Unnamed: 0,offer_name,customer_source_id
0,20220901_SPO_comidabrasileiraysaudavel_piloto_...,105462511
1,20220901_SPO_pratododia_control_NB_azeitesoleo...,165339230
2,20220901_SPO_other_control_NB_verduras_W35,211480746
3,20220901_BOG_cafeteria_piloto_NB_azucaryendulz...,54162330
4,20220901_BOG_comidarapida_piloto_NB_detergente...,63060118
...,...,...
154214,20220901_BOG_comidaespecializada_piloto_NB_vas...,45724658
154215,20220901_MDE_other_control_NB_tuberculos_W35,135340886
154216,20220901_SPO_pratododia_piloto_NB_azeitesoleos...,22279199
154217,20220901_SPO_pratododia_piloto_NB_feijao_W35,50896991


In [65]:
data_should_convert_subcategory["label_should_subcategory"] = 1

In [66]:
dataset = dataset.merge(data_should_convert_subcategory,
                       how='left',
                       on=['offer_name','customer_source_id'])

In [67]:
dataset["label_should_subcategory"].fillna(0, inplace=True)

In [68]:
dataset.groupby(['label','label_should','label_should_subcategory']).customer_source_id.count()

label  label_should  label_should_subcategory
0.0    0.0           0.0                         1998947
                     1.0                           34087
       1.0           0.0                               2
                     1.0                           27781
1.0    1.0           0.0                               7
                     1.0                           92351
Name: customer_source_id, dtype: int64

In [69]:
len(dataset),len(data_segment)

(2153175, 2153175)

In [70]:
dataset[dataset.group == "Treatment"].customer_source_id.nunique()

83324

In [71]:
dataset[dataset.group == "Control"].customer_source_id.nunique()

21320

In [72]:
dataset[(dataset.group == "Control") & (dataset.label_should == 1)].customer_source_id.nunique()

8952

In [73]:
dataset[(dataset.group == "Treatment") & (dataset.label_should == 1)].customer_source_id.nunique()

39352

In [74]:
Treatment_sub = dataset[(dataset.group == "Treatment")].groupby("customer_source_id").label_should_subcategory.sum().reset_index()
Control_sub = dataset[(dataset.group == "Control")].groupby("customer_source_id").label_should_subcategory.sum().reset_index()

In [75]:
Control_sub[(Control_sub.label_should_subcategory>0)].customer_source_id.nunique()

10318

In [76]:
Treatment_sub[(Treatment_sub.label_should_subcategory>0)].customer_source_id.nunique()

42968

In [92]:
(42968/83324 - 10318/21320)*100

3.1715031257984916

In [78]:
27042/83324 - 6002/21320

0.0430206487066514

In [79]:
17160/83324 - 3515/21320

0.04107439770590143

# Analisis ARPU

In [80]:
data_segment_skus_2 = data_segment.loc[
    (data_segment.offer_name.str.contains('20220901')),
    ['customer_source_id', 'region_code', 'group', 'skus']
].drop_duplicates()

In [81]:
data_segment_skus_2['skus_list'] = data_segment_skus_2.skus.apply(lambda x: x.split(","))

In [82]:
data_segment_skus_explode_2 = data_segment_skus_2.explode('skus_list')

In [83]:
data_segment_skus_explode_2.rename(columns={'skus_list':'sku_id'},inplace=True)

In [84]:
data_segment_skus_explode_2['sku_id'] = data_segment_skus_explode_2['sku_id'].astype(int)

In [85]:
data_segment_skus_explode_2 = data_segment_skus_explode_2.drop_duplicates()

In [86]:
data_segment_skus_explode_2

Unnamed: 0,customer_source_id,region_code,group,skus,sku_id
0,146869951,BAQ,Treatment,281192440369,281192
0,146869951,BAQ,Treatment,281192440369,440369
1,158649897,BAQ,Treatment,281192440369,281192
1,158649897,BAQ,Treatment,281192440369,440369
2,204518842,BAQ,Treatment,281192440369,281192
...,...,...,...,...,...
2153174,211113451,VCP,Control,"430899,430898,430895,430896,430643,430642,4306...",430896
2153174,211113451,VCP,Control,"430899,430898,430895,430896,430643,430642,4306...",430643
2153174,211113451,VCP,Control,"430899,430898,430895,430896,430643,430642,4306...",430642
2153174,211113451,VCP,Control,"430899,430898,430895,430896,430643,430642,4306...",430657


In [87]:
data_merged_2 = data_merged.groupby(by = ['region_code', 'customer_source_id', 'sku_id', 'subcategory_transformed','responsable']).agg({'gmv_usd': np.sum, 'discount_item_usd': np.sum}).reset_index()

In [88]:
data_merged_segment_2 = data_merged_2.merge(
    data_segment_skus_explode_2,
    how='inner',
    on=['region_code','sku_id','customer_source_id']
)

In [89]:
data_merged_segment_2.groupby(by=['region_code','subcategory_transformed','group']).apply(#the change will be here padre_sku_id // padre_sku_id
    lambda d: pd.Series(
        {
            "customers": d["customer_source_id"].nunique(),
            "gmv_usd": d["gmv_usd"].sum(),
            "discount_item_usd": d["discount_item_usd"].sum(),
            "avg_gmv": d["gmv_usd"].sum() / d["customer_source_id"].nunique(),
            "avg_discount": float(d["discount_item_usd"].sum() / d["customer_source_id"].nunique()),
            "% dct": float(d["discount_item_usd"].sum()) *100.0 / d["gmv_usd"].sum()

        }
    )
).reset_index()

Unnamed: 0,region_code,subcategory_transformed,group,customers,gmv_usd,discount_item_usd,avg_gmv,avg_discount,% dct
0,BAQ,aceitesygrasas,Control,85.0,2276.999023,58.248310,26.788224,0.685274,2.558117
1,BAQ,aceitesygrasas,Treatment,354.0,11176.505859,431.038757,31.572050,1.217624,3.856650
2,BAQ,alimentoscongelados,Control,9.0,639.522888,29.043541,71.058099,3.227060,4.541439
3,BAQ,alimentoscongelados,Treatment,30.0,562.991821,37.747879,18.766394,1.258263,6.704872
4,BAQ,arroz,Control,33.0,713.183350,35.022312,21.611617,1.061282,4.910702
...,...,...,...,...,...,...,...,...,...
602,VCP,sucosechas,Treatment,46.0,428.939087,28.648420,9.324763,0.622792,6.678902
603,VCP,tuberculos,Control,32.0,131.233383,9.368421,4.101043,0.292763,7.138748
604,VCP,tuberculos,Treatment,141.0,688.632568,67.597893,4.883919,0.479418,9.816250
605,VCP,verduras,Control,37.0,149.592926,13.442106,4.043052,0.363300,8.985790


In [90]:
data_merged_segment_2

Unnamed: 0,region_code,customer_source_id,sku_id,subcategory_transformed,responsable,gmv_usd,discount_item_usd,group,skus
0,BAQ,-254535,-302280,tuberculos,GROWTH & OTHERS,15.006261,2.535588,Treatment,"-302280,-302028,-300387,-302025,-302949,18446"
1,BAQ,-254535,387458,implementosdeaseo,No Discount,0.763277,0.180190,Treatment,57867577855775557761387472387471387458
2,BAQ,-253984,18439,verduras,GROWTH & OTHERS,0.998409,0.195762,Treatment,"258696,73211,73267,258690,-300645,-300657,4013..."
3,BAQ,-253984,63788,verduras,GROWTH & OTHERS,3.879760,0.800846,Treatment,"258696,73211,73267,258690,-300645,-300657,4013..."
4,BAQ,-253984,222614,aceitesygrasas,GROWTH & OTHERS,31.597624,1.588344,Treatment,222614
...,...,...,...,...,...,...,...,...,...
172096,VCP,215028610,509542,ovos,GROWTH & OTHERS,7.603158,1.031579,Treatment,509542
172097,VCP,215037634,434987,frutas,GROWTH & OTHERS,1.357895,0.218947,Treatment,"434987,434983,434895,434925,434929,435001,4348..."
172098,VCP,215096568,428403,leite,GROWTH & OTHERS,16.686316,2.298947,Treatment,428403428402429117429118
172099,VCP,215096568,430316,cervejas,GROWTH & OTHERS,30.075790,2.374737,Treatment,"430296,430295,430297,430290,430292,430291,4303..."
