In [1]:
#//----------------------------
#//LIBRARIES
    #Math
import math
    #Numeric Python
import numpy as np
    #Pandas (dataframes)
import pandas as pd
    #datetime for fate manipulation
from datetime import date, datetime, timedelta  
    #Regex for advanced string matching
import re
    #for time related stuff
import time 
    #json library
import json
    #Analyst tools
import sys
sys.path.append('../')
from analysts_tools.growth import *
    #Procurement tools
from analysts_tools.redash_methods import *
from analystcommunity.read_connection_data_warehouse import run_read_dwd_query, run_read_prod_query

import random
import datetime

import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from procurement_lib import GoogleSheet

In [2]:
todays_date = datetime.datetime.today().strftime('%Y-%m-%d')

In [3]:
query = """
select
    CASE WHEN competitor.competitor_name ILIKE '%assaí%' THEN 'assai' ELSE 'atacadao' END AS competitor_name,
    site.identifier_value as site_code,
    quotation_date.full_date AS quotation_date,
    su.source_id,
    ROUND(MEDIAN(cpp.product_selected_price),2)::float as price
from dpr_product_pricing.fact_collected_product_prices cpp
    inner join dpr_shared.dim_date quotation_date
        on cpp.dim_quotation_date = quotation_date.date_id
    inner join dpr_shared.dim_time quotation_time
        on cpp.dim_quotation_time = quotation_time.time_id
    inner join dpr_shared.dim_site site
        on cpp.dim_site = site.site_id
    inner join dpr_shared.dim_category cat
        on cpp.dim_category = cat.category_id
    inner join dpr_product_pricing.dim_product_outlier_type outlier_type
        on cpp.dim_outlier_type = outlier_type.outlier_type_id
    inner join dpr_product_pricing.dim_product_source_type source_type
        on cpp.dim_source_type = source_type.source_type_id
    inner join dpr_product_pricing.dim_product_competitor competitor
        on cpp.dim_competitor = competitor.competitor_id
    inner join dpr_product_pricing.dim_product_competitor_type competitor_type
        on(
            case
                when cpp.super_category = 'Fruver'
                    then competitor.product_competitor_type_id_fruver = competitor_type.competitor_type_id
                when cpp.super_category = 'Multicategoría'
                    then competitor.product_competitor_type_id_multicategoria = competitor_type.competitor_type_id
            end
        )
    inner join dpr_shared.dim_stock_unit su
        on cpp.dim_stock_unit = su.stock_unit_id
where quotation_date.full_date >= DATE(CURRENT_DATE-14)
    --AND quotation_date.full_date <= '2024-05-22'
    AND competitor.competitor_name NOT ILIKE '%cayena%'
    AND site.identifier_value IN ('SPO','CWB','VCP','BHZ')
    AND (competitor.competitor_name ILIKE '%assaí%' OR competitor.competitor_name ILIKE '%atacadao%' OR competitor.competitor_name ILIKE '%atacadão%')
GROUP BY 1,2,3,4
"""
df_zkkkkk = run_read_dwd_query(query)

df_zkkkkk = df_zkkkkk.dropna().reset_index(drop=True)
df_zkkkkk['lifetime'] = 8

In [4]:
# Ensure dataframe is sorted by 'quotation_date'
df_zkkkkk = df_zkkkkk.sort_values(by='quotation_date')

# Generate the required rows for missing dates
new_rows = []

for (competitor, source_id), group in df_zkkkkk.groupby(['competitor_name', 'source_id']):
    group = group.sort_values(by='quotation_date')
    last_known_price = None
    last_known_date = None
    lifetime = 8
    
    for current_index in range(len(group)):
        current_date = group.iloc[current_index]['quotation_date']
        price = group.iloc[current_index]['price']
        
        # If this is not the first iteration, fill in missing dates
        if last_known_date is not None:
            days_diff = (current_date - last_known_date).days
            if days_diff > 1:
                for j in range(1, min(days_diff, lifetime + 1)):
                    new_date = last_known_date + timedelta(days=j)
                    new_row = {
                        'site_code': group.iloc[current_index]['site_code'],
                        'quotation_date': new_date,
                        'competitor_name': competitor,
                        'source_id': source_id,
                        'price': last_known_price,
                        'lifetime': lifetime - j
                    }
                    new_rows.append(new_row)
                    
                    # Stop if we reach a new datapoint date
                    if new_date + timedelta(days=1) == current_date:
                        break
        
        # Update the last known values and reset lifetime
        last_known_price = price
        last_known_date = current_date
        lifetime = 8  # Reset lifetime

    # After processing all known dates for the group, continue generating rows until lifetime reaches 0
    while lifetime > 0:
        last_known_date += timedelta(days=1)
        new_row = {
            'site_code': group.iloc[-1]['site_code'],
            'quotation_date': last_known_date,
            'competitor_name': competitor,
            'source_id': source_id,
            'price': last_known_price,
            'lifetime': lifetime - 1
        }
        new_rows.append(new_row)
        lifetime -= 1

# Append new rows to the dataframe
df_zkkkkk = df_zkkkkk.append(new_rows, ignore_index=True)

# Sort the final dataframe
df_zkkkkk = df_zkkkkk.sort_values(by=['competitor_name', 'source_id', 'quotation_date'])
df_zkkkkk['replica'] = df_zkkkkk['lifetime'] == 8

In [5]:
df_bench = df_zkkkkk.copy()
df_bench['quotation_date'] = pd.to_datetime(df_bench['quotation_date'])

In [6]:
df_bench.site_code.value_counts()

SPO    29615
BHZ    16935
VCP    15567
CWB    12518
Name: site_code, dtype: int64

In [7]:
query = """
WITH RECURSIVE calendar(calendar_date) AS (
  SELECT DATE_TRUNC('day', DATE(GETDATE()) - INTERVAL '700 day')
  UNION ALL
  SELECT calendar_date + INTERVAL '1 day'
  FROM calendar
  WHERE calendar_date BETWEEN DATE_TRUNC('day', DATE(GETDATE()) - INTERVAL '700 day') AND DATE(GETDATE() - 1) 
),

info AS (
SELECT
    DATE(coalesce(prices.last_modified_at, prices.created_at)) as created_at,
    pp.frida_id as source_id,
    MIN(coalesce(tiers.tax_price, prices.tax_price)) as price,
    MIN(coalesce(tiers.sale_price, prices.sale_price)) as net_price

FROM postgres_growth."growth_pricing.prices_history" prices
LEFT JOIN postgres_growth."growth_pricing.price_tiers_history" tiers ON prices.id = tiers.price_history_id
LEFT JOIN postgres_growth."growth_pricing.skus" skus ON prices.sku_id = skus.id
LEFT JOIN postgres_main_co."purchase_orders.products" p ON skus.sku_id = p.frida_id
LEFT JOIN postgres_main_co."purchase_orders.products" pp ON COALESCE(p.parent_id, p.id) = pp.id

WHERE DATE(prices.created_at) >= DATE_TRUNC('day', DATE(GETDATE()) - INTERVAL '700 day')
 AND p.region_code IN ('SPO','BHZ','CWB','VCP')
 AND p.deleted_at IS NULL
 AND prices.created_by NOT ILIKE '%CATALOG%'
 AND pp.product_category_id IN (5,6,7,8,9,10,13,18) -- 1 ES FRUVER
 AND pp.frida_id IN {skus}
GROUP BY 1,2--,3,4
),

done AS (
SELECT
  DATE(c.calendar_date) AS quotation_date,
  --s.region,
  --s.parent_product_name,
  (s.source_id)::int as source_id,
  LAG(i.price IGNORE NULLS) OVER (PARTITION BY s.source_id ORDER BY c.calendar_date)::FLOAT AS p_price_tool,
  LAG(i.net_price IGNORE NULLS) OVER (PARTITION BY s.source_id ORDER BY c.calendar_date)::FLOAT AS net_price_tool


FROM calendar c
CROSS JOIN (SELECT DISTINCT source_id FROM info) s
LEFT JOIN info i ON c.calendar_date = i.created_at-1 AND s.source_id = i.source_id
)

SELECT *
FROM done
WHERE net_price_tool IS NOT NULL
 AND quotation_date >= DATE(CURRENT_DATE - 14)
""".format(skus=tuple(df_bench.source_id.unique()))
df = run_read_prod_query(query)  
df['quotation_date'] = pd.to_datetime(df['quotation_date'])

In [8]:
df = pd.merge(df_bench, df, left_on=['source_id','quotation_date'], right_on=['source_id','quotation_date'], how='inner')

In [10]:
query = """
SELECT
    s.identifier_value AS site_code,
    DATE(fs.order_submitted_date) AS quotation_date,
    cat.parent_description AS cat,
    sup.source_id,
    sup.card_description,
    (SUM(fs.gmv_pxq_local)/4.75)::FLOAT AS gmv_usd,
    gmv_usd/SUM(gmv_usd) OVER (PARTITION BY s.identifier_value)::FLOAT AS gmv_mix,
    AVG(COALESCE(inventory_p_fin,cogs_p_day))::FLOAT AS costo

FROM dpr_sales.fact_sales                   fs
INNER JOIN dpr_shared.dim_site              s   ON s.site_id = fs.dim_site
INNER JOIN dpr_shared.dim_product           dp  ON dp.product_id = fs.dim_product
INNER JOIN dpr_shared.dim_category          cat ON cat.category_id = dp.category_id
INNER JOIN dpr_shared.dim_stock_unit        su  ON su.product_id = fs.dim_product
INNER JOIN dpr_shared.dim_stock_unit        sup  ON nvl(nullif(su.source_parent_id,0),su.source_id) = sup.source_id
LEFT JOIN dpr_cross_business.fact_cross_business_insights m ON m.dim_stock_unit = sup.stock_unit_id AND m.dim_date = fs.dim_submitted_date

WHERE 
    fs.gmv_enabled = TRUE
    AND fulfillment_order_status NOT IN ('CANCELLED', 'ARCHIVED','No value')
    AND fs.fb_order_status_id IN (1,6,7,8)
    AND fs.is_deleted = FALSE
    AND cat.super_category = 'Multicategoría'
    AND fs.dim_status = 1
    AND dp.is_slot = 'false'
    AND fs.gmv_pxq_local > 0
    AND s.identifier_value IN ('SPO','CWB','VCP','BHZ')
    AND DATE(fs.order_submitted_date) >= DATE(CURRENT_DATE-14)
GROUP BY 1,2,3,4,5
HAVING costo > 0
"""
df_gmv = run_read_dwd_query(query)
df_gmv['quotation_date'] = pd.to_datetime(df_gmv['quotation_date'])

In [11]:
query = """
 SELECT 
        sup.source_id,
        type AS cluster,
        ROW_NUMBER() OVER (PARTITION BY sup.source_id ORDER BY c.last_modified_at DESC, cluster DESC) AS rn
    FROM 
        dpr_product_pricing.dim_sku_cluster_period c
     INNER JOIN dpr_shared.dim_stock_unit        su  ON su.sku = c.sku
     INNER JOIN dpr_shared.dim_stock_unit        sup  ON nvl(nullif(su.source_parent_id,0),su.source_id) = sup.source_id
    where 
        c.site_id in (4,6,9,11)
"""
df_type = run_read_dwd_query(query)
df_type = df_type.loc[df_type.rn == 1,['source_id','cluster']].copy()

In [12]:
df_gmv = pd.merge(df_gmv, df_type, left_on=['source_id'], right_on=['source_id'], how='left')
df_gmv['cluster'] = df_gmv['cluster'].fillna('TAIL')

In [13]:
df_gmv

Unnamed: 0,site_code,quotation_date,cat,source_id,card_description,gmv_usd,gmv_mix,costo,cluster
0,BHZ,2024-06-20,Mercearia,634043,Ervilha Olé Lata 170g,10.4757,1.642768e-05,1.6994,MID
1,BHZ,2024-06-20,Bebidas,453807,Refrigerante Guaraná Antarctica Sem Açúcar 350...,185.3810,2.907089e-04,2.6312,KVI
2,BHZ,2024-06-20,Laticínios e ovos,438839,Queijo Mussarela 3.7kg - Gardingo,1575.0231,2.469903e-03,122.2451,KVI
3,BHZ,2024-06-20,"Carnes, aves e peixes",263385,Salsicha Congelada - SEARA,161.8315,2.537793e-04,7.1436,KVI
4,BHZ,2024-06-20,Mercearia,640998,Molho de Tomate Tradicional Colonial 190g,1.3136,2.059948e-06,1.1976,MID
...,...,...,...,...,...,...,...,...,...
20758,SPO,2024-06-30,"Carnes, aves e peixes",417749,Linguiça Reta Defumada - Rezende,35.2894,9.417088e-06,18.5032,TAIL
20759,SPO,2024-06-30,Bebidas,484693,Kapo Laranja 200ml,6.0126,1.604481e-06,2.1499,TAIL
20760,SPO,2024-06-30,Mercearia,633849,Cogumelos Inteiros Olé 80g,3.0126,8.039219e-07,3.5825,TAIL
20761,SPO,2024-06-30,Mercearia,509498,Palmito Golden Palm Salada Flex 200g,21.2547,5.671884e-06,7.7635,TAIL


In [31]:
df_all = pd.merge(df_gmv, df.groupby(['site_code', 'quotation_date', 'source_id']).agg(
           price=('price',np.min),
           p_price_tool=('p_price_tool',np.mean),
           net_price_tool=('net_price_tool',np.mean)
       ).reset_index(), left_on=['site_code','source_id','quotation_date'], right_on=['site_code','source_id','quotation_date'], how='left')

df_assai = pd.merge(df_gmv, df.loc[df.competitor_name=='assai'], left_on=['site_code','source_id','quotation_date'], right_on=['site_code','source_id','quotation_date'], how='left')
df_atacado = pd.merge(df_gmv, df.loc[df.competitor_name=='atacadao'], left_on=['site_code','source_id','quotation_date'], right_on=['site_code','source_id','quotation_date'], how='left')

In [32]:
df_assai['gpi'] = df_assai.p_price_tool/df_assai.price
df_assai['npi'] = df_assai.net_price_tool/df_assai.price

df_assai['mg'] = 1-(df_assai.costo/df_assai.p_price_tool)
df_assai['nmg'] = 1-(df_assai.costo/df_assai.net_price_tool)

df_assai['comp_mg'] = 1-(df_assai.costo/df_assai.price)

# Filtrar filas donde gpi está fuera del rango 50 a 200
df_assai = df_assai[(df_assai['gpi'] >= 0.5) & (df_assai['gpi'] <= 2)]

In [33]:
df_atacado['gpi'] = df_atacado.p_price_tool/df_atacado.price
df_atacado['npi'] = df_atacado.net_price_tool/df_atacado.price

df_atacado['mg'] = 1-(df_atacado.costo/df_atacado.p_price_tool)
df_atacado['nmg'] = 1-(df_atacado.costo/df_atacado.net_price_tool)

df_atacado['comp_mg'] = 1-(df_atacado.costo/df_atacado.price)

# Filtrar filas donde gpi está fuera del rango 50 a 200
df_atacado = df_atacado[(df_atacado['gpi'] >= 0.5) & (df_atacado['gpi'] <= 2)]

In [34]:
df_all['gpi'] = df_all.p_price_tool/df_all.price
df_all['npi'] = df_all.net_price_tool/df_all.price

df_all['mg'] = 1-(df_all.costo/df_all.p_price_tool)
df_all['nmg'] = 1-(df_all.costo/df_all.net_price_tool)

df_all['comp_mg'] = 1-(df_all.costo/df_all.price)

# Filtrar filas donde gpi está fuera del rango 50 a 200
df_all = df_all[(df_all['gpi'] >= 0.5) & (df_all['gpi'] <= 2)]

In [35]:
# Define a function to apply the custom aggregation
def custom_ventas(group, column):
    return (100.00 * group[column] * group['gmv_mix']).sum()/group['gmv_mix'].sum()
merge_df2 = df_assai.dropna(how='any')
# Group by 'city' and 'year_week', aggregate multiple columns with custom aggregation
df_final_assai = merge_df2.loc[merge_df2.site_code == 'SPO'].groupby(['site_code','cat']).apply(lambda group: pd.Series({
    'gmv_mix': 100.00*group['gmv_mix'].sum(),  # Compute sum of gmv_mix directly
    'skus': group['source_id'].nunique(),
    'mg': custom_ventas(group, 'mg'),
    'nmg': custom_ventas(group, 'nmg'),
    'comp_mg': custom_ventas(group, 'comp_mg'),
    'gpi': custom_ventas(group, 'gpi'),
    'npi': custom_ventas(group, 'npi')
})).reset_index()
print('ASSAI INFO: MEDIANA')
df_final_assai

ASSAI INFO: MEDIANA


Unnamed: 0,site_code,cat,gmv_mix,skus,mg,nmg,comp_mg,gpi,npi
0,SPO,Bebidas,10.48618,97.0,13.001807,10.590935,15.537719,96.803129,94.368793
1,SPO,"Carnes, aves e peixes",7.643672,56.0,11.830235,10.512014,13.692956,98.18893,96.675231
2,SPO,Congelados,1.888974,22.0,24.365667,23.806535,19.947963,105.983719,105.18911
3,SPO,Descartáveis,0.328306,12.0,18.226126,14.110539,15.203116,103.359449,98.716412
4,SPO,Laticínios e ovos,17.281282,47.0,8.19398,6.267232,7.327215,100.781536,98.706217
5,SPO,Limpeza e higiene,2.39242,46.0,15.87456,15.362842,19.29598,95.69509,95.151016
6,SPO,Mercearia,31.713431,222.0,11.281868,8.868589,8.026143,103.563452,100.895524


In [42]:
# Define a function to apply the custom aggregation
def custom_ventas(group, column):
    return (100.00 * group[column] * group['gmv_mix']).sum()/group['gmv_mix'].sum()
merge_df2 = df_atacado.dropna(how='any')
# Group by 'city' and 'year_week', aggregate multiple columns with custom aggregation
df_final_atacado = merge_df2.loc[merge_df2.site_code == 'SPO'].groupby(['site_code','cat','cluster']).apply(lambda group: pd.Series({
    'gmv_mix': 100.00*group['gmv_mix'].sum(),  # Compute sum of gmv_mix directly
    'skus': group['source_id'].nunique(),
    'mg': custom_ventas(group, 'mg'),
    'nmg': custom_ventas(group, 'nmg'),
    'comp_mg': custom_ventas(group, 'comp_mg'),
    'gpi': custom_ventas(group, 'gpi'),
    'npi': custom_ventas(group, 'npi')
})).reset_index()
print('ATACADAO INFO: AVG')
df_final_atacado

ATACADAO INFO: AVG


Unnamed: 0,site_code,cat,cluster,gmv_mix,skus,mg,nmg,comp_mg,gpi,npi
0,SPO,Bebidas,KVI,9.444806,24.0,12.620504,9.532974,15.718092,96.293401,93.290232
1,SPO,Bebidas,MID,2.866529,36.0,11.556654,10.643386,14.578327,96.468299,95.556399
2,SPO,Bebidas,TAIL,0.711381,47.0,14.908068,14.409903,16.036781,98.584666,98.017331
3,SPO,"Carnes, aves e peixes",KVI,6.844145,19.0,10.975184,9.646852,13.852305,97.064899,95.70978
4,SPO,"Carnes, aves e peixes",MID,1.260945,20.0,13.052316,11.423872,4.131285,109.916143,108.00378
5,SPO,"Carnes, aves e peixes",TAIL,0.687261,30.0,16.165781,16.114717,17.544286,98.886245,98.811903
6,SPO,Congelados,KVI,1.66906,4.0,25.57292,24.940116,25.51567,100.080623,99.233655
7,SPO,Congelados,MID,0.176014,7.0,9.620109,9.620109,3.412678,106.899729,106.899729
8,SPO,Congelados,TAIL,0.045779,9.0,20.458394,20.458394,11.103338,113.955484,113.955484
9,SPO,Descartáveis,KVI,0.102565,2.0,13.224815,11.003187,6.455989,107.864736,105.187891


In [37]:
# Define a function to apply the custom aggregation
def custom_ventas(group, column):
    return (100.00 * group[column] * group['gmv_mix']).sum()/group['gmv_mix'].sum()
merge_df2 = df_all.dropna(how='any')
# Group by 'city' and 'year_week', aggregate multiple columns with custom aggregation
df_final_all = merge_df2.loc[merge_df2.site_code == 'SPO'].groupby(['site_code','cat']).apply(lambda group: pd.Series({
    'gmv_mix': 100.00*group['gmv_mix'].sum(),  # Compute sum of gmv_mix directly
    'skus': group['source_id'].nunique(),
    'mg': custom_ventas(group, 'mg'),
    'nmg': custom_ventas(group, 'nmg'),
    'comp_mg': custom_ventas(group, 'comp_mg'),
    'gpi': custom_ventas(group, 'gpi'),
    'npi': custom_ventas(group, 'npi')
})).reset_index()
print('ALL INFO: MIN de las MEDIANAS')
df_final_all

ALL INFO: MIN de las MEDIANAS


Unnamed: 0,site_code,cat,gmv_mix,skus,mg,nmg,comp_mg,gpi,npi
0,SPO,Bebidas,13.342381,118.0,12.764507,10.354245,13.64395,98.743264,96.283397
1,SPO,"Carnes, aves e peixes",8.884257,72.0,11.779402,10.521022,10.773841,101.466253,100.083369
2,SPO,Congelados,1.928071,22.0,24.059688,23.511893,18.744719,107.064024,106.284163
3,SPO,Descartáveis,0.404818,14.0,18.40972,14.606514,14.752291,104.181101,99.8519
4,SPO,Laticínios e ovos,19.242179,57.0,8.398946,6.453782,7.129715,101.182265,99.077138
5,SPO,Limpeza e higiene,2.70074,48.0,15.540708,15.087409,14.044249,101.548727,101.054415
6,SPO,Mercearia,36.65971,271.0,11.519115,9.246424,6.923004,105.115782,102.574479


In [43]:
df_atacado.to_csv('info_cami.csv',index=False)