In [1]:
#//----------------------------
#//LIBRARIES
    #Math
import math
    #Numeric Python
import numpy as np
    #Pandas (dataframes)
import pandas as pd
    #datetime for fate manipulation
from datetime import date, datetime, timedelta  
    #Regex for advanced string matching
import re
    #for time related stuff
import time 
    #json library
import json
    #Analyst tools
import sys
sys.path.append('../')
from analysts_tools.growth import *
    #Procurement tools
from procurement_lib import send_slack_notification
from procurement_lib import redash
from analysts_tools.redash_methods import *
from analystcommunity.read_connection_data_warehouse import run_read_dwd_query

import random
import time
import datetime

import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from procurement_lib import GoogleSheet

In [3]:
todays_date = datetime.datetime.today().strftime('%Y-%m-%d')

In [4]:
query = """
select
    competitor.competitor_name,
    site.identifier_value as site_code,
    quotation_date.full_date AS quotation_date,
    su.source_id,
    ROUND(AVG(cpp.product_selected_price),2)::float as price
from dpr_product_pricing.fact_collected_product_prices cpp
    inner join dpr_shared.dim_date quotation_date
        on cpp.dim_quotation_date = quotation_date.date_id
    inner join dpr_shared.dim_time quotation_time
        on cpp.dim_quotation_time = quotation_time.time_id
    inner join dpr_shared.dim_site site
        on cpp.dim_site = site.site_id
    inner join dpr_shared.dim_category cat
        on cpp.dim_category = cat.category_id
    inner join dpr_product_pricing.dim_product_outlier_type outlier_type
        on cpp.dim_outlier_type = outlier_type.outlier_type_id
    inner join dpr_product_pricing.dim_product_source_type source_type
        on cpp.dim_source_type = source_type.source_type_id
    inner join dpr_product_pricing.dim_product_competitor competitor
        on cpp.dim_competitor = competitor.competitor_id
    inner join dpr_product_pricing.dim_product_competitor_type competitor_type
        on(
            case
                when cpp.super_category = 'Fruver'
                    then competitor.product_competitor_type_id_fruver = competitor_type.competitor_type_id
                when cpp.super_category = 'Multicategoría'
                    then competitor.product_competitor_type_id_multicategoria = competitor_type.competitor_type_id
            end
        )
    inner join dpr_shared.dim_stock_unit su
        on cpp.dim_stock_unit = su.stock_unit_id
where quotation_date.full_date >= current_date - 10
    AND competitor.competitor_name NOT ILIKE '%cayena%'
    AND site.identifier_value IN ('SPO','CWB','VCP','BHZ')
GROUP BY 1,2,3,4
"""
df_zkkkkk = run_read_dwd_query(query)

df_zkkkkk = df_zkkkkk.dropna().reset_index(drop=True)
df_zkkkkk['lifetime'] = 8

In [5]:
# Ensure dataframe is sorted by 'quotation_date'
df_zkkkkk = df_zkkkkk.sort_values(by='quotation_date')

# Generate the required rows for missing dates
new_rows = []

for (competitor, source_id), group in df_zkkkkk.groupby(['competitor_name', 'source_id']):
    group = group.sort_values(by='quotation_date')
    last_known_price = None
    last_known_date = None
    lifetime = 8
    
    for current_index in range(len(group)):
        current_date = group.iloc[current_index]['quotation_date']
        price = group.iloc[current_index]['price']
        
        # If this is not the first iteration, fill in missing dates
        if last_known_date is not None:
            days_diff = (current_date - last_known_date).days
            if days_diff > 1:
                for j in range(1, min(days_diff, lifetime + 1)):
                    new_date = last_known_date + timedelta(days=j)
                    new_row = {
                        'site_code': group.iloc[current_index]['site_code'],
                        'quotation_date': new_date,
                        'competitor_name': competitor,
                        'source_id': source_id,
                        'price': last_known_price,
                        'lifetime': lifetime - j
                    }
                    new_rows.append(new_row)
                    
                    # Stop if we reach a new datapoint date
                    if new_date + timedelta(days=1) == current_date:
                        break
        
        # Update the last known values and reset lifetime
        last_known_price = price
        last_known_date = current_date
        lifetime = 8  # Reset lifetime

    # After processing all known dates for the group, continue generating rows until lifetime reaches 0
    while lifetime > 0:
        last_known_date += timedelta(days=1)
        new_row = {
            'site_code': group.iloc[-1]['site_code'],
            'quotation_date': last_known_date,
            'competitor_name': competitor,
            'source_id': source_id,
            'price': last_known_price,
            'lifetime': lifetime - 1
        }
        new_rows.append(new_row)
        lifetime -= 1

# Append new rows to the dataframe
df_zkkkkk = df_zkkkkk.append(new_rows, ignore_index=True)

# Sort the final dataframe
df_zkkkkk = df_zkkkkk.sort_values(by=['competitor_name', 'source_id', 'quotation_date'])
df_zkkkkk['replica'] = df_zkkkkk['lifetime'] == 8

In [6]:
df_bench = df_zkkkkk.copy()
df_bench['quotation_date'] = pd.to_datetime(df_bench['quotation_date'])

In [7]:
df_c = df_bench.loc[df_bench.quotation_date == todays_date].groupby("source_id").agg(competitors=('competitor_name',lambda x: x.nunique())).reset_index()

In [8]:
query = """
with all_prices as(
    select
        date.full_date as fecha,
        site.identifier_value as region,
        sup.source_id as product_ids,
        sup.description as product_name,
        min_base_price as price,
        min_sale_price as sale_price,
        min_pricing_price as price_hook_acmktpl,
        -- pp.product_price_id,
        row_number() over(partition by sup.source_id,date_created_at_id,time_created_at_id order by su.min_weight_unit desc, price asc) as family_order
    from dpr_product_pricing.dim_product_current_price pp
        inner join dpr_shared.dim_stock_unit su
            on pp.stock_unit_id = su.stock_unit_id
        inner join dpr_shared.dim_stock_unit sup
            on nvl(nullif(su.source_parent_id,0),su.source_id) = sup.source_id
        inner join dpr_shared.dim_date date
            on pp.date_created_at_id = date.date_id
        inner join dpr_shared.dim_site site
            on su.site_id = site.site_id
        inner join dpr_shared.dim_category cat
            on sup.category_id = cat.category_id
    where date.full_date >= current_date - 60
        and date.full_date <= current_date
        and su.description not like '%(S - I)%'
        and cat.super_category = 'Multicategoría'
        and su.active = 1
        and su.description not like '%( S - I)%'
        and su.description not like '%FDL%'
        and su.description not like '%Robin-Food%'
        and su.description not like '%Robin Food%'
        and su.description not like '%RF%'
        and su.description not like '%PPY%'
        and su.description not like '%KAM%'
        and su.description not like '%SIN IVA%'
        and su.description not like '%Oakberry%'
),

clusters as (
 SELECT 
        batch_id,
        created_at,
        last_modified_at,
        sku,
        stock_unit_id,
        source_id,
        type AS cluster,
        site_id,
        ROW_NUMBER() OVER (PARTITION BY sku ORDER BY last_modified_at DESC, cluster DESC) AS rn
    FROM 
        dpr_product_pricing.dim_sku_cluster_period
    where 
        site_id in (4,6,9,11)
)


Select
    fecha.full_date as quotation_date,
    all_prices.region,
    su.source_id,
    --su.description as product_name,
    clusters.cluster,
    price::FLOAT AS price,
    sale_price::FLOAT AS sale_price,
    price_hook_acmktpl::FLOAT AS precio_neto,
    (CASE WHEN c.dtd_cost_local = 0 THEN m.inventory_p_fin ELSE c.dtd_cost_local END)::FLOAT  AS costo,
    min_price::FLOAT AS bench
    
from all_prices
    inner join dpr_shared.dim_stock_unit su 
        on all_prices.product_ids = su.source_id
        
    left join dpr_cross_business.int_dtd_cost c 
        on c.dim_stock_unit = su.stock_unit_id
    
    inner join dpr_shared.dim_date fecha_id 
        on c.dim_date_dtd = fecha_id.date_id
    inner join dpr_shared.dim_date fecha 
        on fecha_id.full_date+1 = fecha.full_date

    left join dpr_cross_business.fact_cross_business_insights m 
        on m.dim_stock_unit = su.stock_unit_id AND m.dim_date = fecha.date_id
    
    left join clusters on clusters.sku = su.sku
    
    left join dpr_product_pricing.obt_benchmark_product_prices b
        on b.stock_unit_id = su.stock_unit_id AND b.benchmark_date = fecha.full_date

where current_date = fecha.full_date
and family_order = 1
and rn = 1
"""
df = run_read_dwd_query(query)  
df['quotation_date'] = pd.to_datetime(df['quotation_date'])

In [9]:
df = pd.merge(df, df_c, left_on=['source_id'], right_on=['source_id'], how='left')

In [11]:
# Función para calcular el percentil
def calcular_percentil(row, bench_df):
    source_id = row['source_id']
    quotation_date = row['quotation_date']
    precio = row['precio_neto']
    
    # Filtrar precios de referencia por source_id y quotation_date
    bench_precios = bench_df[(bench_df['source_id'] == source_id) & (bench_df['quotation_date'] == quotation_date)]['price']
    
    # Si no hay datos de referencia para ese source_id y quotation_date, devolver NaN
    if bench_precios.empty:
        return np.nan
    
    # Convertir a un array de numpy
    bench_precios = bench_precios.to_numpy(dtype=np.float64)
    
    # Manejo de valores por encima del máximo y por debajo del mínimo
    if precio < np.min(bench_precios):
        return -100*(1-(precio/np.min(bench_precios))) # Percentil mínimo
    elif precio > np.max(bench_precios):
        return 100*(precio/np.max(bench_precios)) # Percentil maximo
    
    # Calcular el percentil
    percentil = stats.percentileofscore(bench_precios, precio)
    return percentil

# Aplicar la función a cada fila de df
df['percentil'] = df.apply(calcular_percentil, axis=1, bench_df=df_bench)

In [13]:
def calcular_precio_por_percentil_row(row, bench_df, desired_percentil):
    source_id = row['source_id']
    quotation_date = row['quotation_date']
    
    # Filtrar precios de referencia por source_id y quotation_date
    bench_precios = bench_df[(bench_df['source_id'] == source_id) & (bench_df['quotation_date'] == quotation_date)]['price']
    
    # Si no hay datos de referencia para ese source_id y quotation_date, devolver NaN
    if bench_precios.empty:
        return np.nan
    
    # Convertir a un array de numpy y ordenar los precios
    bench_precios = bench_precios.to_numpy(dtype=np.float64)
    bench_precios.sort()
    
    # Calcular el precio correspondiente al percentil deseado
    precio = np.percentile(bench_precios, desired_percentil)
    
    return precio

In [14]:
# Desired percentile (e.g., 25th percentile)
desired_percentil = 25

# Apply the function to each row in df
df['p25'] = df.apply(lambda row: calcular_precio_por_percentil_row(row, df_bench, desired_percentil), axis=1)

In [16]:
df['mg'] = (1-df.costo/df.precio_neto)
df['mg_p25'] = (1-df.costo/df.p25)

In [17]:
query = """
SELECT
    s.identifier_value AS city,
    cat.parent_description AS cat,
    sup.source_id,
    s.identifier_value || ' || ' || sup.card_description AS product_name,
    (SUM(fs.gmv_pxq_local)/4.75)::FLOAT AS gmv_usd
FROM dpr_sales.fact_sales                   fs
INNER JOIN dpr_shared.dim_site              s   ON s.site_id = fs.dim_site
INNER JOIN dpr_shared.dim_product           dp  ON dp.product_id = fs.dim_product
INNER JOIN dpr_shared.dim_category          cat ON cat.category_id = dp.category_id
INNER JOIN dpr_shared.dim_stock_unit        su  ON su.product_id = fs.dim_product
INNER JOIN dpr_shared.dim_stock_unit        sup  ON nvl(nullif(su.source_parent_id,0),su.source_id) = sup.source_id

WHERE 
    fs.gmv_enabled = TRUE
    AND fulfillment_order_status NOT IN ('CANCELLED', 'ARCHIVED','No value')
    AND fs.fb_order_status_id IN (1,6,7,8)
    AND fs.is_deleted = FALSE
    AND cat.super_category = 'Multicategoría'
    AND fs.dim_status = 1
    AND dp.is_slot = 'false'
    AND fs.gmv_pxq_local > 0
    AND s.identifier_value IN ('SPO','CWB','VCP','BHZ')
    AND DATE(fs.order_submitted_date) >= CURRENT_DATE - 15
GROUP BY 1,2,3,4
"""
df_gmv = run_read_dwd_query(query)

In [18]:
merge_df = pd.merge(df_gmv, df, left_on=['source_id'], right_on=['source_id'], how='left')

merge_df['gmv_mix'] = merge_df['gmv_usd'] / merge_df.groupby('city')['gmv_usd'].transform('sum')

merge_df = merge_df[['city', 'quotation_date', 'cat', 'source_id', 'cluster', 'product_name', 'competitors','gmv_usd', 'gmv_mix',
       'price', 'sale_price', 'precio_neto', 'bench', 'costo', 'percentil', 'p25', 'mg', 'mg_p25', ]]

In [19]:
merge_df['uplift'] = 100.00*((merge_df.p25/merge_df.precio_neto)-1) #es cuanto deberia subir mi precion en %
merge_df['npi'] = merge_df.precio_neto/merge_df.bench
merge_df['npi_p'] = merge_df.p25/merge_df.bench

In [20]:
# Tiene que tener al menos 5 competidores y estar abajo del percentil 25 !!
merge_df.loc[(merge_df.competitors >= 5) & (merge_df.percentil <= 25)].groupby("city").gmv_mix.sum()

city
BHZ    0.373028
CWB    0.280513
SPO    0.322843
VCP    0.304743
Name: gmv_mix, dtype: float64

In [62]:
# Tiene que tener al menos 5 competidores y estar abajo del percentil 25 !!
merge_df.loc[(merge_df.competitors >= 5) & (merge_df.percentil <= 25)].groupby("city").source_id.count()

city
BHZ     80
CWB     52
SPO    204
VCP     33
Name: source_id, dtype: int64

In [63]:
# Tiene que tener al menos 5 competidores y estar abajo del percentil 25 !!
merge_df.loc[(merge_df.competitors >= 5) & (merge_df.percentil <= 25) & (merge_df.cluster == 'KVI')].groupby("city").source_id.count()

city
BHZ    26
CWB    26
SPO    63
VCP    15
Name: source_id, dtype: int64

In [21]:
df_x = merge_df.loc[(merge_df.competitors >= 5) & (merge_df.percentil <= 25)].copy()

# Gaby elasticidades code

In [27]:
from analystcommunity.read_connection_data_warehouse import run_read_dwd_query, run_read_prod_query
import pandas as pd
from decimal import Decimal

def get_fruver_frida_id(region_code: str) -> pd.DataFrame:

    query = f"""
        SELECT
            source_id
                as source_id
        FROM
            dpr_shared.dim_category c
            INNER JOIN dpr_shared.dim_site s
                ON c.site_id = s.site_id
        WHERE
            1=1
            AND s.identifier_value = '{region_code}'
            AND c.super_category = 'Fruver'
            AND c.source_parent_id = c.source_id
        """

    result = run_read_dwd_query(query).squeeze()
    return result

def get_available_elasticity_products(region_code: str) -> pd.DataFrame:

    # First we get the ID of fruver category to use it as filter later
    fruver_category_ids = get_fruver_frida_id(region_code=region_code)
    fruver_category_ids_str = ','.join([str(id) for id in fruver_category_ids])

    # Defining fruver filter to get only multicategory products
    fruver_filter = f'AND category_id not in ({fruver_category_ids_str})'

    # Check if there are current estimations for the products.

    query = f"""
                SELECT DISTINCT
                    product_sku
                    , model_mape
                    , category_id
                    , sub_category_id
                    , product_participation
                    , created_at
                FROM
                    lnd_ops.product_price_demand_estimations
                WHERE
                    1=1
                    AND region = '{region_code}'
                    {fruver_filter}
                    AND (
                        elasticity_batch_uuid is not null
                        OR elasticity_batch_uuid != ''
                    )
                """

    records = run_read_prod_query(query)

    return records


def get_product_sensibility(product_sku: str) -> pd.DataFrame:

    sensibility_sql = f"""
        WITH
            estimation_differences AS (
                SELECT
                    ROW_NUMBER()
                        OVER(
                        ORDER BY product_participation DESC, reference_variation ASC
                    )
                        as row
                    , product_sku
                    , model_mape
                    , estimated_demand
                    , reference_variation
                    , local_price
                    , COALESCE(LAG(estimated_demand, 1)
                            OVER (
                                ORDER BY product_participation DESC, reference_variation ASC
                        ), 0)
                        AS estimated_demand_pre
                    , ABS(
                        COALESCE(estimated_demand - LAG(estimated_demand, 1)
                            OVER (
                                ORDER BY product_participation DESC, reference_variation ASC
                        ), 0)
                    )
                        AS estimated_demand_diff_pre
                    , ABS(
                        COALESCE(estimated_demand - LEAD(estimated_demand, 1)
                            OVER (
                                ORDER BY product_participation DESC, reference_variation ASC
                        ), 0)
                    )
                        AS estimated_demand_diff_post
                    , COALESCE(estimated_demand_diff_pre/estimated_demand, 0)
                        AS estimated_demand_percentual_variation_pre
                    , COALESCE(estimated_demand_diff_post/estimated_demand, 0)
                        AS estimated_demand_percentual_variation_post
                    , (estimated_demand_percentual_variation_pre+estimated_demand_percentual_variation_post)/2.0
                        AS estimated_demand_percentual_variation
                    -- , count(*)
                    --     AS row_len
                FROM
                    lnd_ops.product_price_demand_estimations
                WHERE
                    1=1
                    AND product_sku = '{product_sku}'
                    AND estimated_demand > 0
                -- GROUP BY
                --     1,2,3,4,5,6,7,8,9,10,11,12
                ORDER BY
                    product_participation DESC,
                    reference_variation ASC
            ),

            row_stats AS (
                SELECT
                    COUNT(*)
                        as row_count
                FROM
                    estimation_differences
            )

        SELECT
            product_sku
            , model_mape
            , AVG(estimated_demand_percentual_variation)
                AS mean_estimated_demand_percentual_variation
            -- , CASE
            --     WHEN mean_estimated_demand_percentual_variation > 0.05 THEN 'VERY ELASTIC'
            --     WHEN mean_estimated_demand_percentual_variation > 0.01 THEN 'ELASTIC'
            --     WHEN mean_estimated_demand_percentual_variation = 0.01 THEN 'UNITARY'
            --     ELSE 'INELASTIC'
            -- END
            --    AS elasticity_type
        FROM
            estimation_differences
        WHERE
            row > 1
            and row < (select row_count from row_stats)
        GROUP BY
            1,2
    """

    product_sensibility = run_read_prod_query(sensibility_sql)

    return product_sensibility

In [28]:
final_elasticity = []

for city in ['SPO', 'BHZ', 'CWB', 'VCP']:
    region_code = city  # Assuming region_code should be the same as city code
    available_products = get_available_elasticity_products(region_code=region_code)
    
    sensibilities = []
    for product_sku in available_products['product_sku'].unique():
        product_sensibility = get_product_sensibility(product_sku=product_sku)
        sensibilities.append(product_sensibility)
    
    all_sensibilities = pd.concat(sensibilities, ignore_index=True)
    
    full_sensibilities = pd.merge(
        all_sensibilities[['product_sku', 'mean_estimated_demand_percentual_variation']],
        available_products,
        on='product_sku',
        how='inner'
    )
    
    full_sensibilities.rename(columns={'mean_estimated_demand_percentual_variation': 'mean_edpv'}, inplace=True)
    full_sensibilities['mean_edpv'] = full_sensibilities['mean_edpv'].astype('float')
    
    final_elasticity.append(full_sensibilities)

# Concatenate all cities' data into a single DataFrame
final_elasticity_df = pd.concat(final_elasticity, ignore_index=True)

In [29]:
query = """
SELECT
sku AS product_sku,
source_id
FROM dpr_shared.dim_stock_unit 
WHERE sku in {skus}
""".format(skus=tuple(final_elasticity_df.product_sku.unique()))

df_s = run_read_dwd_query(query)

In [32]:
dfg = pd.merge(final_elasticity_df,df_s, left_on=['product_sku'],right_on=['product_sku'],how='left')

dfg = dfg[['source_id','mean_edpv']]

In [33]:
df3 = pd.merge(df_x, dfg, left_on=['source_id'], right_on=['source_id'], how='left')

# New price

In [42]:
from scipy.stats import percentileofscore

# Function to calculate new_uplift for each city
def calculate_new_uplift(df):
    # Calculate percentiles for each value in the mean_edpv column
    percentiles = df['mean_edpv'].apply(lambda x: percentileofscore(df['mean_edpv'], x) / 100.0)
    
    # Apply the transformation (1 - percentile) and map it to the range [2, 10]
    mapped_values = 2 + (1 - percentiles) * (10 - 2)
    
    # Calculate new_uplift as the minimum between the mapped value and uplift
    df['new_uplift'] = np.minimum(mapped_values, df['uplift'])
    
    return df

# Apply the calculation for each city separately
df3 = df3.groupby('city').apply(calculate_new_uplift)

# Handle rows with NaN in mean_edpv separately if needed
df3_na = df3[df3['mean_edpv'].isna()].copy()

# Assign new_uplift for NaN mean_edpv based on cluster logic
df3_na['new_uplift'] = np.where(df3_na.cluster == 'KVI', np.minimum(5, df3_na.uplift),
                       np.where(df3_na.cluster == 'MID', np.minimum(7, df3_na.uplift),
                       np.minimum(10, df3_na.uplift)))

# Update the original DataFrame with new_uplift for NaN mean_edpv
df3.loc[df3_na.index, 'new_uplift'] = df3_na['new_uplift']

In [47]:
# Función para calcular el percentil
def calcular_percentil2(row, bench_df):
    source_id = row['source_id']
    quotation_date = row['quotation_date']
    precio = row['new_price']
    
    # Filtrar precios de referencia por source_id y quotation_date
    bench_precios = bench_df[(bench_df['source_id'] == source_id) & (bench_df['quotation_date'] == quotation_date)]['price']
    
    # Si no hay datos de referencia para ese source_id y quotation_date, devolver NaN
    if bench_precios.empty:
        return np.nan
    
    # Convertir a un array de numpy
    bench_precios = bench_precios.to_numpy(dtype=np.float64)
    
    # Manejo de valores por encima del máximo y por debajo del mínimo
    if precio < np.min(bench_precios):
        return -100*(1-(precio/np.min(bench_precios))) # Percentil mínimo
    elif precio > np.max(bench_precios):
        return 100*(precio/np.max(bench_precios)) # Percentil maximo
    
    # Calcular el percentil
    percentil = stats.percentileofscore(bench_precios, precio)
    return percentil

In [48]:
df3['new_price'] = df3.precio_neto*(1+df3.new_uplift/100)
df3['new_npi'] = df3.new_price/df3.bench
df3['new_mg'] = 1-(df3.costo/df3.new_price)
df3['new_percentil'] = df3.apply(calcular_percentil2,axis=1,bench_df=df_bench)

In [61]:
# Assuming df_bench is your DataFrame
filtered_data = df_bench[(df_bench['source_id'] == 285599) & (df_bench['quotation_date'] == '2024-06-18')]

# Extract the 'price' column from filtered_data
prices = filtered_data['price']

# Create a boxplot using Plotly Express
fig = px.box(prices, y="price", title="Boxplot of Prices on 2024-06-18 for Source ID 108420")

# Show the plot
fig.show()

In [50]:
df3.to_excel('a.xlsx')