In [21]:
import math
    #Numeric Python
import numpy as np
    #Pandas (dataframes)
import pandas as pd
    #datetime for fate manipulation
from datetime import date, datetime, timedelta  
    #Regex for advanced string matching
import re
    #for time related stuff
import time
    #json library
import json
    #Analyst tools
import sys
sys.path.append('~')
from analysts_tools.growth import *
    #Procurement tools
from analystcommunity.read_connection_data_warehouse import run_read_dwd_query
from analysts_tools.redash_methods import *
from procurement_lib import redash, send_slack_notification, GoogleSheet

import random
import time
import datetime

In [22]:
query = """
WITH customs AS (
SELECT
    s.identifier_value AS region_code,
    COUNT(DISTINCT fs.dim_customer) AS all_customers

FROM dpr_sales.fact_sales                   fs
INNER JOIN dpr_shared.dim_site              s   ON s.site_id = fs.dim_site
INNER JOIN dpr_shared.dim_product           dp  ON dp.product_id = fs.dim_product
INNER JOIN dpr_shared.dim_category          cat ON cat.category_id = dp.category_id

WHERE 
    fs.gmv_enabled = TRUE
    AND fulfillment_order_status NOT IN ('CANCELLED', 'ARCHIVED','No value')
    AND fs.fb_order_status_id IN (1,6,7,8)
    AND cat.super_category = 'Multicategoría'
    AND fs.is_deleted = FALSE
    AND fs.dim_status = 1
    AND dp.is_slot = 'false'
    AND fs.gmv_pxq_local > 0
    AND s.identifier_value = 'SPO'
    AND DATE(fs.order_submitted_date) >= current_date - 60
GROUP BY 1
)

SELECT
    s.identifier_value AS region_code,
    cat.parent_description AS cat,
    cat.description AS subcat,
    sup.source_id,
    sup.description AS product_name,
    100.00*COUNT(DISTINCT fs.dim_customer)/c.all_customers::FLOAT AS penetration,
    SUM(fs.gmv_pxq)::FLOAT AS gmv_usd

FROM dpr_sales.fact_sales                   fs
INNER JOIN dpr_shared.dim_site              s   ON s.site_id = fs.dim_site
INNER JOIN dpr_shared.dim_product           dp  ON dp.product_id = fs.dim_product
INNER JOIN dpr_shared.dim_category          cat ON cat.category_id = dp.category_id
INNER JOIN dpr_shared.dim_stock_unit        su  ON su.product_id = fs.dim_product
INNER JOIN dpr_shared.dim_stock_unit        sup ON nvl(nullif(su.source_parent_id,0),su.source_id) = sup.source_id
INNER JOIN customs                          c   ON c.region_code = s.identifier_value
WHERE 
    fs.gmv_enabled = TRUE
    AND fulfillment_order_status NOT IN ('CANCELLED', 'ARCHIVED','No value')
    AND fs.fb_order_status_id IN (1,6,7,8)
    AND cat.super_category = 'Multicategoría'
    AND fs.is_deleted = FALSE
    AND fs.dim_status = 1
    AND dp.is_slot = 'false'
    AND fs.gmv_pxq_local > 0
    AND s.identifier_value = 'SPO'
    AND DATE(fs.order_submitted_date) >= current_date - 60
GROUP BY 1,2,3,4,5,c.all_customers
"""

df = run_read_dwd_query(query)

In [23]:
from sklearn.cluster import KMeans
import plotly.express as px

In [26]:
# Excluir el SKU con source_id 384239
df_clusters = df[df['source_id'] != 384239].copy()

# Crear el modelo K-means con 4 clusters
X = df_clusters[['gmv_usd', 'penetration']]
kmeans = KMeans(n_clusters=4, random_state=0)
df_clusters['Cluster'] = kmeans.fit_predict(X)

# Obtener la media de gmv_usd y penetration para cada cluster
cluster_means = df_clusters.groupby('Cluster')[['gmv_usd', 'penetration']].mean()

# Ordenar los clusters de acuerdo a la media de 'gmv_usd' y 'penetration'
cluster_order = cluster_means.sort_values(by=['gmv_usd', 'penetration'], ascending=False).index

# Crear un mapeo de clusters basados en el orden de importancia
cluster_labels = {cluster_order[0]: 'SUPER KVI', 
                  cluster_order[1]: 'KVI', 
                  cluster_order[2]: 'MID', 
                  cluster_order[3]: 'TAIL'}

# Aplicar los nuevos nombres de clusters al DataFrame
df_clusters['Cluster_Label'] = df_clusters['Cluster'].map(cluster_labels)

# Reagregar el SKU con source_id 384239 y asignarlo directamente al cluster 'SUPER KVI'
df_super_kvi = df[df['source_id'] == 384239]
df_super_kvi['Cluster_Label'] = 'SUPER KVI'

# Unir los DataFrames
df_final = pd.concat([df_clusters, df_super_kvi], ignore_index=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [27]:
# Crear el gráfico de dispersión con Plotly Express, agregando 'product_name' en hover_data
fig = px.scatter(df_final, 
                 x='gmv_usd', 
                 y='penetration', 
                 color='Cluster_Label', 
                 title='Distribución de GMV y Penetración por Cluster',
                 labels={'gmv_usd': 'GMV (USD)', 'penetration': 'Penetración'},
                 hover_data=['product_name', 'gmv_usd', 'penetration'])

# Mostrar la gráfica
fig.show()

In [31]:
result = df_final.groupby('Cluster_Label').agg({
    'gmv_usd': np.sum,             # Suma total de GMV
    'penetration': np.mean,        # Promedio de penetración
    'source_id': pd.Series.nunique # Número de source_id únicos
}).reset_index()

result

Unnamed: 0,Cluster_Label,gmv_usd,penetration,source_id
0,KVI,3080870.0,9.815377,25
1,MID,3849992.0,5.526708,90
2,SUPER KVI,4891165.0,26.128135,8
3,TAIL,3513994.0,1.119838,858
