In [103]:
import math
    #Numeric Python
import numpy as np
    #Pandas (dataframes)
import pandas as pd
    #datetime for fate manipulation
from datetime import date, datetime, timedelta  
    #Regex for advanced string matching
import re
    #for time related stuff
import time
    #json library
import json
    #Analyst tools
import sys
sys.path.append('~')
from analysts_tools.growth import *
    #Procurement tools
from analystcommunity.read_connection_data_warehouse import run_read_dwd_query
from analysts_tools.redash_methods import *
from procurement_lib import redash, dw, send_message, send_slack_notification, GoogleSheet, logging

In [104]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import mplcursors
from scipy import stats

import seaborn as sns

In [105]:
city = 'BHZ'

In [106]:
def info_query(mes):    
    query = """
    SELECT
        s.identifier_value AS region_code,
        TO_CHAR(DATE_TRUNC('month', DATE(CURRENT_DATE) - INTERVAL '{mes} month'),'YYYY-MM') AS month,
        --fs.order_submitted_date AS fecha,
        --fs.order_id,
        dc.source_id AS customer_id,
        --cat.parent_description AS cat,
        --cat.description AS subcat,
        --dp.card_id,
        --dp.card_description AS product_name,
        fs.order_id,
        --SUM(CASE WHEN cat.super_category = 'Fruver' AND fs.order_submitted_date > current_date - 15 THEN 1 ELSE 0 END) AS fruver,
        COUNT(sup.card_id) AS skus,
        SUM(fs.gmv_pxq_local)/4.75 AS gmv_usd,
        SUM(COALESCE(fsd.product_discount,0))/4.75 AS discount_applied,
        100.00*discount_applied/gmv_usd AS per_dct,
        (1-(SUM(cogs_p_mtd*fs.product_quantity_x_step_unit)/SUM(fs.product_price*fs.product_quantity_x_step_unit)))*100.00 AS margin,
        gmv_usd*margin/100.00 AS cash_margin,
        cash_margin-discount_applied AS net_cash_margin,
        100.00*net_cash_margin/gmv_usd AS net_margin
        --100.00*SUM((product_price*fs.product_quantity_x_step_unit)+fs.product_tax_iva)/SUM(pb.min_price*fs.product_quantity_x_step_unit) AS gpi,
        --100.00*SUM((product_price_discount*fs.product_quantity_x_step_unit)+fs.product_tax_iva)/SUM(pb.min_price*fs.product_quantity_x_step_unit) AS npi

    FROM dpr_sales.fact_sales                   fs
    INNER JOIN dpr_shared.dim_customer          dc  ON dc.customer_id = fs.dim_customer
    INNER JOIN dpr_shared.dim_site              s   ON s.site_id = fs.dim_site
    --INNER JOIN dpr_shared.dim_product           dp  ON dp.product_id = fs.dim_product
    --INNER JOIN dpr_shared.dim_category          cat ON cat.category_id = dp.category_id
    LEFT JOIN dpr_sales.fact_sales_discounts    fsd ON fs.order_item_id = fsd.order_item_id 
    INNER JOIN dpr_shared.dim_stock_unit        su  ON su.product_id = fs.dim_product
    INNER JOIN dpr_shared.dim_stock_unit        sup  ON nvl(nullif(su.source_parent_id,0),su.source_id) = sup.source_id
    LEFT JOIN dpr_cross_business.fact_cross_business_insights m ON m.dim_stock_unit = sup.stock_unit_id AND m.dim_date = fs.dim_submitted_date
    --LEFT JOIN dpr_product_pricing.obt_benchmark_product_prices  pb  ON pb.stock_unit_id = sup.stock_unit_id AND DATE(fs.order_submitted_date) = pb.benchmark_date

    WHERE 
        fs.gmv_enabled = TRUE
        AND fulfillment_order_status NOT IN ('CANCELLED', 'ARCHIVED','No value')
        AND fs.fb_order_status_id IN (1,6,7,8)
        AND fs.is_deleted = FALSE
        AND fs.dim_status = 1
        --AND dp.is_slot = 'false'
        AND fs.gmv_pxq_local > 0
        --AND s.identifier_value IN ('SPO','CWB','VCP','BHZ')
        AND s.identifier_value = '{city}'
        AND cogs_p_mtd > 0
        AND DATE(fs.order_submitted_date) >= DATE_TRUNC('month', DATE(CURRENT_DATE) - INTERVAL '{mes} month')
        AND DATE(fs.order_submitted_date) < DATE_TRUNC('month', DATE(CURRENT_DATE) - INTERVAL '{mes2} month')
    GROUP BY 1,2,3,4
    --HAVING margin < 90 AND
    """.format(city=city, mes=mes, mes2=mes-1)

    return run_read_dwd_query(query)

In [109]:
 # Initialize an empty list to store DataFrames
dfs = []

#1 es el mes pasado, en este caso MARZO
for m in range(1,13):
    df = info_query(m)
    print(m, len(df))
    # Append the DataFrame to the list
    dfs.append(df)

# Concatenate all DataFrames in the list
df_sells = pd.concat(dfs, ignore_index=True)  # Set ignore_index to True to reindex the resulting DataFrame

1 18262
2 18296
3 18033
4 18053
5 20349
6 20433
7 23476
8 24851
9 21600
10 22033
11 19936
12 18349


In [110]:
df_sells[['gmv_usd','discount_applied', 'per_dct', 'margin', 'cash_margin','net_cash_margin', 'net_margin','skus']] = df_sells[['gmv_usd','discount_applied', 'per_dct', 'margin', 'cash_margin',
       'net_cash_margin', 'net_margin','skus']].astype(float)

In [111]:
df_sells.month.min()

'2023-06'

In [112]:
df_sells = df_sells.sort_values(by=['customer_id', 'month','order_id'], ascending=True).reset_index(drop=True)
df_sells.head(5)

Unnamed: 0,region_code,month,customer_id,order_id,skus,gmv_usd,discount_applied,per_dct,margin,cash_margin,net_cash_margin,net_margin
0,BHZ,2023-06,59556171,22011234,1.0,38.5768,1.1368,2.9468,-0.59,-0.227603,-1.364403,-3.5368
1,BHZ,2023-06,59556171,22338514,2.0,26.6315,0.1263,0.4742,7.82,2.082583,1.956283,7.3457
2,BHZ,2023-07,59556171,23061442,3.0,67.6715,2.8252,4.1748,13.91,9.413105,6.587905,9.7351
3,BHZ,2023-07,59556171,23228147,2.0,41.9663,1.1368,2.7088,6.32,2.65227,1.51547,3.6111
4,BHZ,2023-08,59556171,23574324,6.0,86.3263,2.3473,2.7191,11.73,10.126074,7.778774,9.0108


In [113]:
today = datetime.today()
first_day_next_month = (today.replace(day=1) - timedelta(days=30)).replace(day=1)
todays_date = first_day_next_month.strftime('%Y-%m-%d')
todays_date

'2024-05-01'

In [114]:
# Convert 'month' column to datetime type
df_sells['month'] = pd.to_datetime(df_sells['month'])

# Filter to include only the last 5 months
last_5_months = pd.date_range(end=todays_date, periods=3, freq='MS')
filtered_df_sells = df_sells[df_sells['month'].isin(last_5_months)]

# Group by 'customer_id' and count the unique months for each customer
customer_purchase_counts = filtered_df_sells.groupby('customer_id')['month'].nunique()

# Create a boolean mask indicating if each customer has bought at least once in each of the last 5 months
mask = customer_purchase_counts >= 2

# Create a new column 'ok' and assign 'ok' if the condition is met, otherwise 'not ok'
df_sells['ok'] = df_sells['customer_id'].map(mask).replace({True: 'ok', False: 'not ok'})

In [115]:
# Group by customer_id and count the orders, then reset the index
customer_orders = df_sells.groupby('customer_id').size().reset_index(name='orders')

# Merge the counts back to the original DataFrame based on customer_id
df_sells = df_sells.merge(customer_orders, on='customer_id', how='left')

In [116]:
df_sells.loc[df_sells.ok == 'ok'].groupby('customer_id').count().reset_index().sort_values('region_code',ascending=False).head(30).customer_id.unique()

array([280962774, 345347860,  95756413, 229505680, 108339090, 156537169,
        94825427, 337735131, 135402819, 181408603, 123571123, 207521145,
        69259424, 143127011, 195116332,  73570548,  81029840, 151490375,
       174142649, 346830440, 114688872, 189290415,  85952818, 344298823,
        69810679, 190117457, 107319334, 176991501,  90182082, 127766577])

In [117]:
order_q = df_sells.orders.quantile(q=0.15)
order_q

16.0

In [118]:
df_sells['pvalue_net_margin'] = "na"

# Group data by customer_id
grouped = df_sells.loc[(df_sells.orders > order_q)].groupby('customer_id')

count=0
for customer_id, customer_data in grouped:
    if len(customer_data) < int(order_q/2):
        pass
    
    print(len(customer_data))
    # # Calculate the cutoff index for 70% and 30%
    # cut_70 = int(len(customer_data) * 0.5)
    
    # # Separate data into first 70% and last 30%
    # first_70 = customer_data.iloc[:cut_70]
    # last_30 = customer_data.iloc[cut_70:]

    # Calculate the cutoff index for 70% and 30%
    cut_70 = int(len(customer_data.month.unique())*0.75)
    
    # Separate data into first 70% and last 30%
    first_70 = customer_data.loc[customer_data.month.isin(customer_data.month.unique()[:cut_70])]
    last_30 = customer_data.loc[customer_data.month.isin(customer_data.month.unique()[cut_70:])]
    
    #Plot distributions for first 70% and last 30%
    # fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    # sns.histplot(first_70['net_margin'], kde=True, ax=axes[0])
    # axes[0].set_title(f'Distribution of net_margin % (Customer {customer_id}) - First 70%')

    # sns.histplot(last_30['net_margin'], kde=True, ax=axes[1])
    # axes[1].set_title(f'Distribution of net_margin % (Customer {customer_id}) - Last 30%')

    # plt.tight_layout()
    # plt.show()
    
    # Perform statistical comparison
    try:
        statistic, p_value = stats.kstest(first_70['net_margin'], last_30['net_margin'])
        print(first_70['net_margin'].mean(),first_70['net_margin'].median())
        print(last_30['net_margin'].mean(),last_30['net_margin'].median())

        # Make decision based on p-value
        if p_value < 0.05:
            if last_30['net_margin'].mean() > first_70['net_margin'].mean():
                df_sells.loc[df_sells['customer_id'] == customer_id, 'pvalue_net_margin'] = "improve"
                print(f"Customer {customer_id}: net_margin % distribution has improved significantly.")
            elif last_30['net_margin'].mean() < first_70['net_margin'].mean():
                print(f"Customer {customer_id}: net_margin % distribution has worsened.")
                df_sells.loc[df_sells['customer_id'] == customer_id, 'pvalue_net_margin'] = "worse"
            else:
                print(f"Customer {customer_id}: net_margin % distribution remains the same.")
                df_sells.loc[df_sells['customer_id'] == customer_id, 'pvalue_net_margin'] = "equal"
        else:
            print(f"Customer {customer_id}: No significant difference in net margin distributions.")
    except ValueError:
        print(f"Unable to perform statistical comparison for Customer {customer_id}: Insufficient data after removing NaN values.")

26
7.471445 7.9741
9.760466666666668 9.044
Customer 59556171: No significant difference in net margin distributions.
67
3.2186745098039222 2.4862
4.8650625000000005 4.2097
Customer 60763282: No significant difference in net margin distributions.
115
9.415811235955056 9.8351
11.745465384615386 11.556650000000001
Customer 63773239: net_margin % distribution has improved significantly.
52
18.201319047619045 19.037
13.885770000000003 16.9256
Customer 63775372: No significant difference in net margin distributions.
75
25.820854545454548 26.9163
22.80542222222222 23.1745
Customer 63785947: No significant difference in net margin distributions.
42
10.481621052631581 9.86205
10.39115 10.5875
Customer 63790041: No significant difference in net margin distributions.
56
5.601812195121951 5.572
6.715046666666667 7.0253
Customer 63941332: No significant difference in net margin distributions.
32
6.209314814814815 5.0702
3.2326799999999998 1.9167
Customer 63945277: No significant difference in net m

In [119]:
df_sells['pvalue_gmv_usd'] = "na"

# Group data by customer_id
grouped = df_sells.loc[(df_sells.orders >= order_q)].groupby('customer_id')

count=0
for customer_id, customer_data in grouped:
    if len(customer_data) < int(order_q/2):
        pass
    
    print(len(customer_data))
    # # Calculate the cutoff index for 70% and 30%
    # cut_70 = int(len(customer_data) * 0.5)
    
    # # Separate data into first 70% and last 30%
    # first_70 = customer_data.iloc[:cut_70]
    # last_30 = customer_data.iloc[cut_70:]

    # Calculate the cutoff index for 70% and 30%
    cut_70 = int(len(customer_data.month.unique())*0.75)
    
    # Separate data into first 70% and last 30%
    first_70 = customer_data.loc[customer_data.month.isin(customer_data.month.unique()[:cut_70])]
    last_30 = customer_data.loc[customer_data.month.isin(customer_data.month.unique()[cut_70:])]
    
    # Plot distributions for first 70% and last 30%
    # fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    # sns.histplot(first_70['gmv_usd'], kde=True, ax=axes[0])
    # axes[0].set_title(f'Distribution of gmv_usd % (Customer {customer_id}) - First 70%')

    # sns.histplot(last_30['gmv_usd'], kde=True, ax=axes[1])
    # axes[1].set_title(f'Distribution of gmv_usd % (Customer {customer_id}) - Last 30%')

    # plt.tight_layout()
    # plt.show()
    
    # Perform statistical comparison
    try:
        statistic, p_value = stats.kstest(first_70['gmv_usd'], last_30['gmv_usd'])
        #print(first_70['gmv_usd'].mean(),first_70['gmv_usd'].median())
        #print(last_30['gmv_usd'].mean(),last_30['gmv_usd'].median())

        # Make decision based on p-value
        if p_value < 0.05:
            if last_30['gmv_usd'].mean() > first_70['gmv_usd'].mean():
                df_sells.loc[df_sells['customer_id'] == customer_id, 'pvalue_gmv_usd'] = "improve"
                print(f"Customer {customer_id}: gmv_usd % distribution has improved significantly.")
            elif last_30['gmv_usd'].mean() < first_70['gmv_usd'].mean():
                print(f"Customer {customer_id}: gmv_usd % distribution has worsened.")
                df_sells.loc[df_sells['customer_id'] == customer_id, 'pvalue_gmv_usd'] = "worse"
            else:
                print(f"Customer {customer_id}: gmv_usd % distribution remains the same.")
                df_sells.loc[df_sells['customer_id'] == customer_id, 'pvalue_gmv_usd'] = "equal"
        else:
            print(f"Customer {customer_id}: No significant difference in net margin distributions.")
    except ValueError:
        print(f"Unable to perform statistical comparison for Customer {customer_id}: Insufficient data after removing NaN values.")

26
Customer 59556171: No significant difference in net margin distributions.
67
Customer 60763282: gmv_usd % distribution has improved significantly.
115
Customer 63773239: gmv_usd % distribution has improved significantly.
52
Customer 63775372: gmv_usd % distribution has improved significantly.
75
Customer 63785947: No significant difference in net margin distributions.
42
Customer 63790041: No significant difference in net margin distributions.
56
Customer 63941332: No significant difference in net margin distributions.
32
Customer 63945277: No significant difference in net margin distributions.
18
Customer 63963775: gmv_usd % distribution has improved significantly.
30
Customer 63964000: No significant difference in net margin distributions.
39
Customer 63964025: No significant difference in net margin distributions.
17
Customer 63989855: No significant difference in net margin distributions.
54
Customer 64518395: gmv_usd % distribution has worsened.
91
Customer 64526650: No signifi

In [120]:
df_sells['pvalue_skus'] = "na"

# Group data by customer_id
grouped = df_sells.loc[(df_sells.orders >= order_q)].groupby('customer_id')

count=0
for customer_id, customer_data in grouped:
    if len(customer_data) <  int(order_q/2):
        pass
    
    print(len(customer_data))
    # # Calculate the cutoff index for 70% and 30%
    # cut_70 = int(len(customer_data) * 0.5)
    
    # # Separate data into first 70% and last 30%
    # first_70 = customer_data.iloc[:cut_70]
    # last_30 = customer_data.iloc[cut_70:]

    # Calculate the cutoff index for 70% and 30%
    cut_70 = int(len(customer_data.month.unique())*0.75)
    
    # Separate data into first 70% and last 30%
    first_70 = customer_data.loc[customer_data.month.isin(customer_data.month.unique()[:cut_70])]
    last_30 = customer_data.loc[customer_data.month.isin(customer_data.month.unique()[cut_70:])]
    
    # Plot distributions for first 70% and last 30%
    # fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    # sns.histplot(first_70['skus'], kde=True, ax=axes[0])
    # axes[0].set_title(f'Distribution of skus % (Customer {customer_id}) - First 70%')

    # sns.histplot(last_30['skus'], kde=True, ax=axes[1])
    # axes[1].set_title(f'Distribution of skus % (Customer {customer_id}) - Last 30%')

    # plt.tight_layout()
    # plt.show()
    
    # Perform statistical comparison
    try:
        statistic, p_value = stats.kstest(first_70['skus'], last_30['skus'])
        #print(first_70['skus'].mean(),first_70['skus'].median())
        #print(last_30['skus'].mean(),last_30['skus'].median())

        # Make decision based on p-value
        if p_value < 0.05:
            if last_30['skus'].mean() > first_70['skus'].mean():
                df_sells.loc[df_sells['customer_id'] == customer_id, 'pvalue_skus'] = "improve"
                print(f"Customer {customer_id}: skus % distribution has improved significantly.")
            elif last_30['skus'].mean() < first_70['skus'].mean():
                print(f"Customer {customer_id}: skus % distribution has worsened.")
                df_sells.loc[df_sells['customer_id'] == customer_id, 'pvalue_skus'] = "worse"
            else:
                print(f"Customer {customer_id}: skus % distribution remains the same.")
                df_sells.loc[df_sells['customer_id'] == customer_id, 'pvalue_skus'] = "equal"
        else:
            print(f"Customer {customer_id}: No significant difference in net margin distributions.")
    except ValueError:
        print(f"Unable to perform statistical comparison for Customer {customer_id}: Insufficient data after removing NaN values.")

26
Customer 59556171: No significant difference in net margin distributions.
67
Customer 60763282: No significant difference in net margin distributions.
115
Customer 63773239: No significant difference in net margin distributions.
52
Customer 63775372: No significant difference in net margin distributions.
75
Customer 63785947: skus % distribution has worsened.
42
Customer 63790041: No significant difference in net margin distributions.
56
Customer 63941332: No significant difference in net margin distributions.
32
Customer 63945277: No significant difference in net margin distributions.
18
Customer 63963775: No significant difference in net margin distributions.
30
Customer 63964000: No significant difference in net margin distributions.
39
Customer 63964025: skus % distribution has worsened.
17
Customer 63989855: No significant difference in net margin distributions.
54
Customer 64518395: No significant difference in net margin distributions.
91
Customer 64526650: No significant dif

In [121]:
df_c = df_sells.groupby(['customer_id','pvalue_gmv_usd','pvalue_net_margin','pvalue_skus','orders']).agg(
                                   gmv_usd=('gmv_usd', np.sum),
                                   cash_margin=('cash_margin', np.sum),
                                   net_cash_margin=('net_cash_margin', np.sum)
                                   
).reset_index()

In [122]:
df_c['net_margin'] = df_c.net_cash_margin / df_c.gmv_usd

In [123]:
# Define quadrant categories
categories = {
    ('na','na'): 'na',
    ('worse','na'): 'na',
    ('improve','na'): 'na',
    ('na','worse'): 'na',
    ('na','improve'): 'na',
    ('worse', 'worse'): 'Quadrant 1',
    ('worse', 'improve'): 'Quadrant 2',
    ('improve', 'worse'): 'Quadrant 3',
    ('improve', 'improve'): 'Quadrant 4'
}

# Assign quadrant categories
df_c['quadrant'] = df_c.apply(lambda row: categories[(row['pvalue_gmv_usd'], row['pvalue_net_margin'])], axis=1)

In [136]:
query = """
WITH info AS (
SELECT 
    s.identifier_value AS region_code,
    dc.source_id AS customer_id,
    SUM(fs.gmv_pxq) AS gmv_usd_60d,
    gmv_usd_60d/SUM(SUM(fs.gmv_pxq)) OVER (PARTITION BY region_code) AS gmv_mix
    
FROM dpr_sales.fact_sales                   fs
INNER JOIN dpr_shared.dim_customer          dc  ON dc.customer_id = fs.dim_customer
INNER JOIN dpr_shared.dim_site              s   ON s.site_id = fs.dim_site
INNER JOIN dpr_shared.dim_product           dp  ON dp.product_id = fs.dim_product   

WHERE 
    fs.gmv_enabled = TRUE
    AND fulfillment_order_status NOT IN ('CANCELLED', 'ARCHIVED','No value')
    AND fs.fb_order_status_id  IN (1,6,7,8)
    AND fs.is_deleted = FALSE
    AND fs.dim_status = 1
    AND dp.is_slot = 'false'
    --AND fsd.is_enabled = true
    AND s.identifier_value = '{ciudad}'
    AND DATE(fs.order_submitted_date) >= current_date - 60
GROUP BY 1,2
)

SELECT 
*
FROM info
WHERE customer_id IN {clientes}
""".format(ciudad=city,clientes=tuple(df_c.customer_id.unique()))
df_g = run_read_dwd_query(query)

In [137]:
df_final = df_c.merge(df_g, on='customer_id', how='left')

In [142]:
df_final.customer_id.nunique()

12107

In [139]:
df_final.to_excel(f"{city}_analisis_2.xlsx")