In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pathlib
import warnings
warnings.filterwarnings("ignore")

import pathlib
import pandas as pd
from pathlib import Path
import numpy as np
from datetime import timedelta

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.offline as pyo
import mmh3
from typing import Dict



from src.download_AB import download_experiment_data, download_recprice_data, download_order_data, download_bid_data
from src.metrics_AB import calculate_absolute_metrics, get_switchback_results, calculate_ratio_metrics
from src.prepare import prepare_recprice_data, prepare_order_data, prepare_bid_data, get_orders_with_recprice_df, get_hex
from src.visualization import plot_switches_matrix, plot_conversions_by_time, plot_prices_by_time
from src.visualization import plot_metric_by_time, plot_metric_by_hex

## SB Results: Bad Bids

- link

### Костыли

In [7]:
# salt - рандомная строка длиной 8 хранится в experiment.salt
# distribution_map - словарь вида {"GroupA": 50, "Control": 50}, где сумма значений должна быть 100
#  Порядок важен! Control всегда последний
def group_name(split_id: str, salt: str, distribution_map: Dict[str, int]):
    if sum(distribution_map.values()) != 100:
        raise Exception("Incorrect distribution")

    if list(distribution_map.keys())[-1] != "Control":
        raise Exception("Last group should be Control")

    if len(salt) != 8:
        raise Exception("Incorrect salt")

    # Хэш по соль экспа + split_id (id пользователя или девайса)
    murmur_hash: int = mmh3.hash(f"{salt}{split_id}", 1, False)
    reminder = murmur_hash % 100
    s = 0
    for k, v in distribution_map.items():
        s += v
        if reminder < s:
            return k

### Parameters

__Mutable__

In [4]:
EXP_ID = 2641
USER_NAME = 'nusuev_ab'+str(EXP_ID)

ORDER_TYPE = 'auto_econom'
ORDER_TYPE_ID = 1

DAYS_BEFORE = 0

__Immutable__

In [5]:
DATA_ROOT_PATH = pathlib.Path(f'data/exp_id={EXP_ID}')
if not DATA_ROOT_PATH.exists():
    DATA_ROOT_PATH.mkdir(parents=True, exist_ok=True)
    

PLOT_ROOT_PATH = pathlib.Path(f'data/exp_id={EXP_ID}/plots')
if not PLOT_ROOT_PATH.exists():
    PLOT_ROOT_PATH.mkdir(parents=True, exist_ok=True)

### Experiment Data

__Download__

In [10]:
df_exp = download_experiment_data(exp_id=EXP_ID, user_name=USER_NAME)

# df_exp['hour'] = df_exp['switch_start_dttm'].dt.hour
# df_exp['hour'] = df_exp['hour'].astype('category')
# df_exp['weekday_name'] = df_exp['switch_start_dttm'].dt.day_name()
# df_exp['weekday_name'] = df_exp['weekday_name'].astype('category')

df_exp.to_parquet(DATA_ROOT_PATH / 'df_exp.pqt')

EXP_START_DATE = df_exp.utc_start_dttm.dt.date.astype('str').iloc[0]
EXP_STOP_DATE = df_exp.utc_finish_dttm.dt.date.astype('str').iloc[0]
BEFORE_START_DATE = (df_exp.utc_start_dttm.dt.date - timedelta(days=DAYS_BEFORE)).astype('str').iloc[0]
CITY_ID = df_exp.city_id.iloc[0]
# EXP_NAME = df_exp.exp_name.iloc[0]

EXP_SALT = df_exp['exp_salt'].iloc[0]
DISTRIBUTION_MAP = {"GroupA": 50, "Control": 50}

print(
    f"""
    before_start_date: {BEFORE_START_DATE}
    exp_start_date: {EXP_START_DATE}
    exp_stop_date: {EXP_STOP_DATE}
    city_id: {CITY_ID}
    exp_salt: {EXP_SALT}
    distribution_map: {DISTRIBUTION_MAP}
    """
)


    before_start_date: 2025-02-28
    exp_start_date: 2025-02-28
    exp_stop_date: 2025-03-28
    city_id: 4241
    exp_salt: QKLy5R6k
    distribution_map: {'GroupA': 50, 'Control': 50}
    


__Validity__

### Metrics Data

__Recprice__

In [11]:
df_recprice = download_recprice_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    user_name=USER_NAME,
    # printBool=True
)

# Применяем функцию group_name к каждой строке, используя user_id как split_id
df_recprice['recprice_group_name'] = df_recprice.apply(
    lambda row: 'Before' if row['utc_recprice_dttm'] < pd.Timestamp(EXP_START_DATE, tz='UTC') 
    else group_name(row['user_id'], EXP_SALT, DISTRIBUTION_MAP),
    axis=1
)

df_recprice.to_parquet(DATA_ROOT_PATH / 'df_recprice.pqt')

In [12]:
df_recprice = pd.read_parquet(DATA_ROOT_PATH / 'df_recprice.pqt')
df_recprice_prepared = prepare_recprice_data(df_recprice)
df_recprice_prepared.to_parquet(DATA_ROOT_PATH / 'df_recprice_prepared.pqt')

__Bids__

In [13]:
df_bids = download_bid_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    user_name=USER_NAME,
    # printBool=True
)

# Применяем функцию group_name к каждой строке, используя user_id как split_id
df_bids['bid_group_name'] = df_bids.apply(
    lambda row: 'Before' if row['utc_order_dttm'] < pd.Timestamp(EXP_START_DATE, tz='UTC') 
    else group_name(row['user_id'], EXP_SALT, DISTRIBUTION_MAP),
    axis=1
)

df_bids.to_parquet(DATA_ROOT_PATH / 'df_bids.pqt')

In [14]:
df_bids = pd.read_parquet(DATA_ROOT_PATH / 'df_bids.pqt')
df_bids_prepared = prepare_bid_data(df_bids)
df_bids_prepared.to_parquet(DATA_ROOT_PATH / 'df_bids_prepared.pqt')

__Orders (with recprice)__

In [15]:
df_orders = download_order_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    user_name=USER_NAME,
    # printBool=False
)

# Применяем функцию group_name к каждой строке, используя user_id как split_id
df_orders['order_group_name'] = df_orders.apply(
    lambda row: 'Before' if row['utc_order_dttm'] < pd.Timestamp(EXP_START_DATE, tz='UTC') 
    else group_name(row['user_id'], EXP_SALT, DISTRIBUTION_MAP),
    axis=1
)

df_orders.to_parquet(DATA_ROOT_PATH / 'df_orders.pqt')

In [16]:
df_orders = pd.read_parquet(DATA_ROOT_PATH / 'df_orders.pqt')
df_orders_prepared = prepare_order_data(df_orders)
df_orders_prepared.to_parquet(DATA_ROOT_PATH / 'df_orders_prepared.pqt')

In [17]:
df_orders_with_recprice = get_orders_with_recprice_df(df_orders_prepared, df_recprice_prepared)
df_orders_with_recprice['group_name'] = df_orders_with_recprice['recprice_group_name']
df_orders_with_recprice.to_parquet(DATA_ROOT_PATH / 'df_orders_with_recprice.pqt')

только уникальные ордера? – True
доля оставшихся ордеров: 0.9728


### Results

In [18]:
# EXP_ID = 2574
# DATA_ROOT_PATH = pathlib.Path(f'data/exp_id={EXP_ID}')

# df_recprice_prepared = pd.read_parquet(DATA_ROOT_PATH / 'df_recprice_prepared.pqt')
# df_orders_with_recprice = pd.read_parquet(DATA_ROOT_PATH / 'df_orders_with_recprice.pqt')

In [14]:
pd.set_option('display.max_rows', None)
# pd.reset_option('display.max_rows')

In [6]:
df_recprice_prepared = pd.read_parquet(DATA_ROOT_PATH / 'df_recprice_prepared.pqt')
df_orders_with_recprice = pd.read_parquet(DATA_ROOT_PATH / 'df_orders_with_recprice.pqt')
df_bids_prepared = pd.read_parquet(DATA_ROOT_PATH / 'df_bids_prepared.pqt')

__Total__

In [16]:
df_metrics_total = calculate_absolute_metrics(
    df_recprice_prepared,
    df_orders_with_recprice,
    df_bids_prepared,
    group_cols=['group_name', 'order_uuid'],
)

metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05)[
    ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
]

metrics_total_tbl.head()

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant
0,drivers_per_order,1.926175,1.926713,0.000279,0.836709,False
1,bids_per_order,2.111355,2.110571,-0.000371,0.818043,False
2,bids_per_driver,1.096139,1.095426,-0.000651,0.292706,False
3,bids_per_order_with_bid,2.385093,2.380252,-0.00203,0.17062,False
4,bids_per_done_order,2.327085,2.321976,-0.002195,0.160554,False


In [18]:
metrics_total_tbl

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant
0,drivers_per_order,1.926175,1.926713,0.000279,0.836709,False
1,bids_per_order,2.111355,2.110571,-0.000371,0.818043,False
2,bids_per_driver,1.096139,1.095426,-0.000651,0.292706,False
3,bids_per_order_with_bid,2.385093,2.380252,-0.00203,0.17062,False
4,bids_per_done_order,2.327085,2.321976,-0.002195,0.160554,False
5,drivers_per_orders_with_bid,2.175905,2.172902,-0.00138,0.248046,False
6,bids_accepted_per_driver,0.4066644,0.407492,0.002035,0.143567,False
7,price_highrate_usd,2.970135,2.97055,0.000139,0.830372,False
8,price_highrate_usd_rides,2.841368,2.841246,-4.3e-05,0.936752,False
9,price_highrate_usd_orders_without_bids,3.606307,3.615327,0.002501,0.435466,False


__By segment__

In [17]:
condition_on_recprice = df_recprice_prepared.order_type_id == ORDER_TYPE_ID
condition_on_orders = df_orders_with_recprice.order_type == ORDER_TYPE
condition_on_bids = df_bids_prepared.order_type == ORDER_TYPE

df_metrics_total = calculate_absolute_metrics(
    df_recprice_prepared[condition_on_recprice].copy(),
    df_orders_with_recprice[condition_on_orders].copy(),
    df_bids_prepared[condition_on_bids].copy(),
    group_cols=['group_name', 'switch_start_dttm', 'switch_finish_dttm'],
)

metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05)[
    ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
]

metrics_total_tbl

KeyError: 'switch_start_dttm'