# AB Results: Bad Bids 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pathlib
import warnings
warnings.filterwarnings("ignore")

import pathlib
import pandas as pd
from pathlib import Path
import numpy as np
from datetime import timedelta

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.offline as pyo
import mmh3
from typing import Dict



from src.download_AB import download_experiment_data, download_recprice_data, download_order_data, download_bid_data
from src.metrics_AB import calculate_absolute_metrics, get_AB_results, calculate_ratio_metrics
from src.prepare import prepare_recprice_data, prepare_order_data, prepare_bid_data, get_orders_with_recprice_df, get_hex
from src.visualization import plot_switches_matrix, plot_conversions_by_time, plot_prices_by_time
from src.visualization import plot_metric_by_time, plot_metric_by_hex

## __Cucuta [-, 4196]__

- https://ab.aws.indriverapp.com/switchbacks/new_order/-----

### Костыли

In [3]:
# salt - рандомная строка длиной 8 хранится в experiment.salt
# distribution_map - словарь вида {"GroupA": 50, "Control": 50}, где сумма значений должна быть 100
#  Порядок важен! Control всегда последний
def group_name(split_id: str, salt: str, distribution_map: Dict[str, int]):
    if sum(distribution_map.values()) != 100:
        raise Exception("Incorrect distribution")

    if list(distribution_map.keys())[-1] != "Control":
        raise Exception("Last group should be Control")

    if len(salt) != 8:
        raise Exception("Incorrect salt")

    # Хэш по соль экспа + split_id (id пользователя или девайса)
    murmur_hash: int = mmh3.hash(f"{salt}{split_id}", 1, False)
    reminder = murmur_hash % 100
    s = 0
    for k, v in distribution_map.items():
        s += v
        if reminder < s:
            return k

### Parameters

Bad Bids Experiment Specific Parameters

In [4]:
t = 244
alpha = 0.125

Mutable

In [5]:
EXP_ID = 2640
USER_NAME = 'nusuev_ab'+str(EXP_ID)

ORDER_TYPE = 'auto_econom'
ORDER_TYPE_ID = 1

DAYS_BEFORE = 15

Immutable

In [6]:
DATA_ROOT_PATH = pathlib.Path(f'data/exp_id={EXP_ID}')
if not DATA_ROOT_PATH.exists():
    DATA_ROOT_PATH.mkdir(parents=True, exist_ok=True)
    

PLOT_ROOT_PATH = pathlib.Path(f'data/exp_id={EXP_ID}/plots')
if not PLOT_ROOT_PATH.exists():
    PLOT_ROOT_PATH.mkdir(parents=True, exist_ok=True)

### __Experiment Data__

#### Download

In [7]:
df_exp = download_experiment_data(exp_id=EXP_ID, user_name=USER_NAME)

# df_exp['hour'] = df_exp['switch_start_dttm'].dt.hour
# df_exp['hour'] = df_exp['hour'].astype('category')
# df_exp['weekday_name'] = df_exp['switch_start_dttm'].dt.day_name()
# df_exp['weekday_name'] = df_exp['weekday_name'].astype('category')

df_exp.to_parquet(DATA_ROOT_PATH / 'df_exp.pqt')

EXP_START_DATE = df_exp.utc_start_dttm.dt.date.astype('str').iloc[0]
EXP_STOP_DATE = df_exp.utc_finish_dttm.dt.date.astype('str').iloc[0]
BEFORE_START_DATE = (df_exp.utc_start_dttm.dt.date - timedelta(days=DAYS_BEFORE)).astype('str').iloc[0]
CITY_ID = df_exp.city_id.iloc[0]
# EXP_NAME = df_exp.exp_name.iloc[0]

EXP_SALT = df_exp['exp_salt'].iloc[0]
DISTRIBUTION_MAP = {"GroupA": 50, "Control": 50}

print(
    f"""
    before_start_date: {BEFORE_START_DATE}
    exp_start_date: {EXP_START_DATE}
    exp_stop_date: {EXP_STOP_DATE}
    city_id: {CITY_ID}
    """
)


    before_start_date: 2025-02-13
    exp_start_date: 2025-02-28
    exp_stop_date: 2025-03-28
    city_id: 4196
    


#### Check Validity

Switch Splitting. Total.

In [8]:
df_exp

Unnamed: 0,exp_id,city_id,multiple_cities,status,utc_start_dttm,utc_finish_dttm,exp_salt,conditions,utc_start_dttm_unix
0,2640,4196,False,completed,2025-02-28 13:15:00+00:00,2025-03-28 13:15:00+00:00,G82FXbXL,"{""and"": [{""or"": [{""=="": [{""var"": ""order_type_i...",1740748500


In [9]:
df_exp.groupby('group_name').size()

KeyError: 'group_name'

### __Metrics Data__

#### Recprice

In [10]:
df_recprice = download_recprice_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    user_name=USER_NAME,
    # printBool=False
)

# Применяем функцию group_name к каждой строке, используя user_id как split_id
df_recprice['recprice_group_name'] = df_recprice.apply(
    lambda row: 'Before' if row['utc_recprice_dttm'] < pd.Timestamp(EXP_START_DATE, tz='UTC') 
    else group_name(row['user_id'], EXP_SALT, DISTRIBUTION_MAP),
    axis=1
)

df_recprice.to_parquet(DATA_ROOT_PATH / 'df_recprice.pqt')

In [11]:
df_recprice = pd.read_parquet(DATA_ROOT_PATH / 'df_recprice.pqt')
df_recprice_prepared = prepare_recprice_data(df_recprice)
df_recprice_prepared.to_parquet(DATA_ROOT_PATH / 'df_recprice_prepared.pqt')

#### Bids

In [12]:
df_bids = download_bid_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    user_name=USER_NAME,
    # printBool=False
)

# Применяем функцию group_name к каждой строке, используя user_id как split_id
df_bids['bid_group_name'] = df_bids.apply(
    lambda row: 'Before' if row['utc_order_dttm'] < pd.Timestamp(EXP_START_DATE, tz='UTC') 
    else group_name(row['user_id'], EXP_SALT, DISTRIBUTION_MAP),
    axis=1
)

df_bids.to_parquet(DATA_ROOT_PATH / 'df_bids.pqt')

In [13]:
df_bids = pd.read_parquet(DATA_ROOT_PATH / 'df_bids.pqt')
df_bids_prepared = prepare_bid_data(df_bids)
df_bids_prepared.to_parquet(DATA_ROOT_PATH / 'df_bids_prepared.pqt')

#### Orders (with recprice)

In [14]:
df_orders = download_order_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    user_name=USER_NAME,
    # printBool=False
)

# Применяем функцию group_name к каждой строке, используя user_id как split_id
df_orders['order_group_name'] = df_orders.apply(
    lambda row: 'Before' if row['utc_order_dttm'] < pd.Timestamp(EXP_START_DATE, tz='UTC') 
    else group_name(row['user_id'], EXP_SALT, DISTRIBUTION_MAP),
    axis=1
)

df_orders.to_parquet(DATA_ROOT_PATH / 'df_orders.pqt')

In [15]:
df_orders = pd.read_parquet(DATA_ROOT_PATH / 'df_orders.pqt')
df_orders_prepared = prepare_order_data(df_orders)
df_orders_prepared.to_parquet(DATA_ROOT_PATH / 'df_orders_prepared.pqt')

In [16]:
df_orders_with_recprice = get_orders_with_recprice_df(df_orders_prepared, df_recprice_prepared)
df_orders_with_recprice['group_name'] = df_orders_with_recprice['recprice_group_name']
df_orders_with_recprice.to_parquet(DATA_ROOT_PATH / 'df_orders_with_recprice.pqt')

только уникальные ордера? – True
доля оставшихся ордеров: 0.9697


### __Results__

#### Get Data

In [17]:
try:
    del df_recprice_prepared, df_orders_with_recprice, df_bids_prepared
    del df_recprice, df_orders, df_bids
except:
    pass

df_recprice_prepared = pd.read_parquet(DATA_ROOT_PATH / 'df_recprice_prepared.pqt')
df_orders_with_recprice = pd.read_parquet(DATA_ROOT_PATH / 'df_orders_with_recprice.pqt')
df_bids_prepared = pd.read_parquet(DATA_ROOT_PATH / 'df_bids_prepared.pqt')

#### Total Results

In [18]:
pd.reset_option('display.max_rows')

df_metrics_total = calculate_absolute_metrics(
    df_recprice_prepared,
    df_orders_with_recprice,
    df_bids_prepared,
    group_cols=['group_name', 'order_uuid'],
)

metrics_total_tbl = get_AB_results(df_metrics_total, alpha=0.05)[
    ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
]

metrics_total_tbl.head()

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant
0,drivers_per_order,2.158959,2.156357,-0.001205,0.503646,False
1,bids_per_order,2.513711,2.517022,0.001317,0.547782,False
2,bids_per_driver,1.164317,1.167256,0.002525,0.007658,True
3,bids_per_order_with_bid,2.772797,2.78024,0.002684,0.194284,False
4,bids_per_done_order,2.645025,2.648241,0.001216,0.604742,False


In [24]:
condition_on_recprice = df_recprice_prepared.order_type_id == ORDER_TYPE_ID
condition_on_orders = df_orders_with_recprice.order_type == ORDER_TYPE
condition_on_bids = df_bids_prepared.order_type == ORDER_TYPE

pd.reset_option('display.max_rows')

df_metrics_total = calculate_absolute_metrics(
    df_recprice_prepared[condition_on_recprice].copy(),
    df_orders_with_recprice[condition_on_orders].copy(),
    df_bids_prepared[condition_on_bids].copy(),
    group_cols=['group_name', 'order_uuid'],
)

metrics_total_tbl = get_AB_results(df_metrics_total, alpha=0.05)[
    ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
]

metrics_total_tbl.head()

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant
0,drivers_per_order,2.173707,2.171585,-0.000976,0.611214,False
1,bids_per_order,2.540495,2.543543,0.0012,0.609009,False
2,bids_per_driver,1.168738,1.171284,0.002178,0.031988,True
3,bids_per_order_with_bid,2.78751,2.796225,0.003127,0.159371,False
4,bids_per_done_order,2.65299,2.658433,0.002052,0.415397,False


In [None]:
plot_normalized_distributions(df_metrics_grouped, 
                              plot_root_path=PLOT_ROOT_PATH,
                              is_before=True,
                              is_show=True,)

In [None]:
condition_on_recprice = df_recprice_prepared.order_type_id == ORDER_TYPE_ID
condition_on_orders = df_orders_with_recprice.order_type == ORDER_TYPE
condition_on_bids = df_bids_prepared.order_type == ORDER_TYPE

# Calculate Absolute Metrics
df_metrics_grouped = calculate_absolute_metrics(
    df_recprice_prepared[condition_on_recprice].copy(),
    df_orders_with_recprice[condition_on_orders].copy(),
    df_bids_prepared[condition_on_bids].copy(),
    group_cols=GROUP_COLS,
)

# Calculate Ratio Metrics
df_metrics_grouped = calculate_ratio_metrics(df_metrics_grouped)

In [None]:
plot_normalized_distributions(df_metrics_grouped, 
                              plot_root_path=PLOT_ROOT_PATH,
                              is_before=True,
                              is_show=True,)

In [None]:
condition_on_recprice = df_recprice_prepared.order_type_id == ORDER_TYPE_ID
condition_on_orders = df_orders_with_recprice.order_type == ORDER_TYPE
condition_on_bids = df_bids_prepared.order_type == ORDER_TYPE

# Calculate Absolute Metrics
df_metrics_grouped = calculate_absolute_metrics(
    df_recprice_prepared[condition_on_recprice].copy(),
    df_orders_with_recprice[condition_on_orders].copy(),
    df_bids_prepared[condition_on_bids].copy(),
    group_cols=GROUP_COLS,
)

# Calculate Ratio Metrics
df_metrics_grouped = calculate_ratio_metrics(df_metrics_grouped)

In [None]:
plot_normalized_distributions(df_metrics_grouped, 
                              plot_root_path=PLOT_ROOT_PATH,
                              is_before=True,
                              is_show=True,)

#### Results by Segment

In [19]:
condition_on_recprice = df_recprice_prepared.order_type_id == ORDER_TYPE_ID
condition_on_orders = df_orders_with_recprice.order_type == ORDER_TYPE
condition_on_bids = df_bids_prepared.order_type == ORDER_TYPE

In [None]:
pd.set_option('display.max_rows', None)

df_metrics_total = calculate_absolute_metrics(
    df_recprice_prepared[condition_on_recprice].copy(),
    df_orders_with_recprice[condition_on_orders].copy(),
    df_bids_prepared[condition_on_bids].copy(),
    group_cols=['group_name', 'switch_start_dttm', 'switch_finish_dttm'],
)

metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05)[
    ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
]

metrics_total_tbl.to_csv(DATA_ROOT_PATH / 'metrics_total_tbl.csv', index=False)
print(f"""csv file saved to {DATA_ROOT_PATH / 'metrics_total_tbl.csv'}""")
metrics_total_tbl

#### __Plots__

##### Heatmap of the Algorithm

Prepare Data

In [None]:
df_bids_filtered = df_bids_prepared.dropna(
    subset=['eta', 'duration_in_min', 'price_highrate_value', 'price_start_value', 'available_prices_currency']
)
df_bids_filtered = add_algo_name_new(df_bids_filtered, t, alpha)

Check Validity

In [None]:
# Создаем три разных подсчета
total_counts = df_bids_filtered.groupby('group_name').size()
mph_counts = df_bids_filtered[df_bids_filtered['bidding_algorithm_name'] == 'bid_mph'].groupby('group_name').size()
algo_mph_counts = df_bids_filtered[df_bids_filtered['algo_name_new'] == 'algo_bid_mph'].groupby('group_name').size()

# Объединяем результаты в один датафрейм
result = pd.DataFrame({
    'Всего строк': total_counts,
    'Строк bid_mph': mph_counts,
    'Строк algo_bid_mph': algo_mph_counts
})

# Заполняем NaN нулями, если какие-то группы не имеют значений
result = result.fillna(0).astype(int)

# Сортируем по общему количеству строк
result = result.sort_values('Всего строк', ascending=False)

print("Распределение по group_name:")
print(result)

# Добавим итоговую строку
print("\nИтого:")
print(result.sum())

del total_counts, mph_counts, algo_mph_counts, result

Heatmap

In [None]:
condition_on_bids = df_bids_filtered.order_type == ORDER_TYPE
plot_algo_heatmap(df_bids_filtered[condition_on_bids].copy(), min_samples=20)  # показать только бины с 20+ записями

##### Time Series

Get Data

In [None]:
GROUP_COLS = ['group_name', 'time']

In [None]:
condition_on_recprice = df_recprice_prepared.order_type_id == ORDER_TYPE_ID
condition_on_orders = df_orders_with_recprice.order_type == ORDER_TYPE
condition_on_bids = df_bids_prepared.order_type == ORDER_TYPE

# Calculate Absolute Metrics
df_metrics_grouped = calculate_absolute_metrics(
    df_recprice_prepared[condition_on_recprice].copy(),
    df_orders_with_recprice[condition_on_orders].copy(),
    df_bids_prepared[condition_on_bids].copy(),
    group_cols=GROUP_COLS,
)

# Calculate Ratio Metrics
df_metrics_grouped = calculate_ratio_metrics(df_metrics_grouped)

Conversions

In [None]:
plot_conversions_by_time(
    df_metrics_grouped,
    grouped_column='time',
    plot_root_path=PLOT_ROOT_PATH,
    is_before=True,
    is_show=True,
)

Prices

In [None]:
plot_prices_by_time(
    df_metrics_grouped,
    grouped_column='time',
    plot_root_path=PLOT_ROOT_PATH,
    is_before=True,
    is_show=True,
)

Times

In [None]:
plot_times_by_time(
    df_metrics_grouped,
    grouped_column='time',
    plot_root_path=PLOT_ROOT_PATH,
    is_before=True,
    is_show=True
)

#### Results by Duration

In [None]:
print(f"""Included ETA: {t}sec""")

In [31]:
condition_on_recprice = (df_recprice_prepared.order_type_id == ORDER_TYPE_ID) & (df_recprice_prepared.log_duration_in_min <= t/60)
condition_on_orders = (df_orders_with_recprice.order_type == ORDER_TYPE) & (df_orders_with_recprice.log_duration_in_min <= t/60)
condition_on_bids = (df_bids_prepared.order_type == ORDER_TYPE) & (df_bids_prepared.duration_in_min <= t/60)

In [None]:
pd.set_option('display.max_rows', None)

df_metrics_total = calculate_absolute_metrics(
    df_recprice_prepared[condition_on_recprice].copy(),
    df_orders_with_recprice[condition_on_orders].copy(),
    df_bids_prepared[condition_on_bids].copy(),
    group_cols=['group_name', 'switch_start_dttm', 'switch_finish_dttm'],
)

metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05)[
    ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
]

metrics_total_tbl.to_csv(DATA_ROOT_PATH / 'metrics_total_tbl.csv', index=False)
print(f"""csv file saved to {DATA_ROOT_PATH / 'metrics_total_tbl.csv'}""")
metrics_total_tbl

#### __Plots__

##### Time Series

Get Data

In [37]:
GROUP_COLS = ['group_name', 'time']

In [38]:
condition_on_recprice = (df_recprice_prepared.order_type_id == ORDER_TYPE_ID) & (df_recprice_prepared.log_duration_in_min <= t/60)
condition_on_orders = (df_orders_with_recprice.order_type == ORDER_TYPE) & (df_orders_with_recprice.log_duration_in_min <= t/60)
condition_on_bids = (df_bids_prepared.order_type == ORDER_TYPE) & (df_bids_prepared.duration_in_min <= t/60)

# Calculate Absolute Metrics
df_metrics_grouped = calculate_absolute_metrics(
    df_recprice_prepared[condition_on_recprice].copy(),
    df_orders_with_recprice[condition_on_orders].copy(),
    df_bids_prepared[condition_on_bids].copy(),
    group_cols=GROUP_COLS,
)

# Calculate Ratio Metrics
df_metrics_grouped = calculate_ratio_metrics(df_metrics_grouped)

Conversions

In [None]:
plot_conversions_by_time(
    df_metrics_grouped,
    grouped_column='time',
    plot_root_path=PLOT_ROOT_PATH,
    is_before=True,
    is_show=True,
)

Prices

In [None]:
plot_prices_by_time(
    df_metrics_grouped,
    grouped_column='time',
    plot_root_path=PLOT_ROOT_PATH,
    is_before=True,
    is_show=True,
)

Times

In [None]:
plot_times_by_time(
    df_metrics_grouped,
    grouped_column='time',
    plot_root_path=PLOT_ROOT_PATH,
    is_before=True,
    is_show=True
)

# Foot note