In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pathlib
import warnings
warnings.filterwarnings("ignore")

import pathlib
import pandas as pd
from pathlib import Path
import numpy as np
from datetime import timedelta

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.offline as pyo


from src.download import download_experiment_data, download_recprice_data, download_order_data, download_bid_data
from src.metrics import calculate_absolute_metrics, get_switchback_results, calculate_ratio_metrics
from src.prepare import prepare_recprice_data, prepare_order_data, prepare_bid_data, get_orders_with_recprice_df, get_hex
from src.visualization import plot_switches_matrix, plot_conversions_by_time, plot_prices_by_time
from src.visualization import plot_metric_by_time, plot_metric_by_hex

ImportError: cannot import name 'prepare_bid_data' from 'src.prepare' (/Users/georgiinusuev/PycharmProjects/work/badbids/exp_anal/AB/src/prepare.py)

## SB Results: Bad Bids

- link

### Костыли

In [None]:
# salt - рандомная строка длиной 8 хранится в experiment.salt
# distribution_map - словарь вида {"GroupA": 50, "Control": 50}, где сумма значений должна быть 100
#  Порядок важен! Control всегда последний
def group_name(split_id: str, salt: str, distribution_map: Dict[str, int]):
    if sum(distribution_map.values()) != 100:
        raise Exception("Incorrect distribution")

    if list(distribution_map.keys())[-1] != "Control":
        raise Exception("Last group should be Control")

    if len(salt) != 8:
        raise Exception("Incorrect salt")

    # Хэш по соль экспа + split_id (id пользователя или девайса)
    murmur_hash: int = mmh3.hash(f"{salt}{split_id}", 1, False)
    reminder = murmur_hash % 100
    s = 0
    for k, v in distribution_map.items():
        s += v
        if reminder < s:
            return k

### Parameters

__Mutable__

In [3]:
EXP_ID = 2640
USER_NAME = 'nusuev_ab'+str(EXP_ID)

ORDER_TYPE = 'auto_econom'
ORDER_TYPE_ID = 1

DAYS_BEFORE = 0

__Immutable__

In [4]:
DATA_ROOT_PATH = pathlib.Path(f'data/exp_id={EXP_ID}')
if not DATA_ROOT_PATH.exists():
    DATA_ROOT_PATH.mkdir(parents=True, exist_ok=True)
    

PLOT_ROOT_PATH = pathlib.Path(f'data/exp_id={EXP_ID}/plots')
if not PLOT_ROOT_PATH.exists():
    PLOT_ROOT_PATH.mkdir(parents=True, exist_ok=True)

### Experiment Data

__Download__

In [5]:
df_exp = download_experiment_data(exp_id=EXP_ID, user_name=USER_NAME)

df_exp['hour'] = df_exp['switch_start_dttm'].dt.hour
df_exp['hour'] = df_exp['hour'].astype('category')
df_exp['weekday_name'] = df_exp['switch_start_dttm'].dt.day_name()
df_exp['weekday_name'] = df_exp['weekday_name'].astype('category')

df_exp.to_parquet(DATA_ROOT_PATH / 'df_exp.pqt')

EXP_START_DATE = df_exp.utc_start_dttm.dt.date.astype('str').iloc[0]
EXP_STOP_DATE = df_exp.utc_finish_dttm.dt.date.astype('str').iloc[0]
BEFORE_START_DATE = (df_exp.utc_start_dttm.dt.date - timedelta(days=DAYS_BEFORE)).astype('str').iloc[0]
CITY_ID = df_exp.city_id.iloc[0]
EXP_NAME = df_exp.exp_name.iloc[0]

EXP_SALT = df_exp['exp_salt'].iloc[0]
DISTRIBUTION_MAP = {"GroupA": 50, "Control": 50}

print(
    f"""
    before_start_date: {BEFORE_START_DATE}
    exp_start_date: {EXP_START_DATE}
    exp_stop_date: {EXP_STOP_DATE}
    city_id: {CITY_ID}
    exp_name: {EXP_NAME}
    exp_salt: {EXP_SALT}
    distribution_map: {DISTRIBUTION_MAP}
    """
)


    before_start_date: 2025-02-28
    exp_start_date: 2025-02-28
    exp_stop_date: 2025-03-28
    city_id: 4180
    exp_name: (re-) [4180, Veracruz] Bad Bids v0
    


__Validity__

### Metrics Data

__Recprice__

In [9]:
df_recprice = download_recprice_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    user_name=USER_NAME,
    printBool=True
)

# Применяем функцию group_name к каждой строке, используя user_id как split_id
df_recprice['recprice_group_name'] = df_recprice.apply(
    lambda row: 'Before' if row['utc_recprice_dttm'] < pd.Timestamp(EXP_START_DATE, tz='UTC') 
    else group_name(row['user_id'], EXP_SALT, DISTRIBUTION_MAP),
    axis=1
)

df_recprice.to_parquet(DATA_ROOT_PATH / 'df_recprice.pqt')


    WITH
    recprice_tbl AS (
        SELECT *
        FROM (
            SELECT DISTINCT
                t1.city_id                                            AS city_id,
                t1.order_type_name                                    AS order_type,
                t1.order_type_id                                      AS order_type_id,
                t1.id                                                 AS calcprice_uuid,
                t1.user_id                                            AS user_id,
                TIMESTAMP(DATETIME(t1.calculation_dttm, t2.timezone)) AS local_recprice_dttm,
                t1.calculation_dttm                                   AS utc_recprice_dttm,
                t1.base_price / t3.usd_value                          AS price_base_usd,
                t1.price / t3.usd_value                               AS recprice_usd,
                t1.min_price / t3.usd_value                           AS minprice_usd,
                t1.surge         

In [10]:
df_recprice = pd.read_parquet(DATA_ROOT_PATH / 'df_recprice.pqt')
df_recprice_prepared = prepare_recprice_data(df_recprice)
df_recprice_prepared.to_parquet(DATA_ROOT_PATH / 'df_recprice_prepared.pqt')

__Bids__

In [11]:
df_bids = download_bid_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    user_name=USER_NAME,
    printBool=True
)
df_bids.to_parquet(DATA_ROOT_PATH / 'df_bids.pqt')


    WITH
    details_prepare AS (
        SELECT *
        FROM (
            SELECT
                city_id                                                                     AS city_id,
                order_type                                                                  AS order_type,
                order_uuid                                                                  AS order_uuid,
                tender_uuid                                                                 AS tender_uuid,
                user_id                                                                     AS user_id,
                order_timestamp                                                             AS local_order_dttm,
                TIMESTAMP(FORMAT_TIMESTAMP('%Y-%m-%d %H:%M:%S', order_timestamp), timezone) AS utc_order_dttm,
                price_highrate_usd                                                          AS price_highrate_usd,
                price_start_usd              

In [12]:
df_bids = pd.read_parquet(DATA_ROOT_PATH / 'df_bids.pqt')
df_bids_prepared = prepare_bid_data(df_bids)
df_bids_prepared.to_parquet(DATA_ROOT_PATH / 'df_bids_prepared.pqt')

__Orders (with recprice)__

In [13]:
df_orders = download_order_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    user_name=USER_NAME,
    printBool=False
)

# Применяем функцию group_name к каждой строке, используя user_id как split_id
df_orders['order_group_name'] = df_orders.apply(
    lambda row: 'Before' if row['utc_order_dttm'] < pd.Timestamp(EXP_START_DATE, tz='UTC') 
    else group_name(row['user_id'], EXP_SALT, DISTRIBUTION_MAP),
    axis=1
)

df_orders.to_parquet(DATA_ROOT_PATH / 'df_orders.pqt')

In [14]:
df_orders = pd.read_parquet(DATA_ROOT_PATH / 'df_orders.pqt')
df_orders_prepared = prepare_order_data(df_orders)
df_orders_prepared.to_parquet(DATA_ROOT_PATH / 'df_orders_prepared.pqt')

In [15]:
df_orders_with_recprice = get_orders_with_recprice_df(df_orders_prepared, df_recprice_prepared)
df_orders_with_recprice['group_name'] = df_orders_with_recprice['recprice_group_name']
df_orders_with_recprice.to_parquet(DATA_ROOT_PATH / 'df_orders_with_recprice.pqt')

только уникальные ордера? – True
доля оставшихся ордеров: 0.9764


### Results

In [16]:
# EXP_ID = 2574
# DATA_ROOT_PATH = pathlib.Path(f'data/exp_id={EXP_ID}')

# df_recprice_prepared = pd.read_parquet(DATA_ROOT_PATH / 'df_recprice_prepared.pqt')
# df_orders_with_recprice = pd.read_parquet(DATA_ROOT_PATH / 'df_orders_with_recprice.pqt')

In [17]:
pd.set_option('display.max_rows', None)
# pd.reset_option('display.max_rows')

In [18]:
df_recprice_prepared = pd.read_parquet(DATA_ROOT_PATH / 'df_recprice_prepared.pqt')
df_orders_with_recprice = pd.read_parquet(DATA_ROOT_PATH / 'df_orders_with_recprice.pqt')

__Total__

In [19]:
df_metrics_total = calculate_absolute_metrics(
    df_recprice_prepared,
    df_orders_with_recprice,
    df_bids_prepared,
    group_cols=['group_name', 'switch_start_dttm', 'switch_finish_dttm'],
)

metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05)[
    ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
]

metrics_total_tbl.head()

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant
0,cp2order,0.52989,0.526056,-0.007235,0.3017821,False
1,order2bid,0.808849,0.786727,-0.02735,1.314748e-07,True
2,order2start_price_bid,0.501349,0.490406,-0.021827,0.003015706,True
3,order2accept,0.675857,0.678282,0.003589,0.576055,False
4,order2done,0.621156,0.623055,0.003057,0.6386437,False


__By segment__

In [31]:
condition_on_recprice = df_recprice_prepared.order_type_id == ORDER_TYPE_ID
condition_on_orders = df_orders_with_recprice.order_type == ORDER_TYPE
condition_on_bids = df_bids_prepared.order_type == ORDER_TYPE

df_metrics_total = calculate_absolute_metrics(
    df_recprice_prepared[condition_on_recprice].copy(),
    df_orders_with_recprice[condition_on_orders].copy(),
    df_bids_prepared[condition_on_bids].copy(),
    group_cols=['group_name', 'switch_start_dttm', 'switch_finish_dttm'],
)

metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05)[
    ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
]

metrics_total_tbl

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant
0,cp2order,0.580886,0.578977,-0.003285,0.3897196,False
1,cp2bid,0.292237,0.284917,-0.025046,0.0009855999,True
2,cp2start_price_bid,0.292237,0.284917,-0.025046,0.0009855999,True
3,cp2accept,0.394283,0.394455,0.000437,0.9481515,False
4,cp2done,0.362401,0.362365,-9.8e-05,0.9885232,False
5,drivers_per_order,0.926124,0.885638,-0.043715,1.681477e-06,True
6,bids_per_order,1.745393,1.48087,-0.151555,1.1758579999999998e-50,True
7,bids_per_driver,1.884622,1.672094,-0.11277,1.077098e-277,True
8,bids_per_order_with_bid,2.150611,1.875856,-0.127757,1.9619029999999998e-100,True
9,bids_per_done_order,2.023008,1.761413,-0.12931,1.164887e-102,True
