In [8]:
%load_ext autoreload
%autoreload 2

import pathlib
import warnings
warnings.filterwarnings("ignore")

import mmh3
from typing import Dict
from datetime import timedelta
import pandas as pd

from src.download import download_experiment_data, download_recprice_data, download_order_data
from src.metrics import calculate_absolute_metrics, get_switchback_results, calculate_ratio_metrics
from src.prepare import prepare_recprice_data, prepare_order_data, get_orders_with_recprice_df, get_hex

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
# salt - рандомная строка длиной 8 хранится в experiment.salt
# distribution_map - словарь вида {"GroupA": 50, "Control": 50}, где сумма значений должна быть 100
#  Порядок важен! Control всегда последний
def group_name(split_id: str, salt: str, distribution_map: Dict[str, int]):
    if sum(distribution_map.values()) != 100:
        raise Exception("Incorrect distribution")

    if list(distribution_map.keys())[-1] != "Control":
        raise Exception("Last group should be Control")

    if len(salt) != 8:
        raise Exception("Incorrect salt")

    # Хэш по соль экспа + split_id (id пользователя или девайса)
    murmur_hash: int = mmh3.hash(f"{salt}{split_id}", 1, False)
    reminder = murmur_hash % 100
    s = 0
    for k, v in distribution_map.items():
        s += v
        if reminder < s:
            return k

In [10]:
USER_NAME = 'nusuev_ab2608'
EXP_ID = 2608
ORDER_TYPE = 'auto_econom'
ORDER_TYPE_ID = 1
DAYS_BEFORE = 0

In [11]:
DATA_ROOT_PATH = pathlib.Path(f'data/exp_id={EXP_ID}')
if not DATA_ROOT_PATH.exists():
    DATA_ROOT_PATH.mkdir(parents=True, exist_ok=True)
    

PLOT_ROOT_PATH = pathlib.Path(f'data/exp_id={EXP_ID}/plots')
if not PLOT_ROOT_PATH.exists():
    PLOT_ROOT_PATH.mkdir(parents=True, exist_ok=True)

In [12]:
df_exp = download_experiment_data(EXP_ID, USER_NAME)
df_exp['city_id'] = 4269
df_exp

Unnamed: 0,exp_id,city_id,multiple_cities,status,utc_start_dttm,utc_finish_dttm,exp_salt,conditions,utc_start_dttm_unix
0,2608,4269,True,completed,2025-02-21 17:00:00+00:00,2025-03-14 17:00:00+00:00,XGm1V6Hj,"{""and"": [{""or"": [{""version_greater_or_equal"": ...",1740157200


In [14]:
EXP_START_DATE = df_exp.utc_start_dttm.dt.date.astype('str').iloc[0]
EXP_STOP_DATE = df_exp.utc_finish_dttm.dt.date.astype('str').iloc[0]
BEFORE_START_DATE = (df_exp.utc_start_dttm.dt.date - timedelta(days=DAYS_BEFORE)).astype('str').iloc[0]
CITY_ID = df_exp.city_id.iloc[0]
# EXP_NAME = df_exp.exp_na me.iloc[0]

exp_salt = df_exp['exp_salt'].iloc[0]
distribution_map = {"GroupA": 50, "Control": 50}

# Скачиваем данные по рекомендациям
df_recprice = download_recprice_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    user_name=USER_NAME,
)

# Применяем функцию group_name к каждой строке, используя user_id как split_id
df_recprice['recprice_group_name'] = df_recprice.apply(
    lambda row: 'Before' if row['utc_recprice_dttm'] < pd.Timestamp(EXP_START_DATE, tz='UTC') else group_name(row['user_id'], exp_salt, distribution_map),
    axis=1
)

df_recprice.to_parquet(DATA_ROOT_PATH / 'df_recprice.pqt')
df_recprice_prepared = prepare_recprice_data(df_recprice)
df_recprice_prepared.to_parquet(DATA_ROOT_PATH / 'df_recprice_prepared.pqt')

# Скачиваем данные по заказам
df_orders = download_order_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    user_name=USER_NAME,
)

# Применяем функцию group_name к каждой строке, используя user_id как split_id
df_orders['order_group_name'] = df_orders.apply(
    lambda row: 'Before' if row['utc_order_dttm'] < pd.Timestamp(EXP_START_DATE, tz='UTC') else group_name(row['user_id'], exp_salt, distribution_map),
    axis=1
)

df_orders.to_parquet(DATA_ROOT_PATH / 'df_orders.pqt')
df_orders_prepared = prepare_order_data(df_orders)
df_orders_prepared.to_parquet(DATA_ROOT_PATH / 'df_orders_prepared.pqt')

df_orders_with_recprice = get_orders_with_recprice_df(df_orders_prepared, df_recprice_prepared)
df_orders_with_recprice['group_name'] = df_orders_with_recprice['recprice_group_name']
df_orders_with_recprice.to_parquet(DATA_ROOT_PATH / 'df_orders_with_recprice.pqt')



только уникальные ордера? – True
доля оставшихся ордеров: 0.9626


In [15]:
condition_on_recprice = df_recprice_prepared.order_type_id == ORDER_TYPE_ID
condition_on_orders = df_orders_with_recprice.order_type == ORDER_TYPE

df_metrics_total = calculate_absolute_metrics(
    df_recprice_prepared[condition_on_recprice].copy(),
    df_orders_with_recprice[condition_on_orders].copy(),
    group_cols=['user_id', 'group_name'],
    # group_cols=['order_uuid', 'group_name'],
)

metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05, groups={"control": "Control", "treatment": "GroupA"})[
    ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
]

metrics_total_tbl

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant
0,cp2order,0.673654,0.669024,-0.006873,0.19305,False
1,order2bid,0.852694,0.854784,0.002451,0.385646,False
2,order2start_price_bid,0.523493,0.524519,0.00196,0.746073,False
3,order2accept,0.698956,0.700015,0.001514,0.747028,False
4,order2done,0.617964,0.620386,0.003919,0.465236,False
5,bid2accept,0.819703,0.818938,-0.000934,0.746334,False
6,bids_per_accepted,1.129754,1.126747,-0.002661,0.19867,False
7,bid_price_avg,2.761878,2.765046,0.001147,0.833348,False
8,bid_per_order,1.014751,1.015143,0.000386,0.914479,False
9,bid2done,0.72472,0.725781,0.001465,0.693452,False


In [18]:
df_metrics_total.groupby('group_name').sum().transpose()

group_name,Control,GroupA
user_id,3666793198458.0,3619487651157.0
calcprices_count,243931.0,245489.0
orders_count,164325.0,164238.0
tenders_count,325143.0,323997.0
orders_with_bids_count,140119.0,140388.0
start_price_bid_orders_count,86023.0,86146.0
start_price_bid_accepted_orders_count,74344.0,74584.0
start_price_bid_rides_count,66123.0,66410.0
accepted_orders_count,114856.0,114969.0
rides_count,101547.0,101891.0


In [None]:
df_metrics_total.groupby('group_name')[['tenders_count', 'rides_count']].sum()

Unnamed: 0_level_0,tenders_count,rides_count
group_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Before,492446,135033
Control,524294,160687
GroupA,528258,160785


In [22]:
df_recprice_prepared[condition_on_recprice]['utc_recprice_dttm'].min()
# df_orders_with_recprice[condition_on_orders]

Timestamp('2025-02-18 00:00:01+0000', tz='UTC')

In [52]:
old_df_recprice_prepared = df_recprice_prepared

In [59]:
df_recprice_prepared[condition_on_recprice]['utc_recprice_dttm'].min()


Timestamp('2025-02-28 00:00:02+0000', tz='UTC')