In [21]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
import warnings
warnings.filterwarnings('ignore')

import pathlib
import pandas as pd
import numpy as np
from datetime import timedelta

import plotly.graph_objects as go
import pandas as pd



from src.download import download_experiment_data, download_recprice_data, download_order_data
from src.prepare import prepare_recprice_data, prepare_order_data, get_full_df, prepare_my
from src.metrics import calculate_metrics, get_switchback_results, get_switchback_results_new

from src.draw import draw_lines

In [23]:
def binned_percentile(df, x, bin_start='bin_start', frequency='frequency'):
    data = df.reset_index(drop=True)

    data['cumulative_frequency'] = data[frequency].cumsum()

    total_frequency = data[frequency].sum()

    percentile_freq = x * total_frequency

    percentile_bin = data[data['cumulative_frequency'] >= percentile_freq].iloc[0]

    bin_index = data.index[data['cumulative_frequency'] >= percentile_freq][0]
    previous_cumulative = data.iloc[bin_index - 1]['cumulative_frequency'] if bin_index > 0 else 0
    previous_bin_start = data.iloc[bin_index - 1][bin_start] if bin_index > 0 else 0
    current_bin_start = percentile_bin[bin_start]
    current_frequency = percentile_bin[frequency]

    excess = percentile_freq - previous_cumulative
    bin_range = current_bin_start - previous_bin_start
    percentile_value = previous_bin_start + (excess / current_frequency) * bin_range

    return percentile_value

In [24]:
USER_NAME = 'nusuev'

EXP_ID = 2103
DAYS_BEFORE = 0

DATA_ROOT_PATH = pathlib.Path(f'data/exp_id={EXP_ID}')
if not DATA_ROOT_PATH.exists():
    DATA_ROOT_PATH.mkdir(parents=True, exist_ok=True)

In [25]:
df_exp = download_experiment_data(exp_id=EXP_ID, user_name=USER_NAME)

df_exp['hour'] = df_exp['switch_start_dttm'].dt.hour
df_exp['hour'] = df_exp['hour'].astype('category')

df_exp.to_parquet(DATA_ROOT_PATH / 'df_exp.pqt')

In [26]:
EXP_START_DATE = df_exp.utc_start_dttm.dt.date.astype('str').iloc[0]
EXP_STOP_DATE = df_exp.utc_finish_dttm.dt.date.astype('str').iloc[0]
BEFORE_START_DATE = (df_exp.utc_start_dttm.dt.date - timedelta(days=DAYS_BEFORE)).astype('str').iloc[0]
CITY_ID = df_exp.city_id.iloc[0]
ORDER_TYPE = df_exp.order_type.iloc[0]
EXP_NAME = df_exp.exp_name.iloc[0]

print(
    f'''
    exp_start_date: {EXP_START_DATE}
    exp_stop_date: {EXP_STOP_DATE}
    before_start_date: {BEFORE_START_DATE}
    city_id: {CITY_ID}
    order_type: {ORDER_TYPE}
    exp_name: {EXP_NAME}
    '''
)


    exp_start_date: 2024-11-12
    exp_stop_date: 2024-12-04
    before_start_date: 2024-11-12
    city_id: 4227
    order_type: auto_econom
    exp_name: MEP-1945: Ciudad 2comp surge
    


In [27]:
start_index = EXP_NAME.find(':') + 2
end_index = EXP_NAME.find('2comp surge')
JPEG_NAME = EXP_NAME[start_index:end_index].strip() 

In [28]:
df_recprice = download_recprice_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    order_type=ORDER_TYPE,
    user_name=USER_NAME,
)

In [29]:
df_orders = download_order_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    order_type=ORDER_TYPE,
    user_name=USER_NAME,
)

In [30]:
df_recprice_prepared = prepare_recprice_data(df_recprice)
df_orders_prepared = prepare_order_data(df_orders)
df_full = get_full_df(df_orders_prepared, df_recprice_prepared)
df_full['group_name'] = df_full['recprice_group_name']

results = []

df_metrics_total = calculate_metrics(
    df_recprice_prepared,
    df_orders_prepared,
    df_full,
    group_cols=['group_name', 'switch_start_dttm', 'switch_finish_dttm'],
)

metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05)[
    ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
]

results.append(metrics_total_tbl)

df_results = pd.concat(results, ignore_index=True)
df_results
df_results[df_results['metric'].isin(['cp2done', 'order2done', 'price_done_usd'])]

только уникальные ордера? – True
доля оставшихся ордеров: 0.9768


Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant
4,order2done,0.417513,0.43093,0.032135,0.067067,False
12,cp2done,0.292752,0.301903,0.031261,0.085485,False
21,price_done_usd,4.267804,4.241501,-0.006163,0.351599,False


In [31]:
bin_size = 300
df_full['orders_distance_bin'] = (df_full['distance'] // bin_size) * bin_size

# Prepare data
grouped = df_full.groupby(['orders_distance_bin', 'group_name'])['recprice'].mean().reset_index()
frequency = df_full['orders_distance_bin'].value_counts(normalize=True).sort_index()
cumulative_frequency = frequency.cumsum()

# Parameters
max_bin = cumulative_frequency[cumulative_frequency <= 0.99].index[-1]
recprice_max = grouped[grouped['orders_distance_bin'] == max_bin]['recprice'].max()
freq_max = frequency.max()

# Separate data for 'A' and 'Control'
data_a = grouped[grouped['group_name'] == 'A']
data_control = grouped[grouped['group_name'] == 'Control']

# Merge the data on 'orders_distance_bin' for comparison
merged = pd.merge(data_a, data_control, on='orders_distance_bin', suffixes=('_a', '_control'))

# Calculate the intersection using linear interpolation
merged['diff'] = merged['recprice_a'] - merged['recprice_control']
sign_change = (merged['diff'].shift() * merged['diff']) < 0  # Detect sign change
intersection_points = merged[sign_change]

# If there's an intersection
if not intersection_points.empty:
    # Linear interpolation for intersection
    idx = intersection_points.index[0]
    x1, y1_a, y1_control = merged.loc[idx - 1, ['orders_distance_bin', 'recprice_a', 'recprice_control']]
    x2, y2_a, y2_control = merged.loc[idx, ['orders_distance_bin', 'recprice_a', 'recprice_control']]
    slope_a = (y2_a - y1_a) / (x2 - x1)
    slope_control = (y2_control - y1_control) / (x2 - x1)
    intercept_a = y1_a - slope_a * x1
    intercept_control = y1_control - slope_control * x1
    x_intersect = (intercept_control - intercept_a) / (slope_a - slope_control)
    y_intersect = slope_a * x_intersect + intercept_a
else:
    x_intersect, y_intersect = None, None

X = x_intersect

# Create the figure
fig = go.Figure()

# Add a line for each group
for group_name in grouped[~grouped['group_name'].isin(['Before'])]['group_name'].unique():
    group_data = grouped[grouped['group_name'] == group_name]
    fig.add_trace(go.Scatter(
        x=group_data['orders_distance_bin'],
        y=group_data['recprice'],
        mode='lines',
        name=f'{group_name}',
        line=dict(width=2)
    ))

# Add the normalized frequency as a line without showing in the legend
fig.add_trace(go.Scatter(
    x=frequency.index,
    y=frequency.values,
    mode='lines',
    name='Normalized Frequency',  # Not shown in legend
    line=dict(color='gray', dash='dot', width=2),
    yaxis='y2',
    showlegend=False  # Hide from legend
))

# Add the colored area without showing in the legend
filtered_frequency = frequency[frequency.index <= X]
cum_freq_value = filtered_frequency.sum()  # Calculate cumulative frequency in the area
fig.add_trace(go.Scatter(
    x=list(filtered_frequency.index) + [X],
    y=list(filtered_frequency.values) + [0],
    fill='tozeroy',
    mode='lines',
    line=dict(color='rgba(255, 127, 80, 0.3)', width=0),
    name=f'{cum_freq_value:.0%} of rides',
    yaxis='y2',
    showlegend=True
))

# Highlight the intersection point
if x_intersect is not None:
    fig.add_trace(go.Scatter(
        x=[x_intersect],
        y=[y_intersect],
        mode='markers+text',
        marker=dict(color='red', size=10),
        text=[f'{x_intersect:.0f} m <br>{y_intersect:.0f} cur'],
        textposition='top right',
        showlegend=False
    ))

# Update layout for dual axes
fig.update_layout(
    title=JPEG_NAME+': Recprice vs Orders Distance',
    xaxis=dict(
        title='Bin, meters',
        range=[0, max_bin]),
    yaxis=dict(
        title='Recprice, currency',
        range=[0, recprice_max]), 
    yaxis2=dict(
        title='Normalized Frequency',
        overlaying='y',
        side='right',
        range=[0, freq_max], 
        showgrid=False,
        showline=False,   # Hide the line for yaxis
        ticks='',         # Remove ticks
        visible=False
    ),
    legend_title='Group',
    width=800,
    height=500
) 

# Add the vertical lines and text annotations
for perc in [0.25, 0.50, 0.75, 0.99]:
    bin_line = cumulative_frequency[cumulative_frequency <= perc].index[-1]
    fig.add_shape(
        type="line",
        x0=bin_line,
        x1=bin_line,
        y0=0,
        y1=1,
        xref="x",
        yref="paper",  # Extend line across the y-axis range
        line=dict(color="grey", dash="dash", width=1)
    )

    # Add text annotation
    fig.add_trace(go.Scatter(
        x=[bin_line],
        y=[recprice_max*0.1],  
        text=[f'{perc:.0%}'],
        mode="text",
        showlegend=False
    ))

# Show the figure
fig.show()


In [32]:
df_recprice_prepared_1gr = df_recprice_prepared
df_recprice_prepared_1gr['surge_group'] = np.where(df_recprice_prepared_1gr['original_dynamic_surge_updated'] <= 1.0, 'NS', 'S')
df_recprice_prepared_1gr['axis_surge_group'] = df_recprice_prepared_1gr['surge_group']

df_orders_prepared_1gr = df_orders_prepared
df_orders_prepared_1gr = df_orders_prepared_1gr.merge(df_recprice_prepared_1gr[['calcprice_uuid', 'recprice', 'original_dynamic_surge_updated']], on=['calcprice_uuid'], how='left')
df_orders_prepared_1gr['surge_group'] = np.where(df_orders_prepared_1gr['original_dynamic_surge_updated'] <= 1.0, 'NS', 'S')
df_orders_prepared_1gr['axis_surge_group'] = df_orders_prepared_1gr['surge_group']

df_full_1gr = df_full
df_full_1gr = get_full_df(df_orders_prepared_1gr, df_recprice_prepared_1gr)
df_full_1gr['group_name'] = df_full_1gr['recprice_group_name']

results = []

for axis_group in df_full_1gr['axis_surge_group'].unique():
    df_metrics_total = calculate_metrics(
        df_recprice_prepared_1gr[(df_recprice_prepared_1gr['axis_surge_group'] == axis_group)],
        df_orders_prepared_1gr[(df_orders_prepared_1gr['axis_surge_group'] == axis_group)],
        df_full_1gr[(df_full_1gr['axis_surge_group'] == axis_group)],
        group_cols=['group_name', 'switch_start_dttm', 'switch_finish_dttm'],
    )

    metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05)[
        ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
    ]

    metrics_total_tbl['axis_surge_group'] = axis_group
    results.append(metrics_total_tbl)

df_results = pd.concat(results, ignore_index=True)
df_results[df_results['metric'].isin(['cp2done', 'order2done', 'price_done_usd'])]

только уникальные ордера? – True
доля оставшихся ордеров: 0.9768


Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
4,order2done,0.517234,0.512341,-0.00946,0.375557,False,NS
12,cp2done,0.357391,0.356389,-0.002803,0.778847,False,NS
21,price_done_usd,3.892126,3.911224,0.004907,0.320577,False,NS
37,order2done,0.355388,0.373538,0.05107,0.007515,True,S
45,cp2done,0.251507,0.263022,0.045784,0.029523,True,S
54,price_done_usd,4.608431,4.560853,-0.010324,0.11956,False,S


In [33]:
axis_price = y_intersect

df_recprice_prepared_gr2 = df_recprice_prepared
df_recprice_prepared_gr2['axis_group'] = np.where(df_recprice_prepared_gr2['recprice'] < axis_price, 'L', 'R')
df_recprice_prepared_gr2['surge_group'] = np.where(df_recprice_prepared_gr2['original_dynamic_surge_updated'] <= 1.0, 'NS', 'S')
df_recprice_prepared_gr2['axis_surge_group'] = df_recprice_prepared_gr2['axis_group'] + '_' + df_recprice_prepared_gr2['surge_group']

df_orders_prepared_gr2 = df_orders_prepared
df_orders_prepared_gr2 = df_orders_prepared_gr2.merge(df_recprice_prepared_gr2[['calcprice_uuid', 'recprice', 'original_dynamic_surge_updated']], on=['calcprice_uuid'], how='left')
df_orders_prepared_gr2['axis_group'] = np.where(df_orders_prepared_gr2['recprice'] < axis_price, 'L', 'R')
df_orders_prepared_gr2['surge_group'] = np.where(df_orders_prepared_gr2['original_dynamic_surge_updated'] <= 1.0, 'NS', 'S')
df_orders_prepared_gr2['axis_surge_group'] = df_orders_prepared_gr2['axis_group'] + '_' + df_orders_prepared_gr2['surge_group']

df_full_gr2 = df_full
df_full_gr2 = get_full_df(df_orders_prepared_gr2, df_recprice_prepared_gr2)
df_full_gr2['group_name'] = df_full_gr2['recprice_group_name']


results = []

for axis_group in df_full_gr2['axis_surge_group'].unique():
    df_metrics_total = calculate_metrics(
        df_recprice_prepared_gr2[(df_recprice_prepared_gr2['axis_surge_group'] == axis_group)],
        df_orders_prepared_gr2[(df_orders_prepared_gr2['axis_surge_group'] == axis_group)],
        df_full_gr2[(df_full_gr2['axis_surge_group'] == axis_group)],
        group_cols=['group_name', 'switch_start_dttm', 'switch_finish_dttm'],
    )

    metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05)[
        ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
    ]

    metrics_total_tbl['axis_surge_group'] = axis_group
    results.append(metrics_total_tbl)

df_results = pd.concat(results, ignore_index=True)
df_results

df_results[df_results['metric'].isin(['cp2done', 'order2done', 'price_done_usd'])]

только уникальные ордера? – True
доля оставшихся ордеров: 0.9768


Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
4,order2done,0.47015,0.466414,-0.007947,0.4919285,False,R_NS
12,cp2done,0.317568,0.318157,0.001853,0.8686822,False,R_NS
21,price_done_usd,5.087486,5.107432,0.003921,0.329615,False,R_NS
37,order2done,0.330755,0.350995,0.061196,0.001683657,True,R_S
45,cp2done,0.234601,0.247753,0.056058,0.01068063,True,R_S
54,price_done_usd,5.185062,5.05775,-0.024554,2.341887e-07,True,R_S
70,order2done,0.486312,0.507696,0.043972,0.0001472074,True,L_S
78,cp2done,0.340092,0.352372,0.036108,0.0009388288,True,L_S
87,price_done_usd,2.523979,2.516479,-0.002972,0.2323617,False,L_S
103,order2done,0.585679,0.579813,-0.010016,0.3138662,False,L_NS


In [34]:
df_results['metric'].unique()

array(['cp2order', 'order2bid', 'order2start_price_bid', 'order2accept',
       'order2done', 'bid2accept', 'bid2done', 'start_price_bid2accept',
       'start_price_bid2done', 'cp2bid', 'cp2start_price_bid',
       'cp2accept', 'cp2done', 'minprice_usd', 'price_base_usd',
       'recprice_usd', 'price_highrate_usd', 'rides_price_highrate_usd',
       'price_start_usd', 'rides_price_start_usd', 'price_tender_usd',
       'price_done_usd', 'done2rec', 'surge', 'dynamic_surge',
       'original_dynamic_surge_updated', 'good_rate', 'balance',
       'orders_by_minprice_share', 'cp2order_by_orders_by_minprice',
       'order2done_by_orders_by_minprice',
       'cp2done_by_orders_by_minprice', 'orders_by_recprice_share'],
      dtype=object)

In [35]:
df_results[df_results['metric'] == 'cp2done']

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
12,cp2done,0.317568,0.318157,0.001853,0.868682,False,R_NS
45,cp2done,0.234601,0.247753,0.056058,0.010681,True,R_S
78,cp2done,0.340092,0.352372,0.036108,0.000939,True,L_S
111,cp2done,0.418653,0.415377,-0.007824,0.389176,False,L_NS


In [36]:
df_results[df_results['metric'] == 'cp2order']

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
0,cp2order,0.675461,0.682134,0.009879,0.022044,True,R_NS
33,cp2order,0.709292,0.705858,-0.004841,0.300938,False,R_S
66,cp2order,0.69933,0.694062,-0.007532,0.212225,False,L_S
99,cp2order,0.714817,0.716399,0.002214,0.598923,False,L_NS


In [37]:
df_results[df_results['metric'] == 'order2done']

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
4,order2done,0.47015,0.466414,-0.007947,0.491928,False,R_NS
37,order2done,0.330755,0.350995,0.061196,0.001684,True,R_S
70,order2done,0.486312,0.507696,0.043972,0.000147,True,L_S
103,order2done,0.585679,0.579813,-0.010016,0.313866,False,L_NS


In [38]:
df_results[df_results['metric'] == 'orders_by_minprice_share']

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
28,orders_by_minprice_share,0.100311,0.102071,0.017541,0.325992,False,R_NS
61,orders_by_minprice_share,0.057538,0.058304,0.013297,0.496692,False,R_S
94,orders_by_minprice_share,0.076109,0.07906,0.038772,0.158786,False,L_S
127,orders_by_minprice_share,0.107887,0.111065,0.029452,0.092523,False,L_NS


In [39]:
df_results[df_results['metric'] == 'orders_by_recprice_share']

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
32,orders_by_recprice_share,0.680425,0.680969,0.000799,0.842363,False,R_NS
65,orders_by_recprice_share,0.367139,0.379718,0.034263,0.058112,False,R_S
98,orders_by_recprice_share,0.615604,0.594861,-0.033696,9.3e-05,True,L_S
131,orders_by_recprice_share,0.794954,0.79114,-0.004797,0.100959,False,L_NS


In [40]:
df_results[df_results['metric'] == 'price_done_usd']

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
21,price_done_usd,5.087486,5.107432,0.003921,0.329615,False,R_NS
54,price_done_usd,5.185062,5.05775,-0.024554,2.341887e-07,True,R_S
87,price_done_usd,2.523979,2.516479,-0.002972,0.2323617,False,L_S
120,price_done_usd,2.497234,2.497583,0.00014,0.9599346,False,L_NS
