In [21]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
import warnings
warnings.filterwarnings('ignore')

import pathlib
import pandas as pd
import numpy as np
from datetime import timedelta

import plotly.graph_objects as go
import pandas as pd



from src.download import download_experiment_data, download_recprice_data, download_order_data
from src.prepare import prepare_recprice_data, prepare_order_data, get_full_df, prepare_my
from src.metrics import calculate_metrics, get_switchback_results, get_switchback_results_new

from src.draw import draw_lines

In [23]:
def binned_percentile(df, x, bin_start='bin_start', frequency='frequency'):
    data = df.reset_index(drop=True)

    data['cumulative_frequency'] = data[frequency].cumsum()

    total_frequency = data[frequency].sum()

    percentile_freq = x * total_frequency

    percentile_bin = data[data['cumulative_frequency'] >= percentile_freq].iloc[0]

    bin_index = data.index[data['cumulative_frequency'] >= percentile_freq][0]
    previous_cumulative = data.iloc[bin_index - 1]['cumulative_frequency'] if bin_index > 0 else 0
    previous_bin_start = data.iloc[bin_index - 1][bin_start] if bin_index > 0 else 0
    current_bin_start = percentile_bin[bin_start]
    current_frequency = percentile_bin[frequency]

    excess = percentile_freq - previous_cumulative
    bin_range = current_bin_start - previous_bin_start
    percentile_value = previous_bin_start + (excess / current_frequency) * bin_range

    return percentile_value

In [24]:
USER_NAME = 'nusuev'

EXP_ID = 2104
DAYS_BEFORE = 0

DATA_ROOT_PATH = pathlib.Path(f'data/exp_id={EXP_ID}')
if not DATA_ROOT_PATH.exists():
    DATA_ROOT_PATH.mkdir(parents=True, exist_ok=True)

In [25]:
df_exp = download_experiment_data(exp_id=EXP_ID, user_name=USER_NAME)

df_exp['hour'] = df_exp['switch_start_dttm'].dt.hour
df_exp['hour'] = df_exp['hour'].astype('category')

df_exp.to_parquet(DATA_ROOT_PATH / 'df_exp.pqt')

In [26]:
EXP_START_DATE = df_exp.utc_start_dttm.dt.date.astype('str').iloc[0]
EXP_STOP_DATE = df_exp.utc_finish_dttm.dt.date.astype('str').iloc[0]
BEFORE_START_DATE = (df_exp.utc_start_dttm.dt.date - timedelta(days=DAYS_BEFORE)).astype('str').iloc[0]
CITY_ID = df_exp.city_id.iloc[0]
ORDER_TYPE = df_exp.order_type.iloc[0]
EXP_NAME = df_exp.exp_name.iloc[0]

print(
    f'''
    exp_start_date: {EXP_START_DATE}
    exp_stop_date: {EXP_STOP_DATE}
    before_start_date: {BEFORE_START_DATE}
    city_id: {CITY_ID}
    order_type: {ORDER_TYPE}
    exp_name: {EXP_NAME}
    '''
)


    exp_start_date: 2024-11-12
    exp_stop_date: 2024-12-04
    before_start_date: 2024-11-12
    city_id: 4231
    order_type: auto_econom
    exp_name: MEP-1945: San Luis Potosi 2comp surge
    


In [27]:
start_index = EXP_NAME.find(':') + 2
end_index = EXP_NAME.find('2comp surge')
JPEG_NAME = EXP_NAME[start_index:end_index].strip() 

In [28]:
df_recprice = download_recprice_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    order_type=ORDER_TYPE,
    user_name=USER_NAME,
)

In [29]:
df_orders = download_order_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    order_type=ORDER_TYPE,
    user_name=USER_NAME,
)

In [30]:
df_recprice_prepared = prepare_recprice_data(df_recprice)
df_orders_prepared = prepare_order_data(df_orders)
df_full = get_full_df(df_orders_prepared, df_recprice_prepared)
df_full['group_name'] = df_full['recprice_group_name']

results = []

df_metrics_total = calculate_metrics(
    df_recprice_prepared,
    df_orders_prepared,
    df_full,
    group_cols=['group_name', 'switch_start_dttm', 'switch_finish_dttm'],
)

metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05)[
    ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
]

results.append(metrics_total_tbl)

df_results = pd.concat(results, ignore_index=True)
df_results
df_results[df_results['metric'].isin(['cp2done', 'order2done', 'price_done_usd'])]

только уникальные ордера? – True
доля оставшихся ордеров: 0.9766


Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant
4,order2done,0.509698,0.51805,0.016385,0.104776,False
12,cp2done,0.336738,0.339484,0.008156,0.449541,False
21,price_done_usd,3.725504,3.715907,-0.002576,0.66232,False


In [31]:
bin_size = 300
df_full['orders_distance_bin'] = (df_full['distance'] // bin_size) * bin_size

# Prepare data
grouped = df_full.groupby(['orders_distance_bin', 'group_name'])['recprice'].mean().reset_index()
frequency = df_full['orders_distance_bin'].value_counts(normalize=True).sort_index()
cumulative_frequency = frequency.cumsum()

# Parameters
max_bin = cumulative_frequency[cumulative_frequency <= 0.99].index[-1]
recprice_max = grouped[grouped['orders_distance_bin'] == max_bin]['recprice'].max()
freq_max = frequency.max()

# Separate data for 'A' and 'Control'
data_a = grouped[grouped['group_name'] == 'A']
data_control = grouped[grouped['group_name'] == 'Control']

# Merge the data on 'orders_distance_bin' for comparison
merged = pd.merge(data_a, data_control, on='orders_distance_bin', suffixes=('_a', '_control'))

# Calculate the intersection using linear interpolation
merged['diff'] = merged['recprice_a'] - merged['recprice_control']
sign_change = (merged['diff'].shift() * merged['diff']) < 0  # Detect sign change
intersection_points = merged[sign_change]

# If there's an intersection
if not intersection_points.empty:
    # Linear interpolation for intersection
    idx = intersection_points.index[0]
    x1, y1_a, y1_control = merged.loc[idx - 1, ['orders_distance_bin', 'recprice_a', 'recprice_control']]
    x2, y2_a, y2_control = merged.loc[idx, ['orders_distance_bin', 'recprice_a', 'recprice_control']]
    slope_a = (y2_a - y1_a) / (x2 - x1)
    slope_control = (y2_control - y1_control) / (x2 - x1)
    intercept_a = y1_a - slope_a * x1
    intercept_control = y1_control - slope_control * x1
    x_intersect = (intercept_control - intercept_a) / (slope_a - slope_control)
    y_intersect = slope_a * x_intersect + intercept_a
else:
    x_intersect, y_intersect = None, None

X = x_intersect

# Create the figure
fig = go.Figure()

# Add a line for each group
for group_name in grouped[~grouped['group_name'].isin(['Before'])]['group_name'].unique():
    group_data = grouped[grouped['group_name'] == group_name]
    fig.add_trace(go.Scatter(
        x=group_data['orders_distance_bin'],
        y=group_data['recprice'],
        mode='lines',
        name=f'{group_name}',
        line=dict(width=2)
    ))

# Add the normalized frequency as a line without showing in the legend
fig.add_trace(go.Scatter(
    x=frequency.index,
    y=frequency.values,
    mode='lines',
    name='Normalized Frequency',  # Not shown in legend
    line=dict(color='gray', dash='dot', width=2),
    yaxis='y2',
    showlegend=False  # Hide from legend
))

# Add the colored area without showing in the legend
filtered_frequency = frequency[frequency.index <= X]
cum_freq_value = filtered_frequency.sum()  # Calculate cumulative frequency in the area
fig.add_trace(go.Scatter(
    x=list(filtered_frequency.index) + [X],
    y=list(filtered_frequency.values) + [0],
    fill='tozeroy',
    mode='lines',
    line=dict(color='rgba(255, 127, 80, 0.3)', width=0),
    name=f'{cum_freq_value:.0%} of rides',
    yaxis='y2',
    showlegend=True
))

# Highlight the intersection point
if x_intersect is not None:
    fig.add_trace(go.Scatter(
        x=[x_intersect],
        y=[y_intersect],
        mode='markers+text',
        marker=dict(color='red', size=10),
        text=[f'{x_intersect:.0f} m <br>{y_intersect:.0f} cur'],
        textposition='top right',
        showlegend=False
    ))

# Update layout for dual axes
fig.update_layout(
    title=JPEG_NAME+': Recprice vs Orders Distance',
    xaxis=dict(
        title='Bin, meters',
        range=[0, max_bin]),
    yaxis=dict(
        title='Recprice, currency',
        range=[0, recprice_max]), 
    yaxis2=dict(
        title='Normalized Frequency',
        overlaying='y',
        side='right',
        range=[0, freq_max], 
        showgrid=False,
        showline=False,   # Hide the line for yaxis
        ticks='',         # Remove ticks
        visible=False
    ),
    legend_title='Group',
    width=800,
    height=500
) 

# Add the vertical lines and text annotations
for perc in [0.25, 0.50, 0.75, 0.99]:
    bin_line = cumulative_frequency[cumulative_frequency <= perc].index[-1]
    fig.add_shape(
        type="line",
        x0=bin_line,
        x1=bin_line,
        y0=0,
        y1=1,
        xref="x",
        yref="paper",  # Extend line across the y-axis range
        line=dict(color="grey", dash="dash", width=1)
    )

    # Add text annotation
    fig.add_trace(go.Scatter(
        x=[bin_line],
        y=[recprice_max*0.1],  
        text=[f'{perc:.0%}'],
        mode="text",
        showlegend=False
    ))

# Show the figure
fig.show()


In [32]:
df_recprice_prepared_1gr = df_recprice_prepared
df_recprice_prepared_1gr['surge_group'] = np.where(df_recprice_prepared_1gr['original_dynamic_surge_updated'] <= 1.0, 'NS', 'S')
df_recprice_prepared_1gr['axis_surge_group'] = df_recprice_prepared_1gr['surge_group']

df_orders_prepared_1gr = df_orders_prepared
df_orders_prepared_1gr = df_orders_prepared_1gr.merge(df_recprice_prepared_1gr[['calcprice_uuid', 'recprice', 'original_dynamic_surge_updated']], on=['calcprice_uuid'], how='left')
df_orders_prepared_1gr['surge_group'] = np.where(df_orders_prepared_1gr['original_dynamic_surge_updated'] <= 1.0, 'NS', 'S')
df_orders_prepared_1gr['axis_surge_group'] = df_orders_prepared_1gr['surge_group']

df_full_1gr = df_full
df_full_1gr = get_full_df(df_orders_prepared_1gr, df_recprice_prepared_1gr)
df_full_1gr['group_name'] = df_full_1gr['recprice_group_name']

results = []

for axis_group in df_full_1gr['axis_surge_group'].unique():
    df_metrics_total = calculate_metrics(
        df_recprice_prepared_1gr[(df_recprice_prepared_1gr['axis_surge_group'] == axis_group)],
        df_orders_prepared_1gr[(df_orders_prepared_1gr['axis_surge_group'] == axis_group)],
        df_full_1gr[(df_full_1gr['axis_surge_group'] == axis_group)],
        group_cols=['group_name', 'switch_start_dttm', 'switch_finish_dttm'],
    )

    metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05)[
        ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
    ]

    metrics_total_tbl['axis_surge_group'] = axis_group
    results.append(metrics_total_tbl)

df_results = pd.concat(results, ignore_index=True)
df_results[df_results['metric'].isin(['cp2done', 'order2done', 'price_done_usd'])]

только уникальные ордера? – True
доля оставшихся ордеров: 0.9766


Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
4,order2done,0.48932,0.497543,0.016805,0.105745,False,S
12,cp2done,0.323026,0.325555,0.007828,0.492868,False,S
21,price_done_usd,3.820315,3.818092,-0.000582,0.914811,False,S
37,order2done,0.562,0.568008,0.010691,0.337912,False,NS
45,cp2done,0.371994,0.373574,0.004246,0.687761,False,NS
54,price_done_usd,3.51378,3.497806,-0.004546,0.545022,False,NS


In [33]:
axis_price = y_intersect

df_recprice_prepared_gr2 = df_recprice_prepared
df_recprice_prepared_gr2['axis_group'] = np.where(df_recprice_prepared_gr2['recprice'] < axis_price, 'L', 'R')
df_recprice_prepared_gr2['surge_group'] = np.where(df_recprice_prepared_gr2['original_dynamic_surge_updated'] <= 1.0, 'NS', 'S')
df_recprice_prepared_gr2['axis_surge_group'] = df_recprice_prepared_gr2['axis_group'] + '_' + df_recprice_prepared_gr2['surge_group']

df_orders_prepared_gr2 = df_orders_prepared
df_orders_prepared_gr2 = df_orders_prepared_gr2.merge(df_recprice_prepared_gr2[['calcprice_uuid', 'recprice', 'original_dynamic_surge_updated']], on=['calcprice_uuid'], how='left')
df_orders_prepared_gr2['axis_group'] = np.where(df_orders_prepared_gr2['recprice'] < axis_price, 'L', 'R')
df_orders_prepared_gr2['surge_group'] = np.where(df_orders_prepared_gr2['original_dynamic_surge_updated'] <= 1.0, 'NS', 'S')
df_orders_prepared_gr2['axis_surge_group'] = df_orders_prepared_gr2['axis_group'] + '_' + df_orders_prepared_gr2['surge_group']

df_full_gr2 = df_full
df_full_gr2 = get_full_df(df_orders_prepared_gr2, df_recprice_prepared_gr2)
df_full_gr2['group_name'] = df_full_gr2['recprice_group_name']


results = []

for axis_group in df_full_gr2['axis_surge_group'].unique():
    df_metrics_total = calculate_metrics(
        df_recprice_prepared_gr2[(df_recprice_prepared_gr2['axis_surge_group'] == axis_group)],
        df_orders_prepared_gr2[(df_orders_prepared_gr2['axis_surge_group'] == axis_group)],
        df_full_gr2[(df_full_gr2['axis_surge_group'] == axis_group)],
        group_cols=['group_name', 'switch_start_dttm', 'switch_finish_dttm'],
    )

    metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05)[
        ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
    ]

    metrics_total_tbl['axis_surge_group'] = axis_group
    results.append(metrics_total_tbl)

df_results = pd.concat(results, ignore_index=True)
df_results

df_results[df_results['metric'].isin(['cp2done', 'order2done', 'price_done_usd'])]

только уникальные ордера? – True
доля оставшихся ордеров: 0.9766


Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
4,order2done,0.489455,0.49058,0.002299,0.850401,False,R_S
12,cp2done,0.315205,0.316337,0.003591,0.795727,False,R_S
21,price_done_usd,4.756448,4.692636,-0.013416,0.000194,True,R_S
37,order2done,0.489138,0.507631,0.037807,0.000128,True,L_S
45,cp2done,0.333977,0.3394,0.016236,0.079009,False,L_S
54,price_done_usd,2.581943,2.593152,0.004342,0.089745,False,L_S
70,order2done,0.58559,0.590964,0.009178,0.401097,False,L_NS
78,cp2done,0.400242,0.400562,0.000798,0.936423,False,L_NS
87,price_done_usd,2.567872,2.561756,-0.002382,0.568984,False,L_NS
103,order2done,0.528732,0.535435,0.012677,0.306616,False,R_NS


In [34]:
df_results['metric'].unique()

array(['cp2order', 'order2bid', 'order2start_price_bid', 'order2accept',
       'order2done', 'bid2accept', 'bid2done', 'start_price_bid2accept',
       'start_price_bid2done', 'cp2bid', 'cp2start_price_bid',
       'cp2accept', 'cp2done', 'minprice_usd', 'price_base_usd',
       'recprice_usd', 'price_highrate_usd', 'rides_price_highrate_usd',
       'price_start_usd', 'rides_price_start_usd', 'price_tender_usd',
       'price_done_usd', 'done2rec', 'surge', 'dynamic_surge',
       'original_dynamic_surge_updated', 'good_rate', 'balance',
       'orders_by_minprice_share', 'cp2order_by_orders_by_minprice',
       'order2done_by_orders_by_minprice',
       'cp2done_by_orders_by_minprice', 'orders_by_recprice_share'],
      dtype=object)

In [35]:
df_results[df_results['metric'] == 'cp2done']

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
12,cp2done,0.315205,0.316337,0.003591,0.795727,False,R_S
45,cp2done,0.333977,0.3394,0.016236,0.079009,False,L_S
78,cp2done,0.400242,0.400562,0.000798,0.936423,False,L_NS
111,cp2done,0.335058,0.337917,0.008532,0.476594,False,R_NS


In [36]:
df_results[df_results['metric'] == 'cp2order']

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
0,cp2order,0.643993,0.644823,0.001289,0.7831749,False,R_S
33,cp2order,0.682786,0.668595,-0.020785,1.610177e-07,True,L_S
66,cp2order,0.683486,0.677811,-0.008303,0.1065184,False,L_NS
99,cp2order,0.633702,0.631108,-0.004093,0.5003588,False,R_NS


In [37]:
df_results[df_results['metric'] == 'order2done']

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
4,order2done,0.489455,0.49058,0.002299,0.850401,False,R_S
37,order2done,0.489138,0.507631,0.037807,0.000128,True,L_S
70,order2done,0.58559,0.590964,0.009178,0.401097,False,L_NS
103,order2done,0.528732,0.535435,0.012677,0.306616,False,R_NS


In [38]:
df_results[df_results['metric'] == 'orders_by_minprice_share']

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
28,orders_by_minprice_share,0.043609,0.043298,-0.007134,0.792342,False,R_S
61,orders_by_minprice_share,0.042362,0.038298,-0.09595,0.000152,True,L_S
94,orders_by_minprice_share,0.077391,0.080932,0.045758,0.0638,False,L_NS
127,orders_by_minprice_share,0.077023,0.07572,-0.016917,0.528992,False,R_NS


In [39]:
df_results[df_results['metric'] == 'orders_by_recprice_share']

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
32,orders_by_recprice_share,0.462543,0.488902,0.056986,7.227287e-06,True,R_S
65,orders_by_recprice_share,0.747248,0.710784,-0.048797,1.071823e-16,True,L_S
98,orders_by_recprice_share,0.880173,0.878243,-0.002193,0.3487392,False,L_NS
131,orders_by_recprice_share,0.74619,0.744288,-0.002549,0.5915368,False,R_NS


In [40]:
df_results[df_results['metric'] == 'price_done_usd']

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,axis_surge_group
21,price_done_usd,4.756448,4.692636,-0.013416,0.000194,True,R_S
54,price_done_usd,2.581943,2.593152,0.004342,0.089745,False,L_S
87,price_done_usd,2.567872,2.561756,-0.002382,0.568984,False,L_NS
120,price_done_usd,4.991229,4.963779,-0.0055,0.314916,False,R_NS
