In [47]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [48]:
import warnings
warnings.filterwarnings("ignore")

import pathlib
import pandas as pd
import numpy as np
from datetime import timedelta

import plotly.express as px


from src.download import download_experiment_data, download_recprice_data, download_order_data
from src.prepare import prepare_recprice_data, prepare_order_data, get_full_df, prepare_my
from src.metrics import calculate_metrics, get_switchback_results

from src.draw import draw_lines


## Switchback Results (Example)

### Parameters

In [49]:
USER_NAME = 'nusuev'

EXP_ID = 2103
DAYS_BEFORE = 0

DATA_ROOT_PATH = pathlib.Path(f'data/exp_id={EXP_ID}')
if not DATA_ROOT_PATH.exists():
    DATA_ROOT_PATH.mkdir(parents=True, exist_ok=True)

### Data

__Download__

In [50]:
df_exp = download_experiment_data(exp_id=EXP_ID, user_name=USER_NAME)

df_exp['hour'] = df_exp['switch_start_dttm'].dt.hour
df_exp['hour'] = df_exp['hour'].astype('category')

df_exp.to_parquet(DATA_ROOT_PATH / 'df_exp.pqt')

In [51]:
pd.unique(df_exp['switch_start_dttm'])

<DatetimeArray>
['2024-11-12 15:40:00+00:00', '2024-11-12 16:00:00+00:00',
 '2024-11-12 16:20:00+00:00', '2024-11-12 16:40:00+00:00',
 '2024-11-12 17:00:00+00:00', '2024-11-12 17:20:00+00:00',
 '2024-11-12 17:40:00+00:00', '2024-11-12 18:00:00+00:00',
 '2024-11-12 18:20:00+00:00', '2024-11-12 18:40:00+00:00',
 ...
 '2024-12-03 20:40:00+00:00', '2024-12-03 21:00:00+00:00',
 '2024-12-03 21:20:00+00:00', '2024-12-03 21:40:00+00:00',
 '2024-12-03 22:00:00+00:00', '2024-12-03 22:20:00+00:00',
 '2024-12-03 22:40:00+00:00', '2024-12-03 23:00:00+00:00',
 '2024-12-03 23:20:00+00:00', '2024-12-03 23:40:00+00:00']
Length: 1537, dtype: datetime64[us, UTC]

In [52]:
EXP_START_DATE = df_exp.utc_start_dttm.dt.date.astype('str').iloc[0]
EXP_STOP_DATE = df_exp.utc_finish_dttm.dt.date.astype('str').iloc[0]
BEFORE_START_DATE = (df_exp.utc_start_dttm.dt.date - timedelta(days=DAYS_BEFORE)).astype('str').iloc[0]
CITY_ID = df_exp.city_id.iloc[0]
ORDER_TYPE = df_exp.order_type.iloc[0]
EXP_NAME = df_exp.exp_name.iloc[0]

print(
    f"""
    exp_start_date: {EXP_START_DATE}
    exp_stop_date: {EXP_STOP_DATE}
    before_start_date: {BEFORE_START_DATE}
    city_id: {CITY_ID}
    order_type: {ORDER_TYPE}
    exp_name: {EXP_NAME}
    """
)


    exp_start_date: 2024-11-12
    exp_stop_date: 2024-12-04
    before_start_date: 2024-11-12
    city_id: 4227
    order_type: auto_econom
    exp_name: MEP-1945: Ciudad 2comp surge
    


In [53]:
df_recprice = download_recprice_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    order_type=ORDER_TYPE,
    user_name=USER_NAME,
)
df_recprice.to_parquet(DATA_ROOT_PATH / 'df_recprice.pqt')

In [54]:
df_orders = download_order_data(
    start_date=BEFORE_START_DATE,
    stop_date=EXP_STOP_DATE,
    city_id=CITY_ID,
    order_type=ORDER_TYPE,
    user_name=USER_NAME,
)
df_orders.to_parquet(DATA_ROOT_PATH / 'df_orders.pqt')

__Prepare__

In [55]:
df_recprice_prepared = prepare_recprice_data(df_recprice)
df_recprice_prepared.to_parquet(DATA_ROOT_PATH / 'df_recprice_prepared.pqt')

In [56]:
df_orders_prepared = prepare_order_data(df_orders)
df_orders_prepared.to_parquet(DATA_ROOT_PATH / 'df_orders_prepared.pqt')

In [57]:
# #df_full = get_full_df(df_orders_prepared, df_recprice_prepared)
# df_full = get_full_df(df_recprice_prepared, df_orders_prepared)
# df_full['group_name'] = df_full['recprice_group_name']
# #df_full.to_parquet(DATA_ROOT_PATH / 'df_full.pqt')

In [58]:
df_full = get_full_df(df_orders_prepared, df_recprice_prepared)
df_full['group_name'] = df_full['recprice_group_name']
#df_full.to_parquet(DATA_ROOT_PATH / 'df_full.pqt')

только уникальные ордера? – True
доля оставшихся ордеров: 0.9768


### Metrics

#### Total Metrics 


In [59]:
df_metrics_total = calculate_metrics(
    df_recprice_prepared,
    df_orders_prepared,
    df_full,
    group_cols=['group_name', 'switch_start_dttm', 'switch_finish_dttm'],
)

metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05)[
    ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
]

metrics_total_tbl


Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant
0,cp2order,0.701179,0.700586,-0.000847,0.7749183,False
1,order2bid,0.662988,0.683511,0.030955,0.02762588,True
2,order2start_price_bid,0.272425,0.278281,0.021496,0.255011,False
3,order2accept,0.476911,0.491699,0.031007,0.0746468,False
4,order2done,0.417513,0.43093,0.032135,0.06706727,False
5,bid2accept,0.719335,0.719372,5e-05,0.9898488,False
6,bid2done,0.629744,0.630465,0.001144,0.7908825,False
7,start_price_bid2accept,0.832754,0.835046,0.002752,0.1386049,False
8,start_price_bid2done,0.738976,0.74085,0.002536,0.3289369,False
9,cp2bid,0.191019,0.19496,0.020631,0.2876415,False


#### Metrics, by distance 

In [60]:
df_recprice_prepared_merged, df_orders_prepared_merged, df_full_prepared_merged = prepare_my(df_recprice_prepared, df_orders_prepared, df_full,
                                                                                             bound_dynamic_surge=0.0, step_surge_bin=0.5, step_orders_distance_bin=5,
                                                                                             filtered_surge_bin=np.unique([1.0, 1.5]),
                                                                                             filtered_dist_bins = np.arange(0, 25 + 1, 5))

lower_bound_dynamic_surge:  0.0
step_surge_bin:  0.5
step_distance_bin:  5
surge_bins:  [1.  1.5]
dist_bins:  [ 0  5 10 15 20 25]
df_full...
df_recprice...
df_orders...


In [61]:
results = []

step_orders_distance_bin=5
filtered_dist_bins = np.arange(0, 25 + 1, step_orders_distance_bin)

for dist_bin in filtered_dist_bins:
    df_metrics_total = calculate_metrics(
        df_recprice_prepared_merged[df_recprice_prepared_merged['orders_distance_bin'] == dist_bin],
        df_orders_prepared_merged[df_orders_prepared_merged['orders_distance_bin'] == dist_bin],
        df_full_prepared_merged[df_full_prepared_merged['orders_distance_bin'] == dist_bin],
        group_cols=['group_name', 'switch_start_dttm', 'switch_finish_dttm'],
    )

    metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05)[
        ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
    ]
    
    metrics_total_tbl['dist_bin'] = dist_bin
    results.append(metrics_total_tbl)

df_results = pd.concat(results, ignore_index=True)
df_results

    

Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,dist_bin
0,cp2order,1.235204,1.224397,-0.008749,0.013160,True,0
1,order2bid,0.689290,0.709440,0.029232,0.022990,True,0
2,order2start_price_bid,0.336311,0.348677,0.036769,0.029882,True,0
3,order2accept,0.528133,0.545560,0.032998,0.039395,True,0
4,order2done,0.469953,0.486123,0.034408,0.034431,True,0
...,...,...,...,...,...,...,...
247,surge_le_1_cp2order,1.227264,1.236774,0.007749,0.379130,False,25
248,surge_gr_1_order2done,0.353622,0.356865,0.009171,0.748109,False,25
249,surge_le_1_order2done,0.495139,0.491292,-0.007771,0.664354,False,25
250,surge_gr_1_cp2done,0.464584,0.467019,0.005241,0.820109,False,25


In [62]:
pd.unique(df_results['metric'])

array(['cp2order', 'order2bid', 'order2start_price_bid', 'order2accept',
       'order2done', 'bid2accept', 'bid2done', 'start_price_bid2accept',
       'start_price_bid2done', 'cp2bid', 'cp2start_price_bid',
       'cp2accept', 'cp2done', 'minprice_usd', 'price_base_usd',
       'recprice_usd', 'price_highrate_usd', 'rides_price_highrate_usd',
       'price_start_usd', 'rides_price_start_usd', 'price_tender_usd',
       'price_done_usd', 'done2rec', 'surge', 'dynamic_surge',
       'original_dynamic_surge_updated', 'good_rate', 'balance',
       'orders_by_minprice_share', 'cp2order_by_orders_by_minprice',
       'order2done_by_orders_by_minprice',
       'cp2done_by_orders_by_minprice', 'surge_gr_1_orders_share',
       'surge_le_1_orders_share', 'surge_gr_1_rides_share',
       'surge_le_1_rides_share', 'surge_gr_1_cp2order',
       'surge_le_1_cp2order', 'surge_gr_1_order2done',
       'surge_le_1_order2done', 'surge_gr_1_cp2done',
       'surge_le_1_cp2done'], dtype=object)

In [63]:
filtered_df = df_results[df_results['metric'].isin(
    ['cp2order', 'order2done', 'price_highrate_usd', 'done2rec', 'dynamic_surge', 'balance', 'surge_gr_1_orders_share', 'surge_le_1_orders_share', 'surge_gr_1_cp2order', 'surge_le_1_cp2order', 'surge_gr_1_cp2done', 'surge_le_1_cp2done']
)]

# Melt the DataFrame for easier plotting
df_melted = filtered_df.melt(
    id_vars=["metric", "dist_bin", "is_significant"],
    value_vars=["control_value", "experimental_value"],
    var_name="value_type",
    value_name="value"
)

# Create a separate plot for each metric
unique_metrics = df_melted["metric"].unique()

for metric in unique_metrics:
    metric_data = df_melted[df_melted["metric"] == metric]
    
    # Add a column to indicate significant points
    metric_data["marker_style"] = metric_data["is_significant"].apply(lambda x: "star" if x else "circle")
    
    # Create the line plot
    fig = px.line(
        metric_data,
        x="dist_bin",
        y="value",
        color="value_type",
        markers=True,
        title=f"{metric}",
        labels={"value": "Value", "dist_bin": "Distance Bin", "value_type": "Group"}
    )
    
    # Extract line colors from the figure
    line_colors = {trace.name: trace.line.color for trace in fig.data if 'line' in trace}
    
    # Add scatter plots for significant points with matching colors
    for value_type in metric_data["value_type"].unique():
        value_data = metric_data[(metric_data["value_type"] == value_type) & (metric_data["is_significant"])]
        fig.add_scatter(
            x=value_data["dist_bin"],
            y=value_data["value"],
            mode="markers",
            marker=dict(
                symbol="star",
                size=10,
                color=line_colors[value_type]  # Use the same color as the corresponding line
            ),
            showlegend=False,  # Hide from legend
            hoverinfo='skip'  # Hide hover info
        )
    
    # Customize layout
    fig.update_layout(
        legend_title="Group",
        xaxis_title="Distance Bin",
        yaxis_title="",
        width=800,
        height=500
    )
    
    # Show the plot
    fig.show()


#### Metrics, by distance AND surge

In [64]:
results = []

step_orders_distance_bin=5
filtered_dist_bins = np.arange(0, 25 + 1, 5)

step_surge_bin = 0.5
filtered_surge_bin=np.unique([1.0, 1.5])
for surge_bin in filtered_surge_bin:
    for dist_bin in filtered_dist_bins:
        df_metrics_total = calculate_metrics(
            df_recprice_prepared_merged[(df_recprice_prepared_merged['surge_bin'] == surge_bin) &
                                        (df_recprice_prepared_merged['orders_distance_bin'] == dist_bin)],
            df_orders_prepared_merged[(df_orders_prepared_merged['surge_bin'] == surge_bin) &
                                      (df_orders_prepared_merged['orders_distance_bin'] == dist_bin)],
            df_full_prepared_merged[(df_full_prepared_merged['surge_bin'] == surge_bin) &
                                    (df_full_prepared_merged['orders_distance_bin'] == dist_bin)],
            group_cols=['group_name', 'switch_start_dttm', 'switch_finish_dttm'],
        )

        print(surge_bin, dist_bin)
        metrics_total_tbl = get_switchback_results(df_metrics_total, alpha=0.05)[
            ['metric', 'control_value', 'experimental_value', 'uplift_rel', 'pvalue', 'is_significant']
        ]
        print('done')

        metrics_total_tbl['dist_bin'] = dist_bin
        metrics_total_tbl['surge_bin'] = surge_bin
        results.append(metrics_total_tbl)

df_results = pd.concat(results, ignore_index=True)
df_results

1.0 0
done
1.0 5
done
1.0 10
done
1.0 15
done
1.0 20
done
1.0 25
done
1.5 0
done
1.5 5
done
1.5 10
done
1.5 15
done
1.5 20
done
1.5 25
done


Unnamed: 0,metric,control_value,experimental_value,uplift_rel,pvalue,is_significant,dist_bin,surge_bin
0,cp2order,1.251188,1.239593,-0.009267,0.004621,True,0,1.0
1,order2bid,0.662075,0.672949,0.016425,0.094796,False,0,1.0
2,order2start_price_bid,0.315245,0.329465,0.045107,0.000424,True,0,1.0
3,order2accept,0.496352,0.507056,0.021566,0.075606,False,0,1.0
4,order2done,0.440224,0.450451,0.023230,0.055066,False,0,1.0
...,...,...,...,...,...,...,...,...
499,surge_le_1_cp2order,,,,,False,25,1.5
500,surge_gr_1_order2done,0.229143,0.236071,0.030232,0.690784,False,25,1.5
501,surge_le_1_order2done,,,,,False,25,1.5
502,surge_gr_1_cp2done,0.318787,0.321350,0.008042,0.903775,False,25,1.5


In [65]:
filtered_df = df_results[df_results['metric'].isin(
    ['cp2order', 'order2done', 'price_highrate_usd', 'done2rec', 'dynamic_surge', 
     'balance', 'surge_gr_1_orders_share', 'surge_gr_1_cp2order', 'surge_gr_1_cp2done']
)]
df = filtered_df
metrics = df['metric'].unique()

for metric in metrics:
    # Filter data for the current metric
    metric_data = df[df['metric'] == metric]

    # Pivot data for uplift_rel
    heatmap_data = metric_data.pivot(index='surge_bin', columns='dist_bin', values='uplift_rel')
    significance_data = metric_data.pivot(index='surge_bin', columns='dist_bin', values='is_significant')
    text_data = heatmap_data.copy()
    for row in heatmap_data.index:
        for col in heatmap_data.columns:
            value = heatmap_data.loc[row, col]
            is_significant = significance_data.loc[row, col]
            text_data.loc[row, col] = f"{value:.2f}*" if is_significant else f"{value:.2f}"

    # Create heatmap for uplift_rel
    fig_rel = px.imshow(
        heatmap_data,
        text_auto=True,
        color_continuous_scale='Viridis',
        aspect='auto',
        title=f"{metric}: Uplift Relative Heatmap",
        labels={"x": "Distance Bin", "y": "Surge Bin", "color": "Uplift Relative"}
    )
    
    fig_rel.update_traces(text=text_data.values, texttemplate="%{text}")
    fig_rel.update_layout(width=800, height=500, 
                          yaxis={"tick0": heatmap_data.index,
                                                    "dtick": step_surge_bin})
    fig_rel.update_coloraxes(showscale=False)

    # Show the heatmap for uplift_rel
    fig_rel.show()

In [66]:
filtered_df = df_results[df_results['metric'].isin(
    ['cp2order', 'order2done', 'price_highrate_usd', 'done2rec', 'dynamic_surge', 
     'balance', 'surge_gr_1_orders_share', 'surge_gr_1_cp2order', 'surge_gr_1_cp2done']
)]

# Melt the DataFrame for easier plotting
df_melted = filtered_df.melt(
    id_vars=["metric", "dist_bin", "is_significant", "surge_bin"],  # Include surge_bin in id_vars
    value_vars=["control_value", "experimental_value"],
    var_name="value_type",
    value_name="value"
)

# Create a separate plot for each metric
unique_metrics = df_melted["metric"].unique()

for metric in unique_metrics:
    metric_data = df_melted[df_melted["metric"] == metric]
    
    # Create a new column for unique grouping with line break in the legend
    metric_data["grouping"] = metric_data["value_type"] + "<br>(Surge: " + metric_data["surge_bin"].astype(str) + ")"
    
    # Create the line plot
    fig = px.line(
        metric_data,
        x="dist_bin",
        y="value",
        color="grouping",  # Use the new grouping for color
        markers=True,
        title=f"{metric}",
        labels={"value": "Value", "dist_bin": "Distance Bin", "grouping": "Group"}
    )
    
    # Extract colors for each grouping
    line_colors = {trace.name: trace.line.color for trace in fig.data if 'line' in trace}

    # Add scatter plots for significant points with matching colors
    for value_type in metric_data["value_type"].unique():
        for surge in metric_data["surge_bin"].unique():
            value_data = metric_data[(metric_data["value_type"] == value_type) & 
                                     (metric_data["surge_bin"] == surge) & 
                                     (metric_data["is_significant"])]
            if not value_data.empty:  # Ensure there is data before adding scatter
                # Construct the key for the line color
                grouping_key = f"{value_type}<br>(Surge: {surge})"
                marker_color = line_colors.get(grouping_key, 'black')  # Default to black if not found
                
                fig.add_scatter(
                    x=value_data["dist_bin"],
                    y=value_data["value"],
                    mode="markers",
                    marker=dict(
                        symbol="star",
                        size=10,
                        color=marker_color  # Match color with the line
                    ),
                    showlegend=False,  # Hide from legend
                    hoverinfo='skip'  # Hide hover info
                )
    
    # Customize layout
    fig.update_layout(
        legend_title="Group",
        xaxis_title="Distance Bin",
        yaxis_title="",
        width=800,
        height=500
    )
    
    # Show the plot
    fig.show()


In [67]:
draw_lines(df_recprice_prepared_merged,
           df_orders_prepared_merged,
           df_full_prepared_merged,
           [["rides_price_highrate_usd", "rides_price_highrate_usd_sum", "rides_count"],
           ["price_highrate_usd", "price_highrate_usd_sum", "orders_count"],
           ["price_done_usd", "price_done_usd_sum", "rides_count"],
           ["done2rec", "price_done_usd_sum", "rides_price_highrate_usd_sum"],
           ["good_rate", "good_orders_count", "orders_count"],
           ["balance", "good_orders_count", "rides_count"],
           ["original_dynamic_surge_updated", "original_dynamic_surge_updated_sum", "calcprices_count"],
           ["cp2order", "orders_count", "calcprices_count"],
           ["cp2done", "rides_count", "calcprices_count"]])

#mph
#dynamic surge перерассчитывается при 2комп (см оуджи)
#dr OR rides_cnt

0

In [68]:
group_cols = ['group_name', 'switch_start_dttm']
df_grouped = df_recprice_prepared.groupby(group_cols).agg(avg_surge=('dynamic_surge', 'mean')).reset_index()

fig = px.line(
    df_grouped,
    x="switch_start_dttm",
    y="avg_surge",
    color="group_name",
    markers=False,
    title="Average Surge by Group and Switch Start Time",
    labels={
        "switch_start_dttm": "Switch Start Time",
        "avg_surge": "Average Surge",
        "group_name": "Group Name"
    }
)

# Customize layout
fig.update_layout(
    xaxis=dict(title="Switch Start Time", tickangle=-45),
    yaxis=dict(title="Average Surge"),
    legend=dict(title="Group Name"),
    #height=500,
    #width=800
)

fig.show()

fig.write_html('/Users/georgiinusuev/PycharmProjects/SurgeVSLongRides/SB results/bogota_pics/Average Surge.html')


# nan там нет в логах
# <1 там нет в логах

In [69]:
step_surge_bin = 0.1
df = df_recprice_prepared
df['surge_bin'] = (df['dynamic_surge'] // step_surge_bin) * step_surge_bin
df = df.groupby(['group_name', 'surge_bin']).agg(cnt_rows=('dynamic_surge', 'count')).reset_index()
df = df.merge(df.groupby(['group_name']).agg(ttl_sum=('cnt_rows', 'sum')).reset_index(), on=['group_name'], how='left')

df['freq'] = df['cnt_rows'] / df['ttl_sum']

fig = px.line(
    df, 
    x="surge_bin", 
    y="freq", 
    color="group_name",  # Group by the 'group' column
    markers=True,   # Add markers to the lines
    labels={"x": "X-Axis", "y": "Y-Axis"},  # Customize axis labels
    title="Line Plot for Three Groups"
)

# Update layout
fig.update_layout(
    width=800,
    height=500,
    legend_title="Group",
)

fig.show()

In [70]:
step_orders_distance_bin = 0.25

df = df_orders_prepared[df_orders_prepared['is_order_done'] == True]
df['orders_distance_bin'] = (df['distance_in_km'] // step_orders_distance_bin) * step_orders_distance_bin
df = df.groupby(['group_name', 'orders_distance_bin']).agg(cnt_rows=('orders_distance_bin', 'count')).reset_index()
df = df.merge(df.groupby(['group_name']).agg(ttl_sum=('cnt_rows', 'sum')).reset_index(), on=['group_name'], how='left')

df['freq'] = df['cnt_rows'] / df['ttl_sum']

fig = px.line(
    df, 
    x="orders_distance_bin", 
    y="freq", 
    color="group_name",  # Group by the 'group' column
    markers=False,   # Add markers to the lines
    labels={"x": "X-Axis", "y": "Y-Axis"},  # Customize axis labels
    title="Line Plot for Three Groups"
)

# Update layout
fig.update_layout(
    width=800,
    height=500,
    legend_title="Group",
)

fig.show()