In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
XLSX_URL = "https://huggingface.co/datasets/flodussart/getaround_xls_certif/resolve/main/get_around_delay_analysis.xlsx"
df = pd.read_excel(XLSX_URL, sheet_name=0)  

CSV_URL = "https://huggingface.co/datasets/flodussart/getaround_pricing_project/resolve/main/get_around_pricing_project.csv"
dataset_pricing = pd.read_csv(CSV_URL) 

In [6]:
# Define clipping range for delay values (to handle extreme outliers)
CLIP_MIN, CLIP_MAX = -500, 1000  

# Column name constants (for cleaner and more readable code)
COL_DELAY    = "delay_at_checkout_in_minutes"       # Delay between scheduled and actual checkout
COL_CHECKIN  = "checkin_type"                       # Type of check-in (mobile or connect)
COL_STATE    = "state"                              # Rental status (ended / canceled)
COL_GAP      = "time_delta_with_previous_rental_in_minutes"  # Time gap between two consecutive rentals
COL_RENTAL   = "rental_id"                          # Unique rental identifier
COL_PREV_ID  = "previous_ended_rental_id"           # ID of the previous rental (if exists)

# Maximum Y-axis value used in some plots
Y_MAX = 1000  

# Define color palette for check-in types
COLOR_CI = {
    "mobile": "#4cc9f0",     # Light blue for mobile check-in
    "connect": "#ffb703"     # Orange for connect check-in
}

# Visualization and threshold parameters
NBINS = 60                        # Number of bins for histograms
THRESHOLDS = [60, 120]            # Thresholds in minutes (e.g., <1h, <2h)
THRESHOLDS_ROI = np.arange(0, 721, 15)  # Range of interest for gap analysis (0 to 12h in 15-min steps)
SEUILS = list(range(0, 401, 10))        # Thresholds for sensitivity analysis (0 to 400 min by step of 10)
SEUIL_METRIQUE = 60                     # Metric threshold (e.g., 1 hour buffer)
T_DISPLAY = 60                          # Time display unit for plots (in minutes)


In [7]:
# Work copy
df_base = df.copy()

# Numeric delay + clipping for analysis (not the outlier viz zoom)
df_base["delay_clipped"] = df_base["delay_at_checkout_in_minutes"].clip(CLIP_MIN, CLIP_MAX)

# Canonical perimeter for delay analyses: ended only
df_ended = df_base[df_base["state"].eq("ended")].copy()

# Sanity snapshot
print(
    f"Base: {len(df_base):,} rows | Ended: {len(df_ended):,} | "
    f"Missing delay in ended: {df_ended['delay_clipped'].isna().sum():,}"
)


Base: 21,310 rows | Ended: 18,045 | Missing delay in ended: 1,700


In [None]:
def get_scoped(df_ended: pd.DataFrame, scope: str = "all") -> pd.DataFrame:
    """
    Return a scoped view of df_ended:
      - 'all'      : no filter
      - 'connect'  : connect only
      - 'mobile'   : mobile only
    """
    if scope == "connect":
        return df_ended[df_ended["checkin_type"] == "connect"].copy()
    if scope == "mobile":
        return df_ended[df_ended["checkin_type"] == "mobile"].copy()
    return df_ended.copy()


def make_df_gap(df_ended: pd.DataFrame) -> pd.DataFrame:
    """
    Build df_gap on ended only:
      - keep rows with both delay and gap
      - restrict to expected flows (mobile/connect)
      - add 'gap' as float and 'was_conflict' flag (delay > gap)
    """
    out = df_ended.dropna(
        subset=["delay_clipped", "time_delta_with_previous_rental_in_minutes"]
    ).copy()
    out["gap"] = out["time_delta_with_previous_rental_in_minutes"]
    out = out[out["checkin_type"].isin(["mobile", "connect"])]
    out["was_conflict"] = out["delay_clipped"] > out["gap"]
    return out


def make_df_next(df_ended: pd.DataFrame) -> pd.DataFrame:
    """
    Map previous_ended_rental_id -> rental_id to retrieve the next rental.
    Adds a 'next_rental_id' column.
    """
    if {"rental_id", "previous_ended_rental_id"}.issubset(df_ended.columns):
        mapping = (
            df_ended.dropna(subset=["previous_ended_rental_id"])
            .set_index("previous_ended_rental_id")["rental_id"]
            .to_dict()
        )
        out = df_ended.copy()
        out["next_rental_id"] = out["rental_id"].map(mapping)
        return out
    out = df_ended.copy()
    out["next_rental_id"] = pd.NA
    return out


In [10]:
# Define a single scope for consistency across all sections
SCOPE = "all"  # options: 'all' | 'connect' | 'mobile'

# Apply the chosen scope:
# - Section 1: Delay distribution analysis
# - Section 3: Gap-based curves (masked / avoided rentals)
# - Section 2: Propagation to the next rental
df_scope = get_scoped(df_ended, SCOPE)   
df_gap   = make_df_gap(df_scope)         
df_next  = make_df_next(df_scope)        

# Sanity check: display key row counts for each dataset
print(
    f"Scope '{SCOPE}': "
    f"ended rows={len(df_scope):,} | "
    f"gap rows={len(df_gap):,} | "
    f"with next={df_next['next_rental_id'].notna().sum():,}"
)



Scope 'all': ended rows=18,045 | gap rows=1,515 | with next=1,612


In [None]:
# Punctuality donuts by flow (ALL / CONNECT / MOBILE) 
# We reuse the canonical df_ended prepared earlier.

# Fixed label order + consistent colors across panels
labels = ["À l'heure/avance", "En retard", "Non renseigné"]
colors = ["#FF6B6B", "#3B5BDB", "#74C0FC"]  # on-time/early, late, not recorded

panels = [
    ("Flux : ALL",      get_scoped(df_ended, "all")),
    ("Flux : CONNECT",  get_scoped(df_ended, "connect")),
    ("Flux : MOBILE",   get_scoped(df_ended, "mobile")),
]

fig = make_subplots(
    rows=1, cols=3,
    specs=[[{"type": "pie"}, {"type": "pie"}, {"type": "pie"}]],
    subplot_titles=[title for title, _ in panels],
)

for idx, (title, dfi) in enumerate(panels, start=1):
    # Counts per status on the same perimeter
    # - on_time_or_early: delay <= 0 (includes early)
    # - late: delay > 0
    # - missing: NaN delay (not recorded yet)
    count_on_time = int((dfi["delay_clipped"] <= 0).sum())
    count_late    = int((dfi["delay_clipped"] > 0).sum())
    count_missing = int(dfi["delay_clipped"].isna().sum())

    fig.add_trace(
        go.Pie(
            labels=labels,
            values=[count_on_time, count_late, count_missing],
            hole=0.40,
            textinfo="percent+label",
            showlegend=(idx == 1),  # legend only once
            marker=dict(colors=colors),
        ),
        row=1, col=idx
    )

fig.update_layout(
    height=500,
    width=1600,
    title_text="Ponctualité par flux : ALL vs CONNECT vs MOBILE (avec Non renseigné)",
    title_x=0.5,
    legend=dict(orientation="h", y=-0.05),
    margin=dict(l=40, r=40, t=80, b=60),
)
fig.show()

# Quick distribution sanity check on the same canonical perimeter
print("Distribution checkin_type (ended):")
print(df_ended["checkin_type"].value_counts(dropna=False))


Distribution checkin_type (ended):
checkin_type
mobile     14536
connect     3509
Name: count, dtype: int64


In [None]:
# Histogramme des retards (minutes) 

def plot_delay_histogram(df_ended: pd.DataFrame, scope: str = "all") -> None:
    """
    Plot the distribution of positive delays (minutes) for a given scope.
    Scope options: "all" | "connect" | "mobile".
    - Uses the canonical perimeter df_ended (state == 'ended').
    - Keeps only strictly positive observed delays (delay_clipped > 0).
    """
    # Map for prettier title
    scope_label = {"all": "Tous flux", "connect": "Connect", "mobile": "Mobile"}.get(scope, scope)

    # Scope filter (single entry point)
    df_scope = get_scoped(df_ended, scope)

    # Keep positive delays only (observed)
    delays = df_scope["delay_clipped"]
    df_delays = df_scope.loc[delays.notna() & (delays > 0), ["delay_clipped"]].copy()

    if df_delays.empty:
        print("Aucune ligne éligible pour cet histogramme (retards > 0).")
        return

    # KPIs
    median_delay = float(df_delays["delay_clipped"].median())
    share_over = {t: (df_delays["delay_clipped"] > t).mean() * 100 for t in THRESHOLDS}
    n_rows = len(df_delays)

    # Plot
    fig = px.histogram(
        df_delays,
        x="delay_clipped",
        nbins=NBINS,
        range_x=[0, CLIP_MAX],
        labels={"delay_clipped": "Retard au checkout (minutes, borné)"},
        title=(
            f"Distribution des retards (mn) — scope : {scope_label}"
            f" — bornage [{CLIP_MIN}, {CLIP_MAX}]"
        ),
    )
    fig.update_layout(title_x=0.5, plot_bgcolor="white")

    # Median + vertical thresholds
    fig.add_vline(x=median_delay, line_dash="dash", line_color="black", opacity=0.8)
    fig.add_annotation(
        x=median_delay, y=1.02, xref="x", yref="paper",
        text=f"Médiane ≈ {median_delay:.0f} mn", showarrow=False
    )
    for t in THRESHOLDS:
        fig.add_vline(x=t, line_dash="dot", opacity=0.6)
        fig.add_annotation(x=t, y=1.02, xref="x", yref="paper", text=f"{t} mn", showarrow=False)

    fig.show()

    # 5) Text KPIs
    print(f"Lignes (retards > 0) : {n_rows:,}")
    print(f"Médiane du retard   : {median_delay:.1f} mn")
    for t, pct in share_over.items():
        print(f"Part > {t:>3} mn       : {pct:.1f} %")

    # 6) Missing delays in the same scope (context)
    missing = int(df_scope["delay_clipped"].isna().sum())
    total_scope = len(df_scope)
    print(f"Non renseignés dans ce scope : {missing:,} ({missing / total_scope * 100:.1f} %)")

 
plot_delay_histogram(df_ended, scope="all")   # "all" | "connect" | "mobile"


Lignes (retards > 0) : 9,404
Médiane du retard   : 53.0 mn
Part >  60 mn       : 46.6 %
Part > 120 mn       : 27.1 %
Non renseignés dans ce scope : 1,700 (9.4 %)


In [12]:
# Partie 2 : Impact du délai entre deux locations (masquées / évités) 

def build_gap_curves(df_ended: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Build long-format curves (% masked and % avoided conflicts) by threshold.
    Definitions:
      - Masked (per product rule): gap < threshold (base = all eligible rows)
      - Historical conflict: delay_clipped > gap
      - Avoided conflicts: historical_conflict & (gap < threshold) (base = conflicts)
    Returns:
      loss_curve_long, solved_curve_long, df_gap (cleaned eligible base)
    """
    # Reuse existing helper to build the eligible base
    df_gap = make_df_gap(df_ended).copy()

    rows_loss, rows_solved = [], []
    for t in SEUILS:
        for ci in ("mobile", "connect"):
            sub = df_gap[df_gap[COL_CHECKIN] == ci]

            denom_loss = len(sub)                         # base for "% masked"
            denom_solved = int(sub["was_conflict"].sum()) # base for "% avoided"

            masked = int((sub["gap"] < t).sum())          # rule: masked if gap < t
            avoided = int(((sub["gap"] < t) & sub["was_conflict"]).sum())

            rows_loss.append({
                "Seuil (min)": t,
                "variable": f"Masquées {ci} (%)",
                "value": (masked / denom_loss * 100) if denom_loss else 0.0,
            })
            rows_solved.append({
                "Seuil (min)": t,
                "variable": f"Évités {ci} (%)",
                "value": (avoided / denom_solved * 100) if denom_solved else 0.0,
            })

    loss_curve_long = pd.DataFrame(rows_loss)
    solved_curve_long = pd.DataFrame(rows_solved)
    return loss_curve_long, solved_curve_long, df_gap



def sanity_check_curves(loss_curve_long: pd.DataFrame, solved_curve_long: pd.DataFrame) -> None:
    """Basic safety: zero at t=0 and monotonic non-decreasing curves."""
    t0_loss = loss_curve_long.query("`Seuil (min)` == 0")["value"].sum()
    t0_solved = solved_curve_long.query("`Seuil (min)` == 0")["value"].sum()
    assert abs(t0_loss) < 1e-9, "À t=0, % masquées devrait être 0."
    assert abs(t0_solved) < 1e-9, "À t=0, % évités devrait être 0."

    for df_ in (loss_curve_long, solved_curve_long):
        for var in df_["variable"].unique():
            y = (df_.loc[df_["variable"].eq(var)]
                   .sort_values("Seuil (min)")["value"]
                   .to_numpy())
            if len(y) > 1:
                # tolerate tiny numeric noise only
                assert (np.diff(y) >= -1e-9).all(), f"Courbe non-monotone détectée pour {var}."


def pick_value_at(df_long: pd.DataFrame, label: str, threshold: int) -> float:
    """Safe extractor for a single label at a given threshold."""
    row = df_long[(df_long["Seuil (min)"] == threshold) & (df_long["variable"] == label)]
    return float(row["value"].iloc[0]) if not row.empty else 0.0


# Build curves on the canonical perimeter (ENDED only)
loss_curve_long, solved_curve_long, df_gap = build_gap_curves(df_ended)

# Sanity checks
sanity_check_curves(loss_curve_long, solved_curve_long)

# Plots
title_suffix = f"(bornage [{CLIP_MIN}, {CLIP_MAX}] mn, scope : ended)"

fig_loss = px.line(
    loss_curve_long,
    x="Seuil (min)", y="value", color="variable",
    title=f"🔻 % de locations masquées vs seuil {title_suffix}",
    markers=True,
)
fig_loss.update_layout(title_x=0.5, yaxis_range=[0, 100], plot_bgcolor="white", legend_title_text="")
fig_loss.show()

fig_solved = px.line(
    solved_curve_long,
    x="Seuil (min)", y="value", color="variable",
    title=f"% de conflits historiques évités vs seuil {title_suffix}",
    markers=True,
)
fig_solved.update_layout(title_x=0.5, yaxis_range=[0, 100], plot_bgcolor="white", legend_title_text="")
fig_solved.show()

# KPIs at SEUIL_METRIQUE
masq_mobile  = pick_value_at(loss_curve_long,   "Masquées mobile (%)",  SEUIL_METRIQUE)
masq_connect = pick_value_at(loss_curve_long,   "Masquées connect (%)", SEUIL_METRIQUE)
evit_mobile  = pick_value_at(solved_curve_long, "Évités mobile (%)",    SEUIL_METRIQUE)
evit_connect = pick_value_at(solved_curve_long, "Évités connect (%)",   SEUIL_METRIQUE)

print(f"Basé sur {len(df_gap):,} lignes avec 'gap' connu {title_suffix}.")
print(f"Seuil = {SEUIL_METRIQUE} min")
print(f"  Masquées mobile  : {masq_mobile:5.2f} %")
print(f"  Masquées connect : {masq_connect:5.2f} %")
print(f"  Conflits évités mobile  : {evit_mobile:5.2f} %")
print(f"  Conflits évités connect : {evit_connect:5.2f} %")


Basé sur 1,515 lignes avec 'gap' connu (bornage [-500, 1000] mn, scope : ended).
Seuil = 60 min
  Masquées mobile  : 21.66 %
  Masquées connect : 22.69 %
  Conflits évités mobile  : 59.47 %
  Conflits évités connect : 78.75 %


In [None]:
# Partie 3 : Impact des retards sur la location suivante (strict 'ended') 

# Build next-rental mapping within ENDED only
ended_base = make_df_next(df_ended).copy()

delay_map = ended_base.set_index(COL_RENTAL)["delay_clipped"].to_dict()

df_next = ended_base.dropna(subset=["next_rental_id"]).copy()
df_next["next_rental_id"] = df_next["next_rental_id"].astype("Int64")
df_next["next_delay"] = df_next["next_rental_id"].map(delay_map)

# Keep valid A→B pairs and expected flows
df_plot = df_next.dropna(subset=["delay_clipped", "next_delay"]).copy()
df_plot = df_plot[df_plot[COL_CHECKIN].isin(["mobile", "connect"])].copy()

# Keep non-negative next delays and clip Y for readability
df_plot = df_plot[df_plot["next_delay"] >= 0].copy()
df_plot["next_delay"] = df_plot["next_delay"].clip(0, Y_MAX)

# Scatter plot
fig_scatter = px.scatter(
    df_plot,
    x="delay_clipped",
    y="next_delay",
    color=COL_CHECKIN,
    color_discrete_map=COLOR_CI,
    labels={
        "delay_clipped": "Retard au checkout (mn, borné)",
        "next_delay": "Retard de la suivante (mn, borné)",
        COL_CHECKIN: "Type de check-in",
    },
    title="Propagation du retard : location actuelle → location suivante (périmètre ended uniquement)",
)
fig_scatter.add_hline(y=0, line_dash="dash", line_color="#999", opacity=0.6)
fig_scatter.add_vline(x=0, line_dash="dash", line_color="#999", opacity=0.6)
fig_scatter.update_layout(title_x=0.5, plot_bgcolor="white")
fig_scatter.show()

# KPIs
pct_late_next = (df_plot["next_delay"] > 0).mean() * 100 if len(df_plot) else 0.0
avg_next_delay = (
    df_plot.loc[df_plot["next_delay"] > 0, "next_delay"].mean()
    if (df_plot["next_delay"] > 0).any()
    else 0.0
)

print(f"Lignes utilisées pour la viz : {len(df_plot):,}")
print(f"% de locations suivantes en retard : {pct_late_next:.1f} %")
print(f"Retard moyen de la suivante si en retard : {avg_next_delay:.1f} min")


Lignes utilisées pour la viz : 790
% de locations suivantes en retard : 98.2 %
Retard moyen de la suivante si en retard : 117.5 min


In [None]:
# Partie 4 : ROI du seuil — gain marginal & efficacité 

THRESHOLDS_ROI = np.arange(0, 721, 15, dtype=int)  # 0..720 by 15 minutes
T_DISPLAY = 60  # threshold used for KPI display

# Data preparation 
imp = df_ended.dropna(subset=["delay_clipped", COL_GAP]).copy()
imp["gap"] = imp[COL_GAP].astype(float)
imp = imp[imp[COL_CHECKIN].isin(["mobile", "connect"])].copy()

def masked_counts(sub: pd.DataFrame, t: int) -> int:
    """Count rentals hidden by the product rule (gap < t). Base = all eligible rows."""
    return int((sub["gap"] < t).sum())

def solved_counts(sub: pd.DataFrame, t: int) -> int:
    """Count historical conflicts avoided by threshold t."""
    was_conflict = sub["delay_clipped"] > sub["gap"]
    avoided = was_conflict & (sub["gap"] < t)
    return int(avoided.sum())

rows_eff, rows_marg = [], []
prev_total_solved = 0

for t in THRESHOLDS_ROI:
    for ci in ("mobile", "connect"):
        sub = imp[imp[COL_CHECKIN] == ci]
        solved_ci = solved_counts(sub, t)
        masked_ci = masked_counts(sub, t)
        efficiency = solved_ci / masked_ci if masked_ci > 0 else 0.0
        rows_eff.append({"Seuil (min)": t, "variable": f"Efficacité {ci}", "value": efficiency})

    solved_total_t = solved_counts(imp, t)
    gain = solved_total_t - prev_total_solved
    rows_marg.append({"Seuil (min)": t, "Gain marginal (Total)": gain})
    prev_total_solved = solved_total_t

df_eff = pd.DataFrame(rows_eff)
df_marg = pd.DataFrame(rows_marg)


# Sweet spot = threshold maximizing the average efficiency across flows
df_eff_tot = (
    df_eff.pivot(index="Seuil (min)", columns="variable", values="value")
          .assign(Efficacité_totale=lambda x: x.filter(like="Efficacité").mean(axis=1))
          .reset_index()
)
sweet_row = df_eff_tot.loc[df_eff_tot["Efficacité_totale"].idxmax()]
sweet_t = int(sweet_row["Seuil (min)"])
sweet_eff = float(sweet_row["Efficacité_totale"])

# Plots
fig_marg = px.line(
    df_marg, x="Seuil (min)", y="Gain marginal (Total)",
    title="Gain marginal de cas résolus (entre seuils consécutifs)", markers=True
)
fig_marg.add_vline(x=sweet_t, line_dash="dot", line_color="#888")
fig_marg.update_layout(title_x=0.5, plot_bgcolor="white")
fig_marg.show()

fig_eff = px.line(
    df_eff, x="Seuil (min)", y="value", color="variable",
    title="Efficacité = cas résolus / locations masquées", markers=True
)
fig_eff.add_vline(x=sweet_t, line_dash="dot", line_color="#888")
fig_eff.update_layout(
    title_x=0.5,
    yaxis_title="résolus par location masquée",
    plot_bgcolor="white",
    legend_title_text=""
)
fig_eff.show()

# KPIs at one threshold (T_DISPLAY)
def pick_metric_eff(df_long: pd.DataFrame, label: str, t: int) -> float:
    """Pick a single efficiency value from df_long at threshold t."""
    row = df_long[(df_long["Seuil (min)"] == t) & (df_long["variable"] == label)]
    return float(row["value"].iloc[0]) if not row.empty else 0.0

eff_mobile = pick_metric_eff(df_eff, "Efficacité mobile", T_DISPLAY)
eff_connect = pick_metric_eff(df_eff, "Efficacité connect", T_DISPLAY)

sub_mobile = imp[imp[COL_CHECKIN] == "mobile"]
sub_connect = imp[imp[COL_CHECKIN] == "connect"]

print(f"Seuil recommandé (sweet spot) : {sweet_t} min  |  Efficacité moyenne : {sweet_eff:.3f}")
print(f"[{T_DISPLAY} min]  Efficacité mobile  : {eff_mobile:.3f}  |  connect : {eff_connect:.3f}")
print(f"[{T_DISPLAY} min]  Masquées mobile : {masked_counts(sub_mobile, T_DISPLAY):,}  |  connect : {masked_counts(sub_connect, T_DISPLAY):,}")
print(f"[{T_DISPLAY} min]  Résolus  mobile : {solved_counts(sub_mobile, T_DISPLAY):,}  |  connect : {solved_counts(sub_connect, T_DISPLAY):,}")


Seuil recommandé (sweet spot) : 15 min  |  Efficacité moyenne : 0.592
[60 min]  Efficacité mobile  : 0.611  |  connect : 0.420
[60 min]  Masquées mobile : 185  |  connect : 150
[60 min]  Résolus  mobile : 113  |  connect : 63


In [21]:
# Partie 5 : Scénarios business — estimation du CA impacté
def run_business_scenario(
    df_ended: pd.DataFrame,
    threshold_min: int = 60,
    avg_duration_days: float = 1.5,
    loss_rate: float = 0.60,         # 60% of conflicts effectively impact revenue
    mean_daily_price: float = 120.0, # € per day (adjust if needed)
    scope: str = "all",              # "all" | "connect" | "mobile"
) -> dict:
    """
    Compute business KPIs for a given time buffer (threshold_min).

    Parameters
    ----------
    df_ended : pd.DataFrame
        Canonical dataset (state == "ended") with 'delay_clipped' and 'gap' available.
    threshold_min : int
        Minimum buffer between two rentals in minutes.
    avg_duration_days : float
        Average rental duration (business assumption, in days).
    loss_rate : float
        Share of conflicts that effectively lead to revenue loss (0–1).
    mean_daily_price : float
        Average daily rental price in euros (from pricing dataset or assumption).
    scope : str
        "all", "connect", or "mobile" — same logic as get_scoped().

    Returns
    -------
    dict
        Dictionary containing computed KPIs and revenue estimates.
    """

    # Apply the chosen scope using existing helper
    df = get_scoped(df_ended, scope)

    # Keep only rows with both delay and gap defined
    eligible = df.dropna(subset=["delay_clipped", COL_GAP]).copy()
    if eligible.empty:
        print("Aucune ligne éligible (retard & gap manquants).")
        return {}

    # Ensure gap is numeric (safeguard, normally already float)
    eligible["gap"] = eligible[COL_GAP]

    # Compute delay after applying the buffer rule (delay - threshold)
    eligible["overrun_after_buffer"] = eligible["delay_clipped"] - threshold_min

    # Identify rentals affected by the product rule (hidden if delay > gap + threshold)
    affected_mask = eligible["overrun_after_buffer"] > eligible["gap"]
    n_eligible    = len(eligible)
    n_affected    = int(affected_mask.sum())
    pct_affected  = (n_affected / n_eligible * 100) if n_eligible else 0

    # Identify historical conflicts resolved by applying the buffer
    conflict_before = eligible["delay_clipped"] > eligible["gap"]
    conflict_after  = eligible["overrun_after_buffer"] > eligible["gap"]
    resolved_mask   = conflict_before & (~conflict_after)

    n_problematic = int(conflict_before.sum())
    n_resolved    = int(resolved_mask.sum())
    pct_resolved  = (n_resolved / n_problematic * 100) if n_problematic else 0

    # Estimate revenue impact based on assumed loss rate and pricing
    baseline_gmv = n_eligible * mean_daily_price * avg_duration_days
    lost_gmv     = n_affected * loss_rate * mean_daily_price * avg_duration_days
    share_revenue_affected = (lost_gmv / baseline_gmv * 100) if baseline_gmv else 0

    # Aggregate results
    result = {
        "scope": scope,
        "threshold_min": threshold_min,
        "n_eligible": n_eligible,
        "n_affected": n_affected,
        "pct_affected": pct_affected,
        "n_problematic": n_problematic,
        "n_resolved": n_resolved,
        "pct_resolved": pct_resolved,
        "avg_duration_days": avg_duration_days,
        "loss_rate": loss_rate,
        "mean_daily_price": mean_daily_price,
        "share_revenue_affected_pct": share_revenue_affected,
        "lost_gmv_euros": lost_gmv,
        "baseline_gmv_euros": baseline_gmv,
    }

    # Display a clear summary 
    print(f"=== Scénario business (scope = '{scope}') ===")
    print(f"- Seuil (buffer)              : {threshold_min} min")
    print(f"- Lignes éligibles            : {n_eligible:,}")
    print(f"- Locations affectées         : {n_affected:,}  ({pct_affected:.1f} %)")
    print(f"- Conflits historiques        : {n_problematic:,}")
    print(f"- Cas résolus par le seuil    : {n_resolved:,}  ({pct_resolved:.1f} %)")
    print(
        f"- Hypothèses CA → durée={avg_duration_days} j, "
        f"%perte={loss_rate*100:.0f} %, prix/j={mean_daily_price:.0f} €"
    )
    print(f"- Part du CA affectée (proxy) : {share_revenue_affected:.1f} %")
    print(
        f"- CA perdu (proxy)            : {lost_gmv:,.0f} €  "
        f"|  baseline {baseline_gmv:,.0f} €"
    )

    return result


# Run business scenarios for different scopes

# every flow
_ = run_business_scenario(
    df_ended,
    threshold_min=60,
    avg_duration_days=1.5,
    loss_rate=0.60,
    mean_daily_price=121,
    scope="all",
)

# Connect only
_ = run_business_scenario(
    df_ended,
    threshold_min=60,
    avg_duration_days=1.5,
    loss_rate=0.60,
    mean_daily_price=121,
    scope="connect",
)


=== Scénario business (scope = 'all') ===
- Seuil (buffer)              : 60 min
- Lignes éligibles            : 1,515
- Locations affectées         : 141  (9.3 %)
- Conflits historiques        : 270
- Cas résolus par le seuil    : 129  (47.8 %)
- Hypothèses CA → durée=1.5 j, %perte=60 %, prix/j=121 €
- Part du CA affectée (proxy) : 5.6 %
- CA perdu (proxy)            : 15,355 €  |  baseline 274,972 €
=== Scénario business (scope = 'connect') ===
- Seuil (buffer)              : 60 min
- Lignes éligibles            : 1,515
- Locations affectées         : 141  (9.3 %)
- Conflits historiques        : 270
- Cas résolus par le seuil    : 129  (47.8 %)
- Hypothèses CA → durée=1.5 j, %perte=60 %, prix/j=121 €
- Part du CA affectée (proxy) : 5.6 %
- CA perdu (proxy)            : 15,355 €  |  baseline 274,972 €
