# Notebook to compute the real autonomy

Here you can find the approach that was used to estimate the real autonomy of vehicles.
Brief explanation:
- We calculate the number of km traveled per 1 SoC point per discharge phase
- We only take vehicles that have traveled at least 500km with direct monitoring
- We look for the median SoC by brand/model for different km ranges and discharge size in SoC
- We filter aberrant values corresponding to each model
- We calculate via a weighted average the number of km traveled for each VIN that we multiply by 100 to obtain the estimated real autonomy

In [None]:
import pandas as pd
from core.sql_utils import get_connection
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from core.s3.s3_utils import S3Service
from core.s3.settings import S3Settings
from core.spark_utils import create_spark_session
from sqlalchemy import text
from pyspark.sql.functions import col
from plotly.subplots import make_subplots
import plotly.figure_factory as ff


settings = S3Settings()

spark = create_spark_session(
    settings.S3_KEY,
    settings.S3_SECRET
)

s3 = S3Service()

### load data

In [None]:
with get_connection() as con:
        cursor = con.cursor()
        cursor.execute("""SELECT vd.speed, vd.soh, vd.consumption, vd.timestamp, v.vin FROM vehicle_data vd
            left join vehicle v
            on v.id = vd.vehicle_id;""", con)
        dbeaver =  pd.DataFrame(cursor.fetchall(), columns=["speed", "soh", "consumption", "timestamp", "VIN"])
        
dbeaver = dbeaver.sort_values('timestamp')
dbeaver['timestamp'] = pd.to_datetime(dbeaver['timestamp'])
dbeaver = dbeaver.drop_duplicates()
dbeaver["soh"] = dbeaver['soh'].astype(float)

In [None]:
tesla_phase = s3.read_parquet_df_spark(spark, "result_phases/result_phases_tesla_fleet_telemetry.parquet")
bmw_phase = s3.read_parquet_df_spark(spark, "result_phases/result_phases_bmw.parquet")
stellantis_phase = s3.read_parquet_df_spark(spark, "result_phases/result_phases_stellantis.parquet")
ford_phase = s3.read_parquet_df_spark(spark, "result_phases/result_phases_ford.parquet")
renault_phase = s3.read_parquet_df_spark(spark, "result_phases/result_phases_renault.parquet")
kia_phase = s3.read_parquet_df_spark(spark, "result_phases/result_phases_kia.parquet")
mercedes_phase = s3.read_parquet_df_spark(spark, "result_phases/result_phases_mercedes_benz.parquet")
volkswagen_phase = s3.read_parquet_df_spark(spark, "result_phases/result_phases_volkswagen.parquet")
volvo_phase = s3.read_parquet_df_spark(spark, "result_phases/result_phases_volvo_cars.parquet")

In [None]:
def clean(df):
    if "ODOMETER_DIFF" not in df.columns:
        df = df.withColumn("ODOMETER_DIFF", col("ODOMETER_LAST") - col("ODOMETER_FIRST"))
    df_clean = df.select("DATETIME_BEGIN", "BATTERY_NET_CAPACITY", "CONSUMPTION", "DATETIME_END", 
                                             "MAKE", "MODEL", "ODOMETER_DIFF", "ODOMETER_FIRST", "ODOMETER_LAST",
                                             "SOC_DIFF", "SOC_FIRST", "SOC_LAST", "RANGE", "VIN", "VERSION")

    df_clean = df_clean.na.drop(subset=["ODOMETER_DIFF", "RANGE", "SOC_DIFF"])
    df_filtered = df_clean.filter((col("SOC_DIFF") < 0) & (col("ODOMETER_DIFF") > 0))
    df_pandas = df_filtered.toPandas()
    df_pandas['DATETIME_BEGIN'] = pd.to_datetime(df_pandas['DATETIME_BEGIN'])
    df_pandas[["BATTERY_NET_CAPACITY", "CONSUMPTION", "ODOMETER_DIFF", 
                    "ODOMETER_FIRST", "ODOMETER_LAST", "SOC_DIFF", "SOC_FIRST", "SOC_LAST", "RANGE"]] = df_pandas[["BATTERY_NET_CAPACITY", "CONSUMPTION",
                                                                                                                                                  "ODOMETER_DIFF", "ODOMETER_FIRST", "ODOMETER_LAST", "SOC_DIFF", "SOC_FIRST", "SOC_LAST", "RANGE"]].astype('float')
    return df_pandas
    

In [None]:
tesla_clean = clean(tesla_phase)
bmw_clean = clean(bmw_phase)
renault_clean = clean(renault_phase)
ford_clean = clean(ford_phase)
stellantis_clean = clean(stellantis_phase)
kia_clean = clean(kia_phase)
mercedes_clean = clean(mercedes_phase)
volkswagen_clean = clean(volkswagen_phase)
volvo_clean = clean(volvo_phase)
df_clean = pd.concat([tesla_clean, bmw_clean, renault_clean, ford_clean, stellantis_clean, kia_clean, mercedes_clean, volkswagen_clean, volvo_clean])


In [None]:
result = (
    pd.merge_asof(
        df_clean.sort_values('DATETIME_BEGIN'),
        dbeaver,
        by='VIN',
        left_on='DATETIME_BEGIN',
        right_on='timestamp',
        direction='nearest'
    )
)

# Study distribution

compute number of km for 1 soc point 

In [None]:
result['ratio_km_soc'] = result['ODOMETER_DIFF'] / result['SOC_DIFF'].abs()
result['ratio_km_soc_autonomy'] = 100 * result['ratio_km_soc'] /  (result['RANGE'])

In [None]:

fig = ff.create_distplot([result[(result['ratio_km_soc_autonomy'] > 0) & 
                                 (result['ratio_km_soc_autonomy'] < 10)][['soh', 'ratio_km_soc_autonomy']].dropna()['ratio_km_soc_autonomy'].values],
                         ['ratio_km_soc_autonomy'], 
                         colors= ['green'],
                         bin_size=.01,
             
             )
fig.show()

In [None]:
px.scatter(result[(result['ratio_km_soc'] > 0) & (result['ratio_km_soc'] < 3)], x='soh', y='ratio_km_soc', color="MAKE")

Maybe need to filter ratio under 1 km for 1 soc pointwe will see during the autonomy computation. 

##### Consumption

Some consumption are crazy a filter is needded for the moement but a check for every consumption will be done to clean that. 

In [None]:
vin_gb = result.groupby("VIN", as_index=False).agg(
    soh=('soh', 'last'),
    wltp=('RANGE', 'last'),
    net_capacity=('BATTERY_NET_CAPACITY', 'last')
)

We will do a weighted mean for consumption and ratio_km_soc.  
With a filter for `consumption`,  
for the `ratio_km_soc` with and without it. 

In [None]:
def weighted_mean(df, group_col, value_col, weight_col):
    grouped = df.groupby(group_col)
    weighted_avg = grouped.apply(
        lambda x: (x[value_col] * x[weight_col]).sum() / x[weight_col].sum()
    ).reset_index(name=f"weighted_avg_{value_col}")
    return weighted_avg


In [None]:
wh_conso = weighted_mean(result[(result['CONSUMPTION'] > 0) & (result['CONSUMPTION'] < 50)], "VIN", "CONSUMPTION", "ODOMETER_DIFF")
wh_km_soc = weighted_mean(result[(result['ratio_km_soc'] < 15) & (result['ratio_km_soc'] > 0)], "VIN", "ratio_km_soc", "ODOMETER_DIFF")
wh_km_soc_filter = weighted_mean(result[(result['ratio_km_soc'] < 15) & (result['ratio_km_soc'] > 1)], "VIN", "ratio_km_soc", "ODOMETER_DIFF")
wh_km_soc_filter = wh_km_soc_filter.rename(columns={'weighted_avg_ratio_km_soc':'weighted_avg_ratio_km_soc_filter'})

In [None]:
info = vin_gb.merge(wh_conso).merge(wh_km_soc).merge(wh_km_soc_filter)

## Wich method ?

### Autonomy calcul based on consumption

In [None]:
def find_real_autonomy_soh(conso, soh, capacity):
    if soh > 1:
        soh = 1
    x = capacity * soh * 100 / conso
    return x

def find_real_autonomy(conso, capacity):
    x = capacity * 100 / conso
    return x

In [None]:
info['range_based_conso'] = info.apply(lambda x: find_real_autonomy(x['weighted_avg_CONSUMPTION'], x["net_capacity"]), axis=1)
info['range_based_conso_soh'] = info.apply(lambda x: find_real_autonomy_soh(x['weighted_avg_CONSUMPTION'], x['soh'],  x["net_capacity"]), axis=1)
print(info.shape)


In [None]:
px.scatter(info, y="range_based_conso", x="soh", color="wltp")

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=[
    "range based conso",
    "range based conso soh",
])


vars_to_plot = [
    ("range_based_conso", 1, 1),
    ("range_based_conso_soh", 1, 2),
]


for var, row, col in vars_to_plot:
    filtered = info.dropna(subset=[var])[var].values
    dist = ff.create_distplot([filtered], [var], colors=['green'], bin_size=25, show_rug=False)
    
    for trace in dist['data']:
        fig.add_trace(trace, row=row, col=col)


fig.update_layout(
    title="Distribution des proportions",
    showlegend=False,
    height=500,
    width=1400
)

fig.show()


In [None]:
info[['range_based_conso', 'range_based_conso_soh']].describe()

### Autonomy based on ratio km/soc

In [None]:
info['range_based_ratio'] = info['weighted_avg_ratio_km_soc'] * 100
info['range_based_ratio_filter'] = info['weighted_avg_ratio_km_soc_filter'] * 100

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=[
    "range_based_conso",
    "range_based_conso_soh",
])


vars_to_plot = [
    ("range_based_ratio", 1, 1),
    ("range_based_ratio_filter", 1, 2),
]


for var, row, col in vars_to_plot:
    filtered = info.dropna(subset=var)[var].values
    dist = ff.create_distplot([filtered], [var], colors=['green'], bin_size=25, show_rug=False)
    
    for trace in dist['data']:
        fig.add_trace(trace, row=row, col=col)


fig.update_layout(
    title="Distribution des proportions",
    showlegend=False,
    height=500,
    width=1400
)

fig.show()


In [None]:
px.scatter(info, y="range_based_ratio", x="soh", color="wltp")

In [None]:
info[['range_based_ratio', 'range_based_ratio_filter']].describe()

In [None]:
prop_ratio = pd.DataFrame()
prop_ratio['proportion_conso'] = info["range_based_conso"] / (info["wltp"] * info['soh'])
prop_ratio['proportion_conso_soh'] = info["range_based_conso_soh"] / (info["wltp"] * info['soh'])
prop_ratio['proportion_ratio'] = info["range_based_ratio"] / (info["wltp"] * info['soh'])


In [None]:
prop_ratio.describe()

# Range estimation

For the range estimation we will used the ratio_km_soc.  
We re taking only the vin we followed and drived for 500km 

In [None]:
vin =result.groupby("VIN").agg(
    odometer_start=("ODOMETER_FIRST", "min"),
    odometer_end=("ODOMETER_LAST", "max"),
).eval("odometer_diff=odometer_end - odometer_start").sort_values("odometer_diff")

In [None]:
vin = vin[vin['odometer_diff'] > 500]

In [None]:
valid_vin = vin.index.to_list()

### Study of ratio_km_soc 

In [None]:
result['SOC_MEAN'] = (result['SOC_FIRST'] + result['SOC_LAST'] )/ 2

In [None]:
px.scatter(result[(result['VIN'].isin(valid_vin))][['SOC_MEAN', 'ratio_km_soc', 'ODOMETER_DIFF']], x="SOC_MEAN", y="ratio_km_soc", trendline='ols')

In [None]:
px.scatter(result[result['VIN'].isin(valid_vin)][['SOC_MEAN', 'ratio_km_soc', 'ODOMETER_DIFF']], x="ODOMETER_DIFF", y="ratio_km_soc")

In [None]:
px.scatter(result[(result['VIN'].isin(valid_vin)) & (result['ODOMETER_DIFF']<1000) & (result['ODOMETER_DIFF']>10) ][['SOC_MEAN', 'ratio_km_soc', 'ODOMETER_DIFF', 'SOC_DIFF']], x="SOC_DIFF", y="ratio_km_soc", color="SOC_MEAN")

In [None]:
px.scatter(result[(result['VIN'].isin(valid_vin)) & (result['ODOMETER_DIFF']<1000) & (result['ODOMETER_DIFF']>10) ][['SOC_MEAN', 'ratio_km_soc', 'ODOMETER_DIFF', 'SOC_DIFF']], x="SOC_MEAN", y="ratio_km_soc", color='SOC_DIFF')

We can clearly see that the ODOMETER_DIFF has an impact on the ratio_km_soc.  
We can see that lower is the SOC_DIFF higher is the ratio_km_soc.


#### odometer diff study

In [None]:
df = result.dropna(subset=["ratio_km_soc", "MAKE", "ODOMETER_DIFF"])
df["MAKE"] = df["MAKE"].str.lower().str.strip()


bins = [0, 10, 25, 50, 100, 200, 1000]
labels = ["<10 km", "10–25 km", "25–50 km", "50–100 km", "100–200 km", ">200 km"]
df["km_range"] = pd.cut(df["ODOMETER_DIFF"], bins=bins, labels=labels, include_lowest=True)


for km_range in labels:
    df_range = df[df["km_range"] == km_range]
    
    if df_range.empty:
        continue 
    
    stats = df_range.groupby("MAKE")["ratio_km_soc"].agg(["mean", "median"]).reset_index()
    

    fig = go.Figure()
    
    fig.add_trace(go.Bar(
        x=stats["MAKE"],
        y=stats["mean"],
        name="Moyenne",
        text=stats["mean"].round(2),
        textposition="outside",
        marker_color='steelblue'
    ))
    
    fig.add_trace(go.Bar(
        x=stats["MAKE"],
        y=stats["median"],
        name="Médiane",
        text=stats["median"].round(2),
        textposition="outside",
        marker_color='darkorange'
    ))
    
    fig.update_layout(
        title=f"Ratio_km_soc par marque - {km_range}",
        xaxis_title="Marque",
        yaxis_title="Ratio_km_soc",
        barmode='group',
        template="plotly_white"
    )
    
    fig.show()


*Pour une même tranche les moyenne/médiane des différents OEM peuvent être bien différente* 

In [None]:
df = result.dropna(subset=["ratio_km_soc", "MAKE", "ODOMETER_DIFF"])
df["MAKE"] = df["MAKE"].str.lower().str.strip()


bins = [0, 10, 25, 50, 100, 200, 1000]
labels = ["<10 km", "10–25 km", "25–50 km", "50–100 km", "100–200 km", ">200 km"]
df["km_range"] = pd.cut(df["ODOMETER_DIFF"], bins=bins, labels=labels, include_lowest=True)


for make in df["MAKE"].unique():
    df_make = df[df["MAKE"] == make]
    
    if df_make.empty:
        continue 
    
    stats = df_make.groupby("km_range")["ratio_km_soc"].agg(["mean", "median"]).reindex(labels).reset_index()
    

    fig = go.Figure()
    
    fig.add_trace(go.Bar(
        x=stats["km_range"],
        y=stats["mean"],
        name="Moyenne",
        text=stats["mean"].round(2),
        textposition="outside",
        marker_color='steelblue'
    ))
    
    fig.add_trace(go.Bar(
        x=stats["km_range"],
        y=stats["median"],
        name="Médiane",
        text=stats["median"].round(2),
        textposition="outside",
        marker_color='darkorange'
    ))
    
    fig.update_layout(
        title=f"Ratio_km_soc pour {make}",
        xaxis_title="Tranche km",
        yaxis_title="Ratio_km_soc",
        barmode='group',
        template="plotly_white"
    )
    
    fig.show()


In [None]:
pivot = df.pivot_table(index="MAKE", columns="km_range", values="ratio_km_soc", aggfunc="mean")

fig = go.Figure(data=go.Heatmap(
    z=pivot.values,
    x=pivot.columns,
    y=pivot.index,
    colorscale='Blues',
    zmin=0,
    zmax=pivot.max().max(),
    text=pivot.round(2).values,
    texttemplate="%{text}"
))

fig.update_layout(
    title="Heatmap moyenne ratio_km_soc",
    xaxis_title="Tranche km",
    yaxis_title="Marque",
    width=800,
    height=600
)

fig.show()

pivot = df.pivot_table(index="MAKE", columns="km_range", values="ratio_km_soc", aggfunc="median")

fig = go.Figure(data=go.Heatmap(
    z=pivot.values,
    x=pivot.columns,
    y=pivot.index,
    colorscale='Blues',
    zmin=0,
    zmax=pivot.max().max(),
    text=pivot.round(2).values,
    texttemplate="%{text}"
))

fig.update_layout(
    title="Heatmap median ratio_km_soc",
    xaxis_title="Tranche km",
    yaxis_title="Marque",
    width=800,
    height=600
)

fig.show()

**De manière générale le ratio_km_soc est plus faible pour la tranche <10km et plus élevé pour la tranche >200km**

In [None]:
stats_tranche = df.groupby(["MAKE","km_range"])["ratio_km_soc"].agg(["mean", "median", "std", "count"]).reset_index()


In [None]:


df = result.dropna(subset=["ratio_km_soc", "MAKE", "ODOMETER_DIFF"])
df["MAKE"] = df["MAKE"].str.lower().str.strip()


bins = [0, 10, 25, 50, 100, 200, 1000]
labels = ["<10 km", "10–25 km", "25–50 km", "50–100 km", "100–200 km", ">200 km"]
df["km_range"] = pd.cut(df["ODOMETER_DIFF"], bins=bins, labels=labels, include_lowest=True)


valid_tranches_km = []

for make in df["MAKE"].unique():
    df_make = df[df["MAKE"] == make]
    
    stats = df_make.groupby("km_range")["ratio_km_soc"].mean().reset_index()
    
    median_mean = stats["ratio_km_soc"].median()
    
    lower = 0.8 * median_mean
    upper = 1.2 * median_mean
    
    valid = stats[(stats["ratio_km_soc"] >= lower) & (stats["ratio_km_soc"] <= upper)]["km_range"]
    
    valid_tranches_km += [(make, km) for km in valid]

df_valid = df.merge(pd.DataFrame(valid_tranches_km, columns=["MAKE", "km_range"]),
                    on=["MAKE", "km_range"], how="inner")

summary = df_valid.groupby(["MAKE", "km_range"])["ratio_km_soc"].agg(["count", "mean", "median", "std"])



In [None]:


# Pour chaque marque, créer un graphique séparé
for make in df_valid["MAKE"].unique():
    df_make = df_valid[df_valid["MAKE"] == make]
    
    # Calcul des stats par tranche km
    stats = df_make.groupby("km_range")["ratio_km_soc"].agg(["mean", "median"]).reset_index()
    
    # Création du barplot groupé
    fig = px.bar(
        stats,
        x="km_range",
        y=["mean", "median"],
        text_auto=".2f",
        title=f"Ratio_km_soc moyen et médian pour {make}",
        labels={"value": "Ratio_km_soc", "km_range": "Tranche km"},
        barmode="group"
    )
    
    fig.update_layout(template="plotly_white")
    fig.show()



#### SOC_DIFF study

In [None]:


# Nettoyage et préparation
df = result.dropna(subset=["ratio_km_soc", "MAKE", "SOC_DIFF"])
df["MAKE"] = df["MAKE"].str.lower().str.strip()

# Binning pour SOC_DIFF
bins = [-100, -50, -25, -15, -10, -5, 0]  # Les valeurs sont négatives car SOC_DIFF est négatif
labels = ["<-50%", "-50% à -25%", "-25% à -15%", "-15% à -10%", "-10% à -5%", "-5% à 0%"]
df["soc_range"] = pd.cut(df["SOC_DIFF"], bins=bins, labels=labels, include_lowest=True)

# Pour chaque tranche SOC, créer un graphique par constructeur
for soc_range in labels:
    df_range = df[df["soc_range"] == soc_range]
    
    if df_range.empty:
        continue 
    
    stats = df_range.groupby("MAKE")["ratio_km_soc"].agg(["mean", "median"]).reset_index()
    
    fig = go.Figure()
    
    fig.add_trace(go.Bar(
        x=stats["MAKE"],
        y=stats["mean"],
        name="Moyenne",
        text=stats["mean"].round(2),
        textposition="outside",
        marker_color='steelblue'
    ))
    
    fig.add_trace(go.Bar(
        x=stats["MAKE"],
        y=stats["median"],
        name="Médiane",
        text=stats["median"].round(2),
        textposition="outside",
        marker_color='darkorange'
    ))
    
    fig.update_layout(
        title=f"Ratio_km_soc par marque - {soc_range}",
        xaxis_title="Marque",
        yaxis_title="Ratio_km_soc",
        barmode='group',
        template="plotly_white"
    )
    
    fig.show()

# Création de la heatmap
pivot = df.pivot_table(index="MAKE", columns="soc_range", values="ratio_km_soc", aggfunc="mean")

fig = go.Figure(data=go.Heatmap(
    z=pivot.values,
    x=pivot.columns,
    y=pivot.index,
    colorscale='Blues',
    zmin=0,
    zmax=pivot.max().max(),
    text=pivot.round(2).values,
    texttemplate="%{text}"
))

fig.update_layout(
    title="Heatmap moyenne ratio_km_soc par tranche de SOC_DIFF",
    xaxis_title="Tranche SOC_DIFF",
    yaxis_title="Marque",
    width=800,
    height=600
)

fig.show()

# Statistiques par marque et tranche SOC
stats_tranche = df.groupby(["MAKE", "soc_range"])["ratio_km_soc"].agg(["count", "mean", "median", "std"]).reset_index()

# Identification des tranches valides (±20% autour de la médiane) pour chaque constructeur
valid_tranches_soc = []

for make in df["MAKE"].unique():
    df_make = df[df["MAKE"] == make]
    
    # Moyenne par tranche pour cette marque
    stats = df_make.groupby("soc_range")["ratio_km_soc"].mean().reset_index()
    
    # Médiane des moyennes
    median_mean = stats["ratio_km_soc"].median()
    
    # Définir seuils ±20% autour de la médiane
    lower = 0.8 * median_mean
    upper = 1.2 * median_mean
    
    # Sélection des tranches valides
    valid = stats[(stats["ratio_km_soc"] >= lower) & (stats["ratio_km_soc"] <= upper)]["soc_range"]
    
    # Ajouter à la liste avec la marque
    valid_tranches_soc += [(make, soc) for soc in valid]

# Filtrer le DataFrame
df_valid_soc = df.merge(pd.DataFrame(valid_tranches_soc, columns=["MAKE", "soc_range"]),
                        on=["MAKE", "soc_range"], how="inner")

# Graphiques par constructeur avec les tranches 
# Graphiques par constructeur avec les tranches valides
for make in df_valid_soc["MAKE"].unique():
    df_make = df_valid_soc[df_valid_soc["MAKE"] == make]
    
    # Ajout du min et max aux statistiques
    stats = df_make.groupby("soc_range")["ratio_km_soc"].agg([
        "mean", "median", "min", "max"
    ]).reset_index()
    
    # Création du graphique avec les 4 métriques
    fig = go.Figure()
    
    # Moyenne
    fig.add_trace(go.Bar(
        x=stats["soc_range"],
        y=stats["mean"],
        name="Moyenne",
        text=stats["mean"].round(2),
        textposition="outside",
        marker_color='steelblue'
    ))
    
    # Médiane
    fig.add_trace(go.Bar(
        x=stats["soc_range"],
        y=stats["median"],
        name="Médiane",
        text=stats["median"].round(2),
        textposition="outside",
        marker_color='darkorange'
    ))
    
    # Minimum
    fig.add_trace(go.Bar(
        x=stats["soc_range"],
        y=stats["min"],
        name="Minimum",
        text=stats["min"].round(2),
        textposition="outside",
        marker_color='red'
    ))
    
    # Maximum
    fig.add_trace(go.Bar(
        x=stats["soc_range"],
        y=stats["max"],
        name="Maximum",
        text=stats["max"].round(2),
        textposition="outside",
        marker_color='green'
    ))
    
    fig.update_layout(
        title=f"Ratio_km_soc statistiques pour {make} (tranches valides)",
        xaxis_title="Tranche SOC",
        yaxis_title="Ratio_km_soc",
        barmode="group",
        template="plotly_white",
        showlegend=True
    )
    
    fig.show()



In [None]:
stats_detail = df_valid_soc.groupby(["MAKE", "soc_range"])["ratio_km_soc"].agg([
    "count", "mean", "median", "min", "max", "std"
]).round(2)

In [None]:
stats_detail

## Compute 

In [None]:
df_result = result.dropna(subset=["ratio_km_soc", "MAKE", "SOC_DIFF", "ODOMETER_DIFF"])
df_result["MAKE"] = df_result["MAKE"].str.lower().str.strip()

# Binning
bins_km = [0, 10, 25, 50, 100, 200, 1000]
labels_km = ["<10 km", "10–25 km", "25–50 km", "50–100 km", "100–200 km", ">200 km"]
df_result["km_range"] = pd.cut(df_result["ODOMETER_DIFF"], bins=bins_km, labels=labels_km, include_lowest=True)


bins_soc = [-100, -50, -25, -15, -10, -5, 0] 
labels_soc = ["<-50%", "-50% à -25%", "-25% à -15%", "-15% à -10%", "-10% à -5%", "-5% à 0%"]
df_result["soc_range"] = pd.cut(df_result["SOC_DIFF"], bins=bins_soc, labels=labels_soc, include_lowest=True)

In [None]:
df_valid = df_result.merge(pd.DataFrame(valid_tranches_km, 
                                        columns=["MAKE", "km_range"]),
                           on=["MAKE", "km_range"], how="left").merge(
                               pd.DataFrame(valid_tranches_soc, 
                                            columns=["MAKE", "soc_range"]), 
                               on=["MAKE", "soc_range"], how="left")

In [None]:
# Création d'un dictionnaire pour stocker les bornes par modèle
model_bounds = {}

# Calcul du nombre total d'observations pour la normalisation
total_obs = len(df_valid)

# Pour chaque constructeur
for model in df_valid["MODEL"].unique():
 
    for version in df_valid[df_valid["MODEL"] == model]["VERSION"].unique():
        df_version = df_valid[(df_valid["MODEL"] == model) & (df_valid["VERSION"] == version)]
        median = df_version["ratio_km_soc"].median()
        n_points = len( df_valid[df_valid["MODEL"] == model])
        
        confidence_factor = 1 - (n_points / total_obs) 
        
        adjustment = max(0.10, min(0.30, confidence_factor))
        model_name = model + ' | ' + version
        model_bounds[model_name] = {
            "borne_inf": median * (1 - adjustment),
            "mediane": median,
            "borne_sup": median * (1 + adjustment),
            "borne_inf_bis": median * 0.8,  # -20%
            "borne_sup_bis": median * 1.2,  # +20%
            "nombre_points": n_points,
            "min_observe": df_version["ratio_km_soc"].min(),
            "max_observe": df_version["ratio_km_soc"].max()
        }

bounds_df = pd.DataFrame([
    {
        "MODEL": model.strip(),
        "mediane": bounds["mediane"],
        "borne_inf": bounds["borne_inf"],
        "borne_sup": bounds["borne_sup"],
        "borne_inf_bis": bounds["borne_inf_bis"],
        "borne_sup_bis": bounds["borne_sup_bis"],
        "nombre_points": bounds["nombre_points"],
        "min_observe": bounds["min_observe"],
        "max_observe": bounds["max_observe"]
    }
    for model, bounds in model_bounds.items()
])

bounds_df = bounds_df.sort_values("nombre_points", ascending=False)


fig = go.Figure()

for _, row in bounds_df.iterrows():
    fig.add_trace(go.Scatter(
        x=[row["nombre_points"]],
        y=[row["mediane"]],
        mode="markers",
        name=row["MODEL"],
        error_y=dict(
            type="data",
            symmetric=False,
            array=[row["borne_sup"] - row["mediane"]],
            arrayminus=[row["mediane"] - row["borne_inf"]]
        )
    ))

fig.update_layout(
    title="Intervalles de confiance par modèle",
    xaxis_title="Nombre d'observations",
    yaxis_title="Ratio km/soc"
)
fig.show()

In [None]:
df_split = bounds_df['MODEL'].str.split('|', expand=True)

bounds_df[['MODEL', 'VERSION']] = df_split.apply(lambda x: x.str.strip())



In [None]:
bounds_df.sort_values("MODEL", ascending=False).head(30)

## Compute for each vehicles 

In [None]:
df_merged = result.merge(bounds_df[['VERSION', 'borne_inf', 'borne_sup']], on='VERSION', how='left')

df_filtered = df_merged[
    (df_merged['ratio_km_soc'] >= df_merged['borne_inf']) &
    (df_merged['ratio_km_soc'] <= df_merged['borne_sup'])
]

df_filtered = df_filtered.drop(columns=['borne_inf', 'borne_sup'])

In [None]:

weighted_ratio = weighted_mean(df_filtered, "VIN", "ratio_km_soc", "ODOMETER_DIFF")
soh = df_filtered.groupby("VIN").agg(
    soh=("soh", "last"),
    ratio_soc_mean=("ratio_km_soc", "mean"),
    ratio_soc_median=("ratio_km_soc", "median"),
    ).reset_index()
# Ajout des informations de marque et modèle
vin_info = df_filtered[['VIN', 'MAKE', 'MODEL', 'VERSION', 'RANGE', 'BATTERY_NET_CAPACITY']].drop_duplicates()
weighted_ratio_with_info = weighted_ratio.merge(vin_info, on='VIN', how='left')

# Calcul de l'autonomie potentielle
weighted_ratio_with_info['autonomie_potentielle'] = weighted_ratio_with_info['weighted_avg_ratio_km_soc'] * 100


weighted_ratio = weighted_ratio.merge(soh, on='VIN', how='left')
weighted_ratio = weighted_ratio.merge(df_filtered[['VIN', 'MAKE', 'MODEL', 'VERSION', 'RANGE']].drop_duplicates(), on='VIN', how='left')
weighted_ratio['Potential_range'] = round(weighted_ratio['weighted_avg_ratio_km_soc'] * 100)
weighted_ratio['Supposed_range'] = weighted_ratio.apply(
    lambda x: x['RANGE'] * x['soh'] if x['soh'] < 1 else x['RANGE'],
    axis=1
)
weighted_ratio['ratio_autonomy'] = weighted_ratio['Potential_range'] / weighted_ratio['Supposed_range']

In [None]:
weighted_ratio.VIN.nunique()

In [None]:
px.scatter(weighted_ratio, x='RANGE', y='Potential_range', color='VERSION')

In [None]:
weighted_ratio["diff_mean_median"] = weighted_ratio["weighted_avg_ratio_km_soc"] - weighted_ratio["ratio_soc_median"]