# Analyse de autonomie réelle constatée vs WLTP


In [None]:
# Imports
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
from core.s3.s3_utils import S3Service
from core.spark_utils import create_spark_session
from core.pandas_utils import series_start_end_diff
from core.sql_utils import get_sqlalchemy_engine, text
import os
import warnings
warnings.filterwarnings('ignore')

# Configuration de l'affichage
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)


In [None]:
# Connexion aux services
s3 = S3Service()
spark_session = create_spark_session(os.environ.get('S3_KEY'), os.environ.get('S3_SECRET'))
engine = get_sqlalchemy_engine()
con = engine.connect()


## Load data

In [None]:
# Chargement des données véhicules avec informations complètes
query = """
SELECT 
    vd.*,
    v.vin,
    vm.model_name,
    vm.type as version,
    vm.autonomy as wltp_range,
    b.battery_chemistry,
    b.capacity,
    b.net_capacity,
    o.oem_name as make
FROM vehicle_data vd
JOIN vehicle v ON v.id = vd.vehicle_id
JOIN vehicle_model vm ON vm.id = v.vehicle_model_id
JOIN battery b ON b.id = vm.battery_id
JOIN oem o ON o.id = vm.oem_id
"""

with engine.connect() as connection:
    vehicle_df = pd.read_sql(text(query), connection)

vehicle_df['vin'] = vehicle_df['vin'].astype('category').astype(str)
vehicle_df.sort_values('timestamp', inplace=True)
vehicle_df['timestamp'] = pd.to_datetime(vehicle_df['timestamp'])
vehicle_df['soh'] = vehicle_df['soh'].fillna(vehicle_df['soh_oem'])
print(f"Données véhicules chargées : {len(vehicle_df)} lignes")
print(f"Nombre de VINs uniques : {vehicle_df['vin'].nunique()}")


In [None]:
tesla_phase = s3.read_parquet_df_spark(spark_session, "result_phases/result_phases_tesla_fleet_telemetry.parquet")
bmw_phase = s3.read_parquet_df_spark(spark_session, "result_phases/result_phases_bmw.parquet")
stellantis_phase = s3.read_parquet_df_spark(spark_session, "result_phases/result_phases_stellantis.parquet")
ford_phase = s3.read_parquet_df_spark(spark_session, "result_phases/result_phases_ford.parquet")
renault_phase = s3.read_parquet_df_spark(spark_session, "result_phases/result_phases_renault.parquet")
kia_phase = s3.read_parquet_df_spark(spark_session, "result_phases/result_phases_kia.parquet")
mercedes_phase = s3.read_parquet_df_spark(spark_session, "result_phases/result_phases_mercedes_benz.parquet")
volkswagen_phase = s3.read_parquet_df_spark(spark_session, "result_phases/result_phases_volkswagen.parquet")
volvo_phase = s3.read_parquet_df_spark(spark_session, "result_phases/result_phases_volvo_cars.parquet")

In [None]:
from pyspark.sql.functions import col
def clean(df):
    if "ODOMETER_DIFF" not in df.columns:
        df = df.withColumn("ODOMETER_DIFF", col("ODOMETER_LAST") - col("ODOMETER_FIRST"))
    if "RATIO_KM_SOC" not in df.columns:
        df = df.withColumn("RATIO_KM_SOC", col("ODOMETER_DIFF") / col("SOC_DIFF"))
    df_clean = df.select("DATETIME_BEGIN", "BATTERY_NET_CAPACITY", "CONSUMPTION", "DATETIME_END", 
                                             "MAKE", "MODEL", "ODOMETER_DIFF", "ODOMETER_FIRST", "ODOMETER_LAST",
                                             "SOC_DIFF", "SOC_FIRST", "SOC_LAST", "RANGE", "VIN", "VERSION", 'RATIO_KM_SOC')

    df_clean = df_clean.na.drop(subset=["ODOMETER_DIFF", "RANGE", "SOC_DIFF"])
    df_filtered = df_clean.filter((col("SOC_DIFF") < -5) & (col("ODOMETER_DIFF") > 0) & (col("ODOMETER_DIFF") < 1000) & (col("ODOMETER_DIFF") > 5) & (col("RATIO_KM_SOC") < 10))
    df_pandas = df_filtered.toPandas()
    df_pandas['DATETIME_BEGIN'] = pd.to_datetime(df_pandas['DATETIME_BEGIN'])
    df_pandas[["RATIO_KM_SOC", "BATTERY_NET_CAPACITY", "CONSUMPTION", "ODOMETER_DIFF", 
                    "ODOMETER_FIRST", "ODOMETER_LAST", "SOC_DIFF", "SOC_FIRST", "SOC_LAST", "RANGE"]] = df_pandas[["RATIO_KM_SOC", "BATTERY_NET_CAPACITY", "CONSUMPTION",
                                                                                                                                                  "ODOMETER_DIFF", "ODOMETER_FIRST", "ODOMETER_LAST", "SOC_DIFF", "SOC_FIRST", "SOC_LAST", "RANGE"]].astype('float')
    return df_pandas
    
tesla_clean = clean(tesla_phase)
bmw_clean = clean(bmw_phase)
renault_clean = clean(renault_phase)
ford_clean = clean(ford_phase)
stellantis_clean = clean(stellantis_phase)
kia_clean = clean(kia_phase)
mercedes_clean = clean(mercedes_phase)
volkswagen_clean = clean(volkswagen_phase)
volvo_clean = clean(volvo_phase)
df_clean = pd.concat([tesla_clean, bmw_clean, renault_clean, ford_clean, stellantis_clean, kia_clean, mercedes_clean, volkswagen_clean, volvo_clean])


In [None]:
df = df_clean.merge(vehicle_df[['vin', 'model_name', 'version', 'wltp_range', 'battery_chemistry', 'capacity', 'net_capacity', 'make']], left_on='VIN', right_on='vin', how='left')

In [None]:
# Fusion avec les données véhicules
df = pd.merge_asof(
    df_clean.sort_values('DATETIME_END'), 
    vehicle_df[['timestamp', 'vin', 'soh', 'model_name', 'version', 'wltp_range', 
                'battery_chemistry', 'capacity', 'net_capacity', 'make']], 
    left_on='DATETIME_END', 
    right_on='timestamp', 
    left_by='VIN', 
    right_by='vin', 
    direction='nearest',
    suffixes=('', '_vehicle')
)




In [None]:
df.columns

In [None]:
# Autonomie réelle pour une décharge complète
df['supposed_autonomy'] = df['RATIO_KM_SOC'] * 100 

# Ratio autonomie réelle / WLTP (en %)
df['ratio_supposed_wltp'] = (df['supposed_autonomy'] / df['wltp_range']) * 100

# Forward-fill du SoH par véhicule
df['soh'] = df.groupby('vin')['soh'].transform(lambda x: x.ffill())

# Saison 
df['month'] = df['DATETIME_BEGIN'].dt.month
df['season'] = df['month'].map({
    12: 'Hiver', 1: 'Hiver', 2: 'Hiver',
    3: 'Printemps', 4: 'Printemps', 5: 'Printemps',
    6: 'Été', 7: 'Été', 8: 'Été',
    9: 'Automne', 10: 'Automne', 11: 'Automne'
})

In [None]:
# Filtrage des valeurs aberrantes
# On garde les ratios entre 15% et 150% (autonomie réelle entre 30% et 130% du WLTP)
df_clean = df[
    (df['ratio_supposed_wltp'] > 15) & 
    (df['ratio_supposed_wltp'] < 150) 
].copy()

print(f"Données nettoyées : {len(df_clean)} phases ({len(df_clean)/len(df)*100:.1f}% des données)")
print(f"Nombre de véhicules : {df_clean['vin'].nunique()}")
print(f"Nombre de marques : {df_clean['make'].nunique()}")
print(f"Nombre de modèles : {df_clean['model_name'].nunique()}")


In [None]:
df_vin = df_clean.groupby('vin', as_index=False).agg({
    'ratio_supposed_wltp': 'median',
    'supposed_autonomy': 'median',
    'wltp_range': 'first',
    'capacity': 'first',
    'net_capacity': 'first',
    'battery_chemistry': 'first',
    'RATIO_KM_SOC': 'median',
    'make': 'first',
    'model_name': 'first',
    'version': 'first',
    'ODOMETER_FIRST': 'median',
    })

## 1. Analyses descriptives


### 1.1 Distribution globale du ratio Autonomie réelle / WLTP


In [None]:
# Statistiques globales
print("=== RATIO AUTONOMIE RÉELLE / WLTP ===")
print(f"Moyenne : {df_vin['ratio_supposed_wltp'].mean():.1f}%")
print(f"Médiane : {df_vin['ratio_supposed_wltp'].median():.1f}%")
print(f"Écart-type : {df_vin['ratio_supposed_wltp'].std():.1f}%")
print(f"\nPercentiles :")
print(f"  10% : {df_vin['ratio_supposed_wltp'].quantile(0.10):.1f}%")
print(f"  25% : {df_vin['ratio_supposed_wltp'].quantile(0.25):.1f}%")
print(f"  75% : {df_vin['ratio_supposed_wltp'].quantile(0.75):.1f}%")
print(f"  90% : {df_vin['ratio_supposed_wltp'].quantile(0.90):.1f}%")


In [None]:
# Histogramme de la distribution
fig = px.histogram(
    df_clean,
    x='ratio_supposed_wltp',
    nbins=50,
    title='Distribution du ratio Autonomie constatée / WLTP (échelle de la charge)',
    labels={'ratio_supposed_wltp': 'Ratio Autonomie constatée / WLTP (%)'},
    color_discrete_sequence=['#636EFA']
)

# Ajouter ligne verticale pour la moyenne
mean_ratio = df_clean['ratio_supposed_wltp'].mean()
fig.add_vline(x=mean_ratio, line_dash="dash", line_color="red", 
              annotation_text=f"Moyenne: {mean_ratio:.1f}%")

median_ratio = df_clean['ratio_supposed_wltp'].median()

fig.add_vline(x=median_ratio, line_dash="dash", line_color="purple", 
              annotation_text=f"Median: {median_ratio:.1f}%")

fig.add_vline(x=100, line_dash="dash", line_color="green", 
              annotation_text="WLTP (100%)")

fig.update_layout(
    xaxis_title="Ratio (%)",
    yaxis_title="Nombre de phases de décharge",
    height=500
)

fig.show()

# fig.write_html("graph/hist_ratio_wltp_per_charge.html")


In [None]:
# Histogramme de la distribution
fig = px.histogram(
    df_vin,
    x='ratio_supposed_wltp',
    nbins=50,
    title='Distribution du ratio Autonomie constatée / WLTP (échelle du VIN)',
    labels={'ratio_supposed_wltp': 'Ratio Autonomie constatée / WLTP (%)'},
    color_discrete_sequence=['#636EFA']
)

# Ajouter ligne verticale pour la moyenne
mean_ratio = df_vin['ratio_supposed_wltp'].mean()
fig.add_vline(x=mean_ratio, line_dash="dash", line_color="red", 
              annotation_text=f"Moyenne: {mean_ratio:.1f}%")

median_ratio = df_vin['ratio_supposed_wltp'].median()

fig.add_vline(x=median_ratio, line_dash="dash", line_color="purple", 
              annotation_text=f"Median: {median_ratio:.1f}%")

fig.add_vline(x=100, line_dash="dash", line_color="green", 
              annotation_text="WLTP (100%)")

fig.update_layout(
    xaxis_title="Ratio (%)",
    yaxis_title="Nombre de vehciules",
    height=500
)

fig.show()

# fig.write_html("graph/hist_ratio_wltp_per_vin.html")


### 1.2 Comparaison par marque


In [None]:
# Statistiques par marque
stats_by_make = df_clean.groupby('make').agg({
    'ratio_supposed_wltp': ['mean', 'median', 'std', 'count'],
    'supposed_autonomy': 'mean',
    'wltp_range': 'mean',
    'vin': 'nunique'
}).round(1)

stats_by_make.columns = ['_'.join(col).strip() for col in stats_by_make.columns.values]
stats_by_make = stats_by_make.rename(columns={
    'ratio_supposed_wltp_mean': 'Ratio moyen (%)',
    'ratio_supposed_wltp_median': 'Ratio médian (%)',
    'ratio_supposed_wltp_std': 'Écart-type (%)',
    'ratio_supposed_wltp_count': 'Nb phases',
    'supposed_autonomy_mean': 'Autonomie réelle (km)',
    'wltp_range_mean': 'WLTP moyen (km)',
    'vin_nunique': 'Nb véhicules'
})

stats_by_make = stats_by_make.sort_values('Ratio moyen (%)', ascending=False)
print("\n=== STATISTIQUES PAR MARQUE ===")
print(stats_by_make)


In [None]:
# Statistiques par marque
stats_by_make = df_clean.groupby('make').agg({
    'ratio_supposed_wltp': ['mean', 'median', 'std', 'count'],
    'supposed_autonomy': 'mean',
    'wltp_range': 'mean',
    #'vin': 'nunique'
}).round(1)

stats_by_make.columns = ['_'.join(col).strip() for col in stats_by_make.columns.values]
stats_by_make = stats_by_make.rename(columns={
    'ratio_supposed_wltp_mean': 'Ratio moyen (%)',
    'ratio_supposed_wltp_median': 'Ratio médian (%)',
    'ratio_supposed_wltp_std': 'Écart-type (%)',
    'ratio_supposed_wltp_count': 'Nb vehicules',
    'supposed_autonomy_mean': 'Autonomie réelle (km)',
    'wltp_range_mean': 'WLTP moyen (km)',
    #'vin_nunique': 'Nb véhicules'
})

stats_by_make = stats_by_make.sort_values('Ratio moyen (%)', ascending=False)
print("\n=== STATISTIQUES PAR MARQUE ===")
print(stats_by_make)


In [None]:
# Boxplot par marque
fig = px.box(
    df_clean,
    x='make',
    y='ratio_supposed_wltp',
    title='Distribution du ratio Autonomie réelle / WLTP par marque',
    labels={'make': 'Marque', 'ratio_supposed_wltp': 'Ratio (%)'},
    color='make'
)

fig.add_hline(y=100, line_dash="dash", line_color="gray", 
              annotation_text="WLTP (100%)")

fig.update_layout(
    xaxis_tickangle=-45,
    height=600,
    showlegend=False
)

fig.show()
# fig.write_html("graph/boxplot_ratio_wltp_per_make.html")

### 1.3 Comparaison par modèle


In [None]:
# Top 15 modèles les plus représentés
top_models = df_clean['model_name'].value_counts().head(10).index
df_top_models = df_clean[df_clean['model_name'].isin(top_models)]

# Statistiques par modèle
stats_by_model = df_top_models.groupby('model_name').agg({
    'ratio_supposed_wltp': ['mean', 'median', 'std', 'count'],
    'supposed_autonomy': 'mean',
    'wltp_range': 'mean',
    'make': 'first',
    'vin': 'nunique'
}).round(1)

stats_by_model.columns = ['_'.join(col).strip() if col[1] else col[0] for col in stats_by_model.columns.values]
stats_by_model = stats_by_model.sort_values('ratio_supposed_wltp_mean', ascending=False)

print("\n=== TOP 15 MODÈLES - STATISTIQUES ===")
stats_by_model


In [None]:
# Boxplot pour les top modèles
fig = px.box(
    df_top_models,
    x='model_name',
    y='ratio_supposed_wltp',
    title='Ratio Autonomie réelle / WLTP - Top 15 modèles',
    labels={'model_name': 'Modèle', 'ratio_supposed_wltp': 'Ratio (%)'},
    color='make'
)

fig.add_hline(y=100, line_dash="dash", line_color="gray", 
              annotation_text="WLTP (100%)")

fig.update_layout(
    xaxis_tickangle=-45,
    height=600
)

fig.show()


## 2. Facteurs d'influence


### 2.1 Influence de la saison


In [None]:
# Analyse par saison
print("\n=== INFLUENCE DE LA SAISON ===")

stats_season = df_clean.groupby('season').agg({
    'ratio_supposed_wltp': ['mean', 'median', 'std', 'count']
}).round(1)

stats_season = stats_season.reindex(['Hiver', 'Printemps', 'Été', 'Automne'])
print("\nStatistiques par saison :")
print(stats_season)


In [None]:
# Boxplot par saison
fig = px.box(
    df_clean,
    x='season',
    y='ratio_supposed_wltp',
    title='Impact de la saison sur le ratio Autonomie réelle / WLTP',
    labels={'season': 'Saison', 'ratio_supposed_wltp': 'Ratio (%)'},
    color='season',
    category_orders={'season': ['Hiver', 'Printemps', 'Été', 'Automne']}
)

fig.add_hline(y=100, line_dash="dash", line_color="gray", 
              annotation_text="WLTP (100%)")

fig.update_layout(height=500, showlegend=False)
fig.show()


In [None]:
# Comparaison par marque et saison
stats_make_season = df_clean.groupby(['make', 'season']).agg({
    'ratio_supposed_wltp': 'mean'
}).reset_index()

fig = px.bar(
    stats_make_season,
    x='make',
    y='ratio_supposed_wltp',
    color='season',
    barmode='group',
    title='Ratio moyen Autonomie réelle / WLTP par marque et saison',
    labels={'make': 'Marque', 'ratio_supposed_wltp': 'Ratio moyen (%)'},
    category_orders={'season': ['Hiver', 'Printemps', 'Été', 'Automne']}
)

fig.add_hline(y=100, line_dash="dash", line_color="gray")
fig.update_layout(xaxis_tickangle=-45, height=600)
fig.show()

# fig.write_html("graph/bar_ratio_wltp_per_make_and_season.html")


### 2.2 Influence du kilométrage (âge)


In [None]:
# Analyse par kilométrage
print("\n=== INFLUENCE DU KILOMÉTRAGE ===")

# Catégories de kilométrage
df_clean['km_category'] = pd.cut(
    df_clean['ODOMETER_FIRST'],
    bins=[0, 20000, 50000, 100000, float('inf')],
    labels=['0-20k km', '20-50k km', '50-100k km', '>100k km']
)

stats_km = df_clean.groupby('km_category').agg({
    'ratio_supposed_wltp': ['mean', 'median', 'std', 'count'],
    'ODOMETER_FIRST': 'mean'
}).round(1)

print("\nStatistiques par catégorie de kilométrage :")
print(stats_km)


In [None]:
# Boxplot par kilométrage
fig = px.box(
    df_clean,
    x='km_category',
    y='ratio_supposed_wltp',
    title='Impact du kilométrage sur le ratio Autonomie réelle / WLTP',
    labels={'km_category': 'Kilométrage', 'ratio_supposed_wltp': 'Ratio (%)'},
    color='km_category'
)
fig.add_hline(y=100, line_dash="dash", line_color="gray")
fig.update_layout(height=500, showlegend=False)
fig.show()


### 3.3 Comparaison par type de batterie


In [None]:
# Analyse par type de batterie
df_battery = df_clean[df_clean['battery_chemistry'].notna()].copy()

stats_battery = df_battery.groupby('battery_chemistry').agg({
    'ratio_supposed_wltp': ['mean', 'median', 'std', 'count'],
    'supposed_autonomy': 'mean',
    'wltp_range': 'mean',
    'capacity': 'mean',
    'vin': 'nunique'
}).round(1)

stats_battery.columns = ['_'.join(col).strip() for col in stats_battery.columns.values]
stats_battery = stats_battery[stats_battery['ratio_supposed_wltp_count'] >= 30]
stats_battery = stats_battery.sort_values('ratio_supposed_wltp_mean', ascending=False)


# Boxplot par type de batterie
battery_types = stats_battery.index.tolist()
df_battery_filtered = df_battery[df_battery['battery_chemistry'].isin(battery_types)]

if len(battery_types) > 0:
    fig = px.box(
        df_battery_filtered,
        x='battery_chemistry',
        y='ratio_supposed_wltp',
        title='Impact du type de batterie sur le ratio Autonomie réelle / WLTP',
        labels={'battery_chemistry': 'Type de batterie', 'ratio_supposed_wltp': 'Ratio (%)'},
        color='battery_chemistry'
    )
    fig.add_hline(y=100, line_dash="dash", line_color="gray")
    fig.update_layout(height=500, showlegend=False, xaxis_tickangle=-45)
    fig.show()

# fig.write_html("graph/boxplot_ratio_wltp_per_battery_type.html")


### 3.4 Comparaison par capacité de batterie


In [None]:

df_capacity = df_clean[df_clean['capacity'].notna()].copy()


# Catégories de capacité
df_capacity['capacity_category'] = pd.cut(
    df_capacity['capacity'],
    bins=[0, 40, 60, 80, 100, float('inf')],
    labels=['<40 kWh', '40-60 kWh', '60-80 kWh', '80-100 kWh', '>100 kWh']
)

stats_capacity = df_capacity.groupby('capacity_category').agg({
    'ratio_supposed_wltp': ['mean', 'median', 'std', 'count'],
    'capacity': 'mean',
    'supposed_autonomy': 'mean',
    'wltp_range': 'mean',
    'vin': 'nunique'
}).round(1)

# Calcul des counts par catégorie pour les annotations
counts_by_category = df_capacity.groupby('capacity_category').size()

# Boxplot par capacité
fig = px.box(
    df_capacity,
    x='capacity_category',
    y='ratio_supposed_wltp',
    title='Impact de la capacité batterie sur le ratio Autonomie réelle / WLTP',
    labels={'capacity_category': 'Capacité batterie', 'ratio_supposed_wltp': 'Ratio (%)'},
    color='capacity_category',
    category_orders={'capacity_category': ['<40 kWh', '40-60 kWh', '60-80 kWh', '80-100 kWh', '>100 kWh']}
)
fig.add_hline(y=100, line_dash="dash", line_color="gray")

# Ajout des annotations avec les counts
for i, category in enumerate(['<40 kWh', '40-60 kWh', '60-80 kWh', '80-100 kWh', '>100 kWh']):
    if category in counts_by_category.index:
        count = counts_by_category[category]
        fig.add_annotation(
            x=category,
            y=fig.data[i].y.max() if len(fig.data[i].y) > 0 else 100,
            text=f"n={count}",
            showarrow=False,
            yshift=10,
            font=dict(size=10, color="black")
        )

fig.update_layout(height=500, showlegend=False)
fig.show()
# fig.write_html("graph/boxplot_ratio_wltp_per_capacity.html")

# Scatter plot capacité vs ratio
sample_size = min(5000, len(df_capacity))
df_sample = df_capacity.sample(n=sample_size, random_state=42)

fig = px.scatter(
    df_sample,
    x='capacity',
    y='ratio_supposed_wltp',
    title=f'Relation entre capacité batterie et ratio Autonomie réelle / WLTP (n={len(df_capacity)}, échantillon: {sample_size})',
    labels={'capacity': 'Capacité batterie (kWh)', 'ratio_supposed_wltp': 'Ratio (%)'},
    opacity=0.5,
    color='model_name'
)
fig.add_hline(y=100, line_dash="dash", line_color="gray")
fig.update_layout(height=500)
fig.show()
# fig.write_html("graph/scatter_ratio_wltp_per_capacity.html")

## 3. Analyses comparatives


### 3.1 Identification des sur-performants et sous-performants


In [None]:
# Analyse par modèle avec nombre suffisant d'observations
min_observations = 5

model_stats = df_clean.groupby(['make', 'model_name']).agg({
    'ratio_supposed_wltp': ['mean', 'median', 'std', 'count'],
    'supposed_autonomy': 'mean',
    'wltp_range': 'mean',
    'vin': ['nunique']
}).round(1)

model_stats.columns = ['_'.join(col).strip() for col in model_stats.columns.values]
model_stats = model_stats[model_stats['ratio_supposed_wltp_count'] >= min_observations]

# Tri par ratio moyen
model_stats_sorted = model_stats.sort_values('ratio_supposed_wltp_mean', ascending=False)

print(f"\n=== CLASSEMENT DES MODÈLES (min {min_observations} observations) ===")
print(f"\nNombre de modèles analysés : {len(model_stats_sorted)}")


In [None]:
# Top 10 sur-performants
print("\n=== TOP 10 SUR-PERFORMANTS (autonomie réelle proche ou supérieure au WLTP) ===")
top_performers = model_stats_sorted.head(10)
top_performers[['ratio_supposed_wltp_mean', 'ratio_supposed_wltp_median', 
                       'supposed_autonomy_mean', 'wltp_range_mean', 'vin_nunique']]


In [None]:
# Top 10 sous-performants
print("\n=== TOP 10 SOUS-PERFORMANTS (autonomie réelle bien inférieure au WLTP) ===")
bottom_performers = model_stats_sorted.tail(10)
bottom_performers[['ratio_supposed_wltp_mean', 'ratio_supposed_wltp_median', 
                         'supposed_autonomy_mean', 'wltp_range_mean', 'vin_nunique']]


In [None]:
# Visualisation comparative
if len(model_stats_sorted) >= 10:
    top_bottom = pd.concat([
        top_performers.head(5).assign(category='Top 5 sur-performants'),
        bottom_performers.tail(5).assign(category='Top 5 sous-performants')
    ]).reset_index()
    
    top_bottom['model_label'] = top_bottom['make'] + ' - ' + top_bottom['model_name']
    # Ajout du nombre de véhicules dans le label - correction avec apply
    
    fig = px.bar(
        top_bottom,
        y='model_label',  # Utiliser le label avec le count
        x='ratio_supposed_wltp_mean',
        color='category',
        title='Sur-performants vs Sous-performants : Ratio Autonomie réelle / WLTP',
        labels={'model_label': 'Modèle', 'ratio_supposed_wltp_mean': 'Ratio moyen (%)'},
        orientation='h',
        color_discrete_map={'Top 5 sur-performants': 'green', 'Top 5 sous-performants': 'red'},
        text='vin_nunique'  # Afficher le nombre de véhicules sur les barres
    )
    
    # Personnaliser le texte affiché sur les barres
    fig.update_traces(texttemplate='n=%{text}', textposition='outside')
    
    fig.add_vline(x=100, line_dash="dash", line_color="gray", 
                  annotation_text="WLTP (100%)")
    
    fig.update_layout(height=600)
    fig.show()

In [None]:
# Visualisation comparative
if len(model_stats_sorted) >= 10:
    top_bottom = pd.concat([
        top_performers.head(5).assign(category='Top 5 sur-performants'),
        bottom_performers.tail(5).assign(category='Top 5 sous-performants')
    ]).reset_index()
    
    top_bottom['model_label'] = top_bottom['make'] + ' - ' + top_bottom['model_name']
    
    fig = px.bar(
        top_bottom,
        y='model_label',
        x='ratio_supposed_wltp_mean',
        color='category',
        title='Sur-performants vs Sous-performants : Ratio Autonomie réelle / WLTP',
        labels={'model_label': 'Modèle', 'ratio_supposed_wltp_mean': 'Ratio moyen (%)'},
        orientation='h',
        color_discrete_map={'Top 5 sur-performants': 'green', 'Top 5 sous-performants': 'red'}
    )
    
    fig.add_vline(x=100, line_dash="dash", line_color="gray", 
                  annotation_text="WLTP (100%)")
    
    fig.update_layout(height=600)
    fig.show()


## 5. Synthèse et conclusions


In [None]:
# Synthèse finale
print("\n" + "="*70)
print("SYNTHÈSE DE L'ANALYSE AUTONOMIE RÉELLE VS WLTP")
print("="*70)

print(f"\n1. STATISTIQUES GLOBALES")
print(f"   - Ratio moyen : {df_clean['ratio_supposed_wltp'].mean():.1f}%")
print(f"   - Ratio médian : {df_clean['ratio_supposed_wltp'].median():.1f}%")
print(f"   - Écart au WLTP : {100 - df_clean['ratio_supposed_wltp'].mean():.1f}% en moyenne")
print(f"   - Nombre de phases analysées : {len(df_clean)}")
print(f"   - Nombre de véhicules : {df_clean['vin'].nunique()}")

print(f"\n2. MEILLEURE ET PIRE MARQUE")
best_make = stats_by_make.index[0]
worst_make = stats_by_make.index[-1]
print(f"   - Meilleure : {best_make} ({stats_by_make.loc[best_make, 'Ratio moyen (%)']:.1f}%)")
print(f"   - Pire : {worst_make} ({stats_by_make.loc[worst_make, 'Ratio moyen (%)']:.1f}%)")

if len(model_stats_sorted) > 0:
    print(f"\n3. MODÈLES REMARQUABLES")
    best_model = model_stats_sorted.index[0]
    worst_model = model_stats_sorted.index[-1]
    print(f"   - Sur-performant : {best_model[0]} {best_model[1]} ({model_stats_sorted.iloc[0]['ratio_supposed_wltp_mean']:.1f}%)")
    print(f"   - Sous-performant : {worst_model[0]} {worst_model[1]} ({model_stats_sorted.iloc[-1]['ratio_supposed_wltp_mean']:.1f}%)")

print(f"\n4. FACTEURS D'INFLUENCE IDENTIFIÉS")
print(f"   - Saison : Impact observé (variations saisonnières)")
print(f"   - SoH : Corrélation identifiée avec le ratio")
print(f"   - Kilométrage : Impact sur l'autonomie réelle")

if 'temp_outside_mean' in df_clean.columns and df_clean['temp_outside_mean'].notna().sum() > 100:
    print(f"   - Température : Données disponibles, impact analysé")

if 'trip_type' in df_clean.columns and df_clean['trip_type'].notna().sum() > 100:
    print(f"   - Type de trajet : Données disponibles, impact analysé")

if 'battery_chemistry' in df_clean.columns and df_clean['battery_chemistry'].notna().sum() > 100:
    print(f"   - Type de batterie : Données disponibles, comparaison effectuée")

print("\n" + "="*70)
