In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from itertools import combinations
from core.sql_utils import *

from core.pandas_utils import *
from core.spark_utils import create_spark_session
from core.s3.s3_utils import S3Service
from core.s3.settings import S3Settings
from transform.processed_phases.config import LEVEL_1_MAX_POWER, LEVEL_2_MAX_POWER

# Imports PySpark nécessaires
from pyspark.sql import Window
import  pyspark.sql.functions as F
settings = S3Settings()

spark = create_spark_session(
    settings.S3_KEY,
    settings.S3_SECRET
)

s3 = S3Service()

In [None]:
raw_tss_tesla = s3.read_parquet_df_spark(spark, f"raw_ts/tesla-fleet-telemetry/time_series/raw_ts_spark.parquet")

raw_tss_tesla = (raw_tss_tesla
      .withColumnRenamed("BatteryLevel", "battery_level")
      .withColumnRenamed("InsideTemp", "inside_temp")
      .withColumnRenamed("DCChargingEnergyIn", "charge_energy_added")
      .withColumnRenamed("DCChargingPower", "charger_power")
      .withColumnRenamed("EstBatteryRange", "est_battery_range"))


raw_tss_tesla = raw_tss_tesla.withColumn("vin", F.lit("XP7YGCES9SB557941"))

raw_tss_tesla_fleet = s3.read_parquet_df_spark(spark, f"raw_ts/tesla-fleet-telemetry/time_series/raw_ts_spark.parquet")
raw_tss_ford = s3.read_parquet_df_spark(spark, f"raw_ts/ford/time_series/raw_ts_spark.parquet")
raw_tss_renault = s3.read_parquet_df_spark(spark, f"raw_ts/renault/time_series/raw_ts_spark.parquet")
raw_tss_stellantis = s3.read_parquet_df_spark(spark, f"raw_ts/stellantis/time_series/raw_ts_spark.parquet")
raw_tss_mercedez = s3.read_parquet_df_spark(spark, f"raw_ts/mercedes-benz/time_series/raw_ts_spark.parquet")
raw_tss_bmw = s3.read_parquet_df_spark(spark, f"raw_ts/bmw/time_series/raw_ts_spark.parquet")


In [None]:
# Fenêtre ordonnée par timestamp, partitionnée par VIN
w = Window.partitionBy("vin").orderBy("date")

df_charging = (
    raw_tss_tesla
    # delta = différence du niveau de batterie entre deux points consécutifs
    .withColumn("delta", F.col("battery_level") - F.lag("battery_level", 1).over(w))
    # True si delta > 0 (en charge)
    .withColumn("charging", (F.col("delta") > 0).cast("boolean"))
)



In [None]:

# 1️⃣ Détecter les changements d’état (True quand on change)
df_phase = df_charging.withColumn(
    "change_flag",
    (F.col("charging") != F.lag("charging", 1).over(w)).cast("int")
)

# 2️⃣ Cumulative sum par VIN pour obtenir un ID de phase
df_phase = df_phase.withColumn(
    "phase_id",
    F.sum("change_flag").over(w.rowsBetween(Window.unboundedPreceding, 0))
)
raw_tss_tesla = (
    df_phase
    .fillna({"delta": 0, "change_flag": 0})  # équivalent du fillna
    .drop("change_flag")                     # facultatif
)

In [None]:
## compute charging avec Spark
# Conversion du battery_level en numérique et tri par date
raw_tss_spark = raw_tss_tesla.withColumn("battery_level", F.col("battery_level").cast("double"))

# Définir la fenêtre de partition par vin et ordonnée par date
window_spec = Window.partitionBy("vin").orderBy("date")

# Calculer delta (différence de battery_level)
raw_tss_spark = raw_tss_spark.withColumn("delta", F.col("battery_level") - F.lag("battery_level", 1).over(window_spec))
raw_tss_spark = raw_tss_spark.withColumn("delta", F.coalesce(F.col("delta"), F.lit(0)))

# Déterminer l'état de charge (charging)
raw_tss_spark = raw_tss_spark.withColumn(
    "charging",
    F.when(F.col("delta") > 0, True).when(F.col("delta") < 0, False).otherwise(F.lag("charging", 1).over(window_spec))
)

# Pour les premières valeurs, utiliser False par défaut
raw_tss_spark = raw_tss_spark.withColumn("charging", F.coalesce(F.col("charging"), F.lit(False)))

# Calculer phase_id : incrémenter quand charging change
raw_tss_spark = raw_tss_spark.withColumn(
    "charging_change",
    F.when(F.col("charging") != F.lag("charging", 1).over(window_spec), 1).otherwise(0)
)
raw_tss_spark = raw_tss_spark.withColumn("charging_change", F.coalesce(F.col("charging_change"), F.lit(1)))
raw_tss_spark = raw_tss_spark.withColumn("phase_id", F.sum("charging_change").over(window_spec))

## compute charging
def detect_charging(df, level_col="battery_level", ts_col="date"):
    df = df.sort_values(ts_col).reset_index(drop=True)
    df[level_col] = pd.to_numeric(df[level_col], errors="coerce")
    df["delta"] = df[level_col].diff().fillna(0)

    states = []
    charging = False

    for d in df["delta"]:
        if d > 0:        
            charging = True
        elif d < 0:      
            charging = False
        states.append(charging)

    df["charging"] = states
    return df

def add_phase_id(df):

    df_copy = df.copy()
    changes = df_copy['charging'] != df_copy['charging'].shift(1)
    df_copy['phase_id'] = changes.cumsum()
    return df_copy

raw_tss_pd = (
    raw_tss_pd.groupby("vin", group_keys=False)
    .apply(detect_charging)
    .groupby("vin", group_keys=False)
    .apply(add_phase_id)
    .reset_index(drop=True)
)

## Initialization - Get right dataframe


In [None]:
raw_tss_charging = raw_tss_spark.filter(F.col("charging") == True)

In [None]:
engine = get_sqlalchemy_engine()
con = engine.connect()

with engine.connect() as connection:
    dbeaver_df = pd.read_sql(text("""SELECT vm.model_name, vm.type, vm.version, vd.soh, vd.timestamp, v.vin, b.capacity FROM vehicle_data vd
            join vehicle v
            on v.id = vd.vehicle_id
            join vehicle_model vm 
            on vm.id = v.vehicle_model_id
            join battery b
            on b.id = vm.battery_id;"""), con)

dbeaver_df.head()

In [None]:
# Conversion du dataframe pandas vers Spark pour la jointure
dbeaver_spark = spark.createDataFrame(dbeaver_df)

# Conversion de la date en timestamp
raw_tss_charging = raw_tss_charging.withColumn("date", F.col("date").cast("timestamp"))
dbeaver_spark = dbeaver_spark.withColumn("timestamp", F.col("timestamp").cast("timestamp"))

# Join asof en Spark (on utilise une jointure avec condition de proximité temporelle)
# Pour simplifier, on fait une jointure par vin et on garde la ligne la plus proche dans le futur
raw_tss_charging = raw_tss_charging.withColumn("row_id", F.monotonically_increasing_id())
dbeaver_spark = dbeaver_spark.withColumn("db_id", F.monotonically_increasing_id())

# Jointure croisée par vin et filtrage des timestamps futurs
raw_tss_charging = raw_tss_charging.alias("raw").join(
    dbeaver_spark.alias("db"),
    (F.col("raw.vin") == F.col("db.vin")) & (F.col("db.timestamp") >= F.col("raw.date")),
    "left"
).withColumn(
    "time_diff_join",
    (F.col("db.timestamp").cast("long") - F.col("raw.date").cast("long"))
).withColumn(
    "rn",
    F.row_number().over(Window.partitionBy("raw.row_id").orderBy("time_diff_join"))
).filter(
    F.col("rn") == 1
).drop(F.col("db.vin")).drop("rn", "row_id", "db_id", "time_diff_join")

In [None]:
# Agrégation avec Spark
ts = raw_tss_charging.groupBy("vin", "phase_id").agg(
    F.min("charge_energy_added").alias("energy_added_min"),
    F.last("charge_energy_added").alias("energy_added_end"),
    F.mean("inside_temp").alias("inside_temp"),
    F.first("capacity").alias("capacity"),
    F.first("odometer").alias("odometer"),
    F.first("type").alias("type"),
    F.count("battery_level").alias("size"),
    F.first("model_name").alias("model"),
    F.first("date").alias("date"),
    F.percentile_approx("charger_power", 0.5).alias("charger_power"),
    F.first("version").alias("version"),
    F.first("date").alias("start_date"),
    F.first("battery_level").alias("soc_min"),
    F.last("battery_level").alias("soc_end"),
    F.last("est_battery_range").alias("est_battery_range"),
    F.last("soh").alias("soh")
)
ts = ts.cache()
# Ne prendre queles vin  qui ont 3 charges ou plus
vin_counts = ts.groupBy("vin").count().filter(F.col("count") > 3)
index = vin_counts.select("vin").rdd.flatMap(lambda x: x).collect()

ts = ts.filter(F.col("vin").isin(index))

# Calculs supplémentaires
ts = ts.withColumn("soc_diff", F.col("soc_end") - F.col("soc_min"))
ts = ts.withColumn("level_1", 
    F.when(F.col("charger_power") < LEVEL_1_MAX_POWER, F.col("soc_diff") / 100).otherwise(0))
ts = ts.withColumn("level_2", 
    F.when((F.col("charger_power") >= LEVEL_1_MAX_POWER) & (F.col("charger_power") <= LEVEL_2_MAX_POWER), 
         F.col("soc_diff") / 100).otherwise(0))
ts = ts.withColumn("level_3", 
    F.when(F.col("charger_power") > LEVEL_2_MAX_POWER, F.col("soc_diff") / 100).otherwise(0))
ts = ts.withColumn("date", F.to_date(F.col("date")))



In [None]:

px.histogram(ts.select("charger_power").sample(withReplacement=False, fraction=.001, seed=42), x="charger_power", nbins=1000)

Il y'a deux pics un peu inexpliué à 7kW et 8kW  

## Compute & Plot

In [None]:
# compute time by soc point in charge avec Spark
window_phase = Window.partitionBy("phase_id", "vin").orderBy("date")

raw_tss_charging = raw_tss_charging.withColumn(
    "time_diff_ts", 
    F.col("date").cast("long") - F.lag("date", 1).over(window_phase).cast("long")
)
raw_tss_charging = raw_tss_charging.withColumn(
    "soc_diff", 
    F.col("battery_level") - F.lag("battery_level", 1).over(window_phase)
)
raw_tss_charging = raw_tss_charging.withColumn("time_diff", F.coalesce(F.col("time_diff_ts"), F.lit(0))) 

In [None]:
# on récupère uniquement les lignes qui sont des vin avec + 3 charges
tss_charging = raw_tss_charging.filter(F.col("vin").isin(index))

In [None]:
# Pour être sur d'avoir la bonne puissance associée à la charge
# Filtrer les lignes avec soc_diff != 0 et battery_level non null
tss_filtered = tss_charging.filter((F.col("soc_diff") != 0) & F.col("battery_level").isNotNull())

# Récupérer la puissance médiane par phase
power_by_phase = tss_filtered.groupBy("vin", "phase_id").agg(
    F.percentile_approx("charger_power", 0.5).alias("charging_power_median")
)

# Joindre avec les données filtrées
tss_with_power = tss_filtered.join(power_by_phase, ["vin", "phase_id"])

# Créer une fenêtre pour obtenir le SoC suivant
window_soc = Window.partitionBy("vin", "phase_id").orderBy("battery_level")
tss_with_power = tss_with_power.withColumn("next_soc", F.lead("battery_level", 1).over(window_soc))
tss_with_power = tss_with_power.withColumn("next_time_diff", F.lead("time_diff", 1).over(window_soc))
tss_with_power = tss_with_power.withColumn("next_soc_diff", F.lead("soc_diff", 1).over(window_soc))

# Filtrer les lignes valides
tss_with_power = tss_with_power.filter(
    F.col("next_soc").isNotNull() & 
    F.col("next_time_diff").isNotNull() & 
    F.col("next_soc_diff").isNotNull() &
    (F.col("next_soc_diff") != 0)
)

# Calculer time_step
tss_with_power = tss_with_power.withColumn(
    "time_step", 
    F.col("next_time_diff") / F.col("next_soc_diff")
)

# Créer une séquence de SoC entre battery_level et next_soc
tss_with_power = tss_with_power.withColumn(
    "soc_range",
    F.sequence(F.col("battery_level").cast("int"), F.col("next_soc").cast("int") - 1)
)

# Exploser pour créer une ligne par SoC
df_expanded = tss_with_power.select(
    F.col("vin"),
    F.col("phase_id"),
    F.col("time_step").alias("time_diff"),
    F.explode("soc_range").alias("battery_level"),
    F.col("charging_power_median").alias("charger_power")
).withColumn("battery_level", F.col("battery_level").cast("double"))

In [None]:
df_expanded = df_expanded.filter(F.col("time_diff") > 0)

In [None]:
df_expanded = df_expanded.withColumn(
    "level_charge",
    F.when(F.col("charger_power") < 5, "under 5")
    .when((F.col("charger_power") >= 5) & (F.col("charger_power") <= 8), "5 to 8")
    .when((F.col("charger_power") > 8) & (F.col("charger_power") <= 15), "close to 11")
    .when((F.col("charger_power") > 15) & (F.col("charger_power") < 45), "level_2")
    .when((F.col("charger_power") >= 45) & (F.col("charger_power") < 100), "level_3 < 100")
    .otherwise("level_3 > 100")
)
df_expanded = df_expanded.cache()

In [None]:
#Calculer la médiane du SoH par vin
soh_by_vin = ts.groupBy("vin").agg(
    F.percentile_approx("soh", 0.5).alias("median_soh")
).filter(F.col("median_soh").isNotNull())
soh_by_vin = soh_by_vin.cache()

vin_stats = soh_by_vin.orderBy("median_soh").collect()
vin_sorted = [r["vin"] for r in vin_stats]

worst_vin_list = vin_sorted[:150]
best_vin_list  = vin_sorted[-150:]


In [None]:
# création du df
df_expanded_worst = df_expanded.filter(F.col("vin").isin(worst_vin_list)).groupBy("battery_level", "vin", "level_charge").agg(
    F.mean("time_diff").alias("time_diff")
).withColumn("cat", F.lit("worst"))

df_expanded_best = df_expanded.filter(F.col("vin").isin(best_vin_list)).groupBy("battery_level", "vin", "level_charge").agg(
    F.mean("time_diff").alias("time_diff")
).filter(F.col("time_diff").isNotNull()).withColumn("cat", F.lit("best"))

df = df_expanded_worst.union(df_expanded_best)

In [None]:
df_pd = df.toPandas()


In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

i = 1
fig = make_subplots(rows=len(df_pd.level_charge.unique()), cols=1, subplot_titles=df_pd.level_charge.unique())  

for level in df_pd.level_charge.unique():
    temp = df_pd[df_pd['level_charge'] == level]
    
    fig.add_trace(
        go.Scatter(
            x=temp[temp['cat'] == 'worst'].groupby(['battery_level'], as_index=False)['time_diff'].mean()['battery_level'], 
            y=temp[temp['cat'] == 'worst'].groupby(['battery_level'], as_index=False)['time_diff'].mean()['time_diff'],
            mode="markers+text",
            name='Worst SoH',
            marker=dict(color="red")
        ),
        row=i, col=1
    )
    
    fig.add_trace(
        go.Scatter(
            x=temp[temp['cat'] == 'best'].groupby(['battery_level'], as_index=False)['time_diff'].mean()['battery_level'], 
            y=temp[temp['cat'] == 'best'].groupby(['battery_level'], as_index=False)['time_diff'].mean()['time_diff'],
            mode="markers+text",
            name='Best SoH',
             marker=dict(color="green")
        ),
        row=i, col=1
    )
    
    i += 1

fig.update_layout(title_text='Time spent by SoC point between best SoH and Worst SoH by charging power',
                  height=1000, width=1400)

fig.update_xaxes(title_text='SoC point', row=6, col=1)

fig.update_yaxes(title_text='Time passed in seconds', row=4, col=1)

fig.show()

### Study by slice 99 / 95-99/ 90-95/ 85-90/ sub 85

In [None]:
# Jointure avec l'odomètre max
odometer_max = tss_charging.groupBy("vin").agg(F.max("odometer").alias("odometer_max"))
df_merge = ts.join(odometer_max, "vin", "left")

In [None]:
# ajout de l'odomètre en colonne
df_expanded = df_expanded.join(odometer_max, "vin", "left")

In [None]:
vin_under_95 = df_merge.filter((F.col("soh") >= 0.9) & (F.col("soh") < 0.95)).select("vin").distinct().rdd.flatMap(lambda x: x).collect()
vin_under_99 = df_merge.filter((F.col("soh") < 0.99) & (F.col("soh") >= 0.95)).select("vin").distinct().rdd.flatMap(lambda x: x).collect()

In [None]:
# Créer des catégories de SoH avec Spark
df_merge = df_merge.withColumn("cat_soh",
    F.when(F.col("soh") < 0.9, "soh < 90")
    .when((F.col("soh") >= 0.9) & (F.col("soh") < 0.95), "90 < soh < 95")
    .when((F.col("soh") >= 0.95) & (F.col("soh") < 0.99), "95 < soh < 99")
    .otherwise("soh > 99")
)

In [None]:
colors = {
    'soh < 90': 'red',
    '90 < soh < 95': 'orange',
    '95 < soh < 99': 'green',
    'soh > 99': 'blue'
}

In [None]:
# Jointure avec les catégories de SoH
soh_cat = df_merge.groupBy("vin").agg(
    F.last("soh").alias("soh"),
    F.last("cat_soh").alias("cat_soh")
)
df_expanded = df_expanded.join(soh_cat, "vin", "left")

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Conversion en pandas pour la visualisation
df_expanded_pd = df_expanded.toPandas()

i = 1
fig = make_subplots(rows=6, cols=1, subplot_titles=['under 5', '5 to 8', 'close to 11','level_2', 'level_3 < 100', 'level_3 > 100'])  
for level in ['under 5', '5 to 8', 'close to 11','level_2', 'level_3 < 100', 'level_3 > 100']:
    temp = df_expanded_pd[df_expanded_pd['level_charge'] == level]
    
    temp_90 = temp[temp['cat_soh'] == 'soh < 90'].groupby(['battery_level'], as_index=False)['time_diff'].mean()
    fig.add_trace(
        go.Scatter(
            x=temp_90['battery_level'], 
            y=temp_90['time_diff'],
            mode="markers",
            name='SoH < 90',
             marker=dict(color="red")
            ),
        row=i, col=1
    )
    
    temp_95 = temp[temp['cat_soh'] == '90 < soh < 95'].groupby(['battery_level'], as_index=False)['time_diff'].mean()
    fig.add_trace(
        go.Scatter(
            x=temp_95['battery_level'], 
            y=temp_95['time_diff'],
            mode="markers+text",
            name='90 < SoH < 95',
            marker=dict(color="orange")
            ),
        row=i, col=1
    )
    
    temp_99 = temp[temp['cat_soh'] == '95 < soh < 99'].groupby(['battery_level'], as_index=False)['time_diff'].mean()
    fig.add_trace(
        go.Scatter(
            x=temp_99['battery_level'], 
            y=temp_99['time_diff'],
            mode="markers+text",
            name='95 < SoH < 99',
            marker=dict(color="blue")
            ),
        row=i, col=1
    )
    
    temp_100 = temp[(temp['cat_soh'] == 'soh > 99') & (temp['odometer_max'] < 100_000)].groupby(['battery_level'], as_index=False)['time_diff'].mean()
    fig.add_trace(
        go.Scatter(
            x=temp_100['battery_level'], 
            y=temp_100['time_diff'],
            mode="markers+text",
            name='SoH > 99',
            marker=dict(color="#2A9D8F")
            ),
        row=i, col=1
    )
    
    i += 1


fig.update_layout(
    title_text='Time spent by SoC point between best SoH and Worst SoH by charging power',
    height=1000, width=1400,
    showlegend=True
)


fig.update_xaxes(title_text='SoC point', row=6, col=1)

fig.update_yaxes(title_text='Time passed in seconds', row=4, col=1)

fig.show()

On peut voir que plus la charge semble puissante plus les SoH faibles ont un temps de charges élevé par rapport aux autres.

In [None]:
# check de la puissance des charges avec Spark
# describe() n'est pas directement disponible en Spark de la même manière
# On utilise des agrégations pour obtenir les statistiques
charging_power_stats = df_expanded.groupBy("level_charge", "cat_soh").agg(
    F.count("charger_power").alias("count"),
    F.mean("charger_power").alias("mean"),
    F.stddev("charger_power").alias("std"),
    F.mean("charger_power").alias("min"),
    F.percentile_approx("charger_power", 0.25).alias("25%"),
    F.percentile_approx("charger_power", 0.5).alias("50%"),
    F.percentile_approx("charger_power", 0.75).alias("75%"),
    F.max("charger_power").alias("max")
)

charging_power_stats.toPandas()

Les puissance semble les mêmes pour l'enemble des catégories. Un peu plus fairble pour les SoH < 90 pour les charges de niveau 3 supérieur à 100kW

In [None]:
# Agrégation des SoC uniques par catégorie - conversion nécessaire pour apply(set)
df_soc_by_charge_spark = df_expanded.groupBy("level_charge", "cat_soh").agg(
    F.collect_set("battery_level").alias("soc_set")
)

# Conversion en pandas pour la logique suivante qui utilise des opérations de set
df_soc_by_charge = df_soc_by_charge_spark.toPandas()
df_soc_by_charge['soc'] = df_soc_by_charge['soc_set'].apply(set)

In [None]:
# récupérer les soc pour lesquels on a pas des valeurs de charges pour les catégories
d = {}
for level_carge in df_soc_by_charge.level_charge.unique():
    list_set = []
    for cat in df_soc_by_charge.cat_soh.unique():
        list_set.append(df_soc_by_charge[(df_soc_by_charge['level_charge']==level_carge) & (df_soc_by_charge['cat_soh']==cat)]['soc'].values[0])
    combis = list(combinations(list_set, 2))
    soc_to_drop = []
    for e in combis:
        soc_to_drop= soc_to_drop + list(e[0] - e[1])
        soc_to_drop= soc_to_drop + list(e[1] - e[0])

    soc_to_drop = list(set(soc_to_drop))
    d[level_carge] = soc_to_drop



In [None]:
# Filtrage avec Spark - on doit créer une condition de filtrage
# Convertir le dictionnaire en DataFrame Spark pour la jointure
# Créer un DataFrame pandas puis Spark avec les SoC à exclure
exclusion_list = []
for level_charge, soc_list in d.items():
    for soc_val in soc_list:
        exclusion_list.append({"level_charge": level_charge, "soc_to_exclude": float(soc_val)})

if exclusion_list:
    exclusion_df = spark.createDataFrame(pd.DataFrame(exclusion_list))
    
    # Jointure anti pour exclure les lignes
    df_expanded_clean = df_expanded.join(
        exclusion_df,
        (df_expanded.level_charge == exclusion_df.level_charge) & 
        (df_expanded.battery_level == exclusion_df.soc_to_exclude),
        "left_anti"
    )
else:
    df_expanded_clean = df_expanded

In [None]:
# Note: trimmed_in_charge_idx n'existe pas dans le code précédent, on utilise phase_id
time = df_expanded_clean.groupBy("vin", "phase_id").agg(
    F.sum("time_diff").alias("total_time"),
    (F.last("battery_level") - F.first("battery_level")).alias("total_soc_diff"),
    F.first("level_charge").alias("level_charge"),
    F.first("cat_soh").alias("soh_cat")
)

In [None]:
time = time.withColumn("mean_time_by_soc", F.col("total_time") / F.col("total_soc_diff"))

In [None]:
time.columns  # Affichage des colonnes Spark

In [None]:
time_summary = time.filter(
    ~isinf(F.col("mean_time_by_soc")) & ~isnan(F.col("mean_time_by_soc"))
).groupBy("level_charge", "soh_cat").agg(
    F.round(F.mean("mean_time_by_soc"), 0).alias("mean_time_by_soc")
)

time_summary.toPandas()



Pour 1 point de SoC si la charge est de niveau 3 avec une puissance comprise entre 45 et 100 kW , il y'a une différence d'environ 8 secondes entre un véhicules avec un SoH supérieur à 99% et un véhicules avec un SoH inférieur à 90%, soit 9:20 minutes pour une charges entre 20% et 90%, et de 4 secondes avec les autres soit 4:40 minutes.
Pour 1 point de SoC, si la charge est de niveau 3 avec une puissance supérieur à 100 kW, il y'a une différence d'environ 2 secondes entre un véhicules avec un SoH supérieur à 99% et un véhicules avec un SoH inférieur à 90%, soit 2:20 minutes pour une charges entre 20% et 90%.


## Check répartitions au sein de chacun des groupes 

In [None]:
# Jointure avec les catégories SoH
cat_soh_by_vin = df_expanded.groupBy("vin").agg(first("cat_soh").alias("cat_soh"))

corr_df = dbeaver_spark.join(cat_soh_by_vin, "vin", "inner")
corr_df = corr_df.withColumn("start_date", col("start_date").cast("timestamp"))

In [None]:
# Agrégation pour SoH < 90 avec Spark
info_90_spark = corr_df.filter(col("cat_soh") == "soh < 90").groupBy("vin").agg(
    first("battery_id").alias("battery_id"),
    first("version").alias("version"),
    first("autonomy").alias("autonomy"),
    first("type").alias("type"),
    first("model_name").alias("model_name"),
    last("start_date").alias("start_date"),
    last("odometer").alias("odometer")
)

# Conversion en pandas pour l'analyse descriptive
info_90 = info_90_spark.toPandas()
if not info_90.empty:
    info_90.set_index("vin", inplace=True)

In [None]:
# Agrégation pour 90 < SoH < 95 avec Spark
info_95_spark = corr_df.filter(col("cat_soh") == "90 < soh < 95").groupBy("vin").agg(
    first("battery_id").alias("battery_id"),
    first("version").alias("version"),
    first("autonomy").alias("autonomy"),
    first("type").alias("type"),
    first("model_name").alias("model_name"),
    last("start_date").alias("start_date"),
    last("odometer").alias("odometer")
)

info_95 = info_95_spark.toPandas()
if not info_95.empty:
    info_95.set_index("vin", inplace=True)

In [None]:
# Agrégation pour 95 < SoH < 99 avec Spark
info_99_spark = corr_df.filter(col("cat_soh") == "95 < soh < 99").groupBy("vin").agg(
    first("battery_id").alias("battery_id"),
    first("version").alias("version"),
    first("autonomy").alias("autonomy"),
    first("type").alias("type"),
    first("model_name").alias("model_name"),
    last("start_date").alias("start_date"),
    last("odometer").alias("odometer")
)

info_99 = info_99_spark.toPandas()
if not info_99.empty:
    info_99.set_index("vin", inplace=True)

In [None]:
# Agrégation pour SoH > 99 avec Spark
info_100_spark = corr_df.filter(col("cat_soh") == "soh > 99").groupBy("vin").agg(
    first("battery_id").alias("battery_id"),
    first("version").alias("version"),
    first("autonomy").alias("autonomy"),
    first("type").alias("type"),
    first("model_name").alias("model_name"),
    last("start_date").alias("start_date"),
    last("odometer").alias("odometer")
)

info_100 = info_100_spark.toPandas()
if not info_100.empty:
    info_100.set_index("vin", inplace=True)

#### Odometer value

In [None]:
print(f"""odometer infos: under 90% SoH:\n {info_90.odometer.describe()}, 
      between 90-95% SoH:\n {info_95.odometer.describe()},
      between 95-99% SoH: \n{info_99.odometer.describe()}, 
      over 99% SoH: \n{info_100.odometer.describe()}""")


Sans surprise les véhicules avec le plus gros odomètres sont les véhicules avec le moins bon SoH.
Par contre un véhicules avec un SoH supérieur à 99% qui a fait 287000 km 



In [None]:
print(f"""start_date infos: under 90% SoH:\n {info_90['start_date'].describe()}, 
      between 90-95% SoH:\n {info_95['start_date'].describe()},
      between 95-99% SoH: \n{info_99['start_date'].describe()}, 
      over 99% SoH: \n{info_100['start_date'].describe()}""")


Comme pour l'odomètre les véhicules avec une batterie plus vieille ont les moins bon SoH.

In [None]:
print(f"""model infos: under 90% SoH:\n number of different model: {info_90.model_name.nunique()}\n model répartition: {info_90.model_name.value_counts(normalize=True).head(5)}, 
      between 90-95% SoH:\n number of different model: {info_95.model_name.nunique()}\n model répartition: {info_95.model_name.value_counts(normalize=True).head(5)},
      between 95-99% SoH: \n number of different model: {info_99.model_name.nunique()}\n model répartition: {info_99.model_name.value_counts(normalize=True).head(5)}, 
      over 99% SoH: \n number of different model: {info_100.model_name.nunique()}\n model répartition: {info_100.model_name.value_counts(normalize=True).head(5)}""")


La proportion de modèle Y augmente à mesure que le SoH augmente.
Corrélation ancien véhicules et modèle Y? Ils ont peut être commencé avec des modèles 3 puis sdes Y

In [None]:
print(f"""model infos: under 90% SoH:\n number of different model: {info_90.version.nunique()}\n top 3 model: {info_90.version.value_counts(normalize=True).head(5)}, 
      between 90-95% SoH:\n number of different model: {info_95.version.nunique()}\n top 3 model: {info_95.version.value_counts(normalize=True).head(5)},
      between 95-99% SoH: \n number of different model: {info_99.version.nunique()}\n top 3 model: {info_99.version.value_counts(normalize=True).head(5)}, 
      over 99% SoH: \n number of different model: {info_100.version.nunique()}\n top 3 model: {info_100.version.value_counts(normalize=True).head(5)}""")


Pas de sureprésentation d'un tesla_code en particulier dans aucun des groupes.