## EDA : Charging Status Harmonization Analysis

### Purpose
This notebook aims to harmonize the charging_status column across different OEMs by establishing standardized rules to accurately determine charging, discharging, and idle phases.

### Overview
Below, you'll be able to plot the charging status for a selected VIN to verify that it properly represents the different charging phases across all vehicle manufacturers.

## Imports, Spark session & Config

In [None]:
from core.spark_utils import create_spark_session
from core.s3.s3_utils import S3Service
from core.s3.settings import S3Settings

settings = S3Settings()

spark = create_spark_session(
    settings.S3_KEY,
    settings.S3_SECRET
)

s3 = S3Service()

In [None]:
SCALE_SOC = {
    'tesla-fleet-telemetry': 1,
    'mercedes-benz': 100,
    'volvo-cars': 100,
    'kia': 100,
    'renault': 100,
    'ford': 100,
    'stellantis': 100,
    'bmw': 1,
    'volkswagen': 1,
}

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, lead, sum, when, col, signum, dense_rank
from pyspark.sql.types import DoubleType
from pyspark.sql import DataFrame as DF

def compute_energy_added(tss: DF) -> DF:
    tss = tss.withColumn(
        "charge_energy_added",
        when(
            col("dc_charge_energy_added").isNotNull()
            & (col("dc_charge_energy_added") > 0),
            col("dc_charge_energy_added"),
        ).otherwise(col("ac_charge_energy_added")),
    )
    return tss

def _reassign_short_phases(df, min_duration_minutes=3):
    """
    Recalculates phase_id by merging phases shorter than `min_duration_minutes`
    with the previous valid phase.

    Args:
        df (DataFrame): Spark DataFrame with columns `phase_id`, `date`, `total_phase_time`
        min_duration_minutes (float): Minimum duration to keep a phase (in minutes)

    Returns:
        DataFrame: DataFrame with updated `phase_id` column
    """

    df = df.withColumn(
        "is_valid_phase",
        F.when(F.col("total_phase_time") >= min_duration_minutes, 1).otherwise(0),
    )

    w_time = (
        Window.partitionBy("vin")
        .orderBy("date")
        .rowsBetween(Window.unboundedPreceding, 0)
    )

    df = df.withColumn(
        "last_valid_phase_id",
        F.last(
            F.when(F.col("is_valid_phase") == 1, F.col("phase_id")),
            ignorenulls=True,
        ).over(w_time),
    )

    df = df.withColumn(
        "phase_id_updated",
        F.when(F.col("is_valid_phase") == 1, F.col("phase_id")).otherwise(
            F.col("last_valid_phase_id")
        ),
    )

    df = df.withColumn(
        "phase_id_final",
        F.dense_rank().over(Window.partitionBy("vin").orderBy("phase_id_updated"))
        - 1,
    )

    df = df.drop(
        "phase_id", "last_valid_phase_id", "is_valid_phase", "phase_id_updated"
    )
    df = df.withColumnRenamed("phase_id_final", "phase_id")

    return df

def compute_charge_idx(
    tss: DF, make, total_soc_diff_threshold: float = 0.05
) -> DF:

    tss = tss.withColumn("soc", F.col("soc") * SCALE_SOC[make])

    w = Window.partitionBy("vin").orderBy("date")
    
    tss = tss.withColumn(
        "soc", 
        F.coalesce(
            F.last("soc", ignorenulls=True).over(w),
            F.first("soc", ignorenulls=True).over(w.orderBy(F.col("date").desc()))
        )
    )

    tss = tss.withColumn(
        "soc_diff",
        F.col("soc")- F.lag("soc").over(w),
    )

    df = tss.withColumn("prev_date", lag("date").over(w))
    df = df.withColumn(
        "time_gap_minutes",
        (F.unix_timestamp("date") - F.unix_timestamp("prev_date")) / 60,
    )

    df = df.withColumn(
        "direction_raw",
        F.when(col("soc_diff").isNull(), None).otherwise(F.signum("soc_diff")),
    )

    df = df.withColumn(
        "direction",
        F.last("direction_raw", ignorenulls=True).over(
            w.partitionBy("vin")
            .orderBy("date")
            .rowsBetween(Window.unboundedPreceding, 0)
        ),
    )  # Direction forward fill

    df = df.withColumn(
        "direction_change",
        F.when(F.col("direction") != F.lag("direction").over(w), 1).otherwise(0),
    )

    df = df.withColumn(
        "phase_id",
        F.sum("direction_change").over(w.rowsBetween(Window.unboundedPreceding, 0)),
    )

    w_phase = Window.partitionBy("vin", "phase_id")

    df = df.withColumn("total_phase_time", F.sum("time_gap_minutes").over(w_phase))

    if make == 'tesla-fleet-telemetry':
        df = _reassign_short_phases(
            df
        )  # Reassign short phases to previous valid phase (especiallyuseful for tesla-fleet-telemetry noise)

    w_phase = Window.partitionBy("vin", "phase_id")

    df = df.withColumn("total_soc_diff", F.sum("soc_diff").over(w_phase))

    df = df.withColumn("prev_phase", F.lag("direction").over(w)).withColumn(
        "next_phase", F.lead("direction").over(w)
    )


    df = df.withColumn(
        "charging_status",
        F.when(F.col("total_soc_diff") > total_soc_diff_threshold, "charging")
        .when(F.col("total_soc_diff") < -total_soc_diff_threshold, "discharging")
        .when(
            (F.col("prev_phase") == F.col("next_phase"))
            & (F.col("prev_phase") > 0),
            "charging",
        )
        .when(
            (F.col("prev_phase") == F.col("next_phase"))
            & (F.col("prev_phase") < 0),
            "discharging",
        )
        .otherwise("idle"),
    )

    df = df.withColumn(
        "next_status",
        F.lead("charging_status").over(w)
    )
    

    # Set the phase before a charging/discharging phase as the phase to get the correct SOC diff
    df = df.withColumn(
        "charging_status",
        F.when(F.col("next_status") == 'charging', "charging")
        .when(F.col("next_status")== "discharging", "discharging")
        .otherwise(col('charging_status'))
    )

    # Clean the charging status for non tesla-fleet-telemetry
    if make != 'tesla-fleet-telemetry':
        df = df.withColumn(
            "charging_status",
            F.when(F.col("charging_status") == "idle", None).otherwise(F.col("charging_status"))
        )

        df = df.withColumn(
            "charging_status", 
            F.coalesce(
                F.last("charging_status", ignorenulls=True).over(w),
                F.first("charging_status", ignorenulls=True).over(w.orderBy(F.col("date").desc()))
            )
        )
    

    df = df.withColumn(
        "charging_status_change",
        F.when(
            F.col("charging_status") != F.lag("charging_status").over(w), 1
        ).otherwise(0),
    )

    # Gao de temps entre deux charging
    df = df.withColumn("next_soc_diff", F.lag("soc_diff").over(w))

    # Ajouter la colonne corrigée
    df = df.withColumn(
        "charging_status_change",
        F.when(
            (F.col("charging_status_change") == 0) &
            (F.abs(F.col("soc_diff")) <= 1) &
            (F.col("time_gap_minutes") > 60) &
            (F.abs(F.col("next_soc_diff")) > 1),
            F.lit(1)
        ).otherwise(F.col("charging_status_change"))
    )


    df = df.withColumn(
        "charging_status_idx",
        F.sum("charging_status_change").over(
            w.rowsBetween(Window.unboundedPreceding, 0)
        ),
    )

    print(df.columns)

    return df

## Plots by OEMs

### MERCEDES-BENZ

In [None]:
# Charger le dataframe
rss = s3.read_parquet_df_spark(spark, 'raw_ts/mercedes-benz/time_series/raw_ts_spark.parquet')
print(rss.columns)
vin = 'W1V9106331P360398'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss =  rss.withColumnRenamed("battery_level", "soc")
rss_by_vin = rss_by_vin.withColumnRenamed("battery_level", "soc")
rss_by_vin.show()

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, 'mercedes-benz')
rss_by_vin_with_idx = rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

In [None]:
rss.select('vin').distinct().show()

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go

"""
WIP : Discussions avec HM
W1N2437011J000869 peu datapoint
W1N9N0CB8SJ120589 peu datapoint
 W1V44760313781151 fréquence attendue
"""

pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'total_soc_diff', 'charging_status_idx', 'soc_diff', 'next_soc_diff').toPandas()

# Nettoyage
pdf["date"] = pd.to_datetime(pdf["date"])
pdf = pdf.dropna(subset=["soc"])
pdf["soc"] = pdf["soc"].astype(float)

# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)",
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<extra></extra>"
))

# 2. Points colorés avec toutes les informations
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False,
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<br><b>Status:</b> %{customdata[0]}<br><b>Index:</b> %{customdata[1]}<br><b>Soc Diff:</b> %{customdata[2]:.2f}<br><b>next Soc Diff:</b> %{customdata[3]:.2f}<extra></extra>",
    customdata=list(zip(pdf["charging_status"], pdf["charging_status_idx"], pdf["soc_diff"], pdf["next_soc_diff"]))
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()

### BMW

In [None]:
# Charger le dataframe
rss = s3.read_parquet_df_spark(spark, 'raw_ts/bmw/time_series/raw_ts_spark.parquet')
print(rss.columns)
vin = 'WBY1Z21000V306668'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin = rss_by_vin.withColumnRenamed("soc_hv_header", "soc")
rss_by_vin.show()

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, 'bmw')
rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go

# Étape 1 : transformer un petit échantillon (important si gros dataset)
pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'total_soc_diff', 'charging_status_idx', 'soc_diff', 'next_soc_diff').toPandas()

# Nettoyage
pdf["date"] = pd.to_datetime(pdf["date"])
pdf = pdf.dropna(subset=["soc"])
pdf["soc"] = pdf["soc"].astype(float)

# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)",
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<extra></extra>"
))

# 2. Points colorés avec toutes les informations
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False,
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<br><b>Status:</b> %{customdata[0]}<br><b>Index:</b> %{customdata[1]}<br><b>Soc Diff:</b> %{customdata[2]:.2f}<br><b>next Soc Diff:</b> %{customdata[3]:.2f}<extra></extra>",
    customdata=list(zip(pdf["charging_status"], pdf["charging_status_idx"], pdf["soc_diff"], pdf["next_soc_diff"]))
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()


### RENAULT

In [None]:
# Charger le dataframe
rss = s3.read_parquet_df_spark(spark, 'raw_ts/renault/time_series/raw_ts_spark.parquet')
print(rss.columns)
vin = 'VF1RCB00168403987'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin = rss_by_vin.withColumnRenamed("battery_level", "soc")
rss_by_vin.show()

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, 'renault')
rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go

# Étape 1 : transformer un petit échantillon (important si gros dataset)
pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'total_soc_diff').toPandas()

# Nettoyage
pdf["date"] = pd.to_datetime(pdf["date"])
pdf = pdf.dropna(subset=["soc"])
pdf["soc"] = pdf["soc"].astype(float)

# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)"
))

# 2. Points colorés
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()


### STELLANTIS

In [None]:
# Charger le dataframe

make = 'stellantis'
rss = s3.read_parquet_df_spark(spark, f'raw_ts/{make}/time_series/raw_ts_spark.parquet')
print(rss.columns)
vin = 'VR3UHZKXZLT073946'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin = rss_by_vin.withColumnRenamed('electricity.level.percentage', 'soc')
rss_by_vin.show()

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, make)
rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go

# Étape 1 : transformer un petit échantillon (important si gros dataset)
pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'total_soc_diff').toPandas()

# Nettoyage
pdf["date"] = pd.to_datetime(pdf["date"])
pdf = pdf.dropna(subset=["soc"])
pdf["soc"] = pdf["soc"].astype(float)

# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)"
))

# 2. Points colorés
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()


### TESLA-FLEET-TELEMETRY

In [None]:
# Charger le dataframe

make = 'tesla-fleet-telemetry'
rss = s3.read_parquet_df_spark(spark, f'raw_ts/{make}/time_series/raw_ts_spark.parquet')
print(rss.columns)
vin = '5YJSA7E5XRF541915'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin = rss_by_vin.withColumnRenamed('electricity.level.percentage', 'soc')
rss_by_vin.show()

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, make)
rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, lead, sum, when, col, signum, dense_rank
from pyspark.sql.types import DoubleType
from pyspark.sql import DataFrame as DF
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'total_soc_diff').toPandas()

# Nettoyage
pdf["date"] = pd.to_datetime(pdf["date"])
pdf = pdf.dropna(subset=["soc"])
pdf["soc"] = pdf["soc"].astype(float)

# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)"
))

# 2. Points colorés
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()

### VOLKSWAGEN

In [None]:
# Charger le dataframe

make = 'volkswagen'
rss = s3.read_parquet_df_spark(spark, f'raw_ts/{make}/time_series/raw_ts_spark.parquet')
print(rss.columns)
vin = 'WVGZZZE20RE040540'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin = rss_by_vin.withColumnRenamed('energyLevelPrimaryEnginePercentage', 'soc')
rss_by_vin.show()

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, make)
rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, lead, sum, when, col, signum, dense_rank
from pyspark.sql.types import DoubleType
from pyspark.sql import DataFrame as DF
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'total_soc_diff').toPandas()

# Nettoyage
pdf["date"] = pd.to_datetime(pdf["date"])
pdf = pdf.dropna(subset=["soc"])
pdf["soc"] = pdf["soc"].astype(float)

# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)"
))

# 2. Points colorés
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()

# SANDBOX

In [None]:
def compute_charge_idx(
    tss: DF, make, total_soc_diff_threshold: float = 0.5, phase_delimiter_mn: int = 45
) -> DF:

    w = Window.partitionBy("vin").orderBy("date") # Fenetre partitionnée par vin et triée par date

    # Mettre tous les SOCs de 0 à 100 %
    tss = tss.withColumn("soc", F.col("soc") * SCALE_SOC[make])
    
    # Supprimer les lignes avec des valeurs nulles pour la colonne "soc", ne pas faire de forward fill pour pouvoir distinguer de vrais idles
    tss = tss.na.drop(subset=["soc"]) 

    # Calculer la soc_diff en chaque point
    tss = tss.withColumn(
        "soc_diff",
        F.col("soc")- F.lag("soc").over(w),
    )

    # Calculer le temps entre deux points
    df = tss.withColumn("prev_date", lag("date").over(w)) \
        .withColumn(
        "time_gap_minutes",
        (F.unix_timestamp("date") - F.unix_timestamp("prev_date")) / 60,
    )
    
    # Calculer naïvement la direction de la charge
    df = df.withColumn(
        "direction_raw",
        F.when(col("soc_diff").isNull(), None).otherwise(F.signum("soc_diff")),
    )

    # Forward fill la direction
    df = df.withColumn(
        "direction",
        F.last("direction_raw", ignorenulls=True).over(
            w.partitionBy("vin")
            .orderBy("date")
            .rowsBetween(Window.unboundedPreceding, 0)
        ),
    )

    # Calculer les moments où la direction change
    df = df.withColumn(
        "direction_change",
        F.when(F.col("direction") != F.lag("direction").over(w), 1).otherwise(0),
    )

    # A cette étape on a bêtement sans intelligence toutes les phases où on passe d'un statut idle, charging, discharging
    # Quelque soit les points de soc gagnés ou perdus et le temps passé dans la phse
    df = df.withColumn(
        "phase_id",
        F.sum("direction_change").over(w.rowsBetween(Window.unboundedPreceding, 0)),
    )

    w_phase = Window.partitionBy("vin", "phase_id")


    # Pour Tesla 
    # Temps total de la  phase utile, pour Tesla
    # df = df.withColumn("total_phase_time", F.sum("time_gap_minutes").over(w_phase))

    #  df = _reassign_short_phases(
    #     df
    # )  # Reassign short phases to previous valid phase (especiallyuseful for tesla-fleet-telemetry noise)


    # Total de soc gagné ou perdu dans la phase bête et méchante
    df = df.withColumn("total_soc_diff", F.sum("soc_diff").over(w_phase))

    # Direction de la phase précédente et suivante
    df = df.withColumn("prev_phase", F.lag("direction").over(w)).withColumn(
        "next_phase", F.lead("direction").over(w)
    )

    # Coeur du réacteur : permettant de bien juger du statut, une phase pour être considérée comme charging ou discharging, doit amener un gain ou perte de soc minimum
    # Si la phase dure un point et que la précédente et la suivante sont de même nature, on les réassigne sinon c'est un idle
    df = df.withColumn(
        "charging_status",
        F.when((F.col("total_soc_diff") > total_soc_diff_threshold), "charging")
        .when((F.col("total_soc_diff") < -total_soc_diff_threshold), "discharging")
        .when(
            (F.col("prev_phase") == F.col("next_phase"))
            & (F.col("prev_phase") > 0),
            "charging",
        )
        .when(
            (F.col("prev_phase") == F.col("next_phase"))
            & (F.col("prev_phase") < 0),
            "discharging",
        )
        .otherwise("idle"),
    )

    # Là on vient faire en sorte que la phase précédente commence au bon endroit
    df = df.withColumn(
        "next_status",
        F.lead("charging_status").over(w)
    )
    

    # Set the phase before a charging/discharging phase as the phase to get the correct SOC diff
    df = df.withColumn(
        "charging_status",
        F.when(F.col("next_status") == 'charging', "charging")
        .when(F.col("next_status")== "discharging", "discharging")
        .otherwise(col('charging_status'))
    )

    # # Clean the charging status for non tesla-fleet-telemetry
    # df = df.withColumn(
    #     "charging_status",
    #     F.when(F.col("charging_status") == "idle", None).otherwise(F.col("charging_status"))
    # )

    # df = df.withColumn(
    #     "charging_status", 
    #     F.coalesce(
    #         F.last("charging_status", ignorenulls=True).over(w),
    #         F.first("charging_status", ignorenulls=True).over(w.orderBy(F.col("date").desc()))
    #     )
    # )

    # TEST : Drop idle

    # Drop les idles car on a pas d'intérêt à les identifier et ça va permettre de bien découper les numéros de phases
    # Mais important de les avoir identifier pour ne pas prolonger des phases de charge ou décharge inutilement comme le temps est important
    df = df.withColumn(
        "charging_status",
        F.when(F.col("charging_status") == "idle", None).otherwise(F.col("charging_status"))
    )

    df = df.na.drop(subset=["charging_status"])

    df = df.withColumn("prev_date", lag("date").over(w))
    

    # Séparer les phases successives qui sont identiques
    df = df.withColumn(
        "time_gap_minutes",
        (F.unix_timestamp("date") - F.unix_timestamp("prev_date")) / 60,
    )
    

    df = df.withColumn(
        "charging_status_change",
        F.when(
            F.col("charging_status") != F.lag("charging_status").over(w), 1
        ).otherwise(0),
    )


    # Ajouter la colonne corrigée : pertinent quand la fréquence est suffisante
    if make in ('bmw', 'tesla-fleet-telemetry', 'renault'):
        df = df.withColumn(
            "charging_status_change",
            F.when(
                (F.col("charging_status_change") == 0) &
                (F.col("charging_status") == 'discharging') & # on peut le retirer si les fréquences de données en charges sont suffisamment élevées
                (F.col("time_gap_minutes") > phase_delimiter_mn),
                F.lit(1)
            ).otherwise(F.col("charging_status_change"))
        )


    df = df.withColumn(
        "charging_status_idx",
        F.sum("charging_status_change").over(
            w.rowsBetween(Window.unboundedPreceding, 0)
        ),
    )

    # Cette opération on la fait à partir de raw_results dans l'aggrégation

    # Fenêtres
    w_phase = Window.partitionBy("vin", "charging_status_idx").orderBy("date")
    w_phase_static = Window.partitionBy("vin").orderBy("first_date")

    # 1. Agrégation par phase
    phase_stats = (
        df
        .orderBy("date")
        .groupBy("vin", "charging_status_idx")
        .agg(
            F.first("soc", ignorenulls=True).alias("first_soc"),
            F.last("soc", ignorenulls=True).alias("last_soc"),
            F.min("date").alias("first_date"),
            F.max("date").alias("last_date"),
            F.count("date").alias("count_points"),
            F.sum("soc_diff").alias("total_soc_diff"),
            F.first("charging_status").alias("charging_status")
        )
        .withColumn("next_dt_begin", F.lead("first_date").over(w_phase_static))
        .withColumn("next_first_soc", F.lead("first_soc").over(w_phase_static))
    )

    # 2. Appliquer la logique d'ajustement
    condition = (
        ((F.col("last_soc") < F.col("next_first_soc")) & (F.col("charging_status") == "charging")) |
        ((F.col("last_soc") > F.col("next_first_soc")) & (F.col("charging_status") == "discharging"))
    )

    phase_stats = phase_stats.withColumn(
        "adjusted_last_date",
        F.when(condition, F.col("next_dt_begin")).otherwise(F.col("last_date"))
    ).withColumn(
        "adjusted_last_soc",
        F.when(condition, F.col("next_first_soc")).otherwise(F.col("last_soc"))
    ).withColumn(
        "is_usable_phase",
        F.when(F.col("count_points") > 1, F.lit(1)).otherwise(F.lit(0))
    )

    # 3. Joindre les stats sur le DataFrame de base
    df = df.join(
        phase_stats.select(
            "vin", "charging_status_idx",
            "first_soc", "adjusted_last_soc",
            "first_date", "adjusted_last_date",
            "count_points", "total_soc_diff",
            "is_usable_phase"
        ),
        on=["vin", "charging_status_idx"],
        how="left"
    )

    # # 4. Renommer pour clarté
    # df = df.withColumnRenamed("adjusted_last_soc", "last_soc")
    # df = df.withColumnRenamed("adjusted_last_date", "last_date")



    return df

### BMW

In [None]:
# Charger le dataframe
rss = s3.read_parquet_df_spark(spark, 'raw_ts/bmw/time_series/raw_ts_spark.parquet')

rss.select('vin').distinct().show()

In [None]:
vin = 'WBY11HH030CS71250'
vin_2 = 'WBY8P210907F47480'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin_2 = rss.filter(rss['vin'] == vin_2)

rss_by_vin = rss_by_vin.withColumnRenamed("soc_hv_header", "soc")
rss_by_vin_2 = rss_by_vin_2.withColumnRenamed("soc_hv_header", "soc")

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, 'bmw')
rss_by_vin_with_idx_2 = compute_charge_idx(rss_by_vin_2, 'bmw')

rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

rss_by_vin_with_idx_2 =rss_by_vin_with_idx_2.cache()
rss_by_vin_with_idx_2.count()

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go


pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'charging_status_idx', 'soc_diff', 'first_date', 'adjusted_last_date', 'first_soc', 'adjusted_last_soc').toPandas()


# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)",
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<extra></extra>"
))

# 2. Points colorés avec toutes les informations
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False,
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<br><b>Status:</b> %{customdata[0]}<br><b>Index:</b> %{customdata[1]}<br><b>Soc Diff:</b> %{customdata[2]:.2f}<br><b>First Date:</b> %{customdata[3]}<br><b>Last Date:</b> %{customdata[4]}<br><b>First Soc:</b> %{customdata[5]:.2f}<br><b>Last Soc:</b> %{customdata[6]:.2f}<extra></extra>",
    customdata=list(zip(pdf["charging_status"], pdf["charging_status_idx"], pdf["soc_diff"], pdf['first_date'], pdf['adjusted_last_date'], pdf['first_soc'], pdf['adjusted_last_soc']))
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go


pdf = rss_by_vin_with_idx_2.select("date", "soc", "charging_status", 'charging_status_idx', 'soc_diff', 'first_date', 'adjusted_last_date', 'first_soc', 'adjusted_last_soc').toPandas()


# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)",
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<extra></extra>"
))

# 2. Points colorés avec toutes les informations
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False,
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<br><b>Status:</b> %{customdata[0]}<br><b>Index:</b> %{customdata[1]}<br><b>Soc Diff:</b> %{customdata[2]:.2f}<br><b>First Date:</b> %{customdata[3]}<br><b>Last Date:</b> %{customdata[4]}<br><b>First Soc:</b> %{customdata[5]:.2f}<br><b>Last Soc:</b> %{customdata[6]:.2f}<extra></extra>",
    customdata=list(zip(pdf["charging_status"], pdf["charging_status_idx"], pdf["soc_diff"], pdf['first_date'], pdf['adjusted_last_date'], pdf['first_soc'], pdf['adjusted_last_soc']))
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()


### MERCEDES-BENZ

In [None]:
# Charger le dataframe
rss = s3.read_parquet_df_spark(spark, 'raw_ts/mercedes-benz/time_series/raw_ts_spark.parquet')
print(rss.columns)
vin = 'W1N2437011J000869'
vin_2 = 'W1V9106331P360398'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin_2 = rss.filter(rss['vin'] == vin_2)
rss =  rss.withColumnRenamed("battery_level", "soc")
rss_by_vin = rss_by_vin.withColumnRenamed("battery_level", "soc")
rss_by_vin_2 = rss_by_vin_2.withColumnRenamed("battery_level", "soc")
rss_by_vin.show()
rss_by_vin_2.show()


In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, 'mercedes-benz')
rss_by_vin_with_idx_2 = compute_charge_idx(rss_by_vin_2, 'mercedes-benz')

rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

rss_by_vin_with_idx_2 =rss_by_vin_with_idx_2.cache()
rss_by_vin_with_idx_2.count()

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go


pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'charging_status_idx', 'soc_diff', 'first_date', 'adjusted_last_date', 'first_soc', 'adjusted_last_soc').toPandas()


# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)",
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<extra></extra>"
))

# 2. Points colorés avec toutes les informations
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False,
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<br><b>Status:</b> %{customdata[0]}<br><b>Index:</b> %{customdata[1]}<br><b>Soc Diff:</b> %{customdata[2]:.2f}<br><b>First Date:</b> %{customdata[3]}<br><b>Last Date:</b> %{customdata[4]}<br><b>First Soc:</b> %{customdata[5]:.2f}<br><b>Last Soc:</b> %{customdata[6]:.2f}<extra></extra>",
    customdata=list(zip(pdf["charging_status"], pdf["charging_status_idx"], pdf["soc_diff"], pdf['first_date'], pdf['adjusted_last_date'], pdf['first_soc'], pdf['adjusted_last_soc']))
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go


pdf = rss_by_vin_with_idx_2.select("date", "soc", "charging_status", 'charging_status_idx', 'soc_diff', 'first_date', 'adjusted_last_date', 'first_soc', 'adjusted_last_soc').toPandas()


# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)",
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<extra></extra>"
))

# 2. Points colorés avec toutes les informations
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False,
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<br><b>Status:</b> %{customdata[0]}<br><b>Index:</b> %{customdata[1]}<br><b>Soc Diff:</b> %{customdata[2]:.2f}<br><b>First Date:</b> %{customdata[3]}<br><b>Last Date:</b> %{customdata[4]}<br><b>First Soc:</b> %{customdata[5]:.2f}<br><b>Last Soc:</b> %{customdata[6]:.2f}<extra></extra>",
    customdata=list(zip(pdf["charging_status"], pdf["charging_status_idx"], pdf["soc_diff"], pdf['first_date'], pdf['adjusted_last_date'], pdf['first_soc'], pdf['adjusted_last_soc']))
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()


### RENAULT

In [None]:
# Charger le dataframe
rss = s3.read_parquet_df_spark(spark, 'raw_ts/renault/time_series/raw_ts_spark.parquet')

In [None]:
vin = 'VF1AG000865756685'
vin_2 = 'VF1AG000X63197233'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin_2 = rss.filter(rss['vin'] == vin_2)
rss =  rss.withColumnRenamed("battery_level", "soc")
rss_by_vin = rss_by_vin.withColumnRenamed("battery_level", "soc")
rss_by_vin_2 = rss_by_vin_2.withColumnRenamed("battery_level", "soc")

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, 'renault')
rss_by_vin_with_idx_2 = compute_charge_idx(rss_by_vin_2, 'renault')

rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

rss_by_vin_with_idx_2 =rss_by_vin_with_idx_2.cache()
rss_by_vin_with_idx_2.count()

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go


pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'charging_status_idx', 'soc_diff', 'first_date', 'adjusted_last_date', 'first_soc', 'adjusted_last_soc').toPandas()


# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)",
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<extra></extra>"
))

# 2. Points colorés avec toutes les informations
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False,
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<br><b>Status:</b> %{customdata[0]}<br><b>Index:</b> %{customdata[1]}<br><b>Soc Diff:</b> %{customdata[2]:.2f}<br><b>First Date:</b> %{customdata[3]}<br><b>Last Date:</b> %{customdata[4]}<br><b>First Soc:</b> %{customdata[5]:.2f}<br><b>Last Soc:</b> %{customdata[6]:.2f}<extra></extra>",
    customdata=list(zip(pdf["charging_status"], pdf["charging_status_idx"], pdf["soc_diff"], pdf['first_date'], pdf['adjusted_last_date'], pdf['first_soc'], pdf['adjusted_last_soc']))
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()



In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go


pdf = rss_by_vin_with_idx_2.select("date", "soc", "charging_status", 'charging_status_idx', 'soc_diff', 'first_date', 'adjusted_last_date', 'first_soc', 'adjusted_last_soc', 'is_usable_phase').toPandas()


# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)",
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<extra></extra>"
))

# 2. Points colorés avec toutes les informations
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False,
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<br><b>Status:</b> %{customdata[0]}<br><b>Index:</b> %{customdata[1]}<br><b>Soc Diff:</b> %{customdata[2]:.2f}<br><b>First Date:</b> %{customdata[3]}<br><b>Last Date:</b> %{customdata[4]}<br><b>First Soc:</b> %{customdata[5]:.2f}<br><b>Last Soc:</b> %{customdata[6]:.2f}<br><b>Is Usable Phase:</b> %{customdata[7]}<extra></extra>",
    customdata=list(zip(pdf["charging_status"], pdf["charging_status_idx"], pdf["soc_diff"], pdf['first_date'], pdf['adjusted_last_date'], pdf['first_soc'], pdf['adjusted_last_soc'], pdf['is_usable_phase']))
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()

### BMW

In [None]:
# Charger le dataframe
rss = s3.read_parquet_df_spark(spark, 'raw_ts/bmw/time_series/raw_ts_spark.parquet')

rss.select('vin').distinct().show()

In [None]:
vin = 'WBY11HH030CS71250'
vin_2 = 'WBY8P210907F47480'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin_2 = rss.filter(rss['vin'] == vin_2)

rss_by_vin = rss_by_vin.withColumnRenamed("soc_hv_header", "soc")
rss_by_vin_2 = rss_by_vin_2.withColumnRenamed("soc_hv_header", "soc")

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, 'bmw')
rss_by_vin_with_idx_2 = compute_charge_idx(rss_by_vin_2, 'bmw')

rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

rss_by_vin_with_idx_2 =rss_by_vin_with_idx_2.cache()
rss_by_vin_with_idx_2.count()

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go


pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'total_soc_diff', 'charging_status_idx', 'soc_diff', 'prev_phase', 'next_phase', 'direction', 'direction_change', 'charging_status_change', 'time_gap_minutes').toPandas()


# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)",
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<extra></extra>"
))

# 2. Points colorés avec toutes les informations
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False,
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<br><b>Status:</b> %{customdata[0]}<br><b>Index:</b> %{customdata[1]}<br><b>Soc Diff:</b> %{customdata[2]:.2f}<br><b>Prev Phase:</b> %{customdata[4]}<br><b>Next Phase:</b> %{customdata[5]}<br><b>Total Soc Diff:</b> %{customdata[6]}<br><b>Direction:</b> %{customdata[7]}<br><b>Direction Change:</b> %{customdata[8]}<br><b>Charging Status Change:</b> %{customdata[9]}<br><b>Time Gap:</b> %{customdata[10]}<extra></extra>",
    customdata=list(zip(pdf["charging_status"], pdf["charging_status_idx"], pdf["soc_diff"], pdf["prev_phase"], pdf["next_phase"], pdf["total_soc_diff"], pdf["direction"], pdf["direction_change"], pdf["charging_status_change"], pdf["time_gap_minutes"]))
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()


In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go


pdf = rss_by_vin_with_idx_2.select("date", "soc", "charging_status", 'total_soc_diff', 'charging_status_idx', 'soc_diff', 'next_soc_diff').toPandas()


# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)",
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<extra></extra>"
))

# 2. Points colorés avec toutes les informations
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False,
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<br><b>Status:</b> %{customdata[0]}<br><b>Index:</b> %{customdata[1]}<br><b>Soc Diff:</b> %{customdata[2]:.2f}<br><b>next Soc Diff:</b> %{customdata[3]:.2f}<extra></extra>",
    customdata=list(zip(pdf["charging_status"], pdf["charging_status_idx"], pdf["soc_diff"], pdf["next_soc_diff"]))
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()


### STELLANTIS

In [None]:
# Charger le dataframe
rss = s3.read_parquet_df_spark(spark, 'raw_ts/stellantis/time_series/raw_ts_spark.parquet')

rss.select('vin').distinct().show()

In [None]:
vin = 'VR3UKZKXZLJ782214'
vin_2 = 'VR3UHZKXZLT129329'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin_2 = rss.filter(rss['vin'] == vin_2)

rss_by_vin = rss_by_vin.withColumnRenamed("electricity.level.percentage", "soc")
rss_by_vin_2 = rss_by_vin_2.withColumnRenamed("electricity.level.percentage", "soc")

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, 'stellantis')
rss_by_vin_with_idx_2 = compute_charge_idx(rss_by_vin_2, 'stellantis')

rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

rss_by_vin_with_idx_2 =rss_by_vin_with_idx_2.cache()
rss_by_vin_with_idx_2.count()

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go


pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'total_soc_diff', 'charging_status_idx', 'soc_diff', 'prev_phase', 'next_phase', 'direction', 'direction_change', 'charging_status_change', 'time_gap_minutes').toPandas()


# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)",
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<extra></extra>"
))

# 2. Points colorés avec toutes les informations
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False,
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<br><b>Status:</b> %{customdata[0]}<br><b>Index:</b> %{customdata[1]}<br><b>Soc Diff:</b> %{customdata[2]:.2f}<br><b>Prev Phase:</b> %{customdata[4]}<br><b>Next Phase:</b> %{customdata[5]}<br><b>Total Soc Diff:</b> %{customdata[6]}<br><b>Direction:</b> %{customdata[7]}<br><b>Direction Change:</b> %{customdata[8]}<br><b>Charging Status Change:</b> %{customdata[9]}<br><b>Time Gap:</b> %{customdata[10]}<extra></extra>",
    customdata=list(zip(pdf["charging_status"], pdf["charging_status_idx"], pdf["soc_diff"], pdf["prev_phase"], pdf["next_phase"], pdf["total_soc_diff"], pdf["direction"], pdf["direction_change"], pdf["charging_status_change"], pdf["time_gap_minutes"]))
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()


In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go


pdf = rss_by_vin_with_idx_2.select("date", "soc", "charging_status", 'total_soc_diff', 'charging_status_idx', 'soc_diff', 'prev_phase', 'next_phase', 'direction', 'direction_change', 'charging_status_change', 'time_gap_minutes').toPandas()


# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)",
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<extra></extra>"
))

# 2. Points colorés avec toutes les informations
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False,
    hovertemplate="<b>Date:</b> %{x}<br><b>SoC:</b> %{y:.1f}%<br><b>Status:</b> %{customdata[0]}<br><b>Index:</b> %{customdata[1]}<br><b>Soc Diff:</b> %{customdata[2]:.2f}<br><b>Prev Phase:</b> %{customdata[4]}<br><b>Next Phase:</b> %{customdata[5]}<br><b>Total Soc Diff:</b> %{customdata[6]}<br><b>Direction:</b> %{customdata[7]}<br><b>Direction Change:</b> %{customdata[8]}<br><b>Charging Status Change:</b> %{customdata[9]}<br><b>Time Gap:</b> %{customdata[10]}<extra></extra>",
    customdata=list(zip(pdf["charging_status"], pdf["charging_status_idx"], pdf["soc_diff"], pdf["prev_phase"], pdf["next_phase"], pdf["total_soc_diff"], pdf["direction"], pdf["direction_change"], pdf["charging_status_change"], pdf["time_gap_minutes"]))
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()
