## EDA : Charging Status Harmonization Analysis

### Purpose
This notebook aims to harmonize the charging_status column across different OEMs by establishing standardized rules to accurately determine charging, discharging, and idle phases.

### Overview
Below, you'll be able to plot the charging status for a selected VIN to verify that it properly represents the different charging phases across all vehicle manufacturers.

## Imports, Spark session & Config

In [None]:
from core.spark_utils import create_spark_session
from core.s3.s3_utils import S3Service
from core.s3.settings import S3Settings


spark = create_spark_session(
    S3Settings().S3_KEY,
    S3Settings().S3_SECRET
)

s3 = S3Service()

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, lead, sum, when, col, signum, dense_rank
from pyspark.sql.types import DoubleType
from pyspark.sql import DataFrame as DF
from transform.processed_tss.config import SCALE_SOC

def _reassign_short_phases(df, min_duration_minutes=3):
    """
    Recalculates phase_id by merging phases shorter than `min_duration_minutes`
    with the previous valid phase.

    Args:
        df (DataFrame): Spark DataFrame with columns `phase_id`, `date`, `total_phase_time`
        min_duration_minutes (float): Minimum duration to keep a phase (in minutes)

    Returns:
        DataFrame: DataFrame with updated `phase_id` column
    """

    df = df.withColumn(
        "is_valid_phase",
        F.when(F.col("total_phase_time") >= min_duration_minutes, 1).otherwise(0),
    )

    w_time = (
        Window.partitionBy("vin")
        .orderBy("date")
        .rowsBetween(Window.unboundedPreceding, 0)
    )

    df = df.withColumn(
        "last_valid_phase_id",
        F.last(
            F.when(F.col("is_valid_phase") == 1, F.col("phase_id")),
            ignorenulls=True,
        ).over(w_time),
    )

    df = df.withColumn(
        "phase_id_updated",
        F.when(F.col("is_valid_phase") == 1, F.col("phase_id")).otherwise(
            F.col("last_valid_phase_id")
        ),
    )

    df = df.withColumn(
        "phase_id_final",
        F.dense_rank().over(Window.partitionBy("vin").orderBy("phase_id_updated"))
        - 1,
    )

    df = df.drop(
        "phase_id", "last_valid_phase_id", "is_valid_phase", "phase_id_updated"
    )
    df = df.withColumnRenamed("phase_id_final", "phase_id")

    return df

def compute_charge_idx(
    make, tss: DF, total_soc_diff_threshold: float = 0.05
) -> DF:

    w = Window.partitionBy("vin").orderBy("date")
    
    tss = tss.withColumn(
        "soc", 
        F.coalesce(
            F.last("soc", ignorenulls=True).over(w),
            F.first("soc", ignorenulls=True).over(w.orderBy(F.col("date").desc()))
        )
    )

    tss = tss.withColumn(
        "soc_diff",
        F.when(
            F.col("soc").isNotNull(),
            F.col("soc")
            - F.last("soc", ignorenulls=True).over(
                w.rowsBetween(Window.unboundedPreceding, -1)
            ),
        ).otherwise(None),
    )

    df = tss.withColumn("prev_date", lag("date").over(w))
    df = df.withColumn(
        "time_gap_minutes",
        (F.unix_timestamp("date") - F.unix_timestamp("prev_date")) / 60,
    )

    df = df.withColumn(
        "direction_raw",
        F.when(col("soc_diff").isNull(), None).otherwise(F.signum("soc_diff")),
    )

    df = df.withColumn(
        "direction",
        F.last("direction_raw", ignorenulls=True).over(
            w.partitionBy("vin")
            .orderBy("date")
            .rowsBetween(Window.unboundedPreceding, 0)
        ),
    )  # Direction forward fill

    df = df.withColumn(
        "direction_change",
        F.when(F.col("direction") != F.lag("direction").over(w), 1).otherwise(0),
    )

    df = df.withColumn(
        "phase_id",
        F.sum("direction_change").over(w.rowsBetween(Window.unboundedPreceding, 0)),
    )

    w_phase = Window.partitionBy("vin", "phase_id")

    df = df.withColumn("total_phase_time", F.sum("time_gap_minutes").over(w_phase))


    if make == 'tesla-fleet-telemetry':
        df = _reassign_short_phases(
            df
        )  # Reassign short phases to previous valid phase (especiallyuseful for tesla-fleet-telemetry noise)

    w_phase = Window.partitionBy("vin", "phase_id")

    df = df.withColumn("total_soc_diff", F.sum("soc_diff").over(w_phase))

    df = df.withColumn("prev_phase", F.lag("direction").over(w)).withColumn(
        "next_phase", F.lead("direction").over(w)
    )


    df = df.withColumn(
        "charging_status",
        F.when(F.col("total_soc_diff") > total_soc_diff_threshold, "charging")
        .when(F.col("total_soc_diff") < -total_soc_diff_threshold, "discharging")
        .when(
            (F.col("prev_phase") == F.col("next_phase"))
            & (F.col("prev_phase") > 0),
            "charging",
        )
        .when(
            (F.col("prev_phase") == F.col("next_phase"))
            & (F.col("prev_phase") < 0),
            "discharging",
        )
        .otherwise("idle"),
    )

    df = df.withColumn(
        "next_status",
        F.lead("charging_status").over(w)
    )
    

    # Set the phase before a charging/discharging phase as the phase to get the correct SOC diff
    df = df.withColumn(
        "charging_status",
        F.when(F.col("next_status") == 'charging', "charging")
        .when(F.col("next_status")== "discharging", "discharging")
        .otherwise(col('charging_status'))
    )

    # Clean the charging status for non tesla-fleet-telemetry
    if make != 'tesla-fleet-telemetry':
        df = df.withColumn(
            "charging_status",
            F.when(F.col("charging_status") == "idle", None).otherwise(F.col("charging_status"))
        )

        df = df.withColumn(
            "charging_status", 
            F.coalesce(
                F.last("charging_status", ignorenulls=True).over(w),
                F.first("charging_status", ignorenulls=True).over(w.orderBy(F.col("date").desc()))
            )
        )
    

    df = df.withColumn(
        "charging_status_change",
        F.when(
            F.col("charging_status") != F.lag("charging_status").over(w), 1
        ).otherwise(0),
    )


    df = df.withColumn(
        "charging_status_idx",
        F.sum("charging_status_change").over(
            w.rowsBetween(Window.unboundedPreceding, 0)
        ),
    )

    return df

## Plots by OEMs

### MERCEDES-BENZ

In [None]:
# Charger le dataframe
rss = s3.read_parquet_df_spark(spark, 'raw_ts/mercedes-benz/time_series/raw_ts_spark.parquet')
print(rss.columns)
vin = 'W1N9N0CB8SJ120589'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin = rss_by_vin.withColumnRenamed("battery_level", "soc")
rss_by_vin.show()

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, 'mercedes-benz')
rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

In [None]:
rss_by_vin_with_idx.toPandas().sort_values(by='date')[['vin', 'date', 'odometer', 'soc', 'soc_diff', 'time_gap_minutes', 'charging_status', 'direction', "direction_change", 'direction_raw', 'total_phase_time', 'total_soc_diff', 'prev_phase', 'next_phase', 'phase_id',   'charging_status_change','charging_status_idx']][-22:]

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go

# Étape 1 : transformer un petit échantillon (important si gros dataset)
pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'total_soc_diff').toPandas()

# Nettoyage
pdf["date"] = pd.to_datetime(pdf["date"])
pdf = pdf.dropna(subset=["soc"])
pdf["soc"] = pdf["soc"].astype(float)

# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)"
))

# 2. Points colorés
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()


### BMW

In [None]:
# Charger le dataframe
rss = s3.read_parquet_df_spark(spark, 'raw_ts/bmw/time_series/raw_ts_spark.parquet')
print(rss.columns)
vin = 'WBY1Z21000V306668'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin = rss_by_vin.withColumnRenamed("soc_hv_header", "soc")
rss_by_vin.show()

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, 'bmw')
rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go

# Étape 1 : transformer un petit échantillon (important si gros dataset)
pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'total_soc_diff').toPandas()

# Nettoyage
pdf["date"] = pd.to_datetime(pdf["date"])
pdf = pdf.dropna(subset=["soc"])
pdf["soc"] = pdf["soc"].astype(float)

# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)"
))

# 2. Points colorés
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()


### RENAULT

In [None]:
# Charger le dataframe
rss = s3.read_parquet_df_spark(spark, 'raw_ts/renault/time_series/raw_ts_spark.parquet')
print(rss.columns)
vin = 'VF1RCB00168403987'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin = rss_by_vin.withColumnRenamed("battery_level", "soc")
rss_by_vin.show()

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, 'renault')
rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go

# Étape 1 : transformer un petit échantillon (important si gros dataset)
pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'total_soc_diff').toPandas()

# Nettoyage
pdf["date"] = pd.to_datetime(pdf["date"])
pdf = pdf.dropna(subset=["soc"])
pdf["soc"] = pdf["soc"].astype(float)

# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)"
))

# 2. Points colorés
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()


### STELLANTIS

In [None]:
# Charger le dataframe

make = 'stellantis'
rss = s3.read_parquet_df_spark(spark, f'raw_ts/{make}/time_series/raw_ts_spark.parquet')
print(rss.columns)
vin = 'VR3UHZKXZLT073946'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin = rss_by_vin.withColumnRenamed('electricity.level.percentage', 'soc')
rss_by_vin.show()

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, make)
rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go

# Étape 1 : transformer un petit échantillon (important si gros dataset)
pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'total_soc_diff').toPandas()

# Nettoyage
pdf["date"] = pd.to_datetime(pdf["date"])
pdf = pdf.dropna(subset=["soc"])
pdf["soc"] = pdf["soc"].astype(float)

# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)"
))

# 2. Points colorés
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()


### TESLA-FLEET-TELEMETRY

In [None]:
# Charger le dataframe

make = 'tesla-fleet-telemetry'
rss = s3.read_parquet_df_spark(spark, f'raw_ts/{make}/time_series/raw_ts_spark.parquet')
print(rss.columns)
vin = '5YJSA7E69RF532230'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin = rss_by_vin.withColumnRenamed('electricity.level.percentage', 'soc')
rss_by_vin.show()

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, make)
rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, lead, sum, when, col, signum, dense_rank
from pyspark.sql.types import DoubleType
from pyspark.sql import DataFrame as DF
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'total_soc_diff').toPandas()

# Nettoyage
pdf["date"] = pd.to_datetime(pdf["date"])
pdf = pdf.dropna(subset=["soc"])
pdf["soc"] = pdf["soc"].astype(float)

# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)"
))

# 2. Points colorés
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()

### VOLKSWAGEN

In [None]:
# Charger le dataframe

make = 'volkswagen'
rss = s3.read_parquet_df_spark(spark, f'raw_ts/{make}/time_series/raw_ts_spark.parquet')
print(rss.columns)
vin = 'WVGZZZE20RE040540'
rss_by_vin = rss.filter(rss['vin'] == vin)
rss_by_vin = rss_by_vin.withColumnRenamed('energyLevelPrimaryEnginePercentage', 'soc')
rss_by_vin.show()

In [None]:
rss_by_vin_with_idx = compute_charge_idx(rss_by_vin, make)
rss_by_vin_with_idx =rss_by_vin_with_idx.cache()
rss_by_vin_with_idx.count()

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, lead, sum, when, col, signum, dense_rank
from pyspark.sql.types import DoubleType
from pyspark.sql import DataFrame as DF
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


pdf = rss_by_vin_with_idx.select("date", "soc", "charging_status", 'total_soc_diff').toPandas()

# Nettoyage
pdf["date"] = pd.to_datetime(pdf["date"])
pdf = pdf.dropna(subset=["soc"])
pdf["soc"] = pdf["soc"].astype(float)

# Mapping de couleurs
status_color_map = {
    "charging": "green",
    "discharging": "red",
    "idle": "gray"
}
pdf["color"] = pdf["charging_status"].map(status_color_map)

# Tracé
fig = go.Figure()

# 1. Ligne continue noire
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="lines",
    line=dict(color="black"),
    name="SoC (ligne)"
))

# 2. Points colorés
fig.add_trace(go.Scatter(
    x=pdf["date"],
    y=pdf["soc"],
    mode="markers",
    marker=dict(color=pdf["color"], size=6),
    name="Status (points)",
    showlegend=False
))

# Mise en forme
fig.update_layout(
    title="Évolution du SoC avec statut de charge",
    xaxis_title="Date",
    yaxis_title="SoC (%)",
    hovermode="x unified"
)

fig.show()