# soh estimation experimentation of Ford vehicles

method used: `battery_energy / (SoC * capacity)`



In [None]:
from core.s3.s3_utils import S3Service
from core.s3.settings import S3Settings
from core.spark_utils import create_spark_session
import pandas as pd
import plotly.express as px
from core.stats_utils import *
import numpy as np


In [None]:
settings = S3Settings()

spark = create_spark_session(
    settings.S3_KEY,
    settings.S3_SECRET
)

s3 = S3Service()


## Data import

In [None]:
from core.sql_utils import get_connection
with get_connection() as con:
    cursor = con.cursor()
    cursor.execute("""SELECT v.vin, vm.model_name, vm.version, CAST(b.capacity AS float) FROM vehicle_data vd
        left join vehicle v
        on v.id = vd.vehicle_id
        left join vehicle_model vm
        on vm.id=vehicle_model_id
        left join battery b
        on b.id=vm.battery_id
        left join make m
        on m.id=vm.make_id
        where m.make_name='ford';""", con)
    db_info =  pd.DataFrame(cursor.fetchall(), columns=["vin", "model_name", "version", "capacity"])


In [None]:
df_ford = s3.read_parquet_df_spark(spark, 'raw_ts/ford/time_series/raw_ts_spark.parquet')

In [None]:
vin_list = ['WF0AXXTTRBPU07135',
'WF0AXXTTRBPU07175',
'WF0AXXTTRBPU07329',
'WF0AXXTTRBPU07408',
'WF0AXXTTRBPU07427',
'WF0AXXTTRBPU07195',
'WF0AXXTTRBPU07256',
'WF0AXXTTRBPK70767',
'WF0AXXTTRBPK69618',
'WF0AXXTTRBPK69633',
'WF0AXXTTRBPK69642',
'WF0AXXTTRBPK69652',
'WF0AXXTTRBPK69622',
'WF0AXXTTRBPK69637',
'WF0AXXTTRBPK69598',
'WF0AXXTTRBPK69958']
# df_transit = df_ford.where(df_ford["vin"].isin(vin_list)).toPandas()
# df_transit = df_transit.merge(db_info, on='vin', how='left')

In [None]:
raw_ford = df_ford.toPandas()
raw_ford = raw_ford.merge(db_info, on='vin', how='left')
raw_ford.rename(columns={"odometer": "ODOMETER"}, inplace=True)
raw_ford[['battery_energy', 'battery_level', 'ODOMETER', 'capacity']] = raw_ford[['battery_energy', 'battery_level', 'ODOMETER', 'capacity']].astype(float)

In [None]:
sanity_check(raw_ford)

In [None]:
## compute charging
def detect_charging(df, level_col="battery_level", ts_col="date"):
    df = df.sort_values(ts_col).reset_index(drop=True)
    df["delta"] = df[level_col].diff().fillna(0)

    states = []
    charging = False

    for d in df["delta"]:
        if d > 0:        # augmentation → démarrage ou maintien de charge
            charging = True
        elif d < 0:      # baisse → pas en charge
            charging = False
        # sinon d == 0 → garder l'état précédent
        states.append(charging)

    df["charging"] = states
    return df

def add_phase_id(df):

    df_copy = df.copy()
    changes = df_copy['charging'] != df_copy['charging'].shift(1)
    df_copy['phase_id'] = changes.cumsum()
    return df_copy

raw_ford = detect_charging(raw_ford)
raw_ford = add_phase_id(raw_ford)

In [None]:
raw_ford.head()

## Time Series

### Battery energy EDA
We will use the battery energy to estimate the SoH.  
Let's visualize the battery energy to understand it better.

In [None]:
px.box(
    raw_ford,
    x="battery_level",
    y="battery_energy",
    color="capacity"
)

We can see in the plot above that the battery energy does not start at 0 kwh.  
To appropriatly estimate the SoH as the battery_energy / expected_battery_energy we will first estimate the expected battery energy.  
We will express it as the maximum battery energy recorded at a given SoC.  
We are effectively assuming that the expected battery energy was recorded because at least one of the followed vehicles has a battery SoH of 100% at that SoC.  
This should become more accurate as we will have more vehicles in our dataset.

In [None]:
max_energy = (
    raw_ford
    .groupby(["capacity", "battery_level"])
    .agg(
        max_battery_energy=pd.NamedAgg(column="battery_energy", aggfunc="max"),
        max_battery_energy_095=pd.NamedAgg(column="battery_energy", aggfunc=lambda x: x.quantile(0.9))
    )
    .reset_index(drop=False)
)
max_energy

In [None]:
px.scatter(
    max_energy,
    x="battery_level",
    y="max_battery_energy_095",
    color="capacity",
)

In [None]:
most_common_vin = raw_ford.groupby("vin").size().sort_values(ascending=False).idxmax()
ts = raw_ford.query(f"vin == '{most_common_vin}'")

In [None]:
px.scatter(ts, x='date', y='battery_energy', )

In [None]:
# px.scatter(ts, x="date", y="soc", title=f"{vin}")

In [None]:
# px.scatter(ts, x="date", y="estimated_range", title=f"{vin}")

In [None]:
# px.scatter(ts, x="date", y="max_range", title=f"{most_common_vin}")

In [None]:
corr  = raw_ford.corr(numeric_only=True)
selected_column = "battery_energy"
selected_corr = corr[[selected_column]].sort_values(by=selected_column, ascending=False)

# heat map of the correlation matrix
px.imshow(selected_corr, title=f"Correlation Matrix for {selected_column}")


## Reducing dependencies factors


In [None]:
tss = raw_ford.query("ODOMETER != 0")
ts = ts.query("ODOMETER != 0")

In [None]:
tss.loc[:, 'SOH'] = tss.loc[:, 'battery_energy'] / tss.loc[:, 'battery_level'] / tss.loc[:, 'capacity']
ts.loc[:, 'SOH'] = ts.loc[:, 'battery_energy'] / ts.loc[:, 'battery_level'] / ts.loc[:, 'capacity']

In [None]:
fig = px.scatter(
    ts.sample(5000),
    x="battery_level",
    y="SOH",
    color="capacity",
    height=600,
    title="Average State-of-Health (SoH) vs Mileage",
    trendline="ols",
    trendline_scope="overall",
    hover_data=["vin"]
)

fig.show()

In [None]:
px.scatter(ts.sample(5000).query("battery_level > 0.45").query("battery_level < 0.95"), 
           x="ODOMETER",
           y="SOH")

### Dependency to charging ?


In [None]:
px.scatter(ts.query("battery_level > 0.45").query("battery_level < 0.95"), 
           x="ODOMETER",
           y="SOH"  , 
           color="charging")

## SoH estimation

In [None]:
raw_tss_soh = (
    raw_ford
    .pipe(left_merge, max_energy, ["capacity", "battery_level"], ["capacity", "battery_level"], ["max_battery_energy", "max_battery_energy_095"])
    .eval("soh = battery_energy / capacity * 100")
    .eval("soh_095 = battery_energy / max_battery_energy_095 * 100")
)
raw_tss_soh.head(5)

In [None]:
soh_per_vehicle = (
    raw_tss_soh
    .groupby(["vin", "phase_id"])
    .agg({
        "soh": "median",
        "soh_095": "median",
        "ODOMETER": "last",
        "model_name": lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0],
        "version": lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0],
        "capacity": lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0],
    })
    .reset_index(drop=False)
)


In [None]:
soh_per_vehicle.shape

In [None]:
fig = px.scatter(
    soh_per_vehicle.sample(10000),
    x="ODOMETER",
    y="soh_095",
    color="capacity",
    height=600,
    title="Average State-of-Health (SoH) vs Mileage",
    trendline="ols",
    trendline_scope="overall",
    hover_data=["vin"]
)
fig.update_layout(
    xaxis_title="Latest mileage (km)",
    yaxis_title="SoH (%)",
    legend_title="Model",
)
fig.update_traces(line=dict(color='black', dash='dash'))

fig.show()

In [None]:
raw_soh_filtered = (raw_ford
          .query('battery_level > 0.45')
          .query('battery_level < 0.95')
          .eval("SOH = battery_energy / (capacity * battery_level)")
          .assign(charge_size = lambda df: df.groupby(["vin", "phase_id"]).transform("size"))
          .query("charge_size > 10"))

In [None]:
soh_per_vehicle = (
    raw_soh_filtered
    .groupby("vin")
    .agg({
        "soh": "mean",
        "odometer": "max",
        "model": Series.mode,
        "date": "max",
        "battery_energy": "max",
    })
    .reset_index()
)

In [None]:
raw_soh_filtered.select_dtypes(float).corr()

In [None]:
soh_vin = raw_soh_filtered[raw_soh_filtered['vin']=='WF0TK3SU4MMA37317']

In [None]:

px.scatter(soh_vin, x='date', y='SOH', color='charging')

It seems we can take both charging and discharging phase to compute SoH.   
The battery capacity is really important to don't have to high or low SoH.

## processed soh

In [None]:
def make_charge_levels_presentable(results):
    # If none of the level columns exist, return the results as is
    level_columns = ["level_1", "level_2", "level_3"]
    existing_level_columns = [col for col in level_columns if col in results.columns]
    if not existing_level_columns:
        return results
    negative_charge_levels = results[["level_1", "level_2", "level_3"]].lt(0)
    nb_negative_levels = negative_charge_levels.sum().sum()
    if nb_negative_levels > 0:
        print(f"There are {nb_negative_levels}({100*nb_negative_levels/len(results):2f}%) negative charge levels, setting them to 0.")
    results[["level_1", "level_2", "level_3"]] = results[["level_1", "level_2", "level_3"]].mask(negative_charge_levels, 0)
    return results

In [None]:
UPDATE_FREQUENCY = pd.Timedelta(days=7)

def agg_results_by_update_frequency(results:DF) -> DF:
    results["date"] = (
        pd.to_datetime(results["date"], format='mixed')
        .dt.floor(UPDATE_FREQUENCY)
        .dt.tz_localize(None)
        .dt.date
        .astype('datetime64[ns]')
    )
    return (
        results
        # Setting level columns to 0 if they don't exist.
        .assign(
            level_1=results.get("level_1", 0),
            level_2=results.get("level_2", 0),
            level_3=results.get("level_3", 0),
        )
        .groupby(["vin", "date"], observed=True, as_index=False)
        .agg(
            ODOMETER=pd.NamedAgg("ODOMETER", "last"),
            SOH=pd.NamedAgg("SOH", "median"),          
        )
    )

In [None]:
def make_soh_presentable_per_vehicle(df):
    if df["SOH"].isna().all():
        return df
    if df["SOH"].count() > 3:
        outliser_mask = mask_out_outliers_by_interquartile_range(df["SOH"])
        assert outliser_mask.any(), f"There seems to be only outliers???:\n{df['SOH']}."
        df = df[outliser_mask].copy()
    if df["SOH"].count() >= 2:
        df["SOH"] = force_decay(df[["SOH", "ODOMETER"]])
    return df


In [None]:

df_soh_final = (soh_per_vehicle.assign(SOH=lambda df: df["SOH"].replace([np.inf, -np.inf], np.nan))
        .sort_values(["vin", "date"])
        .pipe(make_charge_levels_presentable)
        .pipe(agg_results_by_update_frequency)
        .groupby('vin', observed=True)
        .apply(make_soh_presentable_per_vehicle, include_groups=False)
        .reset_index(level=0)
        .sort_values(["vin", "date"])
    )

In [None]:
px.scatter(df_soh_final, x='ODOMETER', y='SOH', color='vin')