# soh estimation experimentation of Mercedes vehicles
In this notebook, we will try to express the soh at at any point as the energy that the battery would have if the battery would have if it had 100% soh divided by the energy it actually has.  
```
soh = charging.battery_energy / (charging.battery_level * model_battery_capacity) 
```

This method is based on the assumption that the variable `charging.battery_energy` represents the actual energy present in the battery rather than simply `charging.battery_level * model_battery_capacity`.

## Imports

In [None]:
import logging
from datetime import datetime as DT
from datetime import timedelta as TD
from dateutil import parser
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from dotenv import load_dotenv
import os
import random

import numpy as np
from rich import print
import pandas as pd
from pandas import Series
from pandas import DataFrame as DF
import plotly.express as px

from core.s3_utils import S3_Bucket
from jobs.base_jobs.job_interval import Jobinterval
from core.constants import *
from core.time_series_processing import preprocess_date
from jobs.high_mobility.constants import *

## Setup

In [None]:
KWH_BATTERY_CAPCITY_DICT = {
    "ZOE": {
        "R90 Life (batterijkoop) 5d": 52,
        "R135 Edition One (batterijkoop) 5d": 52,
        "R135 Intens (batterijkoop) 5d": 52,
        "R135":52
    }
}
KNOW_MODEL_TYPES = ["R90 Life (batterijkoop) 5d", "R135 Edition One (batterijkoop) 5d", "R135 Intens (batterijkoop) 5d", "R135"]

In [None]:
fleet_info = pd.read_csv("../ayvens/fleet_info.csv", usecols=["VIN","Make","Model","Type", "Autonomie"], dtype={"Make":"string"})
fleet_info = (
    fleet_info
    .rename(columns={"VIN": "vin"})
    .assign(Make=fleet_info["Make"].str.lower().map({"mercedes": "mercedes-benz"}).fillna(fleet_info['Make'].str.lower()))
    .set_index("vin", drop=False)
)
fleet_info["maker_offset"] = fleet_info.groupby("Make")["vin"].transform(lambda vins: random.uniform(-1, 0.1))
fleet_info["model_offset"] = fleet_info.groupby(["Make", "Type"])["vin"].transform(lambda vins: random.uniform(-1, 0.1))
fleet_info["model_slope"] = fleet_info.groupby(["Make", "Type"])["vin"].transform(lambda vins: random.uniform(SOH_LOST_PER_KM_DUMMY_RATIO - 0.00001, SOH_LOST_PER_KM_DUMMY_RATIO + 0.00001))

fleet_info["Make"].unique()

In [None]:
bucket = S3_Bucket()

def get_mercedes_raw_ts(vin:str, brand:str) -> DF:
    key = f"raw_ts/{brand}/time_series/{vin}.parquet"
    # print(key)
    return (
        bucket.read_parquet_df(key)
        .set_index("date", drop=False)
        .sort_index()
    )

RENAME_COLS_DICT = {
    "diagnostics.odometer": "odometer",
    "odometer.value": "odometer",
    "charging.battery_energy": "battery_energy",
    "diagnostics.odometer": "odometer",
    "charging.battery_level": "battery_level",
    "charging.estimated_range": "estimated_range",
}

tss = {}
for vin, vehicle_info in fleet_info.iterrows():
    if vehicle_info["Model"] in KWH_BATTERY_CAPCITY_DICT and vehicle_info["Type"] in KWH_BATTERY_CAPCITY_DICT[vehicle_info["Model"]]:
        default_100_soc_energy = KWH_BATTERY_CAPCITY_DICT[vehicle_info["Model"]][vehicle_info["Type"]] 
    else:
        default_100_soc_energy = np.nan
    default_100_soc_energy = np.nan
    try:
        tss[vin] = (
            get_mercedes_raw_ts(vin, vehicle_info["Make"])
            .assign(vin=vin)
            .assign(maker=vehicle_info["Make"])
            .assign(model=vehicle_info["Model"])
            .assign(version=vehicle_info["Type"])
            .assign(autonomie=vehicle_info["Autonomie"])
            .assign(dummy_soh_slope=vehicle_info["model_slope"])
            .assign(dummy_soh_offset=vehicle_info["maker_offset"] + vehicle_info["model_offset"])
            .assign(default_100_soc_energy=default_100_soc_energy)
            .rename(columns=RENAME_COLS_DICT)
        )
    except Exception as e:
        # display(e)
        # print(vin)
        continue
tss = pd.concat(tss, axis="index", keys=tss.keys(), names=["vin"])

Visualize the sparsity of the data necessary to compute the soh.

In [None]:
SOH_VARS = ["estimated_range", "battery_level", "autonomie"]
# Find all the vehicles that don't have the necessary variables to compute the soh
mercedes_tss = tss.query("maker == 'mercedes-benz'")
# Check how many notna the vehicles have
notna_summary = mercedes_tss.groupby(level=0)[SOH_VARS].count()
notna_summary.columns = ["nb_notna_" + col for col in notna_summary.columns]
# Check how many lines have all the necessary variables
notna_summary["nb_rows_can_compute_soh"] = (
    mercedes_tss[SOH_VARS]
    .notna()
    .all(axis="columns")
    .groupby(level=0)
    .sum()
)

notna_summary = notna_summary.sort_values(by="nb_rows_can_compute_soh", ascending=False)
display("not na summary:")
display(notna_summary)
display(f"Number of vins that cannot compute soh: {notna_summary.query('nb_rows_can_compute_soh == 0').shape[0]}")
display(f"Number of vins that have zero notna of at least one variable necessary to compute the soh: {notna_summary.query('nb_notna_estimated_range == 0 | nb_notna_battery_level == 0 | nb_notna_autonomie  == 0').shape[0]}")
display(f"Number of vins that have zero notna of estimated_range: {notna_summary.query('nb_notna_estimated_range == 0').shape[0]}")

In [None]:
notna_summary.query("nb_notna_estimated_range != 0 & nb_notna_battery_level != 0 & nb_notna_autonomie != 0 & nb_rows_can_compute_soh == 0")

In [None]:
mercedes_tss.query("version == 'Sprinter'").groupby(level=0)["estimated_range"].max().sort_values(ascending=False)

In [None]:
max_ranges = (
    mercedes_tss
    .query("version == 'Vito'")
    .groupby(level=0)["estimated_range"]
    .max()
    .sort_values(ascending=False)
)
max_ranges
# ranges = DF({
#     "max_recorded_range": max_ranges,
#     "default_range": fleet_info.loc[max_ranges.index, "Autonomie"]
# })
# ranges

## SOH

Let's visualize the soh estimation.

In [None]:
import plotly.graph_objects as go
import numpy as np
from scipy import stats

# Mercedes soh
mercedes_soh_mask = tss["maker"] == "mercedes-benz"
tss.loc[mercedes_soh_mask, "soh"] = (
    tss.loc[mercedes_soh_mask]
    .eval("estimated_range / battery_level / autonomie * 100")
)

# Calculate average SOH and last odometer reading for each VIN
mercedes_df = (
    tss[mercedes_soh_mask]
    .reset_index(drop=True)
    .groupby("vin")
    .agg({
        "soh": "mean",
        "odometer": "last",
        "model": "first",
        "date": "last",
        "estimated_range": "max",
    })
    .reset_index()
)

# Ensure odometer and soh are numeric
mercedes_df['odometer'] = pd.to_numeric(mercedes_df['odometer'], errors='coerce')
mercedes_df['soh'] = pd.to_numeric(mercedes_df['soh'], errors='coerce')

# Remove any rows with NaN values
mercedes_df = mercedes_df.dropna(subset=['odometer', 'soh'])

# Create scatter plot
fig = px.scatter(
    mercedes_df,
    x="odometer",
    y="soh",
    color="model",
    height=600,
    title="Average State-of-Health (SoH) vs Mileage",
    trendline="ols",
    trendline_scope="overall",
)

# Calculate trendline
slope, intercept, r_value, p_value, std_err = stats.linregress(mercedes_df["odometer"], mercedes_df["soh"])
line_x = np.array([mercedes_df["odometer"].min(), mercedes_df["odometer"].max()])
line_y = slope * line_x + intercept

# Add trendline
# fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='Trendline', ))

fig.update_layout(
    xaxis_title="Latest mileage (km)",
    yaxis_title="SoH (%)",
    legend_title="Model",
)
fig.update_traces(line=dict(color='black', dash='dash'))

fig.show()
fig.to_html("soh_plot.html")

We can see that the soh estimation of the Vitos and Sprinters are off.  
Let's try to divide their default range by 2.  

In [None]:
# Instead of dividing the default range by 2 we multiply the soh by 2 to preserve the default range.
mercedes_df.loc[mercedes_df.eval("model == 'Sprinter' | model == 'Vito'"), "soh"] *= 2 
fig = px.scatter(mercedes_df,
    x="odometer",
    y="soh",
    trendline="ols",
    color="model",
    trendline_scope="overall"
)
fig.update_traces(line=dict(color='black', dash='dash'))

The resulting sohs follows the overall trend which makes a lot more sense than the previous results.  
We can assume that the informed default ranges in fleet info are wrong.

## Conclusion

Soh from estimated range seems promessing and could be used as our final resulsts to Ayvens.  
We would, however, need to improve the accuracy of the estimator.  