# soh estimation experimentation of Mercedes vehicles
In this notebook, we will try to express the soh at at any point as the energy that the battery would have if the battery would have if it had 100% soh divided by the energy it actually has.  
```
soh = charging.battery_energy / (charging.battery_level * model_battery_capacity) 
```

This method is based on the assumption that the variable `charging.battery_energy` represents the actual energy present in the battery rather than simply `charging.battery_level * model_battery_capacity`.

## Setup

In [None]:
! mkdir -p data_cache

### Imports

In [None]:
import plotly.express as px
from scipy import stats
import numpy as np
from core.stats_utils import *
from core.pandas_utils import *
from core.config import valid_soh_points
from core.stats_utils import filter_results_by_lines_bounds
from transform.fleet_info.main import fleet_info
from transform.processed_tss.ProcessedTimeSeries import ProcessedTimeSeries

### Data extraction

In [None]:
tss = ProcessedTimeSeries("mercedes-benz", force_update=False)

In [None]:
tss.columns

In [None]:
tss['odometer'] = (
    tss.groupby('vin')['odometer']
    .apply(lambda group: group.ffill().bfill())
    .reset_index(drop=True)
   )
tss['soc'] = (
    tss.groupby('vin')['soc']
    .apply(lambda group: group.ffill().bfill())
    .reset_index(drop=True)
   )
tss['estimated_range'] = (
    tss.groupby('vin')['estimated_range']
    .apply(lambda group: group.ffill().bfill())
    .reset_index(drop=True)
   )

In [None]:
sanity_check(tss)

In [None]:
fleet_info.query("make == 'mercedes-benz'")["range"].value_counts(dropna=False, sort=True, ascending=False)

In [None]:
fleet_info.query("make == 'mercedes-benz' & range.isna()")[["model", "version"]].value_counts(sort=True, ascending=False).sort_index()

In [None]:
unique_models = fleet_info.query("make == 'mercedes-benz'")["model"].unique()

for model in unique_models:
    nb_with_range = len(fleet_info.query(f"model == '{model}' & range.notna()"))
    nb_total = len(fleet_info.query(f"model == '{model}'"))
    print(f"Model: {model}")
    print(f"Number of {model} with range: {nb_with_range}")
    print(f"Number of {model}: {nb_total}")
    print(f"ratio: {nb_with_range / nb_total:.2f}")
    print()

## Time series

In [None]:
most_common_vin = tss.groupby("vin").size().sort_values(ascending=False).idxmax()
most_common_vin
vin = "W1K2938901F016704"
ts = tss.query(f"vin == '{vin}'")

In [None]:
ts = ts.eval("estimated_range_by_soc = estimated_range/ soc")
tss = tss.eval("estimated_range_by_soc = estimated_range / soc")

px.scatter(ts, x="date", y="estimated_range_by_soc", color="in_discharge_idx", title=f"{vin}")
fleet_info.query("make == 'mercedes-benz'")["range"].value_counts(dropna=False, sort=True, ascending=False)

In [None]:
tss.head()

In [None]:
corr  = tss.corr(numeric_only=True)
selected_column = "range"
selected_corr = corr[[selected_column]].sort_values(by=selected_column, ascending=False)

# heat map of the correlation matrix
px.imshow(selected_corr, title=f"Correlation Matrix for {selected_column}")


#### Rolling variance


In [None]:
# Calculation on the rolling variance 
tss["rolling_variance"] = tss.groupby("vin")["estimated_range_by_soc"].transform(lambda group: group.rolling(window=3).var())
var = tss.dropna(subset=['rolling_variance'])


In [None]:
# Heat map 
# Sélectionner les colonnes d'intérêt
columns_of_interest = ['soc', 'estimated_range', 'estimated_range_by_soc','rolling_variance']

# Calculer la matrice de corrélation
correlation_matrix = tss[columns_of_interest].corr()
# Visualiser la matrice de corrélation avec une heatmap
fig = px.imshow(correlation_matrix,
                labels=dict(x="Variables", y="Variables", color="Correlation"),
                x=correlation_matrix.columns,
                y=correlation_matrix.index,
                title="Heatmap de la Corrélation")

# Afficher le graphique
fig.show()

### Individual

In [None]:
px.scatter(ts.assign(discharge_size = lambda df: df.groupby(["vin", "in_discharge_idx"]).transform("size"))
        .query("soc > 0.7 & soc < 0.98 & discharge_size > 10 & in_discharge_perf_mask"), 
           x="date",
           y="estimated_range_by_soc", 
           color="vin")

## Reducing depandicies to factors


### Depandicies to soc -> Not much to do 

In [None]:
import plotly.express as px
import numpy as np
ts_fil = ts #.query('soc > 0.7')
# Créer le scatter plot de base
fig = px.scatter(ts_fil, 
                 x="estimated_range_by_soc",
                 y="soc", 
                 color="in_discharge_idx")

# Nettoyer les données pour la trendline
mask = (~np.isnan(ts_fil['estimated_range'])) & (~np.isnan(ts_fil['soc']))  # Enlever les NaN
x = ts_fil['estimated_range'][mask]
y = ts_fil['soc'][mask]

# Calculer la trendline avec numpy (en gérant les erreurs potentielles)
try:
    z = np.polyfit(x, y, 1)
    p = np.poly1d(z)

    # Créer des points x pour une ligne plus lisse
    x_range = np.linspace(x.min(), x.max(), 100)

    # Ajouter la trendline au graphique
    fig.add_traces(
        px.line(
            x=x_range, 
            y=p(x_range)
        ).update_traces(
            line=dict(color='black', dash='dash'),
            name='Trendline'
        ).data
    )
except np.linalg.LinAlgError:
    print("Impossible de calculer la trendline - problème avec les données")

print(z,p)

fig.show()

### Depandicies to discharging -> Much more stable in charge, we will keep only the values in charge

In [None]:
px.scatter(ts, 
           x="date",
           y="soc"  , 
           color="in_discharge_perf_mask")

### Correction per model


#### Vito -> Mulitoplicateur x 1/0,96

In [None]:
# Mercedes soh
tss_filtered:DF = (
    tss.query("model == 'vito'")
    .query("in_discharge_perf_mask")
    .eval("soh = estimated_range / soc / range/ 0.96 ")
    .assign(discharge_size = lambda df: df.groupby(["vin", "in_discharge_idx"]).transform("size"))
    .query("discharge_size > 10")
)
#Applying the filter on the outliers
# tss_filtered = filter_results_by_lines_bounds(tss_filtered, valid_soh_points)

# Calculate average SOH and last odometer reading for each VIN
soh_per_vehicle = (
    tss_filtered
    .groupby("vin")
    .agg({
        "soh": "median",
        "odometer": "max",
        "model": Series.mode,
        "date": "max",
        "estimated_range": "max",
    })
    .reset_index()
)


In [None]:
tss_filtered.head()

In [None]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Create the scatter plot
fig = px.scatter(
    soh_per_vehicle,
    x="odometer",
    y="soh",
    labels={"soh": "SoH", 'vin': 'VIN'},
    color="model",
    title="Average State-of-Health (SoH) vs Mileage",
    hover_data={"vin": True}
)

# Add a trendline for each model
for model_name, group in soh_per_vehicle.groupby('model'):
    x = group["odometer"]
    y = group["soh"]
    if len(x) > 1:  # Ensure there are enough points to fit a line
        coefficients = np.polyfit(x, y, 1)  # Linear fit (degree 1)
        trendline = np.polyval(coefficients, x)

        # Add the trendline to the plot using go.Scatter
        trendline_trace = go.Scatter(
            x=x,
            y=trendline,
            mode='lines',
            name=f'Trendline {model_name}',
            line=dict(dash='dash')  # Different dash style for each model
        )

        fig.add_trace(trendline_trace)

fig.show()

##  Final SOH

### Estimation

In [None]:
tss.dropna(subset=["estimated_range", "soc", "range"]).head()

In [None]:
# Mercedes soh
tss_filtered:DF = (
    tss
    .assign(discharge_size = lambda df: df.groupby(["vin", "in_discharge_idx"]).transform("size"))
    .query("soc > 0.7 & discharge_size > 10 & in_discharge_perf_mask")
    .eval("soh = estimated_range / soc / range")
    .sort_values(["vin", "date"])

)
#Applying filter on the number of charge 
# 1. Trouver les VINs qui ont un in_charge_idx maximum > 3
valid_vins = (
    tss_filtered.groupby('vin')
    .max()
    .reset_index()
)

# Applying correction model value 
mask = tss_filtered['model'] == 'vito'
tss_filtered.loc[mask, 'soh'] = tss_filtered.loc[mask, 'soh'] /0.97

mask = tss_filtered['model'] == 'sprinter'
tss_filtered.loc[mask, 'soh'] = tss_filtered.loc[mask, 'soh'] /0.98


#Applying the filter on the outliers
# tss_filtered = filter_results_by_lines_bounds(tss_filtered, valid_soh_points)

# Calculate average SOH and last odometer reading for each VIN
soh_per_vehicle = (
    tss_filtered
    .groupby("vin")
    .agg({
        "soh": "mean",
        "odometer": "max",
        "model": "first",
        "date": "max",
        "estimated_range": "max",
    })
    .reset_index()
)

In [None]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Create the scatter plot
fig = px.scatter(
    soh_per_vehicle,
    x="odometer",
    y="soh",
    labels={"soh": "SoH", 'vin': 'VIN'},
    color="model",
    title="Average State-of-Health (SoH) vs Mileage",
    hover_data={"vin": True}
)

fig = (
    px.scatter(
        soh_per_vehicle,
        x="odometer",
        y="soh",
        color="model",
        hover_data={"vin": True},
        title="Average State-of-Health (SoH) vs Mileage",
        trendline="ols",
        trendline_scope="overall",
    )
    .update_traces(
        line=dict(dash='dash')
    )
)
fig.show()

In [None]:
soh_per_vehicle['soh'] = soh_per_vehicle['soh'].round(2)
soh_per_vehicle[['vin', 'model', 'soh','odometer']].query('model == "vito" | model == "sprinter"').to_csv("soh_per_vehicle.csv", index=False)



In [None]:
soh_per_vehicle.query("vin == 'W1V44760313886610'")

## Visualization

In [None]:
corr  = ts.corr(numeric_only=True)
selected_column = "range"
selected_corr = corr[[selected_column]].sort_values(by=selected_column, ascending=False)

# heat map of the correlation matrix
px.imshow(selected_corr, title=f"Correlation Matrix for {selected_column}")

In [None]:
from transform.raw_tss.main import get_raw_tss
raw_tss = get_raw_tss("mercedes-benz", force_update=False)
raw_tss.columns

The resulting sohs follows the overall trend which makes a lot more sense than the previous results.  
We can assume that the informed default ranges in fleet info are wrong.

## Conclusion

Soh from estimated range seems promessing and could be used as our final resulsts to Ayvens.  
We would, however, need to improve the accuracy of the estimator.  