# soh estimation experimentation of Mercedes vehicles
In this notebook, we will try to express the soh at at any point as the energy that the battery would have if the battery would have if it had 100% soh divided by the energy it actually has.  
```
soh = charging.battery_energy / (charging.battery_level * model_battery_capacity) 
```

This method is based on the assumption that the variable `charging.battery_energy` represents the actual energy present in the battery rather than simply `charging.battery_level * model_battery_capacity`.

## Setup

In [None]:
! mkdir -p data_cache

### Imports

In [None]:
import plotly.express as px
from scipy import stats
import numpy as np

from core.pandas_utils import *
from transform.fleet_info.main import fleet_info
from transform.processed_tss.main import get_processed_tss

### Data extraction

In [None]:
tss = get_processed_tss("mercedes-benz", force_update=False)

In [None]:
sanity_check(tss)

In [None]:
fleet_info.query("make == 'mercedes-benz'")["range"].value_counts(dropna=False, sort=True, ascending=False)

In [None]:
fleet_info.query("make == 'mercedes-benz' & range.isna()")[["model", "version"]].value_counts(sort=True, ascending=False).sort_index()

In [None]:
unique_models = fleet_info.query("make == 'mercedes-benz'")["model"].unique()

for model in unique_models:
    nb_with_range = len(fleet_info.query(f"model == '{model}' & range.notna()"))
    nb_total = len(fleet_info.query(f"model == '{model}'"))
    print(f"Model: {model}")
    print(f"Number of {model} with range: {nb_with_range}")
    print(f"Number of {model}: {nb_total}")
    print(f"ratio: {nb_with_range / nb_total:.2f}")
    print()

## Time series

In [None]:
most_common_vin = tss.groupby("vin").size().sort_values(ascending=False).idxmax()
most_common_vin
vin = "W1K2938901F017998"
ts = tss.query(f"vin == '{vin}'")

In [None]:
px.scatter(ts, x="date", y="soc", title=f"{vin}")

In [None]:
px.scatter(ts, x="date", y="estimated_range", title=f"{vin}")

In [None]:
px.scatter(ts, x="date", y="max_range", title=f"{most_common_vin}")

In [None]:
ts = ts.eval("estimated_range_by_soc = estimated_range.ffill() / soc.ffill()")
tss = tss.eval("estimated_range_by_soc = estimated_range.ffill() / soc.ffill()")

px.scatter(ts, x="date", y="estimated_range_by_soc", title=f"{most_common_vin}")

In [None]:
corr  = tss.corr(numeric_only=True)
selected_column = "max_range"
selected_corr = corr[[selected_column]].sort_values(by=selected_column, ascending=False)

# heat map of the correlation matrix
px.imshow(selected_corr, title=f"Correlation Matrix for {selected_column}")


#### Rolling variance


In [None]:
# Calculation on the rolling variance 
tss["rolling_variance"] = tss.groupby("vin")["soh"].transform(lambda group: group.rolling(window=3).var())
var = tss.dropna(subset=['rolling_variance'])


In [None]:
# Heat map 
# Sélectionner les colonnes d'intérêt
columns_of_interest = ['soc', 'estimated_range', 'soh','rolling_variance']

# Calculer la matrice de corrélation
correlation_matrix = tss[columns_of_interest].corr()
# Visualiser la matrice de corrélation avec une heatmap
fig = px.imshow(correlation_matrix,
                labels=dict(x="Variables", y="Variables", color="Correlation"),
                x=correlation_matrix.columns,
                y=correlation_matrix.index,
                title="Heatmap de la Corrélation")

# Afficher le graphique
fig.show()

## Reducing depandicies to factors


### Depandicies to soc -> Not much to do 

In [None]:
px.scatter(ts, 
           x="estimated_range_by_soc",
           y="soc", 
           title="soc")

### Depandicies to discharging -> Nothing to conclude

In [None]:
px.scatter(ts, 
           x="date",
           y="estimated_range_by_soc", 
           color="in_charge_perf_mask")

##  Final SOH

### Estimation

In [None]:
# Mercedes soh
tss:DF = (
    tss
    .eval("soh = estimated_range / soc / range * 100")
    .eval("odometer = odometer.ffill()")
)
#tss.loc[tss.eval("model == 'vito' | model == 'sprinter'"), "soh2"] *= 2 

# Calculate average SOH and last odometer reading for each VIN
soh_per_vehicle = (
    tss
    .reset_index(drop=True)
    .groupby("vin")
    .agg({
        "soh": "mean",
        "odometer": "last",
        "model": Series.mode,
        "date": "last",
        "estimated_range": "max",
    })
    .reset_index()
)

In [None]:
import numpy as np
import plotly.graph_objects as go

# Calculate the trendline
x = soh_per_vehicle["odometer"]
y = soh_per_vehicle["soh"]
coefficients = np.polyfit(x, y, 1)  # Linear fit (degree 1)
trendline = np.polyval(coefficients, x)

# Create the scatter plot
fig = px.scatter(
    soh_per_vehicle,
    x="odometer",
    y="soh",
    labels={"soh": "SoH", 'vin': 'VIN'},
    color="model",
    title="Average State-of-Health (SoH) vs Mileage"
)

# Add the trendline to the plot using go.Scatter
trendline_trace = go.Scatter(
    x=x,
    y=trendline,
    mode='lines',
    name='Trendline',
    line=dict(color='black', dash='dash')
)

fig.add_trace(trendline_trace)

fig.show()

In [None]:
tss.query(f"vin == '{vin}'")["soh"].mean()

## Veisualization

In [None]:
corr  = ts.corr(numeric_only=True)
selected_column = "max_range"
selected_corr = corr[[selected_column]].sort_values(by=selected_column, ascending=False)

# heat map of the correlation matrix
px.imshow(selected_corr, title=f"Correlation Matrix for {selected_column}")

In [None]:
# Create scatter plot
fig = (
    px.scatter(
        tss.dropna(subset=['odometer', 'soh']).eval("model_vin = model.astype('string') + vin"), #.query("soh > 70"),
        x="odometer",
        y="soh",
        color="model_vin",
        height=1000,
        title="Average State-of-Health (SoH) vs Mileage",
        trendline="ols",
        trendline_scope="overall",
    )
    .update_traces(line=dict(color='black', dash='dash'))
    #.update_layout(
    #    yaxis_scaleanchor="x",
    #    yaxis_scaleratio=1
    #)
)

fig.show()

In [None]:
from transform.raw_tss.main import get_raw_tss
raw_tss = get_raw_tss("mercedes-benz", force_update=False)
raw_tss.columns

In [None]:
tss.columns

In [None]:
tss.corr(numeric_only=True)["soh"].sort_values(ascending=False)

We can see that the soh estimation of the Vitos and Sprinters are off.  
Let's try to divide their default range by 2.  

In [None]:
# Instead of dividing the default range by 2 we multiply the soh by 2 to preserve the default range.
soh_per_vehicle.loc[soh_per_vehicle.eval("model == 'Sprinter' | model == 'Vito'"), "soh"] *= 2 
fig = px.scatter(soh_per_vehicle.query("model != 'vito' & soh > 70"),
    x="odometer",
    y="soh",
    trendline="ols",
    color="model",
    trendline_scope="overall"
)
fig.update_traces(line=dict(color='black', dash='dash'))

The resulting sohs follows the overall trend which makes a lot more sense than the previous results.  
We can assume that the informed default ranges in fleet info are wrong.

## Conclusion

Soh from estimated range seems promessing and could be used as our final resulsts to Ayvens.  
We would, however, need to improve the accuracy of the estimator.  