# Evaluation of our soh estimation
The goal of this notebook is to establish a way of evaluatig our own estimation.  
Given the fact that we don't have a ground trhuth to compare our estimation with, we have to get creative.  

## Setup

### Imports

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import linregress

from core.pandas_utils import floor_to
from core.plt_utils import plt_3d_df
from transform.watea.soh_estimation import get_processed_cluster, get_soh_per_charges

### Extraction of the result

In [None]:
processed_cluster = get_processed_cluster()
charges = get_soh_per_charges()

## Visualization

In [None]:
px.scatter(
    processed_cluster,
    "odometer",
    "soh",
    color="id",
).update_xaxes(matches=None)

Let's visualize a few vehicles to see how our estimations look.  

In [None]:
ids = processed_cluster["id"].value_counts(sort=True, ascending=False).index.to_series()
ids_to_plot = [*ids[:2], *ids[2:].sample(n=2)]

ids_to_plot

In [None]:
px.box(
    processed_cluster.set_index("id", drop=False).loc[ids_to_plot],
    "odometer",
    "soh",
    facet_col="id",
    facet_col_wrap=1,
    height=700,
).update_xaxes(matches=None)

### soh over input features

In [None]:
# px.scatter(processed_cluster, "floored_current", "soh", trendline="rolling", opacity=0.25, trendline_options={"window": 100}, trendline_scope="overall", color="id").update_traces(line={"color": "red"})
processed_cluster["floored_current"] = floor_to(processed_cluster["current"], 1)
px.box(processed_cluster, "floored_current", "soh")

In [None]:
px.scatter(processed_cluster, "voltage", "soh", trendline="rolling", opacity=0.25, trendline_options={"window": 100}, trendline_scope="overall", color="id").update_traces(line={"color": "red"})

In [None]:
px.scatter(processed_cluster, "temperature", "soh", trendline="rolling", opacity=0.25, trendline_options={"window": 100}, trendline_scope="overall", color="id").update_traces(line={"color": "red"})

In [None]:
px.scatter(processed_cluster, "regime_seperation_feature", "soh", trendline="rolling", opacity=0.25, trendline_options={"window": 100}, trendline_scope="overall", color="id").update_traces(line={"color": "black"})

Looking at the soh vs features plots, it seems like the current could be a source of noise.  
Let's what the estimation would look like without this feature.    

In [None]:
from pandas import DataFrame as DF
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
from sklearn.pipeline import Pipeline

FEATURES = ["voltage", "temperature", "current"]

def estimate_soh(cluster:DF) -> tuple[DF, Pipeline]:
    x = cluster[FEATURES].values
    y = cluster["energy_added"].values
    soh_estimator = (
        Pipeline([
            ('poly_features', PolynomialFeatures(degree=10)),
            ('regressor', LinearRegression())
        ])
        .fit(X=x, y=y)
    )
    cluster["general_energy_added"] = (
        soh_estimator
        .predict(X=x)
        .squeeze()
    )
    default_100_soh_cluster = cluster.query("is_default_100_soh")
    y2_pred = soh_estimator.predict(default_100_soh_cluster[FEATURES])
    residuals = default_100_soh_cluster['energy_added'] - y2_pred
    initial_intercept = soh_estimator.named_steps['regressor'].intercept_
    adjusted_intercept = initial_intercept + residuals.mean()
    soh_estimator.named_steps['regressor'].intercept_ = adjusted_intercept

    cluster:DF = (
        cluster
        .assign(default_100_energy_added=soh_estimator.predict(cluster[FEATURES]))
        .eval("soh = 100 * energy_added / default_100_energy_added")
        .eval("residual = default_100_energy_added - energy_added")
    )
    cluster["residual"] = cluster["residual"].abs()

    
    return cluster, soh_estimator

reprocessed_cluster, soh_estimator = estimate_soh(processed_cluster)


In [None]:
import numpy as np

mins = processed_cluster[FEATURES].min().values
maxs = processed_cluster[FEATURES].max().values

# Create a 1D array for each feature using np.arange with a step of 1
voltage_range = np.arange(mins[0], maxs[0] + 1, 1)  # Add 1 to include the max
temperature_range = np.arange(mins[1], maxs[1] + 1, 1)

# Create a 2D grid of all combinations of voltage and temperature
voltage_grid, temperature_grid = np.meshgrid(voltage_range, temperature_range)

# Flatten the grids to make a 2D array of shape (n_points, 2)
decision_boundry_input = np.c_[voltage_grid.ravel(), temperature_grid.ravel()]


In [None]:
z = soh_estimator.predict(decision_boundry_input)
z.shape

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.pipeline import Pipeline
from plotly.graph_objects import Figure

# Assume you already have the FEATURES, processed_cluster, and soh_estimator defined.

FEATURES = ["voltage", "temperature"]

# Step 1: Generate the grid of input points
mins = processed_cluster[FEATURES].min().values
maxs = processed_cluster[FEATURES].max().values

# Create a 1D array for each feature using np.arange with a step of 1
voltage_range = np.arange(mins[0], maxs[0] + 1, 1)  # Voltage range
temperature_range = np.arange(mins[1], maxs[1] + 1, 1)  # Temperature range

# Create a 2D grid of all combinations of voltage and temperature
voltage_grid, temperature_grid = np.meshgrid(voltage_range, temperature_range)

# Flatten the grids to create a 2D array with (n_points, 2) shape
decision_boundry_input = np.c_[voltage_grid.ravel(), temperature_grid.ravel()]

# Step 2: Predict the z-values using the 'soh_estimator' pipeline
z_pred = soh_estimator.predict(decision_boundry_input)

# Step 3: Reshape the predicted values back into the grid shape
z_grid = z_pred.reshape(voltage_grid.shape)


def plt_3d_df(
        df: DF,
        x:str,
        y:str,
        z:str,
        color:str=None,
        opacity=0.5,
        colorscale='Rainbow',
        size=3,
        width=1500,
        height=1000,
        hover_name=None,
    ) -> Figure:
    return (
        px.scatter_3d(
            df,
            x,
            y,
            z,
            color,
            opacity=opacity,
            width=width,
            height=height,
            hover_name=hover_name,
            size=[size] * len(df),
            color_continuous_scale=colorscale,
        )
        .update_traces(marker=dict(line=dict(width=0)))
        .update_layout(
            scene=dict(
                camera=dict(
                    projection=dict(
                        type='orthographic'  # Keeps projection consistent
                    )
                ),
                zaxis=dict(
                    backgroundcolor="white",  # Make the background lighter
                    showgrid=True,  # Gridlines help with depth perception
                    showspikes=False,
                ),
                xaxis=dict(
                    showgrid=True,
                ),
                yaxis=dict(
                    showgrid=True,
                ),
            ),
        )
    )

# Use the same figure as before and add the surface trace
fig = plt_3d_df(processed_cluster.query("odometer <= 3000"), "voltage", "temperature", "energy_added", "id", opacity=1)

# Add the surface trace to the same figure
fig.add_trace(
    go.Surface(
        x=voltage_grid,  # X-axis: voltage
        y=temperature_grid,  # Y-axis: temperature
        z=z_grid,  # Z-axis: predicted SOH (or another target value)
        colorscale="Rainbow",
        # opacity=0.6,  # Set opacity to make the surface semi-transparent
        showscale=False,
    )
)

# Show the figure
fig.show()


In [None]:
charges.columns

In [None]:
px.scatter(
    reprocessed_cluster.groupby("charge_id").agg({"odometer": "median", "soh":"median", "id":"first", "charge_id":"first"}),
    "odometer",
    "soh",
    color="id",
)

In [None]:
px.scatter(
    processed_cluster.groupby("charge_id").agg({"odometer": "median", "soh":"median", "id":"first", "charge_id":"first"}),
    "odometer",
    "soh",
    color="id",
)

Looking closely we can see that the prediction is actually better with current and more degrees of freedom.  
The difference is subtle however.  
Let's visualize with an arrow plot.  

In [None]:
old_charges = processed_cluster.groupby("charge_id").agg({"odometer": "median", "soh":"median", "id":"first", "charge_id":"first"})
new_charges = reprocessed_cluster.groupby("charge_id").agg({"odometer": "median", "soh":"median", "id":"first", "charge_id":"first"})

old_and_new_charges = pd.concat((old_charges, new_charges))
arrow_df = (
    old_charges
    .assign(new_soh=new_charges['soh'])
    .assign(empty_col=pd.NA)
    .loc[:, ["odometer", "id", "soh", "new_soh", "empty_col"]]
    .set_index(["odometer", "id"], append=True)
    .T
    .unstack()
    .to_frame()
    .rename(columns={0: "soh"})
    .reset_index()
)
arrow_df

In [None]:
MARKER_SIZE = 8

fig = (
    px.scatter(
        old_and_new_charges,
        "odometer",
        "soh",
        color="id",
    )
    .add_trace(
        go.Scatter(
            x=arrow_df["odometer"],
            y=arrow_df["soh"],
            mode="markers+lines",
            marker=dict(
                symbol="arrow",
                color="royalblue",
                size=MARKER_SIZE,
                angleref="previous",
                standoff=MARKER_SIZE / 2,
            ),
        )
    )
)
fig.show()

Great, now we have a (more) convinient way to visualize the impact of the changes on our soh estimation.  
Unfortunatly, tweaking manually the soh estimation pipeline is to unefficient.  
Let's try to programatically search for a better soh estimation through hyperparameter tunning.  
To do so we will need a reward/loss function.  
Since this is an unsupervised regression task, we will need to get creative...  
We will try to use the values mean of the outputs of scipy.stats.linregress per vehicle (as we are (almost) sure that the soh should be monotonically decreasing per vehicle).  

In [None]:
most_common_id = reprocessed_cluster["id"].value_counts().index[0]
single_vehicle_cluster = reprocessed_cluster.query(f"id == '{most_common_id}'")

plt_3d_df(single_vehicle_cluster, "soc", "current", "energy_added", "charge_id").show()
px.box(
    single_vehicle_cluster,
    "odometer",
    "soh",
    # trendline="ols"
)

In [None]:
test = linregress(single_vehicle_cluster["odometer"], single_vehicle_cluster["soh"])
[
    "slope",
    "intercept",
    "rvalue",
    "pvalue",
    "stderr",
    "intercept_stderr",
]