# SoH estimation improvement
We recieved  a "ground truth" dataset from Ayven's Sloan containing Aviloo and from the (Tesla's?) SoH readout, of a few teslas.   
Some cars have a big difference between the bib soh and the soh readout.  

The goal of this notebook is to improve the SoH estimation by using this ground truth dataset. 

## Setup

In [None]:
! mkdir -p data_cache

### Imports

In [None]:
import plotly.express as px

from core.stats_utils import *
from core.sql_utils import *
from core.pandas_utils import *
from core.plt_utils import *
from core.caching_utils import cache_result
from transform.fleet_info.main import fleet_info
from transform.processed_tss.ProcessedTimeSeries import ProcessedTimeSeries

### Data extraction

My computer's performance is just enough to run the code below so I need to cache the results to avoid loosing too much time. 

In [None]:
@cache_result("data_cache/tesla_tss.parquet", on='local_storage')
def get_tss() -> DF:
    return ProcessedTimeSeries("tesla")

@cache_result("data_cache/tesla_charges.parquet", on='local_storage')
def get_charges() -> DF:
    return (
        get_tss()
        .drop(columns=fleet_info.drop(columns=["vin"]).columns)
        .merge(fleet_info, on="vin", how="left")
        .query("trimmed_in_charge")
        .groupby(["vin", "trimmed_in_charge_idx"])
        .agg(
            energy_added=pd.NamedAgg("charge_energy_added", series_start_end_diff),
            soc_diff=pd.NamedAgg("soc", series_start_end_diff),
            soc_start=pd.NamedAgg("soc", "first"),
            soc_end=pd.NamedAgg("soc", "last"),
            inside_temp=pd.NamedAgg("inside_temp", "mean"),
            outside_temp=pd.NamedAgg("outside_temp", "mean"),
            capacity=pd.NamedAgg("capacity", "first"),
            odometer=pd.NamedAgg("odometer", "first"),
            fast_charger_type=pd.NamedAgg("fast_charger_type", "first"),
            size=pd.NamedAgg("soc", "size"),
            model=pd.NamedAgg("model", "first"),
            version=pd.NamedAgg("version", "first"),
            date=pd.NamedAgg("date", "first"),
            charge_rate=pd.NamedAgg("charge_rate", "median"),
            fast_charger_present=pd.NamedAgg("fast_charger_present", "median"),
            charge_current_request=pd.NamedAgg("charge_current_request", "median"),
            tesla_code=pd.NamedAgg("tesla_code", "first"),
            battery_heater=pd.NamedAgg("battery_heater", "median"),
        )
        .reset_index(drop=False)
        .eval("soh = energy_added / (soc_diff / 100.0 * capacity)")
        .eval("model_version = model + version")
    )

In [None]:
ground_truth = (
    pd.read_csv(
        "data_cache/ground_truth.csv",
        dtype={
            "Score Aviloo": "int64",
            "SoH Readout": "float64",
            "VIN": "string",
            "BIB SOH": "float64",
            "Brand (FlashTest)": "string",
            "Model Group (FlashTest)": "string",
            "Mileage": "float64",
        }
    )
    .rename(columns={"VIN": "vin", "SoH Readout": "ground_truth_soh"})
)

In [None]:
@cache_result("data_cache/tesla_sub_tss.parquet", on='local_storage')
def get_sub_tss() -> DF:
    return (
        get_tss()
        .query("vin in @ground_truth.vin")
    )

In [None]:
tss = get_sub_tss(force_update=False)

In [None]:
charges = pd.read_parquet("data_cache/tesla_charges.parquet")

In [None]:
(
    charges
    .eval("capacity_notna = capacity.notna()")
    .groupby("vin")
    ["capacity_notna"]
    .value_counts()
    .unstack(level=1)
    .fillna(0)
    .iloc[:, 1]
    .gt(0)
    .value_counts()
)

In [None]:
charges.dtypes

In [None]:
tss.dtypes

## Ground truth and current SoH estimation comparaison

In [None]:
ground_truth

In [None]:
fleet_info.query("vin in @ground_truth.vin")

In [None]:
charges:DF = (
    charges
    .pipe(left_merge, ground_truth, left_on="vin", right_on="vin", src_dest_cols=["ground_truth_soh"])
    .eval("ground_truth_soh = ground_truth_soh / 100.0")
    .eval("soh_residual = ground_truth_soh - soh")
    .eval("abs_soh_residual = soh_residual.abs()")
)
charges

Now that we have a ground truth, we will try to identify the factors that explain the difference between the SoH estimation and the ground truth.  

## SoH estimation EDA
It has been some time since I've worked on the SoH estimation, so let's see if there is any improvement that can be made.  

In [None]:
charges.dropna(subset=["date", "soh"], how="all")['vin'].nunique()

In [None]:
px.scatter(
    (
        charges
        .query("soc_diff > 20")
        #.query("soh.between(0.75, 1.2)")
        .dropna(subset=["soh"], how="all")
        #.sample(n=5000)
        .eval("outside_to_inside_temp_diff = outside_temp - inside_temp")
        .eval("tesla_code = tesla_code.fillna('Unknown')")
    ), #.query("tesla_code == 'MT311'"),
    x="inside_temp",
    y="soh",
    opacity=0.5,
    color="vin",
    trendline="ols",
    trendline_scope="overall",
).update_layout(
    autosize=False,
    showlegend=False,
    width=750,
    height=750,
)

We can see that there is a small correlation between the SoH estimation and the inside temperature.  
This is most likely due to the heat system working which makes the battery lose energy while charging, so the energy added is higher for the same soc diff.  
However, the soh is actually higher when the inside temperature is lower.    

In [None]:
train_charges = charges.query("soh.between(0.75, 1.2) & soc_diff > 20")
mean_soh = train_charges["soh"].mean()
inside_temp_soh_lr = lr_params_as_series(train_charges, "inside_temp", "soh")
inside_temp_soh_lr

In [None]:
train_charges = (
    train_charges
    .eval("soh_offset_pred = inside_temp * @inside_temp_soh_lr['slope'] + @inside_temp_soh_lr['intercept']")
    .eval("offseted_soh = soh - soh_offset_pred + @mean_soh")
)
px.scatter(
    train_charges,
    x="inside_temp",
    y="offseted_soh",
    trendline="ols",
    trendline_scope="overall",
    color="vin",
)

In [None]:
px.box(
    train_charges.query("vin in @ground_truth.vin").eval("new_residual_soh = offseted_soh - ground_truth_soh"),
    x="tesla_code",
    y="new_residual_soh",
    color="tesla_code",
)


In [None]:
train_charges = (
    train_charges
    .query("vin in @ground_truth.vin")
    .eval("new_residual_soh = offseted_soh - ground_truth_soh")
    .eval("residual_soh = soh - ground_truth_soh")
)
train_charges["abs_new_residual_soh"] = train_charges["new_residual_soh"].abs()
train_charges["abs_residual_soh"] = train_charges["residual_soh"].abs()
(
    train_charges
    .loc[:, ["abs_new_residual_soh", "abs_residual_soh"]]
    .describe()
)

In [None]:
agg_train_charges = (
    train_charges
    .groupby("vin")
    .agg(
        nb_charges=pd.NamedAgg("capacity", "size"),
        soh_median=pd.NamedAgg("soh", "median"),
        soh_mean=pd.NamedAgg("soh", "mean"),
        offseted_soh_median=pd.NamedAgg("offseted_soh", "median"),
        offseted_soh_mean=pd.NamedAgg("offseted_soh", "mean"),
    )
    .reset_index()
    .pipe(
        left_merge,
        ground_truth,
        left_on="vin",
        right_on="vin",
        src_dest_cols={
            "ground_truth_soh": "ground_truth_soh",
            "Mileage": "ground_truth_odometer",
    })
    .eval("ground_truth_soh = ground_truth_soh / 100.0")
    .eval("offseted_median_residual_soh = offseted_soh_median - ground_truth_soh")
    .eval("median_residual_soh = soh_median - ground_truth_soh")
    .assign(
        abs_offseted_median_residual_soh=lambda df: df["offseted_median_residual_soh"].abs(),
        abs_median_residual_soh=lambda df: df["median_residual_soh"].abs(),
    )
    .sort_values("nb_charges", ascending=False)
    .eval("abs_residual_improvement = abs_median_residual_soh - abs_offseted_median_residual_soh")
)
display(agg_train_charges)
display(agg_train_charges.describe())

In [None]:
train_charges[["vin", "new_abs_residual_soh", "abs_residual_soh"]]

In [None]:
px.scatter(
    train_charges.groupby("vin")[""],
)

In [None]:
train_charges

In [None]:
import matplotlib.pyplot as plt

# Get unique VINs
unique_vins = train_charges["vin"].unique()

# Create subplots
fig, axes = plt.subplots(len(unique_vins), 1, figsize=(10, 15), sharex=False)

# Plot each VIN in a separate subplot
for ax, vin in zip(axes, unique_vins):
    vin_data = train_charges[train_charges["vin"] == vin]
    ground_truth_data = ground_truth[ground_truth["vin"] == vin]
    
    ax.scatter(vin_data["odometer"], vin_data["soh"], label=f"{vin} - offseted_soh")
    #ax.scatter(vin_data["odometer"], vin_data["offetedsoh"], label=f"{vin} - offseted_soh")
    ax.scatter(ground_truth_data["Mileage"], ground_truth_data["ground_truth_soh"] / 100, label=f"{vin} - ground_truth_soh", color='red')
    
    ax.set_title(f"VIN: {vin}")
    ax.set_ylabel("SOH")
    ax.legend()
    ax.set_xlabel("Odometer")

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
ground_truth.dtypes

In [None]:
import plotly.express as px
import plotly.graph_objects as go

# Main plot using Plotly Express
fig = px.scatter(
    train_charges,
    x="odometer",
    y="offseted_soh",
    color="vin",
    facet_row="vin",
    facet_row_spacing=0.03,
    facet_col_spacing=0.03,
)

# Add extra points from another DataFrame
for vin in train_charges["vin"].unique():
    # Filter the extra points DataFrame for the current `vin`
    extra_points = ground_truth[ground_truth["vin"] == vin]
    
    print(type(fig.layout))
    print(fig.layout.dtypes)
    # Add a scatter trace for the extra points
    fig.add_trace(
        go.Scatter(
            x=extra_points["Mileage"],
            y=extra_points["ground_truth_soh"],
            mode="markers",
            marker=dict(color="red", size=10),  # Customize marker style
            name=f"Extra points for {vin}",
        ),
        row=fig.layout["facet_row_sequence"][vin],
        col=1,  # Adjust if using facet columns
    )

# Update layout if needed
fig.update_layout(height=1500)

# Show the plot
fig.show()