# SoH estimation improvement
We recieved  a "ground truth" dataset from Ayven's Sloan containing Aviloo and from the (Tesla's?) SoH readout, of a few teslas.   
Some cars have a big difference between the bib soh and the soh readout.  

The goal of this notebook is to improve the SoH estimation by using this ground truth dataset. 

## Setup

In [None]:
! mkdir -p data_cache

### Imports

In [None]:
import plotly.express as px

from core.sql_utils import *
from core.pandas_utils import *
from core.plt_utils import *
from core.caching_utils import cache_result
from transform.fleet_info.main import fleet_info
from transform.processed_tss.ProcessedTimeSeries import ProcessedTimeSeries

### Data extraction

My computer's performance is just enough to run the code below so I need to cache the results to avoid loosing too much time. 

In [None]:
@cache_result("data_cache/tesla_tss.parquet", on='local_storage')
def get_tss() -> DF:
    return ProcessedTimeSeries("tesla")

#@cache_result("data_cache/tesla_charges.parquet", on='local_storage')
def get_charges(tss:DF) -> DF:
    return (
        tss
        .drop(columns=fleet_info.drop(columns=["vin"]).columns)
        .merge(fleet_info, on="vin", how="left")
        .query("trimmed_in_charge")
        .groupby(["vin", "trimmed_in_charge_idx"])
        .agg(
            energy_added=pd.NamedAgg("charge_energy_added", series_start_end_diff),
            soc_diff=pd.NamedAgg("soc", series_start_end_diff),
            soc_start=pd.NamedAgg("soc", "first"),
            soc_end=pd.NamedAgg("soc", "last"),
            temp=pd.NamedAgg("inside_temp", "mean"),
            capacity=pd.NamedAgg("capacity", "first"),
            odometer=pd.NamedAgg("odometer", "first"),
            fast_charger_type=pd.NamedAgg("fast_charger_type", "first"),
            size=pd.NamedAgg("soc", "size"),
            model=pd.NamedAgg("model", "first"),
            version=pd.NamedAgg("version", "first"),
            date=pd.NamedAgg("date", "first"),
            charge_rate=pd.NamedAgg("charge_rate", "median"),
            fast_charger_present=pd.NamedAgg("fast_charger_present", "median"),
            charge_current_request=pd.NamedAgg("charge_current_request", "median"),
            tesla_code=pd.NamedAgg("tesla_code", "first"),
        )
        .reset_index(drop=False)
        .eval("soh = energy_added / (soc_diff / 100.0 * capacity)")
        .eval("model_version = model + version")
    )

In [None]:
ground_truth = (
    pd.read_csv(
        "data_cache/ground_truth.csv",
        dtype={
            "Score Aviloo": "int64",
            "SoH Readout": "float64",
            "VIN": "string",
            "BIB SOH": "float64",
            "Brand (FlashTest)": "string",
            "Model Group (FlashTest)": "string",
            "Mileage": "float64",
        }
    )
    .rename(columns={"VIN": "vin", "SoH Readout": "ground_truth_soh"})
)

In [None]:
@cache_result("data_cache/tesla_sub_tss.parquet", on='local_storage')
def get_sub_tss() -> DF:
    return (
        get_tss()
        .query("vin in @ground_truth.vin")
    )

In [None]:
charges = get_charges(get_sub_tss())

In [None]:
tss = get_sub_tss()

## Ground truth and current SoH estimation comparaison

In [None]:
ground_truth

In [None]:
fleet_info.query("vin in @ground_truth.vin")

In [None]:
charges:DF = (
    get_charges(tss)
    .pipe(left_merge, ground_truth, left_on="vin", right_on="vin", src_dest_cols=["ground_truth_soh"])
    .eval("ground_truth_soh = ground_truth_soh / 100.0")
    .eval("soh_residual = ground_truth_soh - soh")
    .eval("abs_soh_residual = soh_residual.abs()")
)
charges

Now that we have a ground truth, we will try to identify the factors that explain the difference between the SoH estimation and the ground truth.  

## SoH estimation residual correlation EDA
Let's try to find the factors that explain the difference between the SoH estimation and the ground truth.  

In [None]:
charges.query("vin == '5YJ3E7EA6LF558840'")[["soh", "ground_truth_soh", "soh_residual", "abs_soh_residual"]].describe()

In [None]:
charges1 = charges.groupby("vin").agg({
    'soh': 'mean',
    'soh_residual': 'mean',
    'abs_soh_residual': 'mean',
    'date': 'last',
    'tesla_code': 'first',
    'vin': 'first',
    'capacity': 'count'
})

In [None]:
px.box(
    charges,
    points="all",
    x="capacity",
    y="soh_residual",
    color="tesla_code",
)

In [None]:
(
    charges
    .query("abs_soh_residual < 0.1")
    .corr(numeric_only=True)
    .loc[:, ["soh_residual", "abs_soh_residual", "soh"]]
)

In [None]:
px.scatter(
    charges1.query("abs_soh_residual < 0.1"),
    x="capacity",
    y="soh_residual",
    # trendline="ols",
    hover_data=["vin"],
).update_layout(
    autosize=False,
    width=750,
    height=750,
)

In [None]:
px.scatter(
    charges.query("abs_soh_residual < 0.1"),
    x="temp",
    y="soh_residual",
    trendline="ols",
).update_layout(
    autosize=False,
    width=750,
    height=750,
)

In [None]:
px.scatter(
    charges.query("abs_soh_residual < 0.1"),
    x="charge_current_request",
    y="soh_residual",
    trendline="ols",
).update_layout(
    autosize=False,
    width=750,
    height=750,
)

In [None]:
px.scatter(
    charges.query("abs_soh_residual < 0.1"),
    x="size",
    y="soh_residual",
    trendline="ols",
).update_layout(
    autosize=False,
    width=750,
    height=750,
)


In [None]:
px.scatter(
    charges.query("abs_soh_residual < 0.1"),
    x="soc_diff",
    y="soh_residual",
    trendline="ols",
).update_layout(
    autosize=False,
    width=750,
    height=750,
)

### Conclusion

Unfortunatly, there doesn't seem to be an obvious/simple pattern that explains the difference between the SoH estimation and the ground truth.  

## Soh oscillation

In [None]:
charges = get_charges(get_tss())

In [None]:
charges.dropna(subset=["date", "soh"]).shape

In [None]:
fleet_info.query("make == 'tesla'").count() / fleet_info.query("make == 'tesla'").shape[0]

In [None]:
vin_in_vehicle_table = (
    charges["vin"]
    .drop_duplicates(keep="first")
    .isin(pd.read_sql_table("vehicle", con)["vin"])
)
vin_in_vehicle_table.value_counts()

In [None]:
vins_missing = (
    charges["vin"]
    .drop_duplicates(keep="first")
    .loc[~vin_in_vehicle_table]
)

In [None]:
charges.count() / charges.shape[0]

In [None]:
px.scatter(
    (
        charges
        .query("soc_diff > 20")
        .dropna(subset=["date", "soh"]).eval("floored_date = date.dt.floor('D')")
    ),
    x="temp",
    y="soh",
    opacity=0.5,
    color="vin",
).update_layout(
    autosize=False,
    width=750,
    height=750,
)


In [None]:
px.box(
    charges.query("soc_diff > 20").dropna(subset=["date", "soh"]).eval("floored_temp = (temp // 5) * 5"),
    points="all",
    x="floored_temp",
    y="soh",
    #color="vin",
)