# Ituran Second Response charging points SoH esitmation

In this notebook we will handle the processed time series data to compute the SoH from the charging points (like we did with Watea).  
This would corresponds to the result/soh_estimation step in our pipeline.  

## Setup
Please run the `ituran_second_response_tss_EDA.ipynb` notebook before running this one.  

### Import

In [None]:
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import plotly.express as px
import plotly.graph_objects as go
from plotly.colors import qualitative

from core.pandas_utils import *
from core.plt_utils import plt_3d_df

### Data Extraction

We will simply load the processed time series data computed in the previous notebook.

In [None]:
tss = pd.read_parquet("./data_cache/ituran_tss.parquet")

## SoH estimation  
To estimate the SoH, we will use the charging points' energy_added.  
In this context a charging point is an statistical description of a period of charging from one soc to the next.  
We consider the energy added to be the energy that the battery required to gain that soc.  
We will estimate/express the SoH of the battery as the energy required by the battery to gain a certain soc divided by the energy required by a 100%SoH battery to gain the same soc.  
Since the added energy that a battery requires to gain a certain soc depends on multiple factors, we will try to capture as many factors as possible.  
To estimate the SoH all we really need to estimate is the energy required by a 100%SoH battery to gain a certain soc based on the charging point's factors.  
Then we just need to divide the energy required by the battery to gain a certain soc by the estimated energy required by a 100%SoH battery to gain the same soc in the same conditions.  

### Charging points SoH

#### Data extraction

In [None]:
def compute_first_charge_soc(tss:DF) -> DF:
    tss["first_charge_soc"] = (
        tss
        .groupby(["vehicle_id", "trimmed_in_charge_idx"])
        ["soc"]
        .transform("first")
    )
    tss["first_charge_soc"] = tss["first_charge_soc"].where(tss["trimmed_in_charge"], pd.NA)
    return tss

CHARGING_POINT_QUANTIZATION = 3

def floor_soc(tss:DF) -> DF:
    return (
        tss
        .assign(floored_soc=floor_to(tss["soc"], CHARGING_POINT_QUANTIZATION))
    )

charging_points:DF = (
    tss
    .pipe(compute_first_charge_soc)
    .pipe(floor_soc)
    .query("vehicle_model == 'geometry c' & trimmed_in_charge")
    .groupby(["vehicle_id", "trimmed_in_charge_idx", "floored_soc"], as_index=False, observed=True)
    .agg(
        energy_added_at_start=pd.NamedAgg(column="cum_energy_added", aggfunc="first"),
        energy_added_at_end=pd.NamedAgg(column="cum_energy_added", aggfunc="last"),
        energy_added=pd.NamedAgg(column="cum_energy_added", aggfunc=series_start_end_diff),
        ac_mode_mean=pd.NamedAgg(column="charging_ac_mode", aggfunc="mean"),
        dc_mode_mean=pd.NamedAgg(column="charging_dc_mode", aggfunc="mean"),
        current=pd.NamedAgg(column="ffilled_charging_current", aggfunc="median"),
        voltage=pd.NamedAgg(column="ffilled_charging_voltage", aggfunc="median"),
        estimated_range=pd.NamedAgg(column="ffilled_estimated_range", aggfunc="median"),
        time_remaining_for_charge=pd.NamedAgg(column="ffilled_time_remaining_for_charge", aggfunc="median"),
        model=pd.NamedAgg(column="vehicle_model", aggfunc="first"),
        first_charge_soc=pd.NamedAgg(column="first_charge_soc", aggfunc="first"),
        duration=pd.NamedAgg(column="date", aggfunc=series_start_end_diff),
        date=pd.NamedAgg(column="date", aggfunc="first"),
        nb_points=pd.NamedAgg(column="soc", aggfunc="size"),
        odometer=pd.NamedAgg(column="odometer", aggfunc="first"),
    )
    .rename(columns={"floored_soc": "soc"})
    .eval("energy_added=energy_added_at_end - energy_added_at_start")
    .eval("soc_added = soc - first_charge_soc")
    .eval("power = current * voltage")
    .eval("in_ac = ac_mode_mean > 0.3")
    .eval("in_dc = dc_mode_mean > 0.3")
    .eval("power = current * voltage")
    .eval("sec_duration = duration.dt.total_seconds()")
    .eval("energy_added_per_point = energy_added / nb_points")
)

sanity_check(charging_points)

#### EDA

In [None]:
(
    charging_points
    .corr(numeric_only=True)
    .abs()
    .sort_values(by="energy_added", ascending=False)
    .loc[:, "energy_added"]
    .iloc[1:]
)

In [None]:
pd.concat((
    charging_points.groupby("nb_points")['energy_added'].count().sort_values(ascending=False),
    charging_points.groupby("nb_points")['vehicle_id'].nunique(),
), axis=1)

In [None]:
px.scatter(
    charging_points.query("energy_added.between(26e3, 245e3)"),
    x="nb_points",
    y="energy_added",
    color="vehicle_id",
)

In [None]:
charging_points["vehicle_id"].value_counts(sort=True)

In [None]:
plt_3d_df(
    charging_points.query("vehicle_id == 27 & energy_added > 13e3"),
    x="date",
    y="duration",
    z="energy_added",
    color="nb_points",
    log_z=True,
)

In [None]:
plt_3d_df(
    (
        charging_points
        .query("energy_added.between(26e3, 245e3)")
        .query("in_dc")
        #.query("nb_points == 4")
        #.query("nb_points.between(3, 6)")
    ),
    x='power',
    y="soc",
    z="energy_added",
    color="vehicle_id",
    opacity=0.25,
    size=3,
    width=1500,
    height=1000,
    log_z=True,
    #log_y=True,
    #symbol="in_dc",
)

We can see a clear correlation between the charging points' energy added and its "soc_added" and "power" features.  
However, the relation seems to be some sort of modulo relation then a polynomial relation.  
This is either due to some error in the tss processing or an actual physical phenomenon.  

In [None]:
charging_points_to_plot = (
    charging_points
    .query("energy_added.between(26e3, 100e3)")
    .query("in_dc")
    # .query("vehicle_id == 27")
)

# Initialize 3D plot
fig = go.Figure()

X='sec_duration'
Y='soc'
Z='energy_added'

# Add a trace for each `trimmed_in_charge_idx` group
for (vehicle_id, trimmed_in_charge_idx), group_data in charging_points_to_plot.groupby(["vehicle_id", "trimmed_in_charge_idx"]):
    fig.add_trace(go.Scatter3d(
        x=group_data[X],
        y=group_data[Y],
        z=group_data[Z],
        mode='lines',
        name=f'Group {trimmed_in_charge_idx} (vehicle_id: {vehicle_id})'
    ))

# Update layout
fig.update_layout(
    title="Charges time series",
    scene=dict(
        xaxis_title=X,
        yaxis_title=Y,
        zaxis_title=Z,
        zaxis=dict(
            type='log'
        ),
        camera=dict(
            projection=dict(
                type='orthographic'
            )
        )
    ),
    width=1500,
    height=1000,
    showlegend=True
)

# Show the plot
fig.show()

In [None]:
charging_points_to_plot = (
    charging_points
    .query("energy_added.between(26e3, 100e3)")
    .query("in_dc")
    .query("nb_points == 4")
)

# Initialize 3D plot
fig = go.Figure()

X='power'
Y='soc_added'
Z='energy_added'

# Add a trace for each `trimmed_in_charge_idx` group
for (vehicle_id, trimmed_in_charge_idx), group_data in charging_points_to_plot.groupby(["vehicle_id", "trimmed_in_charge_idx"]):
    fig.add_trace(go.Scatter3d(
        x=group_data[X],
        y=group_data[Y],
        z=group_data[Z],
        mode='lines',
        name=f'Group {trimmed_in_charge_idx} (vehicle_id: {vehicle_id})'
    ))

# Update layout
fig.update_layout(
    title="Charges time series",
    scene=dict(
        xaxis_title=X,
        yaxis_title=Y,
        zaxis_title=Z,
        zaxis=dict(
            type='log'
        ),
        camera=dict(
            projection=dict(
                type='orthographic'
            )
        )
    ),
    width=1500,
    height=1000,
    showlegend=True
)

# Show the plot
fig.show()

## Default Energy added fitting
We will try to fit a model to estimate the default(brand new vehicle) energy added required for a charging point to complete based on its power, soc and nb_points.  
And them, express the SoH as the ratio between the energy that charging point actually required and this estimated default required energy added.

Let's find the vehicles with the least amount distance traveled to use as a reference point for the default energy added.

In [None]:
# Note: There is only one odo value per vehicle (>_<)
vehicle_od = (
    charging_points
    .groupby("vehicle_id", observed=True, as_index=False)["odometer"]
    .agg("min")
    .sort_values("odometer")
)
display(vehicle_od)
px.bar(vehicle_od.astype({"vehicle_id": "string"}), x="vehicle_id", y="odometer")

This actually seems like a pretty good distribution of odometers for that small of a sample size. :thumbsup:

In [None]:
charging_points.dtypes

In [None]:
# Define the features and target
features = ["power", "soc", "nb_points", "voltage", "current", "soc_added"]
target = ["energy_added"]

no_nan_charging_points = charging_points.dropna(subset=features + target, how="any").copy()

# Extract the feature matrix (X) and target vector (y)
X = no_nan_charging_points[features].values
y = no_nan_charging_points[target].values

# Perform a polynomial fit
DEGREE = 10

# Define a function to apply exponential transformation
exp_transformer = FunctionTransformer(np.exp, feature_names_out="one-to-one")

# Create a pipeline with an exponential transformation step
model = Pipeline([
    ('standard_scaler', StandardScaler()),  # Standardization
    # ('exp_features', exp_transformer),      # Exponential feature transformation
    ('polynomial_features', PolynomialFeatures(DEGREE)),  # Polynomial expansion
    ('linear_regression', LinearRegression())  # Linear regression
])

# Fit the model
model.fit(X, y)

# Make predictions
no_nan_charging_points["fit_energy_added"] = model.predict(X)
no_nan_charging_points["residual"] = no_nan_charging_points.eval("fit_energy_added - energy_added")
no_nan_charging_points["residual_abs"] = no_nan_charging_points["residual"].abs()
no_nan_charging_points[["residual", "residual_abs"]].describe()

In [None]:
plt_df = (
    no_nan_charging_points
    .query("energy_added.between(26e3, 245e3)")
    # .query("vehicle_id == 27")
    #.query("nb_points.between(3, 6)")
    .eval("sec_duration = duration.dt.total_seconds()")
    .melt(
        ["power", "soc", "vehicle_id", "nb_points", "duration", "sec_duration"],
        [
            "fit_energy_added",
            "energy_added",
        ]
    )
)
plt_3d_df(
    plt_df.astype({"vehicle_id": "category"}),
    x='power',
    y="soc",
    z="value",
    color="vehicle_id",
    symbol="variable",
    opacity=0.25,
    size=3,
    width=1500, 
    height=1000,
    # log_z=True,
    # log_y=True,
    # symbol="in_dc",
)