# Soh estimation experimentation of Renault vehicles
Two methods of calculation for the SoH: 
Based on the battery level 
```
soh = charging.battery_energy / (charging.battery_level * model_battery_capacity) 
```
Based on the estimated range 
```
soh = estimated_range / soc * model_battery_range) 
```
The good result is probably a combination of the two.

## Imports

In [None]:
import logging
from datetime import datetime as DT
from datetime import timedelta as TD
from dateutil import parser
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import numpy as np
from rich import print
import pandas as pd
from pandas import Series
from pandas import DataFrame as DF
import plotly.express as px
import plotly.graph_objects as go
from scipy.optimize import curve_fit
from core.s3_utils import S3_Bucket
from core.constants import *
from core.time_series_processing import preprocess_date

## Setup

We must ensure that the data points of the time series can be compared together.  
To do this, we will extract their corresponding car model from `fleet_info.csv`("List finale des vin a activer" on the drive).

In [None]:
fleet_info = pd.read_csv("../ayvens/fleet_info1.csv", usecols=["VIN","Make","Model","Type"], dtype={"Make":"string"})
# fleet_info = pd.read_csv("fleet_info.csv")
print(fleet_info.columns)
fleet_info = (
    fleet_info
    .rename(columns={"VIN": "vin"})
    .assign(Make=fleet_info["Make"].str.lower())
    .query("Make == 'renault'")
    .set_index("vin", drop=False)
)
fleet_info[["Model", "Type"]].value_counts()

Then we will use data find online to get the default battery capacity of each model.  
Note: *Here a model is a combinatin of the `Model` and `Type` fleet_info variables since cars of the same model with different type can have different battery capacity*.

In [None]:
KWH_BATTERY_CAPCITY_DICT = {
    "ZOE": {
        "R90 Life (batterijkoop) 5d": 41,
        "R135 Edition One (batterijkoop) 5d": 52,
        "R135 Intens (batterijkoop) 5d": 52,
        "R135":52
    }
}
KNOW_MODEL_TYPES = ["R90 Life (batterijkoop) 5d", "R135 Edition One (batterijkoop) 5d", "R135 Intens (batterijkoop) 5d", "R135"]

Let's remove the vins that we don't have a known default battery capacity.

In [None]:
has_known_capcity = fleet_info["Type"].isin(KNOW_MODEL_TYPES)
fleet_info = fleet_info[has_known_capcity]
fleet_info.head(10)


Let's extract the raw time seriess of all the cars we have into a multi indexed df. 

In [None]:
bucket = S3_Bucket()

def get_renault_raw_ts(vin:str) -> DF:
    return (
        bucket.read_parquet_df(f"raw_ts/renault/time_series/{vin}.parquet")
        .set_index("date", drop=False)
        .sort_index()
    )

raw_tss = {}
count = 0
for vin, vehicle_info in fleet_info.iterrows():
    default_100_soc_energy = KWH_BATTERY_CAPCITY_DICT[vehicle_info["Model"]][vehicle_info["Type"]]
    try:
        raw_tss[vin] = (
            get_renault_raw_ts(vin)
            .assign(default_100_soc_energy=default_100_soc_energy)
            .assign(vin=vin)
            .assign(type=vehicle_info["Type"])
        )
    except Exception as e:
        # display(e)
        # print(vin)
        count += 1
        continue
raw_tss = pd.concat(raw_tss, axis="index", keys=raw_tss.keys(), names=["vin"])
print("Le nombre de VIN qui ont eu un problème est de : ", count)
raw_tss["type"].unique()


In [None]:
# Compter le nombre de VIN uniques
nombre_vin_uniques = raw_tss['vin'].nunique()

print(f"Le nombre de VIN différents dans tss est : {nombre_vin_uniques}")

**Note**: *There are only R135 models.*

### Time series processing
Let's implement a naive soh estimation pipeline.  

In [None]:
tss:DF = (
    raw_tss
    .rename(columns={"charging.battery_energy": "battery_energy", "diagnostics.odometer": "odometer", "charging.battery_level": "battery_level","charging.estimated_range": "estimated_range"})
    .eval("soc = battery_level * 100")
    .eval("expected_battery_energy = default_100_soc_energy * battery_level")
    .eval("soh = 100 * expected_battery_energy / battery_energy / 115") # the division of 115 is to normalize the battery capacity 
)
tss.columns

In [None]:
# tss[tss['vin']=='VF1AG000366046670'].tail(10).head(25)
# columns_of_interest = ['vin', 'soc', 'battery_energy']  # Replace with your desired columns
# value_counts_specific = tss[columns_of_interest].agg('value_counts')
# print(value_counts_specific)

## EDA

## Assumption verification
First, we will verify that the `soc` and `battery_energy` are two "real" variables.  
That is, none of them is calculated from the other.

In [None]:
# Compter le nombre de VIN uniques
nombre_vin_uniques = tss['vin'].nunique()

print(f"Le nombre de VIN différents dans tss est : {nombre_vin_uniques}")


In [None]:
px.scatter(tss, x="soc", y="battery_energy", color="vin")


Looking at this scatter plot we can see that:
- The two variables are in fact two real variables instead of one being a synthetic variable calculated from the other.  
- The difference is much more important at high `soc` values.

Let's verify that the `soh` is not correlated with the `soc` or `odometer`.

In [None]:
px.scatter(tss, x="soc", y="soh", color="vin")

In [None]:
px.scatter(tss, x="odometer", y="soh", color="vin")


# Adding filters SOC > 40 & Battery_energy > 0


In [None]:
#Removing low SoC values
tss_filtered = tss.query("soc > 40")
tss_filtered = tss_filtered.query("battery_energy > 0")
tss_filtered.head(2)
tss_filtered = tss_filtered.reset_index(drop=True)
tss_filtered['soh_mean'] = tss_filtered.groupby('vin')['soh'].transform(lambda x: x.rolling(window=1000000, min_periods=1).mean())
px.scatter(tss_filtered, x="odometer", y="soh_mean", color="vin")
# px.scatter(tss_filtered[tss_filtered['vin']=='VF1AG000366046670'], x="date", y="odometer", color="vin")

In [None]:
# Group by VIN and calculate mean SoH and get the last odometer reading
soh_mean_by_vin = tss_filtered.groupby('vin').agg({
    'soh': 'mean',
    'odometer': 'last'
}).reset_index()

# Rename columns for clarity
soh_mean_by_vin = soh_mean_by_vin.rename(columns={'soh': 'mean_soh'})

# Sort by odometer if desired
soh_mean_by_vin = soh_mean_by_vin.sort_values('odometer')

# Create a scatter plot with odometer on x-axis and mean SoH on y-axis
fig = px.scatter(soh_mean_by_vin, x='odometer', y='mean_soh', 
                 hover_data=['vin'],  # Show VIN on hover
                 title='Mean SoH by VIN (Last Odometer Reading)',
                 labels={'mean_soh': 'Mean SoH', 'odometer': 'Last Odometer Reading'})

# Customize the layout if needed
fig.update_layout(xaxis_title="Last Odometer Reading", yaxis_title="Mean SoH")

# Show the plot
fig.show()

# px.scatter(tss_filtered[tss_filtered['vin']=='VF1AG000964802627'], x="odometer", y="soh_mean", color="vin")


In [None]:
filtered_data = tss_filtered[tss_filtered['vin'] == 'VF1AG000X64802717']

# Create the subplot with a secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add the first trace (soh_mean)
fig.add_trace(
    go.Scatter(x=filtered_data["odometer"], y=filtered_data["soh_mean"], name="SOH Mean"),
    secondary_y=False,
)

# Add the second trace (e.g., battery_energy) on the secondary y-axis
fig.add_trace(
    go.Scatter(x=filtered_data["odometer"], y=filtered_data["soc"], name="SoC"),
    secondary_y=True,
)

# Update layout
fig.update_layout(
    title_text="SOH Mean and Battery Energy vs Odometer",
    xaxis_title="Odometer",
)

# Update y-axes labels
fig.update_yaxes(title_text="SOH Mean", secondary_y=False)
fig.update_yaxes(title_text="Estimated range", secondary_y=True)

# Show the plot
fig.show()

# Trying with the estimated range


In [None]:
tss_filtered:DF = (
    tss_filtered
    .eval("soh2 = estimated_range/350/soc*100")
)
px.scatter(tss_filtered, x="soc", y="soh2", color="vin")

In [None]:
tss_filtered['soh_mean2'] = tss_filtered.groupby('vin')['soh2'].transform(lambda x: x.rolling(window=1000, min_periods=1).mean())
px.scatter(tss_filtered, x="odometer", y="soh_mean2", color="vin")

In [None]:
# px.scatter(tss_filtered, x="odometer", y="soh_mean2", color="vin")
filtered_data = tss_filtered[tss_filtered['vin'] == 'VF1AG000X64802717']

# Create the subplot with a secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add the first trace (soh_mean)
fig.add_trace(
    go.Scatter(x=filtered_data["date"], y=filtered_data["soh_mean2"], name="SOH Mean"),
    secondary_y=False,
)

# Add the second trace (e.g., battery_energy) on the secondary y-axis
fig.add_trace(
    go.Scatter(x=filtered_data["date"], y=filtered_data["soc"], name="SoC"),
    secondary_y=True,
)

# Update layout
fig.update_layout(
    title_text="SOH Mean and Battery Energy vs Odometer",
    xaxis_title="Odometer",
)

# Update y-axes labels
fig.update_yaxes(title_text="SOH Mean", secondary_y=False)
fig.update_yaxes(title_text="Estimated range", secondary_y=True)

# Show the plot
fig.show()


# Comparing the 2 SoH

In [None]:
filtered_data_2 = tss_filtered[tss_filtered['vin'] == 'VF1AG000366046670']
# Create the subplot with a secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add the first trace (soh_mean)
fig.add_trace(
    go.Scatter(x=filtered_data_2["date"], y=filtered_data_2["soh_mean"], name="soh_mean"),
    secondary_y=False,
)

# Add the second trace (e.g., battery_energy) on the secondary y-axis
fig.add_trace(
    go.Scatter(x=filtered_data_2["date"], y=filtered_data_2["soh_mean2"], name="soh_mean2"),
    secondary_y=True,
)

# Update layout
fig.update_layout(
    title_text="Comparing the two SOH decay",
    xaxis_title="Odometer",
)

# Update y-axes labels
# fig.update_yaxes(title_text="SOH Mean", secondary_y=False)
# fig.update_yaxes(title_text="Estimated range", secondary_y=True)

# Show the plot
fig.show()

Using SoH based on battery_energy and SoH2 based on estimated _range

In [None]:
tss_filtered['soh_tot'] = (tss_filtered['soh_mean'] + tss_filtered['soh_mean2']) / 2
filtered_data_3 = tss_filtered[tss_filtered['vin'] == 'VF1AG000X63197233']
# Create the subplot with a secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add the first trace (soh_mean)
fig.add_trace(
    go.Scatter(x=filtered_data_3["odometer"], y=filtered_data_3["soh_tot"], name="soh_tot"),
    secondary_y=False,
)

# Add the second trace (e.g., battery_energy) on the secondary y-axis
fig.add_trace(
    go.Scatter(x=filtered_data_3["odometer"], y=filtered_data_3["charging.charging_rate"], name="soc"),
    secondary_y=True,
)

# Update layout
fig.update_layout(
    title_text="Soc, and SoH tot",
    xaxis_title="Odometer",
)

# Update y-axes labels
# fig.update_yaxes(title_text="SOH Mean", secondary_y=False)
# fig.update_yaxes(title_text="Estimated range", secondary_y=True)

# Show the plot
fig.show()

In [None]:
px.scatter(tss_filtered, x="odometer", y="soh_tot", color="vin")

In [None]:
columns_of_interest = ['vin', 'soc', 'charging.charging_rate']  # Replace with your desired columns
value_counts_specific = tss_filtered[columns_of_interest].agg('value_counts')
print(value_counts_specific)

In [None]:
# Group by VIN and calculate mean SoH and get the last odometer reading
soh_mean_by_vin = tss_filtered.groupby('vin').agg({
    'soh_tot': 'mean',
    'odometer': 'last'
}).reset_index()

# Rename columns for clarity
soh_mean_by_vin = soh_mean_by_vin.rename(columns={'soh_tot': 'mean_soh_tot'})

soh_mean_by_vin = soh_mean_by_vin.dropna()
soh_mean_by_vin = soh_mean_by_vin[~soh_mean_by_vin.isin([np.inf, -np.inf]).any(axis=1)]

# # Sort by odometer if desired
soh_mean_by_vin = soh_mean_by_vin.sort_values('odometer')

# Create a scatter plot with odometer on x-axis and mean SoH on y-axis
fig = px.scatter(soh_mean_by_vin, x='odometer', y='mean_soh_tot', 
                 hover_data=['vin'],  # Show VIN on hover
                 title='SoH by odometer for each VIN',
                 labels={'mean_soh_tot': 'Mean SoH', 'odometer': 'Last Odometer Reading'})

# Customize the layout if needed
fig.update_layout(xaxis_title="Odometer", yaxis_title="SoH")

# Show the plot
# fig.show()

# Calculate linear regression passing through (0,1)
x = soh_mean_by_vin['odometer']
y = soh_mean_by_vin['mean_soh_tot']

# Subtract 1 from y to shift the intercept to (0,1)
y_shifted = y - 1

# Fit a line passing through the origin
slope = np.sum(x * y_shifted) / np.sum(x**2)

# Create the polynomial function
p = np.poly1d([slope, 1])  # [slope, intercept]

# Create scatter plot
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=soh_mean_by_vin['odometer'], 
    y=soh_mean_by_vin['mean_soh_tot'],
    mode='markers',
    name='Data points',
    text=soh_mean_by_vin['vin'],
    hovertemplate='<b>VIN:</b> %{text}<br><b>Odometer:</b> %{x}<br><b>Mean SoH:</b> %{y:.2f}'
))

# Add trendline
x_trend = np.linspace(0, soh_mean_by_vin['odometer'].max(), 100)
fig.add_trace(go.Scatter(
    x=x_trend,
    y=p(x_trend),
    mode='lines',
    name='Trendline',
    line=dict(color='red')
))

# Update layout
fig.update_layout(
    title='Average State-of-Health (SoH) vs Mileage',
    xaxis_title="Latest mileage (km)",
    yaxis_title="SoH (%)",
    legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)
)

# Show the plot
fig.show()

# Print the equation of the line
print(f"Equation of the line: y = {slope:.6f}x + 1")

# #### Exponential decay
# Define the exponential decay function
# def exp_decay(x, a, b):
#     return a * np.exp(-b * x) + (1 - a)

# # Group by VIN and calculate mean SoH and get the last odometer reading
# soh_mean_by_vin = tss_filtered.groupby('vin').agg({
#     'soh_tot': 'mean',
#     'odometer': 'last'
# }).reset_index()

# # Rename columns for clarity
# soh_mean_by_vin = soh_mean_by_vin.rename(columns={'soh_tot': 'mean_soh'})

# # Sort by odometer
# soh_mean_by_vin = soh_mean_by_vin.sort_values('odometer')

# # Fit the exponential decay function
# popt, _ = curve_fit(exp_decay, soh_mean_by_vin['odometer'], soh_mean_by_vin['mean_soh'], p0=[0.2, 1e-6])
# a, b = popt

# # Generate points for the trendline
# x_trend = np.linspace(0, soh_mean_by_vin['odometer'].max(), 500)
# y_trend = exp_decay(x_trend, a, b)

# # Create the scatter plot
# fig = px.scatter(soh_mean_by_vin, x='odometer', y='mean_soh', 
#                  hover_data=['vin'],
#                  labels={'odometer': 'Last Odometer Reading', 'mean_soh': 'Mean SoH'},
#                  title='Mean SoH by VIN (Last Odometer Reading) with Exponential Decay Trendline')

# # Add the trendline
# fig.add_scatter(x=x_trend, y=y_trend, mode='lines', name='Trendline (Exponential Decay)',
#                 line=dict(color='red', dash='dash'))

# # Update layout
# fig.update_layout(
#     xaxis_title="Last Odometer Reading",
#     yaxis_title="Mean SoH",
#     legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
#     yaxis_range=[0.75, 1]  # Adjust this range as needed
# )

# # Show the plot
# fig.show()

# # Print the decay parameters
# print(f"Exponential decay parameters: a = {a:.4f}, b = {b:.8f}")

#### Polynomial de degré 2

# px.scatter(tss_filtered[tss_filtered['vin']=='VF1AG000964802627'], x="odometer", y="soh_mean", color="vin")

In [None]:
# Select the desired columns
selected_data = soh_mean_by_vin[['vin', 'odometer', 'mean_soh_tot']]

# Round the mileage to the nearest integer
selected_data['odometer'] = selected_data['odometer'].round().astype(int)

# Rename the columns
selected_data = selected_data.rename(columns={
    'vin': 'VIN',
    'odometer': 'Mileage',
    'mean_soh_tot': 'SoH'
})

# Save to CSV
selected_data.to_csv('Data_renault.csv', index=False)