# soh estimation experimentation for renault

Two methods of calculation for the SoH: 
Based on the battery level 
```
soh = charging.battery_energy / (charging.battery_level * model_battery_capacity) 
```
Based on the estimated range 
```
soh = estimated_range / soc * model_battery_range) 
```
The good result is probably a combination of the two.

## Setup

In [None]:
! mkdir -p data_cache

### Imports

In [None]:
import plotly.express as px
import numpy as np
from core.stats_utils import *
from core.pandas_utils import *
from transform.fleet_info.main import fleet_info
from core.s3.s3_utils import S3Service
from core.s3.settings import S3Settings
from core.spark_utils import create_spark_session
from core.stats_utils import *
from core.sql_utils import *
import numpy as np
settings = S3Settings()

spark = create_spark_session(
    settings.S3_KEY,
    settings.S3_SECRET
)

s3 = S3Service()
company = "renault"

### Data extraction

In [None]:
raw_tss  = s3.read_parquet_df_spark(spark, "raw_ts/renault/time_series/raw_ts_spark.parquet").toPandas()


In [None]:
# Compter le nombre de VIN uniques
nombre_vin_uniques = raw_tss['vin'].nunique()

print(f"Le nombre de VIN différents dans tss est : {nombre_vin_uniques}")

In [None]:

from core.sql_utils import *
with get_connection() as con:
    cursor = con.cursor()
    cursor.execute("""SELECT vm.model_name, vm.type, vm.autonomy, v.vin, b.net_capacity FROM vehicle v 
                    left JOIN vehicle_model vm
                    ON v.vehicle_model_id = vm.id
                    left JOIN battery b
                    on b.id=vm.battery_id""")
    dbeaver_df = cursor.fetchall()
dbeaver_df = pd.DataFrame(dbeaver_df, columns=[desc[0] for desc in cursor.description])




In [None]:
raw_tss = raw_tss.merge(dbeaver_df, on='vin', how='left')

In [None]:
def detect_charging_bis(df, level_col="battery_level", ts_col="date", min_consecutive=2):
    """
    Détecte les phases de charge en se basant sur les variations du niveau de batterie (SoC).
    Nécessite au moins `min_consecutive` augmentations ou diminutions consécutives
    pour considérer un changement d'état.
    """

    df = df.sort_values(ts_col).reset_index(drop=True)
    df["delta"] = df[level_col].diff().fillna(0)

    charging = False
    states = []
    pos_count, neg_count = 0, 0

    for d in df["delta"]:
        if d > 0:
            pos_count += 1
            neg_count = 0
        elif d < 0:
            neg_count += 1
            pos_count = 0
        else:
            # stabilité → ne change rien, mais ne réinitialise pas
            pass

        # si on a eu min_consecutive hausses consécutives → charge détectée
        if pos_count >= min_consecutive:
            charging = True
        # si on a eu min_consecutive baisses consécutives → fin de charge
        elif neg_count >= min_consecutive:
            charging = False

        states.append(charging)

    df["charging"] = states
    return df
def add_phase_id(df):

    df_copy = df.copy()
    changes = df_copy['charging'] != df_copy['charging'].shift(1)
    df_copy['phase_id'] = changes.cumsum()
    df_copy["phase_id"] = df_copy["phase_id"].astype(str)
    return df_copy

raw_tss = raw_tss.groupby("vin", group_keys=False).apply(
    lambda g: detect_charging_bis(g, level_col="battery_level", ts_col="date", min_consecutive=2)
)
raw_tss = add_phase_id(raw_tss)

In [None]:
types = {
    "vin": str,
    "date": "datetime64[ns]",
    "battery_energy": float,
    "battery_level": float,
    "estimated_range": float,
    "odometer": float,
    'battery_charge_type': str, 
    'charging_rate': float, 
    'distance_to_complete_charge': float, 
    'outside_temperature': float, 
    'plugged_in': bool, 
    'status': str, 
    'vin': str,
    "net_capacity": float,
}
raw_tss = raw_tss.astype(types)


In [None]:
sanity_check(raw_tss)

## Time series

In [None]:
most_common_vin = raw_tss.groupby("vin").size().sort_values(ascending=False).idxmax()
most_common_vin
vin = "VF1AG000064475468"
ts = raw_tss.query(f"vin == '{vin}'")

In [None]:
px.scatter(ts, x="date", y="odometer", title=f"{vin}")

In [None]:
corr  = raw_tss.corr(numeric_only=True)
selected_column = "battery_energy"
selected_corr = corr[[selected_column]].sort_values(by=selected_column, ascending=False)

# heat map of the correlation matrix
px.imshow(selected_corr, title=f"Correlation Matrix for {selected_column}")


#### Rolling variance


In [None]:
# Calculation on the rolling variance 
raw_tss["rolling_variance"] = raw_tss.groupby("vin")["battery_energy"].transform(lambda group: group.rolling(window=3).var())
var = raw_tss.dropna(subset=['rolling_variance'])


In [None]:
# Heat map 
# Sélectionner les colonnes d'intérêt
columns_of_interest = ['battery_level', 'battery_energy', 'estimated_range', 'rolling_variance']

# Calculer la matrice de corrélation
correlation_matrix = raw_tss[columns_of_interest].corr()
# Visualiser la matrice de corrélation avec une heatmap
fig = px.imshow(correlation_matrix,
                labels=dict(x="Variables", y="Variables", color="Correlation"),
                x=correlation_matrix.columns,
                y=correlation_matrix.index,
                title="Heatmap de la Corrélation")

# Afficher le graphique
fig.show()

## First filtering 

In [None]:
# Deleting odometer = 0 
ts = ts.query("odometer != 0")
tss = raw_tss.query("odometer != 0")

## Reducing depandicies to factors


In [None]:
tss['soh'] = tss['battery_energy'] / tss['battery_level'] / tss['net_capacity']
ts['soh'] = ts['battery_energy'] / ts['battery_level'] / ts['net_capacity']


### Depandicies to soc -> We take only values above 0.4

In [None]:
px.scatter(tss, x="battery_level", y="battery_energy", color="vin")


In [None]:
fig = px.scatter(
    tss,
    x="battery_level",
    y="soh",
    color="net_capacity",
    height=600,
    title="Average State-of-Health (SoH) vs Mileage",
    trendline="ols",
    trendline_scope="overall",
    hover_data=["vin"]
)

fig.show()

### Depandicies to discharging -> Much more value in discharge, the value in charge are also good so we keep them


In [None]:
px.scatter(ts
           .query("battery_level > 0.4")
           .query("battery_level < 0.95"), 
           x="date",
           y="soh"  , 
           color="charging")

### Value estimated_range -> No clear inside on the value. Estimated_range is a recalculated value so not that much a surprise




In [None]:
px.scatter(tss.query("battery_level > 0.4")
           .query("battery_level < 0.95"), 
           x="estimated_range",
           y="soh"  , 
           color="charging")

### Outside temp

In [None]:
temp_analysis = (tss
    .groupby('vin')
    .agg({
        'outside_temperature': lambda x: {
            'total_records': len(x),
            'null_count': x.isnull().sum(),
            'null_percentage': (x.isnull().sum() / len(x) * 100)
        }
    })
    .outside_temperature
    .apply(pd.Series)
)

# Afficher les résultats
print("Analyse des données de température par VIN :")
print(temp_analysis.sort_values('null_percentage'))

In [None]:
px.scatter(tss.query("battery_level > 0.4")
           .query("battery_level < 0.95"), 
           x="battery_level",
           y="soh"  , 
           color="outside_temperature")

##  Final SOH

### Estimation

In [None]:
# Mercedes soh
tss_filtered:DF = (
    tss
    .query('battery_level > 0.5')
     .query('battery_level < 0.97')
    .eval("soh = battery_energy / battery_level / net_capacity ")
    .assign(charge_size = lambda df: df.groupby(["vin", "phase_id"]).transform("size"))
    .query("charge_size > 10")

)
#Applying filter on the number of charge 
# 1. Trouver les VINs qui ont un in_charge_idx maximum > 3
phase_counts = tss.groupby("vin")["phase_id"].nunique()
valid_vins = phase_counts[phase_counts >= 3].index

#Applying the filter on the outliers
# tss_filtered = filter_results_by_lines_bounds(tss_filtered, valid_soh_points)

# Calculate average SOH and last odometer reading for each VIN
soh_per_vehicle = (
    tss_filtered
    .groupby("vin")
    .agg({
        "soh": "mean",
        "odometer": "max",
        "type": Series.mode,
        "date": "max",
        "battery_energy": "max",
    })
    .reset_index()
)

In [None]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Create the scatter plot
fig = px.scatter(
    soh_per_vehicle.dropna(subset=['soh']),
    x="odometer",
    y="soh",
    labels={"soh": "SoH", 'vin': 'VIN'},
    color="type",
    title="Average State-of-Health (SoH) vs Mileage",
    hover_data={"vin": True}
)

# Add a trendline for each model
for version, group in soh_per_vehicle.dropna(subset=['soh']).groupby('type'):
    x = group["odometer"]
    y = group["soh"]
    if len(x) > 1:  # Ensure there are enough points to fit a line
        coefficients = np.polyfit(x, y, 1)  # Linear fit (degree 1)
        trendline = np.polyval(coefficients, x)

        # Add the trendline to the plot using go.Scatter
        trendline_trace = go.Scatter(
            x=x,
            y=trendline,
            mode='lines',
            name=f'Trendline {version}',
            line=dict(dash='dash')  # Different dash style for each model
        )

        fig.add_trace(trendline_trace)

fig.show()

In [None]:
# soh_per_vehicle['soh'] = soh_per_vehicle['soh'].round(2)
# soh_per_vehicle[['vin', 'model', 'soh','odometer']].query('model == "vito" | model == "sprinter"').to_csv("soh_per_vehicle.csv", index=False)



In [None]:
# soh_per_vehicle.query("vin == 'W1V44760313886610'")

## Visualization

In [None]:
fig = px.scatter(
    tss_filtered.query("vin == 'VF1AG000666731648'"), 
    x="odometer",           # Kilométrage sur l'axe x
    y="soh",               # SOH sur l'axe y
    color="battery_level",           # Couleur selon le SOC
    title="Evolution du battery_energy en fonction du kilométrage",
    labels={
        "odometer": "Kilométrage (km)",
        "soh": "State of Health (%)",
        "battery_level": "State of Charge (%)"
    }
)

# Optionnel : Personnalisation supplémentaire
fig.update_layout(
    xaxis_title="Kilométrage (km)",
    yaxis_title="battery_energy normalized",
    coloraxis_colorbar_title="State of Charge (%)"
)

fig.show()
