# soh estimation experimentation of Ford vehicles


## Setup

In [None]:
! mkdir -p data_cache

### Imports

In [None]:
import plotly.express as px
from scipy import stats
import numpy as np
from core.stats_utils import *
from core.pandas_utils import *
from core.config import valid_soh_points
from core.stats_utils import filter_results_by_lines_bounds
from transform.fleet_info.main import fleet_info
from transform.processed_tss.main import ProcessedTimeSeries
company = "renault"

### Data extraction

In [None]:
tss = ProcessedTimeSeries(company, force_update=True)

In [None]:
tss.columns

In [None]:
sanity_check(tss)

In [None]:
fleet_info.query("make == 'company'")["range"].value_counts(dropna=False, sort=True, ascending=False)


## Time series

In [None]:
most_common_vin = tss.groupby("vin").size().sort_values(ascending=False).idxmax()
most_common_vin
vin = "VF1AG000064475468"
ts = tss.query(f"vin == '{vin}'")

In [None]:
px.scatter(ts, x="date", y="odometer", title=f"{vin}")

In [None]:
tss.head()

In [None]:
corr  = tss.corr(numeric_only=True)
selected_column = "battery_energy"
selected_corr = corr[[selected_column]].sort_values(by=selected_column, ascending=False)

# heat map of the correlation matrix
px.imshow(selected_corr, title=f"Correlation Matrix for {selected_column}")


#### Rolling variance


In [None]:
# Calculation on the rolling variance 
tss["rolling_variance"] = tss.groupby("vin")["battery_energy"].transform(lambda group: group.rolling(window=3).var())
var = tss.dropna(subset=['rolling_variance'])


In [None]:
# Heat map 
# Sélectionner les colonnes d'intérêt
columns_of_interest = ['soc', 'battery_energy', 'estimated_range', 'rolling_variance']

# Calculer la matrice de corrélation
correlation_matrix = tss[columns_of_interest].corr()
# Visualiser la matrice de corrélation avec une heatmap
fig = px.imshow(correlation_matrix,
                labels=dict(x="Variables", y="Variables", color="Correlation"),
                x=correlation_matrix.columns,
                y=correlation_matrix.index,
                title="Heatmap de la Corrélation")

# Afficher le graphique
fig.show()

## First filtering 

In [None]:
# Deleting odometer = 0 
ts = ts.query("odometer != 0")
tss = tss.query("odometer != 0")

## Reducing depandicies to factors


In [None]:
tss['soh'] = tss['battery_energy'] / tss['soc'] / tss['capacity']
ts['soh'] = ts['battery_energy'] / ts['soc'] / ts['capacity']


### Depandicies to soc -> We take only values above 0.4

In [None]:
fig = px.scatter(
    tss,
    x="soc",
    y="soh",
    color="capacity",
    height=600,
    title="Average State-of-Health (SoH) vs Mileage",
    trendline="ols",
    trendline_scope="overall",
    hover_data=["vin"]
)

fig.show()

### Depandicies to discharging -> Much more value in discharge, the value in charge are also good so we keep them


In [None]:
px.scatter(ts
           .query("soc > 0.4")
           .query("soc < 0.95"), 
           x="date",
           y="soh"  , 
           color="in_charge")

### Value estimated_range -> No clear inside on the value. Estimated_range is a recalculated value so not that much a surprise




In [None]:
px.scatter(tss.query("soc > 0.4")
           .query("soc < 0.95"), 
           x="estimated_range",
           y="soh"  , 
           color="in_charge")

### Outside temp

In [None]:
temp_analysis = (tss
    .groupby('vin')
    .agg({
        'outside_temp': lambda x: {
            'total_records': len(x),
            'null_count': x.isnull().sum(),
            'null_percentage': (x.isnull().sum() / len(x) * 100)
        }
    })
    .outside_temp
    .apply(pd.Series)
)

# Afficher les résultats
print("Analyse des données de température par VIN :")
print(temp_analysis.sort_values('null_percentage'))

In [None]:
px.scatter(tss.query("soc > 0.4")
           .query("soc < 0.95"), 
           x="soc",
           y="soh"  , 
           color="outside_temp")

#### Discharge loss

In [None]:
fig = px.scatter(tss,
           x="soc_discharge_loss",
           y="odometer_discharge_loss",
           trendline="ols",
           trendline_scope="overall",
           title="Odometer loss vs soc loss", 
)

fig.show()
trendline_results = fig.data[1]

# Extract the slope and intercept from the trendline results
# Note: Plotly does not directly expose slope and intercept, so we need to calculate them
# from the trendline data points
x_trend = trendline_results.x
y_trend = trendline_results.y

slope, intercept = np.polyfit(x_trend, y_trend, 1)

# Print the trendline equation
print(f"Trendline equation: odometer_discharge_loss = {slope:.2f} * soc_discharge_loss + {intercept:.2f}")

##  Final SOH

### Estimation

In [None]:
# Mercedes soh
tss_filtered:DF = (
    tss
    .query('soc > 0.5')
     .query('soc < 0.97')
    .eval("soh = battery_energy / soc / capacity ")
    .assign(charge_size = lambda df: df.groupby(["vin", "in_charge_idx"]).transform("size"))
    .query("charge_size > 10")

)
#Applying filter on the number of charge 
# 1. Trouver les VINs qui ont un in_charge_idx maximum > 3
valid_vins = (
    tss_filtered.groupby('vin')['in_charge_idx']
    .max()
    .reset_index()
    .query('in_charge_idx > 3')
    ['vin']
)


#Applying the filter on the outliers
# tss_filtered = filter_results_by_lines_bounds(tss_filtered, valid_soh_points)

# Calculate average SOH and last odometer reading for each VIN
soh_per_vehicle = (
    tss_filtered
    .groupby("vin")
    .agg({
        "soh": "mean",
        "odometer": "max",
        "version": Series.mode,
        "date": "max",
        "battery_energy": "max",
    })
    .reset_index()
)

In [None]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Create the scatter plot
fig = px.scatter(
    soh_per_vehicle,
    x="odometer",
    y="soh",
    labels={"soh": "SoH", 'vin': 'VIN'},
    color="version",
    title="Average State-of-Health (SoH) vs Mileage",
    hover_data={"vin": True}
)

# Add a trendline for each model
for version, group in soh_per_vehicle.groupby('version'):
    x = group["odometer"]
    y = group["soh"]
    if len(x) > 1:  # Ensure there are enough points to fit a line
        coefficients = np.polyfit(x, y, 1)  # Linear fit (degree 1)
        trendline = np.polyval(coefficients, x)

        # Add the trendline to the plot using go.Scatter
        trendline_trace = go.Scatter(
            x=x,
            y=trendline,
            mode='lines',
            name=f'Trendline {version}',
            line=dict(dash='dash')  # Different dash style for each model
        )

        fig.add_trace(trendline_trace)

fig.show()

In [None]:
# soh_per_vehicle['soh'] = soh_per_vehicle['soh'].round(2)
# soh_per_vehicle[['vin', 'model', 'soh','odometer']].query('model == "vito" | model == "sprinter"').to_csv("soh_per_vehicle.csv", index=False)



In [None]:
# soh_per_vehicle.query("vin == 'W1V44760313886610'")

## Visualization

In [None]:
fig = px.scatter(
    tss_filtered.query("vin == 'VF1AG000666731648'"), 
    x="odometer",           # Kilométrage sur l'axe x
    y="soh",               # SOH sur l'axe y
    color="soc",           # Couleur selon le SOC
    title="Evolution du battery_energy en fonction du kilométrage",
    labels={
        "odometer": "Kilométrage (km)",
        "soh": "State of Health (%)",
        "soc": "State of Charge (%)"
    }
)

# Optionnel : Personnalisation supplémentaire
fig.update_layout(
    xaxis_title="Kilométrage (km)",
    yaxis_title="battery_energy normalized",
    coloraxis_colorbar_title="State of Charge (%)"
)

fig.show()


The resulting sohs follows the overall trend which makes a lot more sense than the previous results.  
We can assume that the informed default ranges in fleet info are wrong.

## Conclusion

Soh from estimated range seems promessing and could be used as our final resulsts to Ayvens.  
We would, however, need to improve the accuracy of the estimator.  