# BMW 
The goal of this notebook is to elaborate a method to calculate the SoH of the BMW fleet.  

## Setup

### Imports

In [None]:
from core.s3.s3_utils import S3Service
from core.s3.settings import S3Settings
from core.spark_utils import create_spark_session
import plotly.express as px
from core.stats_utils import *
from core.sql_utils import *
import numpy as np
settings = S3Settings()

spark = create_spark_session(
    settings.S3_KEY,
    settings.S3_SECRET
)

s3 = S3Service()


### Data extraction

In [None]:
tss = s3.read_parquet_df_spark(spark, "raw_ts/bmw/time_series/raw_ts_spark.parquet").withColumnRenamed("soc_hv_header", "soc").toPandas()
tss["mileage"] = tss.sort_values('date').groupby("vin")["mileage"].transform(lambda group: group.bfill())
tss.rename(columns={"kombi_remaining_electric_range": "remaining_range"}, inplace=True)


In [None]:

from core.sql_utils import *
with get_connection() as con:
    cursor = con.cursor()
    cursor.execute("""SELECT vm.model_name, vm.type, vm.autonomy, v.vin, b.net_capacity FROM vehicle v 
                    left JOIN vehicle_model vm
                    ON v.vehicle_model_id = vm.id
                    left JOIN battery b
                    on b.id=vm.battery_id""")
    dbeaver_df = cursor.fetchall()
dbeaver_df = pd.DataFrame(dbeaver_df, columns=[desc[0] for desc in cursor.description])




In [None]:
tss = tss.merge(dbeaver_df, on="vin", how="left")

In [None]:
tss = tss.astype({"date" :"datetime64[ns]",
"avg_electric_range_consumption":"float64",
"charging_ac_ampere":"float64",
"charging_ac_voltage":"float64",
"charging_status":"object",
"current_remaining_fuel_range":"float64",
"hv_state_of_health":"float64",
"remaining_range":"float64",
"mileage":"float64",
"remaining_fuel":"float64",
"soc":"float64",
"model":"str",
"vin":"str",
"model_name":"str",
"autonomy":"float64",
"net_capacity":"float64",})

In [None]:
## compute charging
def detect_charging(df, level_col="battery_level", ts_col="date"):
    df = df.sort_values(ts_col).reset_index(drop=True)
    df["delta"] = df[level_col].diff().fillna(0)

    states = []
    charging = False

    for d in df["delta"]:
        if d > 0:        # augmentation → démarrage ou maintien de charge
            charging = True
        elif d < 0:      # baisse → pas en charge
            charging = False
        # sinon d == 0 → garder l'état précédent
        states.append(charging)

    df["charging"] = states
    return df
def detect_charging_bis(df, level_col="battery_level", ts_col="date", min_consecutive=2):
    """
    Détecte les phases de charge en se basant sur les variations du niveau de batterie (SoC).
    Nécessite au moins `min_consecutive` augmentations ou diminutions consécutives
    pour considérer un changement d'état.
    """

    df = df.sort_values(ts_col).reset_index(drop=True)
    df["delta"] = df[level_col].diff().fillna(0)

    charging = False
    states = []
    pos_count, neg_count = 0, 0

    for d in df["delta"]:
        if d > 0:
            pos_count += 1
            neg_count = 0
        elif d < 0:
            neg_count += 1
            pos_count = 0
        else:
            # stabilité → ne change rien, mais ne réinitialise pas
            pass

        # si on a eu min_consecutive hausses consécutives → charge détectée
        if pos_count >= min_consecutive:
            charging = True
        # si on a eu min_consecutive baisses consécutives → fin de charge
        elif neg_count >= min_consecutive:
            charging = False

        states.append(charging)

    df["charging"] = states
    return df
def add_phase_id(df):

    df_copy = df.copy()
    changes = df_copy['charging'] != df_copy['charging'].shift(1)
    df_copy['phase_id'] = changes.cumsum()
    df_copy["phase_id"] = df_copy["phase_id"].astype(str)
    return df_copy

tss = tss.groupby("vin", group_keys=False).apply(
    lambda g: detect_charging_bis(g, level_col="soc", ts_col="date", min_consecutive=2)
)
tss = add_phase_id(tss)

tss = tss.astype({"date" :"datetime64[ns]",
"avg_electric_range_consumption":"float64",
"charging_ac_ampere":"float64",
"charging_ac_voltage":"float64",
"charging_status":"object",
"current_remaining_fuel_range":"float64",
"hv_state_of_health":"float64",
"remaining_range":"float64",
"mileage":"float64",
"remaining_fuel":"float64",
"soc":"float64",
"model":"str",
"vin":"str",
"model_name":"str",
"autonomy":"float64",
"net_capacity":"float64",
"phase_id":"str"})

## SoH calculation

### First method on the estimated range


In [None]:
tss["soh"] = tss["remaining_range"] / (tss["soc"] * tss["autonomy"]) * 100


In [None]:
tss.vin.value_counts().head(5)

In [None]:
tss_unique = tss[tss["vin"] == "WBY11CF000CJ80264"]

#### Few graphs before calculation 


In [None]:
px.scatter(tss, 
           x="soc", 
           y="remaining_range", 
           color="vin")

In [None]:
px.scatter(tss, 
           x="soc", 
           y="remaining_range", 
           color="charging")


-> No correlation between in_charge and estimated_range

In [None]:
px.scatter(tss_unique, 
           x="date", 
           y="soc", 
           color="phase_id")


### SoH visualisation


#### SoC / SOH 
-> The SoH is correlated with the SoC it needs to be corrected 


In [None]:
px.scatter(tss, 
           x="soh", 
           y="soc", 
           color="vin")

In [None]:

import plotly.express as px
import plotly.graph_objects as go
import numpy as np

# Assuming tss is your DataFrame
tss_clean = tss.dropna(subset=['soc', 'soh'])
# Create a scatter plot
fig = px.scatter(tss_clean, 
                 x="soc", 
                 y="soh", 
                 color="vin",
                 title='Scatter plot with Trendline')

# Calculate the trendline
x = tss_clean['soc']
y = tss_clean['soh']
# Fit a linear model
coefficients = np.polyfit(x, y, 1)
trendline = np.poly1d(coefficients)

# Extract the slope and intercept
slope, intercept = coefficients

# Add the trendline to the plot using go.Scatter
fig.add_trace(go.Scatter(
    x=x,
    y=trendline(x),
    mode='lines',
    name='Trendline'
))

# Add an annotation for the trendline equation
equation_text = f"y = {slope:.10f}x + {intercept:.2f}"
fig.add_annotation(
    x=max(x),  # Position the annotation at the maximum x value
    y=max(trendline(x)),  # Position the annotation at the corresponding y value
    text=equation_text,
    showarrow=False,
    font=dict(size=12, color="black"),
    xanchor='right'
)

# Show the plot
fig.show()
print(equation_text)

#### In charge / SOH
-> No correlation between the in charge and the SoH 


In [None]:
px.scatter(tss, 
           x="date", 
           y="soh", 
           color="charging")

#### Charging 
-> No correlation between the charging plug connected and the SoH 

In [None]:
px.scatter(tss.query("charging == True"), 
           x="date", 
           y="soh", 
           color="charging_status")

#### SOH / odometer


In [None]:
px.scatter(tss, 
           x="mileage", 
           y="soh", 
           color="vin")

-> Find a better fill method for odometer

#### Heat map on the variance
-> Depandency to the SoH 


In [None]:
# Calculation on the rolling variance 
tss["rolling_variance"] = tss.groupby("vin")["soh"].transform(lambda group: group.rolling(window=10).var())
var = tss.dropna(subset=['rolling_variance']).query("charging == False").head(10)


In [None]:
tss.columns

In [None]:
# Heat map 
# Sélectionner les colonnes d'intérêt
columns_of_interest = ['soc', 'remaining_range', 'soh', 'charging_ac_voltage', 'charging_ac_ampere', 'rolling_variance']

# Calculer la matrice de corrélation
correlation_matrix = tss[columns_of_interest].corr()
# Visualiser la matrice de corrélation avec une heatmap
fig = px.imshow(correlation_matrix,
                labels=dict(x="Variables", y="Variables", color="Correlation"),
                x=correlation_matrix.columns,
                y=correlation_matrix.index,
                title="Heatmap de la Corrélation")

# Afficher le graphique
fig.show()

#### soc + estimated_range


In [None]:
px.scatter(tss.query("charging == False"), 
           x="soc", 
           y="remaining_range", 
           color="vin")

In [None]:
####

### Improving the calculation 
- Adding filtering on the number of point
- Adding filtering on the outliers  
- Adding filtering on the soc 

In [None]:
# Filtering on the number of point of SoH 
non_null_estimated_range = tss.dropna(subset=['remaining_range'])
vin_counts = non_null_estimated_range['vin'].value_counts()
vins_with_at_least_10_non_null = vin_counts[vin_counts >= 100].index
filtered_tss = tss[tss['vin'].isin(vins_with_at_least_10_non_null)]
print(len(vins_with_at_least_10_non_null))


In [None]:
### Filtering the outliers - Est-ce que c'est possible d'utiliser ta fonction Mauro ? 
filtered_tss = filtered_tss[filtered_tss['soh'] < 1.2]

In [None]:
### Filtering on the soc 
# This is the equation that links soc and SoH : y = 0.0010918648x + 0.75
filtered_tss['soh'] = filtered_tss['soh'] - 0.00127771*filtered_tss['soc'] + 0.13

#### Impact of the filtering of the soc


In [None]:
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

# Assuming tss is your DataFrame
tss_clean = filtered_tss.dropna(subset=['soc', 'soh'])
# Create a scatter plot
fig = px.scatter(tss_clean, 
                 x="soc", 
                 y="soh", 
                 color="vin",
                 title='Scatter plot with Trendline')

# Calculate the trendline
x = tss_clean['soc']
y = tss_clean['soh']
# Fit a linear model
coefficients = np.polyfit(x, y, 1)
trendline = np.poly1d(coefficients)

# Extract the slope and intercept
slope, intercept = coefficients

# Add the trendline to the plot using go.Scatter
fig.add_trace(go.Scatter(
    x=x,
    y=trendline(x),
    mode='lines',
    name='Trendline'
))

# Add an annotation for the trendline equation
equation_text = f"y = {slope:.8f}x + {intercept:.2f}"
fig.add_annotation(
    x=max(x),  # Position the annotation at the maximum x value
    y=max(trendline(x)),  # Position the annotation at the corresponding y value
    text=equation_text,
    showarrow=False,
    font=dict(size=12, color="black"),
    xanchor='right'
)

# Show the plot
fig.show()
print(equation_text)

### Final SoH 

In [None]:
filtered_tss.columns

In [None]:
aggregated_tss = filtered_tss.groupby(["vin"]).agg(
    {"soc": "mean", 
     "remaining_range": "mean", 
     "autonomy": "mean", 
     "mileage": "mean",
     "vin": "first",
     "type": "first",
     "date": "last",
     "soh": "mean"})
px.scatter(aggregated_tss, 
           x="mileage", 
           y="soh", 
           color="vin")

### Improvement SoH

In [None]:
# Calculation on the rolling variance 
filtered_tss["rolling_variance"] = filtered_tss.groupby("vin")["soh"].transform(lambda group: group.rolling(window=10).var())
var = filtered_tss.dropna(subset=['rolling_variance']).query("charging == False").head(10)

In [None]:
# Heat map 
# Sélectionner les colonnes d'intérêt
columns_of_interest = ['soc', 'remaining_range', 'soh', 'charging_ac_voltage', 'charging_ac_ampere', 'rolling_variance']

# Calculer la matrice de corrélation
correlation_matrix = filtered_tss[columns_of_interest].corr()
# Visualiser la matrice de corrélation avec une heatmap
fig = px.imshow(correlation_matrix,
                labels=dict(x="Variables", y="Variables", color="Correlation"),
                x=correlation_matrix.columns,
                y=correlation_matrix.index,
                title="Heatmap de la Corrélation")

# Afficher le graphique
fig.show()

## Second method on the charging (charging_ac_voltage / charging_ac_current)

### Few graphs

In [None]:
px.scatter(tss, 
           x="date", 
           y="charging_ac_ampere", 
           color="vin")

In [None]:
px.scatter(tss, 
           x="date", 
           y="charging_ac_voltage", 
           color="vin")