In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler, RobustScaler



In [None]:
from  transform.processed_tss.ProcessedTimeSeries import TeslaProcessedTimeSeries
from core.caching_utils import cache_result 
from core.pandas_utils import series_start_end_diff
from transform.raw_results.config import *
tss = TeslaProcessedTimeSeries(make="tesla")

# Load data

In [None]:
USE_COLS = [
    "vin",
    "trimmed_in_charge_idx",
    "trimmed_in_charge",
    "charge_energy_added",
    "soc",
    "outside_temp",
    "capacity",
    "odometer",
    "model",
    "date",
    "tesla_code",
    "battery_heater",
    "charging_power",
    "version",
    "start_date",
    "battery_chemistry",
    "activation_status"

]

In [None]:
LEVEL_1_MAX_POWER = 8
LEVEL_2_MAX_POWER = 45

In [None]:
# get from Mauro
@cache_result("data_cache/tesla_results_4.parquet", "local_storage")
def get_raw_results():
    return (
        TeslaProcessedTimeSeries("tesla", use_cols=USE_COLS) 
        .query("trimmed_in_charge")                                     # We only select the rows of the time series that are in charge.
        .groupby(["vin", "trimmed_in_charge_idx"])                      # We group by vin and the index of the charge.
        .agg(
            energy_added_min=pd.NamedAgg("charge_energy_added", "min"), 
            energy_added_end=pd.NamedAgg("charge_energy_added", "last"),
            soc_end=pd.NamedAgg("soc", "last"),
            soc_min=pd.NamedAgg("soc", "min"),
            soc_diff=pd.NamedAgg("soc", series_start_end_diff),
            outside_temp=pd.NamedAgg("outside_temp", "mean"),
            capacity=pd.NamedAgg("capacity", "first"),
            odometer=pd.NamedAgg("odometer", "first"),
            version=pd.NamedAgg("version", "first"),
            size=pd.NamedAgg("soc", "size"),
            model=pd.NamedAgg("model", "first"),
            date=pd.NamedAgg("date", "first"),
            charging_power=pd.NamedAgg("charging_power", "median"),
            tesla_code=pd.NamedAgg("tesla_code", "first"),
            start_date=pd.NamedAgg("start_date", "first"),
            battery_chemistry=pd.NamedAgg("battery_chemistry", "first"),
            activation_status=pd.NamedAgg("activation_status", "first"),
        )
        .reset_index(drop=False)
        .eval("energy_added = energy_added_end - energy_added_min")
        .eval("soc_diff = soc_end - soc_min")
        .eval("soh = energy_added / (soc_diff / 100.0 * capacity)")
        .eval("level_1 = soc_diff * (charging_power < @LEVEL_1_MAX_POWER) / 100")
        .eval("level_2 = soc_diff * (charging_power.between(@LEVEL_1_MAX_POWER, @LEVEL_2_MAX_POWER)) / 100")
        .eval("level_3 = soc_diff * (charging_power > @LEVEL_2_MAX_POWER) / 100")
        .query("soc_diff > 40 & soh.between(0.75, 1.05)")
	    #.eval("bottom_soh = soh.between(0.75, 0.9)")
        #.eval("fixed_soh_min_end = soh.mask(tesla_code == 'MTY13', soh / 0.96)")
        #.eval("fixed_soh_min_end = fixed_soh_min_end.mask(bottom_soh & tesla_code == 'MTY13', fixed_soh_min_end + 0.08)")
        .sort_values(["tesla_code", "vin", "date"])
    )

In [None]:
results = get_raw_results(force_update=True)

In [None]:
results.soh

In [None]:
results.vin.nunique()

## Tesla battery label & chemistry

In [None]:
# build life battery in month
def compute_life_battery(start_date, last_date):
    try:
        years = relativedelta(start_date, last_date).years
        months = relativedelta(start_date, last_date).months
        return years*12 + months
    except:
        return 0
    
results = results.merge(results.groupby('vin', as_index=False, observed=False).agg(life_battery=('date', 'max')), on='vin')
results['life_battery'] = results.apply(lambda x: compute_life_battery(x['life_battery'], x['start_date']), axis=1)

In [None]:
# build interval for vehicles odometer
results['odometer_interval'] = results['odometer'].apply(lambda x: "< 50k" if x < 50_000 
                                          else "50k-80k"  if x >= 50_000 and x < 80_000
                                          else "80-120" if x >= 80_000 and x <120_000
                                          else "120k >")

In [None]:
vin_soh = results.groupby('vin', as_index=False, observed=True).agg(
    chemistry= ('battery_chemistry','first'), 
    soh_mean=('soh', 'mean'),
    soh_median=('soh', 'median'),
    odometer_interval = ('odometer_interval',"last"),
    odometer = ('odometer', "max"),
    life_battery = ('life_battery','max'),
    tesla_code = ('tesla_code', 'first'),
    outside_temp=('outside_temp', 'mean')
    )

# EDA

## Diff of the impact beetween odometer and life battery on SoH

In [None]:
vin_soh[['odometer', 'soh_median', 'life_battery']].mean()

In [None]:
vin_soh[vin_soh['tesla_code']=='MT336']

In [None]:
px.imshow(vin_soh[['soh_median', 'odometer', 'life_battery']].corr()[['soh_median']], text_auto=True, color_continuous_scale='viridis')

In [None]:
px.scatter(vin_soh[vin_soh['life_battery']>0][['odometer', 'soh_median', 'life_battery', 'chemistry']],  x='life_battery', y='soh_median', title="Impact of battery life on SoH", trendline='ols')

In [None]:
px.scatter(vin_soh[vin_soh['life_battery']>0][['odometer', 'soh_median', 'life_battery', 'chemistry']],  x='odometer', y='soh_median', title="Impact of mileage on SoH", trendline='ols')

## Chemistry impact 

The chimistry impact the SoH but we want to know **_how much ?_**

Chemistry repartition by vin

In [None]:
vin_soh.value_counts('chemistry')

In [None]:
vin_soh.value_counts('chemistry', normalize=True)

In [None]:
chemistry_rep_v = pd.DataFrame(vin_soh.value_counts(['chemistry'])).reset_index().rename(columns={'count':'total_vehicles_chemistry'})
chemistry_odo_rep_v = pd.DataFrame(vin_soh.value_counts(['chemistry', 'odometer_interval']).reset_index()).rename(columns={'count':'total_vehicles_chemistry_odometer'})
rep = chemistry_rep_v.merge(chemistry_odo_rep_v,  on='chemistry')
rep['proportion_chemistry'] = (rep['total_vehicles_chemistry_odometer'] / rep['total_vehicles_chemistry'] ).round(2)
rep

In [None]:
px.scatter(vin_soh, x='odometer', y='soh_median', color="chemistry", hover_data="tesla_code")

In [None]:
vin_soh.groupby(["odometer_interval", "chemistry"], )[['soh_median']].mean().sort_index()

In [None]:
# mean over the median and the mean by vin 
vin_soh[~vin_soh['tesla_code'].isin(['MT336' ])].groupby([ 'odometer_interval', 'chemistry'])[['soh_mean', 'soh_median']].mean().sort_index()

**General**   
SOH decreases with higher mileage across all chemistries (LFP, NCA, NMC).  
LFP batteries degrade faster compared to NCA and NMC.  
NMC generally has the highest SOH retention across odometer intervals.  

**<50k km:**    
Highest SOH for all chemistries, with NMC (~0.989) and NCA (~0.987) having the best retention.  
LFP shows slightly lower SOH (~0.975).

**50k-80k km:**  
Moderate degradation.  
NMC (~0.958-0.964) still leads, followed by NCA (~0.943-0.960), while LFP drops further (~0.924-0.891).

**80k-120k km:**  
Noticeable SOH decline, especially for LFP (~0.895-0.864).  
NCA (~0.923-0.968) still holds up better than LFP.  
NMC (~0.939-0.944) continues to perform well.  

**120k+ km:**  
Highest degradation.    
LFP SOH drops significantly (~0.881-0.837).   
NCA maintains a better SOH (~0.906-0.928), showing better long-term durability.  
NMC retains the best SOH (~0.928-0.933).  
 

**Conclusion:**  
NMC is the best performer in SOH retention across mileage.   
NCA is slightly behind NMC but still holds well.  
LFP degrades the fastest, especially beyond 80k km.   
The only drawback here is that we don't have the same number of vin for NMC(759), NCA (1678) and LFP (2998).  
Also we only have information on tesla model 3.

### Study of MT336

In [None]:
px.scatter(vin_soh[vin_soh['chemistry']=='LFP'].groupby('vin').agg({
    'odometer': 'max',
    'soh': 'median',
    'tesla_code': 'first',
    }), x='odometer', y='soh', color='tesla_code', title="SoH VS odometer for LFP chemistry")

In [None]:
mt336_df = results[(results['tesla_code']=="MT336")].groupby('vin', observed=True).agg(
    nbr_charges=('trimmed_in_charge_idx', 'count'),
    energy_added_min = ("energy_added_min", "min"),
    energy_added_mean_end = ('energy_added_end', 'mean'),
    soc_mean_end =('soc_end', 'mean'),
    soc_mean_min = ('soc_min', 'mean'),
    soc_diff = ("soc_diff", 'mean'),
    outside_temp_mean = ('outside_temp', 'mean'),
    capacity = ('capacity', 'max'),
    odometer = ('odometer', 'max'),
    start_date = ('start_date', 'min'),
    energy_added = ("energy_added", "mean"),
    soh_mean = ('soh', 'mean'),
    soh_median = ('soh', 'mean'),
    soh_min = ('soh', 'min'),
    soh_max = ('soh', 'max'),
    level_1_total = ('level_1', 'sum'),
    level_2_total = ('level_2', 'sum'),
    level_3_total = ('level_3', 'sum'),
    life_battery = ("life_battery", "max"),
    odometer_interval = ("odometer_interval", 'last')
).dropna()

In [None]:
px.imshow(mt336_df.drop(columns=["odometer_interval", "start_date"]).corr(), text_auto=True, height=1000, width=1000)

In [None]:
mt336_df["start_date"] = mt336_df["start_date"].astype('str')

In [None]:
mt336_df.columns

### find cluster 

In [None]:

sc = RobustScaler()
mt336_df_sc = pd.DataFrame(sc.fit_transform(mt336_df.drop(columns=['start_date', 'odometer_interval', ])), columns=mt336_df.drop(columns=['start_date', 'odometer_interval', ]).columns)


In [None]:
mt336_df_sc.dropna(inplace=True)

In [None]:
# columns we want to use to create the centroïds
# don't add soh in it because it's the pseudo target

columns_to_keep = ['nbr_charges', 'energy_added_min', 'energy_added_mean_end',
       'soc_mean_end', 'soc_mean_min', 'soc_diff', 'outside_temp_mean',
       'capacity', 'odometer', 'energy_added', 'level_1_total', 'level_2_total',
       'level_3_total', 'life_battery']

In [None]:
#search best number of cluster 
inertias = []
ks = range(1,20)

for k in ks:
    km_test = KMeans(n_clusters=k).fit(mt336_df_sc[columns_to_keep])
    inertias.append(km_test.inertia_)

plt.plot(ks, inertias)
plt.xlabel('k cluster number')

In [None]:
# build the model 
cluster_model = KMeans(n_clusters=2)
cluster_model.fit(mt336_df_sc[columns_to_keep])

In [None]:
mt336_df_sc['labels'] = cluster_model.labels_
mt336_df_sc['labels'] = mt336_df_sc['labels'].astype(str)

In [None]:
# # gaussian miwture test 
# from sklearn.mixture import GaussianMixture
# gm_model = GaussianMixture(n_components=2,).fit(mt336_df_sc[columns_to_keep])
# mt336_df['labels'] = gm_model.predict(mt336_df_sc[columns_to_keep])
# mt336_df['labels'] = mt336_df['labels'].astype(str)

In [None]:
px.scatter(mt336_df_sc, x='odometer', y='soh_median', color='labels')

Can't see anything, try with different number of cluster

Cut in two the data arbitrary at .87 SoH

In [None]:
px.scatter(mt336_df[mt336_df['soh_mean'] < .87], x='odometer', y='soh_mean', trendline='ols', size='nbr_charges')

In [None]:
px.scatter(mt336_df[mt336_df['soh_mean'] > .87], x='odometer', y='soh_mean', trendline='ols', size='nbr_charges', title='Impact of odometer on SoH for MT336 with more than .87 of SoH')

## Temperature impact analysis

In [None]:
vin_soh.groupby('chemistry')['outside_temp'].describe()

No significant temperature difference between each chemistry during charging

I decide to don't take into account the chemistry for the following study








In [None]:
outside_temp = results.groupby('vin', observed=True,).agg({
    "outside_temp": 'mean',
    "soh": "median",
    "odometer_interval": "max",
    "life_battery": "max",
    'battery_chemistry': "last"
}).copy()

In [None]:
px.scatter(outside_temp[(outside_temp['outside_temp'] > 0)], x='outside_temp', y='soh', color='odometer_interval', title="soh distribution over outside temperature during charging")

We can't see an impact of the temperature on th SoH with that graph  

In [None]:
outside_temp['outside_temp_round_1'] = outside_temp['outside_temp'].round(1)
outside_temp['outside_temp_round'] = outside_temp['outside_temp'].round()


In [None]:
px.scatter(outside_temp.groupby(['outside_temp_round_1', 'odometer_interval'], as_index=False).agg({'outside_temp':'mean', 'soh': 'median'}), 
           x='outside_temp', y='soh', color='odometer_interval',
           title='Mean SoH over Temperature (step 1°C)')


In [None]:
px.scatter(outside_temp.groupby(['outside_temp_round_1', 'odometer_interval', 'vin', 'battery_chemistry'], as_index=False,  observed=True).mean(), 
           x='outside_temp', y='soh', color='battery_chemistry', 
           title='Mean SoH over Temperature (step 0.1°C)', trendline='ols')


Can't see any differences in SOH between a battery charged at high, medium, or low outside temperatures.

In [None]:
px.scatter(outside_temp, x='outside_temp', y='soh', color='life_battery',)


There is no difference in SoH between a battery charged at high, medium, or low outside temperatures.

There is a spread effect close to 0°C and for temperatures above 10°C. (More data could be show something)

**Conclusion**

I didn’t find any direct impact of temperature on SoH during charging.

There is a spread when the temperature is below 2°C and above 10°C, but this could simply be due to the data distribution.

Additionally, we cannot compare lower temperatures to higher ones, as all cars in the dataset are based in Europe from November to March, meaning there is no available data for comparison.

Need more data over more times.

## Charge types

The perpective is to see if a level of charge 1/2/3 as an impact on the SoH.   
We need to compare car with a number of charging/mileage/life/chemistry uniform. 


In [None]:
results['is_level_1'] = results['level_1'].apply(lambda x: 1 if x>0 else 0)
results['is_level_2'] = results['level_2'].apply(lambda x: 1 if x>0 else 0)
results['is_level_3'] = results['level_3'].apply(lambda x: 1 if x>0 else 0)

In [None]:
charges_vin = results.groupby(['vin'], as_index=False, observed=True).agg(
    total_level_1=("level_1", "sum"),
    total_level_2=("level_2", "sum"),
    total_level_3=("level_3", "sum"),
    nbr_charge_level_1=("is_level_1", 'sum'),
    nbr_charge_level_2=("is_level_2", 'sum'),
    nbr_charge_level_3=("is_level_3", 'sum'),
    nbr_charge=('trimmed_in_charge_idx', "count"),
    odometer=("odometer", "max"),
    odometer_interval=("odometer_interval", "last"),
    soh_mean=("soh", "mean"),
    soh_median=("soh", "median"),
    soh_min=("soh", "min"),
    soh_max=("soh", "max"),
    
).copy()

In [None]:
charges_vin.value_counts('nbr_charge').plot(kind='bar')

In [None]:

charges_vin.groupby("nbr_charge").agg({
    "nbr_charge_level_1":'sum',
    "nbr_charge_level_2":'sum',
    "nbr_charge_level_3":'sum',
}).plot(kind='bar', title='distribution du nombre de type de charge')

In [None]:
# on à 4.6 charge en moyenne par véhcule
charges_vin.nbr_charge.mean()

**With the chemistry:**

The **_mean charging number_** by vin is **4.4**. 
*More than half* of the vin has **_4 charging or less_**. 50 vin have more than 15 charging (less than 1%). 

**If we don't take care of the chemistry:**

The **_mean charging number_** by vin is **4.6**. 
*58%* have **4 or less** charging. 216 vin have 14 or more charging (2.3%).


Based on that the following discover can only should be treated cautiously and coul be path for the futur

In [None]:
px.scatter(charges_vin[(charges_vin['total_level_3']>0) &(charges_vin['nbr_charge']>4)], x='total_level_3', y="soh_mean", 
           color='odometer_interval',size='nbr_charge', trendline="ols", hover_data='soh_max',
           title="Impact of level 3 charging on SoH")

In [None]:
px.scatter(charges_vin[(charges_vin['total_level_2']>0) &(charges_vin['nbr_charge']>4)], x='total_level_2', y="soh_mean", 
           color='odometer_interval', size='nbr_charge', trendline="ols",
           title="Impact of level 2 charging on SoH")


In [None]:
px.scatter(charges_vin[(charges_vin['total_level_1']>0) &(charges_vin['nbr_charge']>4)], x='total_level_1', y="soh_mean", color='odometer_interval',size='nbr_charge', trendline="ols",
           title="Impact of level 1 charging on SoH")

## distribution de la distance parcouru sur la période 

In [None]:
distance_ = results.groupby('vin', as_index=False, observed=True).agg(max_odo=('odometer', "max"),
                           min_odo=('odometer', "min"),
                           ).eval('distance = max_odo - min_odo')

In [None]:
plt.hist(distance_.distance, bins=range(0, int(distance_.distance.max()), 400));#range(0, int(distance_.distance.max()), 400));

## vin with more than 5 charges

In [None]:
vin_with_more_than_5_charges = results.groupby('vin', observed=True, as_index=False)[['trimmed_in_charge_idx']].count().query('trimmed_in_charge_idx > 5').vin.unique()

In [None]:
len(vin_with_more_than_5_charges)

In [None]:
df_more_5_charges = results.query('vin in @vin_with_more_than_5_charges')

In [None]:
px.imshow(df_more_5_charges[df_more_5_charges['life_battery']>0].groupby("vin", observed=True)[['soh', 'odometer', 'life_battery']].mean().corr()[['soh']], text_auto=True, color_continuous_scale='viridis')

# mean discharging impact

In [None]:
results

In [None]:
results["low_soc"] =  results["soc_min"].apply(lambda x: 1 if x <20 else 0)
results["mid_soc"] =  results["soc_min"].apply(lambda x: 1 if (x <=70 and x >=20) else 0)
results["hight_soc"] =  results["soc_min"].apply(lambda x: 1 if x > 70 else 0)

In [None]:
results.columns

In [None]:
discharge_df = results.groupby('vin',  observed=True).agg({
    "low_soc":"sum",
    "mid_soc":"sum",
    "hight_soc":"sum",
    "soc_min":"mean",
    "capacity": "count",
    "soh": "median",
    "odometer": 'last',
    "battery_chemistry": 'last',
    "start_date": "first",
    "date": 'last'
}).rename(columns={'capacity':'nbr_charges'})

In [None]:
discharge_df['proportion_low'] = discharge_df['low_soc']/discharge_df['nbr_charges']
discharge_df['proportion_mid'] = discharge_df['mid_soc']/discharge_df['nbr_charges']

In [None]:
discharge_df['date'] = pd.to_datetime(discharge_df['date'])
discharge_df['start_date'] = pd.to_datetime(discharge_df['start_date'])

# Calculer la différence en mois de manière vectorisée
discharge_df['life_battery'] = abs(((discharge_df['date'].dt.year - discharge_df['start_date'].dt.year) * 12 + 
                        (discharge_df['date'].dt.month - discharge_df['start_date'].dt.month)))

discharge_df['life_battery'] = discharge_df['life_battery'].replace(np.nan, 0)

In [None]:
discharge_df['round_odometer'] = (round(discharge_df['odometer'] / 20000) * 20000).astype(str)


In [None]:
px.scatter(discharge_df, x='soc_min', y='soh', color='battery_chemistry')

In [None]:
px.scatter(discharge_df[discharge_df['proportion_low'] >0], x='proportion_low', y='soh', color='battery_chemistry', size='nbr_charges')

In [None]:
px.scatter(discharge_df[["proportion_low","nbr_charges","soh"]].groupby('proportion_low', as_index=False).median().sort_values("proportion_low"), x="proportion_low", y="soh", trendline='ols')

In [None]:
px.scatter(discharge_df[ (discharge_df['nbr_charges'] > 10)], x='soc_min', y='soh', trendline='ols', size='nbr_charges', color='round_odometer')

In [None]:
px.scatter(discharge_df[(discharge_df['nbr_charges'] > 5) & (discharge_df['battery_chemistry']!='7C')], x='soc_min', y='soh', trendline='ols', size='nbr_charges', color='round_odometer')

In [None]:
px.scatter(discharge_df[(discharge_df['nbr_charges'] > 5) &(discharge_df['battery_chemistry']!='7C')& (discharge_df["proportion_low"] > 0)], x='proportion_low', y='soh', trendline='ols', size='nbr_charges', color='round_odometer')