# Correlation Check

The goal is to ensure that the calculated SoH (State of Health) values are not correlated with the measured SoC (State of Charge):
- at the start of the charging session,
- at the end of the charging session,
- as the average over the entire charging session.

This ensures that the SoH estimation remains independent of the battery's charge level.

## Load data

In [None]:
from core.s3.s3_utils import S3Service
from core.spark_utils import create_spark_session
import os
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

In [None]:
spark_session = create_spark_session(os.environ.get('S3_KEY'), os.environ.get('S3_SECRET'))

In [None]:
s3 = S3Service()


#### Raw Results

In [None]:
#s3.read_parquet_df_spark(spark_session, )
raw_res_bmw = s3.read_parquet_df("raw_results/bmw.parquet")
raw_res_mercedes = s3.read_parquet_df("raw_results/mercedes-benz.parquet")
raw_res_tesla = s3.read_parquet_df("raw_results/tesla.parquet")

##### Process Tss

In [None]:
floor_date = pd.Timedelta(days=7)

# prce_tss_bmw = s3.read_parquet_df("processed_ts/bmw/time_series/processed_tss.parquet")
# prce_tss_bmw = prce_tss_bmw.eval("floored_date = date.dt.floor(@floor_date)")

prce_tss_mercedes = s3.read_parquet_df("processed_ts/mercedes-benz/time_series/processed_tss.parquet")
prce_tss_mercedes = prce_tss_mercedes.eval("floored_date = date.dt.floor(@floor_date)")

prce_tss_tesla = s3.read_parquet_df("processed_ts/tesla/time_series/processed_tss.parquet")
prce_tss_tesla = prce_tss_tesla.eval("floored_date = date.dt.floor(@floor_date)")

merge data to get soc and SoH compute

In [None]:
df_soh_tesla = prce_tss_tesla[['date', 'soc', 'charging_status', 'odometer', 'charging_rate', 'vin',
       'in_charge', 'in_discharge', 'in_charge_idx', 'in_discharge_idx',
       'trailing_soc', 'leading_soc', 'trimmed_in_charge',
       'trimmed_in_discharge', 'trimmed_in_charge_idx',
       'trimmed_in_discharge_idx', 'status', 'model', 'version', 
       'net_capacity']].merge(raw_res_tesla[['soc_diff', 'soh', 'date', 'vin']], 
                                              how='inner', on=['date', 'vin']).dropna(subset='soh')

In [None]:
df_soh_mercedes = prce_tss_mercedes[['date', 'charging_status', 'odometer', 'charging_rate',
       'estimated_range', 'max_range', 'vin',
       'in_charge', 'in_discharge', 'in_charge_idx', 'in_discharge_idx',
       'trailing_soc', 'leading_soc', 'trimmed_in_charge',
       'trimmed_in_discharge', 'trimmed_in_charge_idx',
       'trimmed_in_discharge_idx', 'status', 'model', 'version', 
       'net_capacity']].merge(raw_res_mercedes[['soc', 'soh', 'date', 'vin']], 
                                              how='inner', on=['date', 'vin']).dropna(subset='soh')

## Correlation SoH/soc

In [None]:
correlation_df = df_soh_mercedes.groupby("in_charge_idx").agg(
    start_soc = ('soc', 'min'),
    end_soc = ('soc', 'max'),
    soh = ("soh", "median")
).eval('diff_soc = end_soc - start_soc')

In [None]:
corr  = correlation_df.corr(numeric_only=True)
selected_column = "soh"
selected_corr = corr[[selected_column]].sort_values(by=selected_column, ascending=False)

# heat map of the correlation matrix
px.imshow(selected_corr, title=f"Correlation Matrix for {selected_column}")

In [None]:
px.scatter(correlation_df, x="soh", y="diff_soc", )