# notebook pour tester la correction du SoH avec différentes features

Pour l'instant chez kia le `SoH` est simplement corrigé par le `SoC` (cf notebook `soh_estimation_kia.ipynb`)

In [None]:
import numpy as np
import plotly.express as px

from core.pandas_utils import *
from core.s3.s3_utils import S3Service
from core.s3.settings import S3Settings
from core.spark_utils import create_spark_session
from core.sql_utils import *
from core.stats_utils import *
from pyspark.sql import functions as F

# Configuration*
settings = S3Settings()
spark = create_spark_session(
    settings.S3_KEY,
    settings.S3_SECRET
)
s3 = S3Service()
company = "kia"

In [None]:
rename = {
    "Vehicle_Drivetrain_Odometer": "odometer",
    "Vehicle_Green_BatteryManagement_BatteryCapacity_Value": "max_capacity_kj",
    "Vehicle_Green_BatteryManagement_BatteryConditioning": "conditioning_status",
    "Vehicle_Green_BatteryManagement_BatteryPreCondition_Status": "pre_conditioning_status",
    "Vehicle_Green_BatteryManagement_BatteryPreCondition_TemperatureLevel": "pre_conditioning_temperature_level",
    "Vehicle_Green_BatteryManagement_BatteryRemain_Ratio": "soc",
    "Vehicle_Green_BatteryManagement_BatteryRemain_Value": "remaining_capacity_kj",
    "Vehicle_Green_BatteryManagement_SoH_Ratio": "soh_oem",
    "Vehicle_Green_ChargingInformation_Charging_RemainTime": "charging_time_remaining",
    "Vehicle_Green_ChargingInformation_ConnectorFastening_State": "is_charger_connected",
    "Vehicle_Green_ChargingInformation_ElectricCurrentLevel_State": "electric_current_level_state",
}

## Calcul du SoH de base suivant `KiaRawTsToProcessedPhases`

In [None]:
from transform.processed_phases.config import (
            SOC_DIFF_THRESHOLD,
        )
from transform.processed_phases.providers.kia import (
    KiaRawTsToProcessedPhases,
)
from core.spark_utils import (
    get_spark_available_cores,
    safe_astype_spark_with_error_handling,
)

kia = KiaRawTsToProcessedPhases(
    make='kia',
    spark=spark,
)

tss = kia.bucket.read_parquet_df_spark(
    spark=spark,
    key=f"raw_ts/{kia.make.value}/time_series/raw_ts_spark.parquet",
)
optimal_partitions_nb, _ = kia._set_optimal_spark_parameters(
    tss, get_spark_available_cores(spark, kia.logger)
)

tss = tss.coalesce(optimal_partitions_nb).cache()
tss.count()



tss = tss.withColumnsRenamed(rename)
tss_filtered = tss.select( "odometer",
 "max_capacity_kj",
 "conditioning_status",
 "pre_conditioning_status",
 "pre_conditioning_temperature_level",
 "soc",
 "remaining_capacity_kj",
 "soh_oem",
 "charging_time_remaining",
 "is_charger_connected",
 "electric_current_level_state",
 "vin",
 "date")

tss_filtered = safe_astype_spark_with_error_handling(tss_filtered)
tss_filtered = kia._normalize_units_to_metric(tss_filtered)
tss_filtered = tss_filtered.orderBy(["vin", "date"])
tss_fullfill = kia.fill_forward(tss_filtered)
tss_phase_idx = kia.compute_charge_idx(
    tss_fullfill, SOC_DIFF_THRESHOLD[kia.make.value]
)
tss_phase_idx = tss_phase_idx.cache()
phases = kia.generate_phase(tss_phase_idx)
phases.repartition(optimal_partitions_nb).cache()
phases.count()

phase_tss = kia.join_metrics_to_phase(phases, tss_fullfill)
full_data = kia.compute_specific_features_before_aggregation(phase_tss)

## Correction avec le SoC

Model de regression liénaire simple pour chaque vin

In [None]:
from pyspark.ml.regression import LinearRegression, LinearRegressionModel
from pyspark.ml.feature import VectorAssembler


In [None]:
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import StructType, StructField, DoubleType, StringType

schema = StructType([
    StructField("VIN", StringType()),
    StructField("coef_soc", DoubleType()),
    StructField("intercept", DoubleType()),
])

In [None]:
def linear_reg_soc_soh_bis(key, pdf: pd.DataFrame) -> pd.DataFrame:
    # key est un tuple des clés de groupBy
    vin = key[0]

    x = pdf["soc"].to_numpy()
    y = pdf["soh"].to_numpy()

    if len(x) < 3 or np.std(x) == 0:
        return pd.DataFrame({
            "VIN": [vin],
            "coef_soc": [None],
            "intercept": [None],
        })

    coef, intercept = np.polyfit(x, y, 1)

    return pd.DataFrame({
        "VIN": [vin],
        "coef_soc": float(coef),
        "intercept": float(intercept),
    })


In [None]:

base = (
    full_data.select("VIN", "soc", "soh")
    .dropna(subset=["VIN", "soc", "soh"])
)

stats = base.groupBy("VIN").agg(
    F.count("*").alias("n"),
    F.avg("soc").alias("x_mean"),
    F.avg("soh").alias("y_mean"),
    F.var_samp("soc").alias("x_var"),
    F.covar_samp("soc", "soh").alias("xy_covar"),
)

coef_df = (
    stats
    .withColumn(
        "coef_soc",
        F.when((F.col("n") >= 3) & (F.col("x_var") > 0),
               F.col("xy_covar") / F.col("x_var"))
         .otherwise(F.lit(None).cast("double"))
    )
    .withColumn(
        "intercept",
        F.when(F.col("coef_soc").isNotNull(),
               F.col("y_mean") - F.col("coef_soc") * F.col("x_mean"))
         .otherwise(F.lit(None).cast("double"))
    )
    .select(
        F.col("VIN"),
        F.col("coef_soc"),
        F.col("intercept"),
    )
)

In [None]:
full_data_with_coef = full_data.join(coef_df, on="VIN", how="left")

In [None]:
full_data_with_coef = full_data_with_coef.withColumn("soh_updated", 
                               F.when(F.col("coef_soc").isNotNull(), F.col("soh") / (F.col("coef_soc") * F.col("soc") + F.col("intercept"))).otherwise(F.col("soh")))

In [None]:
df_pd = full_data_with_coef.select("VIN", "date", "coef_soc", "intercept", "soh", "soh_updated", "odometer", 'PHASE_INDEX', 'net_capacity').toPandas()

In [None]:
df_pd.sort_values(by='VIN').dropna(subset=['soh', 'soh_updated']).head()

In [None]:
px.scatter(df_pd, x='odometer', y='soh_updated', color='VIN')

## Aggregation 

In [None]:
soh_phases = df_pd.groupby(["PHASE_INDEX", "VIN"], as_index=False).agg(
    soh = ("soh_updated", "median"),
    count = ("soh_updated", "count"),
    odometer = ("odometer", "max"),
    net_capacity = ("net_capacity", "first"),
    date = ("date", "max"),
)



In [None]:
UPDATE_FREQUENCY = pd.Timedelta(days=7)


soh_phases["date"] = (
    pd.to_datetime(soh_phases["date"], format='mixed')
    .dt.floor(UPDATE_FREQUENCY)
    .dt.tz_localize(None)
    .dt.date
    .astype('datetime64[ns]')
)

In [None]:
result = soh_phases.groupby(['date', 'VIN'], as_index=False).agg(
    soh = ("soh", "median"),
    count = ("soh", "count"),
    odometer = ("odometer", "max"),
    net_capacity = ("net_capacity", "first"),
)

In [None]:
px.scatter(result, x='odometer', y='soh', color='VIN')