In [None]:
%load_ext autoreload
%autoreload 2

## Notebook pour tester le pipe de fleet-telemetry

### Raw tss

In [None]:
from core.s3_utils import *
from transform.raw_tss.fleet_telemetry_raw_tss import *
import pandas as pd
import plotly.express as px
from transform.processed_tss.config import *
from core.constants import *
from transform.raw_tss.config import *
# from transform.processed_tss.ProcessedTimeSeries import ProcessedTimeSeries
from transform.raw_results.tesla_fleet_telemetry import get_results as get_results_origin
from transform.processed_results.main import get_processed_results
from datetime import timedelta

import numpy as np

In [None]:
S3_RAW_TSS_KEY_FORMAT = "raw_ts/{brand}/time_series/raw_tss.parquet"
FLEET_TELEMETRY_RAW_TSS_KEY = S3_RAW_TSS_KEY_FORMAT.format(brand="tesla-fleet-telemetry")

In [None]:
FLEET_TELEMETRY_RAW_TSS_KEY

In [None]:
s3 = S3_Bucket()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max as spark_max, substring
from typing import List
from pyspark.sql import SparkSession, DataFrame as SparkDF
from pyspark.sql.functions import col, pandas_udf
from typing import Iterator
import pandas as pd

def get_response_keys_to_parse_with_spark(bucket: S3_Bucket, spark: SparkSession) -> "pyspark.sql.DataFrame":
    if bucket.check_file_exists(FLEET_TELEMETRY_RAW_TSS_KEY):
        # raw_tss_path = bucket.get_s3_path(FLEET_TELEMETRY_RAW_TSS_KEY)
        raw_tss_df = spark.read.parquet(FLEET_TELEMETRY_RAW_TSS_KEY).select("vin", "readable_date")
    else:
        # Attention : ici, DEFAULT_TESLA_RAW_TSS_DF doit être converti en PySpark
        raw_tss_df = spark.createDataFrame(DEFAULT_TESLA_RAW_TSS_DF)

    # Obtenir la dernière date parsée par VIN
    last_parsed_df = raw_tss_df.groupBy("vin").agg(
        spark_max("readable_date").alias("last_parsed_date")
    )

    # Liste des fichiers à parser
    responses_df = bucket.list_responses_keys_of_brand("tesla-fleet-telemetry", as_spark=True, spark=spark)

    # Extraire la date depuis le nom de fichier (supposé être à la fin du nom sans extension .jsonl ou .parquet)
    responses_df = responses_df.withColumn(
        "date", col("file").substr(1, length=col("file"))[:-5].cast("timestamp")
    )

    # Joindre avec les dernières dates
    merged_df = responses_df.join(last_parsed_df, on="vin", how="outer")

    # Garder les fichiers nouveaux ou non parsés
    filtered_df = merged_df.filter(
        (col("last_parsed_date").isNull()) | (col("date") > col("last_parsed_date"))
    )

    return filtered_df


def get_raw_tss_from_keys_with_spark(keys: SparkDF, bucket: S3_Bucket, spark: SparkSession, batch_size: int = 500) -> SparkDF:
    # Trier par date pour reproductibilité
    keys = keys.orderBy("date")

    # Conversion en partitions logiques avec repartition
    num_partitions = max(1, keys.count() // batch_size)
    keys = keys.repartition(num_partitions)

    # Définir une fonction de traitement par partition (pandas_udf de type GROUPED_MAP)
    @pandas_udf("vin string, timestamp timestamp, other_column_1 type, ...", functionType="MAP_ITER")
    def parse_partition(batch_iter: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
        for pdf in batch_iter:
            parsed_dfs = []
            for _, row in pdf.iterrows():
                key = row["key"]
                try:
                    response = bucket.read_json(key)
                    parsed_df = explode_data(pd.DataFrame.from_records(response))
                    parsed_dfs.append(parsed_df)
                except Exception as e:
                    logger.warning(f"Failed to parse {key}: {e}")
            if parsed_dfs:
                yield pd.concat(parsed_dfs, ignore_index=True)

    # Appliquer la fonction sur les partitions
    raw_tss_df = keys.mapInPandas(parse_partition, schema="vin string, timestamp timestamp, ...")

    return raw_tss_df

def get_raw_tss_with_spark(bucket: S3_Bucket, spark: SparkSession) -> "pyspark.sql.DataFrame":
    logger.debug("Getting raw TSS from responses provided by Tesla fleet telemetry.")
    keys_df = get_response_keys_to_parse_with_spark(bucket, spark)

    if bucket.check_file_exists(FLEET_TELEMETRY_RAW_TSS_KEY):
        # raw_tss_path = bucket.get_s3_path(FLEET_TELEMETRY_RAW_TSS_KEY)
        raw_tss_df = spark.read.parquet("s3://"+FLEET_TELEMETRY_RAW_TSS_KEY)
        new_raw_tss_df = get_raw_tss_from_keys_with_spark(keys_df, bucket, spark)
        return raw_tss_df.unionByName(new_raw_tss_df)
    else:
        return get_raw_tss_from_keys_with_spark(keys_df, bucket, spark)


In [None]:
S3_Bucket().get_creds_from_dot_env()#.read_json_file("response/tesla-fleet-telemetry/7SAXCCE67RF445292/2025-04-02.json")

In [None]:
access_key = "SCW9P6Q1T26F2JGSC1AS"
secret_key = "c702e16a-5a48-45f3-8538-5783b5c58e44"

In [None]:
from pyspark.sql import SparkSession
import os
from transform.raw_tss.fleet_telemetry_raw_tss import *
os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages org.apache.hadoop:hadoop-aws:3.3.4 pyspark-shell"

)
spark = SparkSession.builder \
    .appName("Scaleway S3 Read JSON") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") \
    .config("spark.hadoop.fs.s3a.endpoint", "https://s3.fr-par.scw.cloud") \
    .config("spark.hadoop.fs.s3a.access.key", access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", secret_key) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.executor.memory", "12g") \
    .config("spark.driver.memory", "12g") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()

In [None]:
s3.check_spark_file_exists("raw_ts/tesla-fleet-telemetry/time_series/spark_raw_tss.parquet/")

In [None]:
df = spark.read.parquet("s3a://bib-platform-prod-data/raw_ts/tesla-fleet-telemetry/time_series/spark_raw_tss.parquet")

In [None]:
from transform.raw_tss.spark_raw_tss import get_raw_tss as get_raw_tss_spark

In [None]:
import argparse
from logging import getLogger
import os
import shutil

from scipy.integrate import cumulative_trapezoid
import pandas as pd
from core.constants import *

from core.spark_utils import *
from core.pandas_utils import *
from core.caching_utils import CachedETLSpark
from core.logging_utils import set_level_of_loggers_with_prefix
from core.console_utils import main_decorator
from transform.processed_tss.config import *
from transform.raw_tss.main import get_raw_tss
from transform.fleet_info.main import fleet_info
from pyspark.sql import DataFrame as DF, Window
from pyspark.sql.functions import (
    col, lag, unix_timestamp, when, lit, last, 
    expr, coalesce, sum as spark_sum
)
from pyspark.sql.types import FloatType, TimestampType, DoubleType
from transform.fleet_info.main import fleet_info
from core.spark_utils import *
from transform.processed_tss.config import *
from core.s3_utils import S3_Bucket


# Here we have implemented the ETL as a class as most raw time series go through the same processing step.
# To have a processing step specific to a data provider/manufacturer, simply implement a subclass of ProcessedTimeSeries and update update_all_tss.
class ProcessedTimeSeries(CachedETLSpark):
    # Declare that the following variable names are not dataframe(parent class) columns
    _metadata = ['make', "logger", "id_col", "max_td"]

    def __init__(self, make:str, id_col:str="vin", log_level:str="INFO", max_td:TD=MAX_TD, force_update:bool=False, spark: SparkSession = None,  **kwargs):
        self.make = make
        logger_name = f"transform.processed_tss.{make}"
        self.logger = getLogger(logger_name)
        set_level_of_loggers_with_prefix(log_level, logger_name)
        self.id_col = id_col
        self.max_td = max_td
        self.spark = spark
        super().__init__(S3_PROCESSED_TSS_KEY_FORMAT.format(make=make), "s3", force_update=force_update, spark=spark, **kwargs)
    # No need to call run, it will be called in CachedETL init.
    def run(self):
        self.logger.info(f"{'Processing ' + self.make + ' raw tss.':=^{50}}")
        tss = get_raw_tss_spark(spark)
        print('load data')
        tss = rename_and_select(tss, rename_spark_column, col_to_select)
        tss = safe_astype_spark(tss)
        tss = self.normalize_units_to_metric(tss)
        tss = tss.orderBy(["vin", "date"])
        tss = self.compute_date_vars(tss)
        tss = self.compute_charge_n_discharge_vars(tss)
        tss = tss.join(spark.createDataFrame(fleet_info), 'vin', 'left')
        print("process done")
        return tss


    def normalize_units_to_metric(self, tss):
        tss = tss.withColumn("odometer", col("odometer") * 1.609)
        return tss

    
    def compute_cum_var(self, tss, var_col: str, cum_var_col: str):
        if var_col not in tss.columns:
            self.logger.debug(f"{var_col} not found, not computing {cum_var_col}.")
            return tss

        self.logger.debug(f"Computing {cum_var_col} from {var_col} using Arrow + Pandas UDF.")

        # Schéma de retour attendu → adapte le type si nécessaire
        schema = tss.schema.add(cum_var_col, DoubleType())

        @pandas_udf(schema, functionType="grouped_map")
        def integrate_trapezoid(df: pd.DataFrame) -> pd.DataFrame:
            # Ordonne les données par date (important !)
            df = df.sort_values("date").copy()

            # Conversion x en secondes (numpy int64)
            x = df["date"].astype('int64') // 10**9  # Convertit ns → s
            y = df[var_col].fillna(0).astype("float64")

            # Intégration cumulée
            cum = cumulative_trapezoid(y=y.values, x=x.values, initial=0) * KJ_TO_KWH

            # Ajuste pour que ça commence à zéro
            cum = cum - cum[0]

            df[cum_var_col] = cum
            return df

        return tss.groupBy(self.id_col).apply(integrate_trapezoid)

    def compute_date_vars(self, tss: DF) -> DF:
        # Créer une fenêtre par vin, ordonnée par date
        window_spec = Window.partitionBy("vin").orderBy("date")
        
        # Calculer le lag de date (valeur précédente)
        tss = tss.withColumn("prev_date", lag(col("date")).over(window_spec))
        
        # Différence en secondes entre les deux timestamps
        tss = tss.withColumn(
            "sec_time_diff",
            (unix_timestamp(col("date")) - unix_timestamp(col("prev_date"))).cast("double")
        )
        
        return tss

    def compute_charge_n_discharge_masks(self, tss:DF, in_charge_vals:list, in_discharge_vals:list) -> DF:
        """Computes the `in_charge` and `in_discharge` masks either from the charging_status column or from the evolution of the soc over time."""
        self.logger.debug("Computing charging and discharging masks.")
        if self.make in CHARGE_MASK_WITH_CHARGING_STATUS_MAKES:
            return self.charge_n_discharging_masks_from_charging_status(tss, in_charge_vals, in_discharge_vals)
        if self.make in CHARGE_MASK_WITH_SOC_DIFFS_MAKES:
            return self.charge_n_discharging_masks_from_soc_diff(tss)
        raise ValueError(MAKE_NOT_SUPPORTED_ERROR.format(make=self.make))

    def charge_n_discharging_masks_from_soc_diff(self, tss):
        w = Window.partitionBy(self.id_col).orderBy("date").rowsBetween(Window.unboundedPreceding, 0)

        # Forward fill soc
        tss = tss.withColumn("soc_ffilled", last("soc", ignorenulls=True).over(w))

        # Window for diff calculation
        w_diff = Window.partitionBy(self.id_col).orderBy("date")

        soc_prev = lag("soc_ffilled").over(w_diff)
        soc_diff = col("soc_ffilled") - soc_prev

        # Normalisation du signe → {-1, 0, 1}
        soc_sign = when(soc_diff.isNull(), lit(0)).otherwise(soc_diff / abs(soc_diff))

        tss = tss.withColumn("soc_diff", soc_sign)

        # Forward fill and backward fill equivalents
        tss = tss.withColumn("soc_diff_ffill", last("soc_diff", ignorenulls=True).over(w))
        w_rev = Window.partitionBy(self.id_col).orderBy(col("date").desc()).rowsBetween(Window.unboundedPreceding, 0)
        tss = tss.withColumn("soc_diff_bfill", last("soc_diff", ignorenulls=True).over(w_rev))

        # Définition des masques
        tss = tss.withColumn("in_charge", (col("soc_diff_ffill") > 0) & (col("soc_diff_bfill") > 0))
        tss = tss.withColumn("in_discharge", (col("soc_diff_ffill") < 0) & (col("soc_diff_bfill") < 0))

        return tss

    def charge_n_discharging_masks_from_charging_status(self, tss:DF, in_charge_vals:list, in_discharge_vals:list) -> DF:
        self.logger.debug(f"Computing charging and discharging vars using charging status dictionary.")
        assert "charging_status" in tss.columns, NO_CHARGING_STATUS_COL_ERROR
        return (
            tss
            .eval(f"in_charge = charging_status in {in_charge_vals}")
            .eval(f"in_discharge = charging_status in {in_discharge_vals}")
        )

    def trim_leading_n_trailing_soc_off_masks(self, tss:DF, masks:list[str]) -> DF:
        self.logger.debug(f"Computing trimmed masks of{masks}.")
        for mask in masks:
            tss["naned_soc"] = tss["soc"].where(tss[mask])
            soc_grp = tss.groupby(["vin", mask + "_idx"], observed=True)["naned_soc"]
            trailing_soc = soc_grp.transform("first")
            leading_soc = soc_grp.transform("last")
            tss["trailing_soc"] = trailing_soc
            tss["leading_soc"] = leading_soc
            tss[f"trimmed_{mask}"] = tss[mask] & (tss["soc"] != trailing_soc) & (tss["soc"] != leading_soc)
        tss = tss.drop(columns="naned_soc")
        return tss
    
    
    def compute_idx_from_masks(self, tss, masks: list[str]):
        """
        Spark version of compute_idx_from_masks.
        
        Args:
            tss (DataFrame): Spark DataFrame.
            masks (list): List of boolean column names to compute idx on.
        
        Returns:
            DataFrame: Transformed Spark DataFrame.
        """
        self.logger.info(f"Computing {masks} idx from masks.")
        
        for mask in masks:
            idx_col_name = f"{mask}_idx"

            w = Window.partitionBy(self.id_col).orderBy("time")  # adapte 'time' à ta colonne temporelle

            # Décalage de mask par groupe
            shifted_mask = lag(col(mask), 1).over(w)

            # new_period_start_mask = shifted_mask != mask
            new_period_start_mask = (shifted_mask.isNull() | (shifted_mask != col(mask)))

            # Si max_td est défini, on ajoute aussi condition sur time_diff
            if self.max_td is not None:
                new_period_start_mask = new_period_start_mask | (col("time_diff") > lit(self.max_td))

            # Génère l'index via cumul
            tss = tss.withColumn(
                "new_period_start_mask", when(new_period_start_mask, lit(1)).otherwise(lit(0))
            )

            tss = tss.withColumn(
                idx_col_name, spark_sum("new_period_start_mask").over(w)
            ).drop("new_period_start_mask")

        return tss

    def compute_status_col(self, tss):
        self.logger.debug("Computing status column.")

        # Fenêtre ordonnée par date pour chaque VIN
        w = Window.partitionBy("vin").orderBy("date")

        # Décalage pour calculer diff(odometer)
        prev_odo = lag("odometer").over(w)
        delta_odo = col("odometer") - prev_odo

        # Première base de status
        status = when(col("in_charge") == True, lit("charging")) \
                .when(col("in_charge") == False, lit("discharging")) \
                .otherwise(lit("unknown"))

        # Raffinement → si in_charge == False → "moving" ou "idle_discharging"
        status = when(col("in_charge") == True, lit("charging")) \
                .when(col("in_charge") == False, 
                    when(delta_odo > 0, lit("moving"))
                    .otherwise(lit("idle_discharging"))
                    ) \
                .otherwise(lit("unknown"))

        return tss.withColumn("status", status)

    @classmethod
    def update_all_tss(cls, **kwargs):
        for make in ALL_MAKES:
            if make in ["tesla", "tesla-fleet-telemery"]:
                cls = TeslaProcessedTimeSeries
            else:
                cls = ProcessedTimeSeries
            cls(make, force_update=True, **kwargs)
            
            
class TeslaProcessedTimeSeries(ProcessedTimeSeries):
    def __init__(self, make:str="tesla-fleet-telemery", id_col:str="vin", log_level:str="INFO", max_td:TD=MAX_TD, force_update:bool=False, spark=None, **kwargs):
        self.logger = getLogger(make)
        set_level_of_loggers_with_prefix(log_level, make)
        super().__init__(make, id_col, log_level, max_td, force_update, spark=spark, **kwargs)

        
    def compute_charge_n_discharge_vars(self, tss:DF) -> DF:
        tss = self.compute_charge_n_discharge_masks(tss, IN_CHARGE_CHARGING_STATUS_VALS, IN_DISCHARGE_CHARGING_STATUS_VALS)
        tss = self.compute_charge_idx_bis(tss)
        return tss

    def compute_charge_n_discharge_masks(self, tss:DF, in_charge_vals:list, in_discharge_vals:list) -> DF:
        """Computes the `in_charge` and `in_discharge` masks either from the charging_status column or from the evolution of the soc over time."""
        if self.make in CHARGE_MASK_WITH_CHARGING_STATUS_MAKES:
            return self.charge_n_discharging_masks_from_charging_status(tss, in_charge_vals, in_discharge_vals)

    def charge_n_discharging_masks_from_charging_status(self, tss: DF, in_charge_vals: list, in_discharge_vals: list) -> DF:
        assert "charging_status" in tss.columns, NO_CHARGING_STATUS_COL_ERROR
        
        # Masques booléens Spark
        tss = tss.withColumn(
            "in_charge",
            when(col("charging_status").isin(in_charge_vals), lit(True)).otherwise(lit(False))
        )

        tss = tss.withColumn(
            "in_discharge",
            when(col("charging_status").isin(in_discharge_vals), lit(True)).otherwise(lit(False))
        )
    
        return tss

    def compute_energy_added(self, tss: DF) -> DF:
        tss = tss.withColumn(
            "charge_energy_added",
            when(
                col("dc_charge_energy_added").isNotNull() & (col("dc_charge_energy_added") > 0),
                col("dc_charge_energy_added")
            ).otherwise(col("ac_charge_energy_added"))
        )
        return tss

    
    def compute_charge_idx_bis(self, tss: DF) -> DF:
        
        tss = self.compute_energy_added(tss)
        
        # 1. Filtrer les lignes où soc n'est pas null
        tss_na = tss.filter(col("soc").isNotNull())

        # 2. Créer une fenêtre ordonnée par date par VIN
        vin_window = Window.partitionBy("vin").orderBy("date")

        # 3. Calcul des différences
        tss_na = tss_na \
            .withColumn("soc_diff", col("soc") - lag("soc", 1).over(vin_window)) \
            .withColumn("trend", when(col("soc_diff") > 0, lit(1))
                                .when(col("soc_diff") < 0, lit(-1))
                                .otherwise(lit(0))) \
            .withColumn("prev_trend", lag("trend", 1).over(vin_window)) \
            .withColumn("prev_prev_trend", lag("trend", 2).over(vin_window)) \
            .withColumn("prev_date", lag("date", 1).over(vin_window)) \
            .withColumn("time_diff_min", 
                        (unix_timestamp(col("date")) - unix_timestamp(col("prev_date"))) / 60) \
            .withColumn("time_gap", col("time_diff_min") > 60) \
            .withColumn("trend_change",
                        when(
                            ((col("trend") != col("prev_trend")) & 
                            (col("prev_trend") == col("prev_prev_trend"))) | 
                            col("time_gap"), 
                            lit(1)
                        ).otherwise(lit(0)))

        # 4. Initialiser les premières lignes à 0
        tss_na = tss_na.withColumn(
            "trend_change",
            when(col("date") == lag("date", 1).over(vin_window), lit(0)).otherwise(col("trend_change"))
        )

        # 5. Cumulative sum (session index)
        tss_na = tss_na.withColumn("in_charge_idx", spark_sum("trend_change").over(vin_window.rowsBetween(Window.unboundedPreceding, 0)))

        # 6. Join avec le DataFrame original
        tss = tss.join(
            tss_na.select("vin", "date", "soc", "soc_diff", "in_charge_idx"),
            on=["vin", "date", "soc"],
            how="left"
        )

        # 7. Forward-fill `odometer` et `in_charge_idx` (non-natif en Spark, mais on peut approximer)
        fill_window = Window.partitionBy("vin").orderBy("date").rowsBetween(Window.unboundedPreceding, 0)
        tss = tss \
            .withColumn("odometer", coalesce(col("odometer"), expr("last(odometer, true)").over(fill_window))) \
            .withColumn("in_charge_idx", coalesce(col("in_charge_idx"), expr("last(in_charge_idx, true)").over(fill_window)))

        return tss

In [None]:
rename_spark_column = {
        "readable_date": "date",
        "Odometer" : "odometer",
        "ACChargingEnergyIn": "ac_charge_energy_added",
        "Soc": "soc",
        "CarType": 'model',
        "DCChargingEnergyIn": "dc_charge_energy_added",
        "BatteryLevel": "battery_level",
        "ACChargingPower": "ac_charging_power",
        "DCChargingPower": "dc_charging_power",
        "DetailedChargeState": "charging_status",
        
}
col_to_select = [
    'vin', 'date', 'odometer', 'soc', 
    "battery_level",
    "ac_charge_energy_added",
    "dc_charge_energy_added",
    "ac_charging_power",
    "dc_charging_power",
    "charging_status"]

In [None]:
res = TeslaProcessedTimeSeries(make='tesla-fleet-telemetry', force_update=False, spark=spark)

In [None]:
res.data.select('vin').distinct().count()

In [None]:
df_pd = res.data.withColumn("vin", col("vin") == "5YJSA7E52RF541858").toPandas()

In [None]:
df_pd.columns

In [None]:
import plotly.express as px

In [None]:
df_pd.dropna(subset='soc').shape

In [None]:
df_pd.columns

In [None]:
df_pd['in_charge_idx'] = df_pd['in_charge_idx'].astype('str')

In [None]:
df_pd[df_pd["vin"]==True]

In [None]:
rename_spark_column = {
        "readable_date": "date",
        "Odometer" : "odometer",
        "ACChargingEnergyIn": "ac_charge_energy_added",
        "Soc": "soc",
        "CarType": 'model',
        "DCChargingEnergyIn": "dc_charge_energy_added",
        "BatteryLevel": "battery_level",
        "ACChargingPower": "ac_charging_power",
        "DCChargingPower": "dc_charging_power",
        "DetailedChargeState": "charging_status",
}
col_to_select = [
    'vin', 'date', 'odometer', 'soc', 
    "battery_level",
    "ac_charge_energy_added",
    "dc_charge_energy_added",
    "ac_charging_power",
    "dc_charging_power",
    "charging_status"]

In [None]:
ProcessedTimeSeries.update_all_tss()

In [None]:
df = res.run()

In [None]:
df.select('vin').distinct().count()

In [None]:
df

In [None]:
res.vin.nunique()

In [None]:
from transform.raw_results.raw_results_spark import *

In [None]:
raw_res = get_results(df)

In [None]:
raw_res.count()

In [None]:
raw_res_oandas = raw_res.toPandas()

In [None]:
raw_actual = s3.read_parquet_df('raw_results/tesla-fleet-telemetry.parquet')

In [None]:
raw_actual[["vin", "date", "soh", "version", ]].sort_values(['vin', 'date'])

In [None]:
raw_res_oandas[["vin", "date", "soh", "version","in_charge_idx"]].sort_values(['vin', 'date'])

In [None]:
raw_res_oandas

In [None]:
raw_res_oandas.columns

In [None]:
raw_res_oandas[["vin", "date", "soh"]].sort_values(['vin', 'date']).dropna()

In [None]:
df_Pandas = df.toPandas()

In [None]:
def create_spark_seesion(access_key, secret_key):
    os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages org.apache.hadoop:hadoop-aws:3.3.4 pyspark-shell"
)
    spark = SparkSession.builder \
    .appName("Scaleway S3 Read JSON") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") \
    .config("spark.hadoop.fs.s3a.endpoint", "https://s3.fr-par.scw.cloud") \
    .config("spark.hadoop.fs.s3a.access.key", access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", secret_key) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .getOrCreate()
    return spark

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType , MapType
from pyspark.sql.functions import col, max as spark_max, to_timestamp, expr, udf,  explode, when
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType, IntegerType, ArrayType
from functools import reduce


In [None]:
def read_parquet_spark(spark, key: str, columns: list[str] | None = None):
        full_path = f"s3a://bib-platform-prod-data/{key}"
        df = spark.read.parquet(full_path)
        if columns is not None:
            df = df.select(*columns)
        return df
    
def get_response_keys_to_parse_spark(spark, bucket:S3_Bucket):
    if bucket.check_file_exists(FLEET_TELEMETRY_RAW_TSS_KEY):
        raw_tss_subset = read_parquet_spark(spark, FLEET_TELEMETRY_RAW_TSS_KEY, columns=["vin", "readable_date"])
    else:
        schema = StructType([
    StructField("vin", StringType(), True),
    StructField("readable_date", TimestampType(), True),
])
        raw_tss_subset = spark.createDataFrame([], schema)
    
    last_parsed_date = (
        raw_tss_subset
        .groupby(["vin"])
        .agg({"readable_date": "max"}).withColumnRenamed("max(readable_date)", "last_parsed_date")
    )
    response_keys_df = bucket.list_responses_keys_of_brand("tesla-fleet-telemetry")
    response_keys_df = spark.createDataFrame(response_keys_df)
    response_keys_df = response_keys_df.withColumn(
        "date",
        to_timestamp(expr("substring(file, 1, length(file) - 5)"))
    )

    return (
        response_keys_df
        .join(last_parsed_date, on="vin", how="outer")
        .filter((col("last_parsed_date").isNull()) | (col("date") > col("last_parsed_date")))
    )

In [None]:
path = f"s3a://bib-platform-prod-data/response/tesla-fleet-telemetry/5YJ3E7EB1KF334219/2025-05-21.json"
path_parquet = "s3a://bib-platform-prod-data/raw_ts/tesla-fleet-telemetry/time_series/raw_tss.parquet"
# df = spark.read.parquet(path_parquet)


In [None]:
response = get_response_keys_to_parse_spark(spark, s3)

In [None]:
resp_pandas = response.toPandas()

In [None]:
resp_pandas

In [None]:
def explode_data_spark(response: dict):
    flattened_data = []
    for entry in response if isinstance(response, list) else [response]:
        vin = entry.get('vin')
        timestamp = entry.get('timestamp')
        readable_date = entry.get('readable_date')

        base = {
            'vin': vin,
            'timestamp': timestamp,
            'readable_date': readable_date,
        }

        row_data = {}
        for item in entry.get("data", []):
            key = item.get("key")
            value_dict = item.get("value", {})
            if not value_dict:
                continue
            value = list(value_dict.values())[0]  # Récupère la valeur quel que soit le type
            row_data[key] = value

        flattened_data.append({**base, **row_data})
    # Conversion Pandas → Spark
    df_pd = pd.DataFrame(flattened_data)
    df_pd = df_pd.replace(False, 'false').replace(True, 'true')
    for col in df_pd.columns:
        df_pd[col] = df_pd[col].astype(str)
    df_spark = spark.createDataFrame(df_pd)
    return df_spark

In [None]:
def get_raw_tss_from_keys_spark(keys: DataFrame, bucket: S3_Bucket) -> DataFrame:
    """
    keys: Spark DataFrame with columns 'key', 'vin', 'date' (where 'date' is a timestamp)
    bucket: S3_Bucket with method read_multiple_json_files(keys: List[str], ...) -> List[dict]
    Returns a Spark DataFrame with parsed and exploded telemetry data
    """
    # Convert Spark DataFrame to Pandas for grouping
    keys_pd = keys.select("key", "vin", "date").toPandas()

    # Grouper par semaine (comme avant)
    grouped = keys_pd.groupby(pd.Grouper(key="date", freq="W-MON"))
    grouped_items = list(grouped)
    for week, week_keys in track(grouped_items, description="Processing weekly groups"):
        week_date = week.date().strftime('%Y-%m-%d')
        logger.debug(f"Parsing the responses of the week {week_date}:")
        logger.debug(f"{len(week_keys)} keys to parse for {week_keys['vin'].nunique()} vins.")
        logger.debug(f"This represents {round(len(week_keys) / len(keys_pd) * 100)}% of the total keys to parse.")

        # Lire les JSON depuis S3
        responses = bucket.read_multiple_json_files(week_keys["key"].tolist(), max_workers=64)

        logger.debug(f"Read the responses.")
        # Parser et exploser les JSON avec ThreadPoolExecutor
        with ThreadPoolExecutor(max_workers=64) as executor:
            week_raw_tss = list(executor.map(explode_data_spark, responses)) # retourne des pandas ou Spark DFs ?
        #week_raw_tss = [explode_data_spark(responses)]

    #print(len(week_raw_tss))
    # Union finale
    if week_raw_tss:
        return reduce(DataFrame.unionAll, week_raw_tss)
    else:
        # Retourne un Spark DF vide si aucun résultat
        schema = spark.createDataFrame([], week_raw_tss[0].schema) if 'concatenated' in locals() else spark.createDataFrame([], schema=StructType([]))
        return schema

In [None]:
test_response = [{"vin":"5YJ3E7EB1KF334219","timestamp":1748318343277,"readable_date":"2025-05-27 03:59:03",
                  "data":[{"key":"InsideTemp","value":{"stringValue":"19.100000880658627"}},
                          {"key":"FastChargerType","value":{"invalid":True}},
                          {"key":"PackCurrent","value":{"stringValue":"0"}},
                          {"key":"ModuleTempMax","value":{"stringValue":"26"}},
                          {"key":"ChargeCurrentRequest","value":{"stringValue":"16"}},
                          {"key":"ChargePortColdWeatherMode","value":{"stringValue":"false"}},
                          {"key":"DCChargingEnergyIn","value":{"stringValue":"17.419999610632658"}},
                          {"key":"DCDCEnable","value":{"stringValue":"false"}},
                          {"key":"ChargerPhases","value":{"invalid":True}},
                          {"key":"PreconditioningEnabled","value":{"stringValue":"false"}},
                          {"key":"ChargeCurrentRequestMax","value":{"stringValue":"16"}},
                          {"key":"BatteryLevel","value":{"stringValue":"49.2429022082019"}},
                          {"key":"ACChargingPower","value":{"stringValue":"0"}},
                          {"key":"EstBatteryRange","value":{"stringValue":"113.16217657792016"}},
                          {"key":"BmsFullchargecomplete","value":{"stringValue":"false"}},
                          {"key":"ChargeAmps","value":{"stringValue":"0"}},
                          {"key":"LifetimeEnergyUsed","value":{"stringValue":"26604.12326362799"}},
                          {"key":"HvacACEnabled","value":{"booleanValue":False}},
                          {"key":"BatteryHeaterOn","value":{"stringValue":"false"}},
                          {"key":"IsolationResistance","value":{"stringValue":"4280"}},
                          {"key":"DetailedChargeState","value":{"detailedChargeStateValue":"DetailedChargeStateDisconnected"}},
                          {"key":"BrickVoltageMin","value":{"stringValue":"3.836000182200223"}},
                          {"key":"BrickVoltageMax","value":{"stringValue":"3.838000182295218"}},
                          {"key":"EstimatedHoursToChargeTermination","value":{"invalid":True}},
                          {"key":"ChargePort","value":{"stringValue":"CCS"}},
                          {"key":"ChargeState","value":{"stringValue":"Idle"}},
                          {"key":"HvacPower","value":{"hvacPowerValue":"HvacPowerStateOff"}},
                          {"key":"EfficiencyPackage","value":{"stringValue":"Default"}},
                          {"key":"HvacAutoMode","value":{"hvacAutoModeValue":"HvacAutoModeStateOn"}},
                          {"key":"SentryMode","value":{"stringValue":"Off"}},
                          {"key":"HvacFanSpeed","value":{"intValue":2}},
                          {"key":"BMSState","value":{"stringValue":"Standby"}},
                          {"key":"FastChargerPresent","value":{"stringValue":"false"}},
                          {"key":"ModuleTempMin","value":{"stringValue":"24.5"}},
                          {"key":"Odometer","value":{"stringValue":"64016.35690902214"}},
                          {"key":"Soc","value":{"stringValue":"48.958990536277604"}},
                          {"key":"DefrostMode","value":{"defrostModeValue":"DefrostModeStateOff"}},
                          {"key":"ChargeEnableRequest","value":{"stringValue":"true"}},
                          {"key":"ACChargingEnergyIn","value":{"stringValue":"17.83427804352963"}},
                          {"key":"ChargeRateMilePerHour","value":{"doubleValue":0}},
                          {"key":"ChargingCableType","value":{"invalid":True}},
                          {"key":"VehicleSpeed","value":{"invalid":True}},
                          {"key":"OutsideTemp","value":{"stringValue":"12"}},
                          {"key":"RatedRange","value":{"stringValue":"126.69387471919158"}},
                          {"key":"EuropeVehicle","value":{"booleanValue":True}},
                          {"key":"PackVoltage","value":{"stringValue":"368.1199917718768"}},
                          {"key":"IdealBatteryRange","value":{"stringValue":"126.69387471919158"}},
                          {"key":"ClimateKeeperMode","value":{"climateKeeperModeValue":"ClimateKeeperModeStateOff"}},
                          {"key":"RearDefrostEnabled","value":{"booleanValue":False}},
                          {"key":"CarType","value":{"stringValue":"Model3"}},
                          {"key":"DefrostForPreconditioning","value":{"booleanValue":False}},
                          {"key":"EnergyRemaining","value":{"stringValue":"32.65999926999211"}},
                          {"key":"ChargeLimitSoc","value":{"stringValue":"62"}},
                          {"key":"DCChargingPower","value":{"stringValue":"0"}}],"meta":{},"createdAt":"2025-05-27T03:59:03.277232687Z"},
                 {"vin":"5YJ3E7EB1KF334219","timestamp":1748318344277,"readable_date":"2025-05-27 03:59:04",
                  "data":[{"key":"ChargerVoltage","value":{"doubleValue":1.716}}],"meta":{},"createdAt":"2025-05-27T03:59:04.277224934Z"}]

In [None]:
from pyspark.sql.functions import to_date, trunc, date_format
from rich.progress import track

def get_raw_tss_from_keys_spark(keys: DataFrame, bucket: S3_Bucket) -> DataFrame:

    df = keys.withColumn("week_start", trunc(to_date("date"), "week"))  # Spark considère la semaine commençant le dimanche

    # Corriger pour que la semaine commence le lundi
    from pyspark.sql.functions import expr
    df = df.withColumn("week_start", expr("date_sub(trunc(date, 'week'), 6)"))
    all_raw = []
    # Obtenir les semaines uniques
    weeks = df.select("week_start").distinct().orderBy("week_start").collect()
    for row in track(weeks, description="Processing weekly groups"):
        week_keys = df.filter(df.week_start == row["week_start"])
        key_list = [r["key"] for r in week_keys.select("key").distinct().collect()]
        responses = bucket.read_multiple_json_files(key_list, max_workers=64)
        with ThreadPoolExecutor(max_workers=64) as executor:
            week_raw_tss = list(executor.map(explode_data_spark, responses))
            all_raw = all_raw + week_raw_tss
        
    if all_raw:
        return reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), all_raw)
    else:
        return pd.DataFrame()


In [None]:
raw_tss = get_raw_tss_from_keys_spark(response, s3)

In [None]:
raw_tss.toPandas()

### Process TTS Spark

In [None]:
rename_spark_column = {
        "readable_date": "date",
        "Odometer" : "odometer",
        "ACChargingEnergyIn": "ac_charge_energy_added",
        "Soc": "soc",
        "CarType": 'model',
        "DCChargingEnergyIn": "dc_charge_energy_added",
        "BatteryLevel": "battery_level",
        "ACChargingPower": "ac_charging_power",
        "DCChargingPower": "dc_charging_power",
        "DetailedChargeState": "charging_status",
}

In [None]:
col_to_select = [
    'vin', 'date', 'odometer', 'soc', 
    "battery_level",
    "ac_charge_energy_added",
    "dc_charge_energy_added",
    "ac_charging_power",
    "dc_charging_power",
    "charging_status"]

In [None]:
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType, TimestampType
def rename_and_select(tss, rename_col, col_to_select):
    return tss.withColumnsRenamed(rename_col).select(col_to_select)

def safe_astype(tss):
    return tss.withColumn("odometer", col("odometer").cast(FloatType())) \
    .withColumn("soc", col("soc").cast(FloatType())) \
    .withColumn("battery_level", col("battery_level").cast(FloatType())) \
    .withColumn("ac_charge_energy_added", col("ac_charge_energy_added").cast(FloatType())) \
    .withColumn("dc_charge_energy_added", col("dc_charge_energy_added").cast(FloatType())) \
    .withColumn("ac_charging_power", col("ac_charging_power").cast(FloatType())) \
    .withColumn("dc_charging_power", col("dc_charging_power").cast(FloatType())) \
    .withColumn("date", col("date").cast(TimestampType()))

In [None]:
raw_tss_filter_type = raw_tss_filter.withColumn("odometer", col("odometer").cast(FloatType())) \
    .withColumn("soc", col("soc").cast(FloatType())) \
    .withColumn("battery_level", col("battery_level").cast(FloatType())) \
    .withColumn("ac_charge_energy_added", col("ac_charge_energy_added").cast(FloatType())) \
    .withColumn("dc_charge_energy_added", col("dc_charge_energy_added").cast(FloatType())) \
    .withColumn("ac_charging_power", col("ac_charging_power").cast(FloatType())) \
    .withColumn("dc_charging_power", col("dc_charging_power").cast(FloatType())) \
    .withColumn("date", col("date").cast(TimestampType()))

In [None]:
def normalize_units_to_metric(tss):
    tss = tss.withColumn("odometer", col("odometer") * 1.609)
    return tss

In [None]:
normalize_units_to_metric(raw_tss_filter_type)
raw_tss_filter_type_sorted = raw_tss_filter_type.sort(['vin', "date"])

In [None]:
from pyspark.sql import Window
from pyspark.sql.functions import col, lag, unix_timestamp
def compute_date_vars(tss: DF) -> DF:
    # Créer une fenêtre par vin, ordonnée par date
    window_spec = Window.partitionBy("vin").orderBy("date")
    
    # Calculer le lag de date (valeur précédente)
    tss = tss.withColumn("prev_date", lag(col("date")).over(window_spec))
    
    # Différence en secondes entre les deux timestamps
    tss = tss.withColumn(
        "sec_time_diff",
        (unix_timestamp(col("date")) - unix_timestamp(col("prev_date"))).cast("double")
    )
    
    return tss

In [None]:
raw_tss_filter_type_sorted_date_vars = compute_date_vars(raw_tss_filter_type_sorted)

In [None]:
from pyspark.sql import DataFrame as DF, Window
from pyspark.sql.functions import (
    col, lag, unix_timestamp, when, lit,
    expr, coalesce, sum as _sum
)

In [None]:
IN_CHARGE_CHARGING_STATUS_VALS = [
    'charging', # Tesla
    # 'nopower', # Tesla
    'chargingactive',
    'slow_charging',
    'fast_charging',
    'initialization',
    "in-progress",
    # fleet-telemetry
    'detailedchargestatecharging', 
    'detailedchargestatestarting'
]

IN_DISCHARGE_CHARGING_STATUS_VALS = [
    'charging_error',
    'nocharging',
    'chargingerror',
    'cable_unplugged',
    'disconnected', # Tesla
     # fleet-telemetry
    "detailedchargestatedisconnected",
    "detailedchargestatenopower",
    "detailedchargestatestopped",
    "detailedchargestatecomplete",
  
]

In [None]:
def charge_n_discharging_masks_from_charging_status(tss: DF, in_charge_vals: list, in_discharge_vals: list) -> DF:
    assert "charging_status" in tss.columns, NO_CHARGING_STATUS_COL_ERROR
    
    # Masques booléens Spark
    tss = tss.withColumn(
        "in_charge",
        when(col("charging_status").isin(in_charge_vals), lit(True)).otherwise(lit(False))
    )

    tss = tss.withColumn(
        "in_discharge",
        when(col("charging_status").isin(in_discharge_vals), lit(True)).otherwise(lit(False))
    )
    
    return tss

def compute_charge_n_discharge_masks(tss:DF, in_charge_vals:list, in_discharge_vals:list) -> DF:
    """Computes the `in_charge` and `in_discharge` masks either from the charging_status column or from the evolution of the soc over time."""
    if "tesla-fleet-telemetry" in CHARGE_MASK_WITH_CHARGING_STATUS_MAKES:
        return charge_n_discharging_masks_from_charging_status(tss, in_charge_vals, in_discharge_vals)

In [None]:
def compute_energy_added(tss: DF) -> DF:
    tss = tss.withColumn(
        "charge_energy_added",
        when(
            col("dc_charge_energy_added").isNotNull() & (col("dc_charge_energy_added") > 0),
            col("dc_charge_energy_added")
        ).otherwise(col("ac_charge_energy_added"))
    )
    return tss

In [None]:


def compute_charge_idx_bis(tss: DF) -> DF:
    
    tss = compute_energy_added(tss)
    
    # 1. Filtrer les lignes où soc n'est pas null
    tss_na = tss.filter(col("soc").isNotNull())

    # 2. Créer une fenêtre ordonnée par date par VIN
    vin_window = Window.partitionBy("vin").orderBy("date")

    # 3. Calcul des différences
    tss_na = tss_na \
        .withColumn("soc_diff", col("soc") - lag("soc", 1).over(vin_window)) \
        .withColumn("trend", when(col("soc_diff") > 0, lit(1))
                              .when(col("soc_diff") < 0, lit(-1))
                              .otherwise(lit(0))) \
        .withColumn("prev_trend", lag("trend", 1).over(vin_window)) \
        .withColumn("prev_prev_trend", lag("trend", 2).over(vin_window)) \
        .withColumn("prev_date", lag("date", 1).over(vin_window)) \
        .withColumn("time_diff_min", 
                    (unix_timestamp(col("date")) - unix_timestamp(col("prev_date"))) / 60) \
        .withColumn("time_gap", col("time_diff_min") > 60) \
        .withColumn("trend_change",
                    when(
                        ((col("trend") != col("prev_trend")) & 
                         (col("prev_trend") == col("prev_prev_trend"))) | 
                        col("time_gap"), 
                        lit(1)
                    ).otherwise(lit(0)))

    # 4. Initialiser les premières lignes à 0
    tss_na = tss_na.withColumn(
        "trend_change",
        when(col("date") == lag("date", 1).over(vin_window), lit(0)).otherwise(col("trend_change"))
    )

    # 5. Cumulative sum (session index)
    tss_na = tss_na.withColumn("in_charge_idx", _sum("trend_change").over(vin_window.rowsBetween(Window.unboundedPreceding, 0)))

    # 6. Join avec le DataFrame original
    tss = tss.join(
        tss_na.select("vin", "date", "soc", "soc_diff", "in_charge_idx"),
        on=["vin", "date", "soc"],
        how="left"
    )

    # 7. Forward-fill `odometer` et `in_charge_idx` (non-natif en Spark, mais on peut approximer)
    fill_window = Window.partitionBy("vin").orderBy("date").rowsBetween(Window.unboundedPreceding, 0)
    tss = tss \
        .withColumn("odometer", coalesce(col("odometer"), expr("last(odometer, true)").over(fill_window))) \
        .withColumn("in_charge_idx", coalesce(col("in_charge_idx"), expr("last(in_charge_idx, true)").over(fill_window)))

    return tss


In [None]:
def compute_charge_n_discharge_vars(tss:DF) -> DF:
    tss = compute_charge_n_discharge_masks(tss, IN_CHARGE_CHARGING_STATUS_VALS,  IN_DISCHARGE_CHARGING_STATUS_VALS)
    tss = compute_charge_idx_bis(tss)
    return tss
        # .pipe(self.compute_idx_from_masks, ["in_charge"])
        # .pipe(self.trim_leading_n_trailing_soc_off_masks, ["in_charge", "in_discharge"])
        # # .pipe(self.compute_idx_from_masks, ["trimmed_in_charge", "trimmed_in_discharge"])


In [None]:
process_tss = compute_charge_n_discharge_vars(raw_tss_filter_type_sorted_date_vars)

In [None]:
def run():
    

In [None]:
from transform.fleet_info.main import fleet_info


In [None]:
def run(tss) -> DF:
        tss = rename_and_select(tss, rename_spark_column, col_to_select)
        tss = safe_astype(tss)
        tss = normalize_units_to_metric(tss)
        tss = tss.orderBy(["vin", "date"])
        #tss = str_lower_columns(tss, COLS_TO_STR_LOWER)
        tss = compute_date_vars(tss)
        tss = compute_charge_n_discharge_vars(tss)
        #tss = tss.merge(fleet_info, on="vin", how="left")
        
        tss = tss.join(spark.createDataFrame(fleet_info), 'vin', 'left')
        #tss = tss.eval("age = date.dt.tz_localize(None) - start_date.dt.tz_localize(None)")
        #tss = tss.withColumn("age")
        # It seems that the reset_index calls doesn't reset the id_col into a category if the groupby's by argument was categorical.
        # So we recall astype on the id_col  in case it is supposed to be categorical.
        return tss

In [None]:


#response.write.mode("overwrite").parquet("s3a://bib-platform-prod-data/raw_ts/tesla-fleet-telemetry/time_series/test.parquet")

In [None]:
run(raw_tss)

# pass

### Processed tss

In [None]:
from transform.processed_tss.ProcessedTimeSeries import ProcessedTimeSeries

In [None]:
def compute_charge_n_discharge_masks(tss:DF) -> DF:
        # We use a nullable boolean Series to represnet the rows where:
        tss["nan_charging"] = (
            Series(pd.NA, index=tss.index, dtype="boolean")# We are not sure of anything.
            .mask(tss["charging_status"].isin(IN_CHARGE_CHARGING_STATUS_VALS), True)# We are sure that the vehicle is in charge.
            .mask(tss["charging_status"].isin(IN_DISCHARGE_CHARGING_STATUS_VALS), False)# We are sure that the vehicle is not in charge.
        )
        # If a period of uncertainty (NaN) is surrounded by equal periods of certainties (True-NaN-True or False-NaN-False),
        # We will fill them to the value of these certainties.
        # However there are edge cases that have multiple days of uncertainties periods (I can't find the VIN but I'm sure you can ;-) )
        # Interestingly enough the charge_energy_added variable does not get forwared that far and gets reset to zero. 
        # This would create outliers in our charge SoH estimation as we estimate the energy_gained as the diff between the last(0) and first value of charge_energy_added.
        # So we set a maximal uncertainty period duration over which we don't fill it.
        tss["nan_date"] = tss["date"].mask(tss["nan_charging"].isna())
        tss[["ffill_charging", "ffill_date"]] = tss.groupby("vin", observed=True)[["nan_charging", "nan_date"]].ffill()
        tss[["bfill_charging", "bfill_date"]] = tss.groupby("vin", observed=True)[["nan_charging", "nan_date"]].bfill()
        nan_period_duration:Series = tss.eval("bfill_date - ffill_date")
        fill_unknown_period = tss.eval("ffill_charging.eq(bfill_charging) & @nan_period_duration.le(@MAX_CHARGE_TD)")
        tss["nan_charging"] = tss["nan_charging"].mask(fill_unknown_period, tss["ffill_charging"])
        # As mentioned before, the SoC oscillates at [charge_limit_soc - ~3%, charge_limit_soc] so we set these periods to NaN as well.
        tss["nan_charging"] = tss["nan_charging"].mask(tss["soc"] >= (tss["charge_limit_soc"] - 3))
        # Then we seperate the Series into two, more explicit, columns.
        tss["in_charge"] = tss.eval("nan_charging.notna() & nan_charging")
        tss["in_discharge"] = tss.eval("nan_charging.notna() & ~nan_charging")
        return tss.drop(columns=["nan_charging", "ffill_charging", "bfill_charging", "ffill_date", "bfill_date"])
    
def compute_energy_added(tss:DF) -> DF:
        tss['charge_energy_added'] = tss['dc_charge_energy_added'].where(
            tss['dc_charge_energy_added'].notnull() & 
            (tss['dc_charge_energy_added'] > 0), 
            tss['ac_charge_energy_added'])
        return tss
    
def compute_charge_idx_bis(tss):

        tss = tss.pipe(compute_energy_added)
        tss_na = tss.dropna(subset=['soc']).copy()
        tss_na['soc_diff'] = tss_na.groupby('vin', observed=True)['soc'].diff()
        tss_na['soc_diff_rolling'] = tss_na['soc_diff'].rolling(window=5, min_periods=1).mean()
        # Determine trend
        tss_na['trend'] = tss_na['soc_diff_rolling'].apply(lambda x: 1 if x > 0 else -1 if x < 0 else np.nan)
        tss_na['trend'] = tss_na['trend'].ffill()

        def detect_trend_change(group):

            group['prev_trend'] = group['trend'].shift(1)
            group['prev_prev_trend'] = group['trend'].shift(2)
            
            group['prev_date'] = group['date'].shift(1)
            group['time_diff_min'] = (group['date'] - group['prev_date']).dt.total_seconds() / 60
            group['time_gap'] = group['time_diff_min'] > 60  

            # Faire une sépration charge_idx et discharge_idx
            group['trend_change'] = (
                (((group['trend'] != group['prev_trend']) & 
                  (group['prev_trend'] == group['prev_prev_trend']) ) |
                group['time_gap'])
            )
            group.loc[group.index[0:2], 'trend_change'] = False
            return group


        tss_na = tss_na.groupby('vin', observed=True).apply(detect_trend_change).reset_index(drop=True)
        
        # Compute charge id
        tss_na['in_charge_idx'] = tss_na.groupby('vin',  observed=True)['trend_change'].cumsum()
        tss = tss.merge(tss_na[["soc", "date", "vin", 'soc_diff', 'in_charge_idx', 'trend', 'prev_trend', 'prev_prev_trend', 'trend_change',]], 
                        on=["soc", "date", "vin"], how="left")
        tss[["odometer","in_charge_idx"]] = tss[["odometer", "in_charge_idx"]].ffill()
        return tss

In [None]:
from core.pandas_utils import safe_locate, safe_astype, str_lower_columns
from transform.fleet_info.main import fleet_info 

In [None]:
make = "tesla-fleet-telemetry"
def run(tss):
    tss = tss.rename(columns=RENAME_COLS_DICT, errors="ignore")
    tss = tss.pipe(safe_locate, col_loc=list(COL_DTYPES.keys()), logger=logger)
    tss = tss.pipe(safe_astype, COL_DTYPES, logger=logger)
    tss = tss.pipe(normalize_units_to_metric)
    tss = tss.sort_values(by=["vin", "date"])
    tss = tss.pipe(str_lower_columns, COLS_TO_STR_LOWER)
    tss = tss.pipe(compute_date_vars)
    tss = tss.pipe(compute_charge_n_discharge_vars)
    tss = tss.merge(fleet_info, on="vin", how="left")
    tss = tss.eval("age = date.dt.tz_localize(None) - start_date.dt.tz_localize(None)")
    # It seems that the reset_index calls doesn't reset the "vin" into a category if the groupby's by argument was categorical.
    # So we recall astype on the "vin"  in case it is supposed to be categorical.
    tss = tss.astype({"vin": COL_DTYPES["vin"]})
    return tss

def compute_charge_n_discharge_vars(tss:DF) -> DF:
    return (
        tss
        # Compute the in_charge and in_discharge masks 
        .pipe(compute_charge_n_discharge_masks, IN_CHARGE_CHARGING_STATUS_VALS, IN_DISCHARGE_CHARGING_STATUS_VALS)
        # Compute the correspding indices to perfrom split-apply-combine ops
        .pipe(compute_idx_from_masks, ["in_charge", "in_discharge"])
        # We recompute the masks by trimming off the points that have the first and last soc values
        # This is done to reduce the noise in the output due to measurments noise.
        .pipe(trim_leading_n_trailing_soc_off_masks, ["in_charge", "in_discharge"]) 
        .pipe(compute_idx_from_masks, ["trimmed_in_charge", "trimmed_in_discharge"])
        .pipe(compute_cum_var, "power", "cum_energy")
        .pipe(compute_cum_var, "charger_power", "cum_charge_energy_added")
        .pipe(compute_status_col)
    )

def normalize_units_to_metric( tss:DF) -> DF:
    tss["odometer"] = tss["odometer"] * ODOMETER_MILES_TO_KM.get(make, 1)
    return tss
from scipy.integrate import cumulative_trapezoid
def compute_cum_var( tss: DF, var_col:str, cum_var_col:str) -> DF:
    if not var_col in tss.columns:
        logger.debug(f"{var_col} not found, not computing {cum_var_col}.")
        return tss
    logger.debug(f"Computing {cum_var_col} from {var_col}.")
    tss[cum_var_col] = (
        cumulative_trapezoid(
            # Leave the keywords as default order is y x not x y (-_-)
            # Make sure that date time units are in seconds before converting to int
            x=tss["date"].dt.as_unit("s").astype(int),
            y=tss[var_col].fillna(0).values,
            initial=0,
        )            
        .astype("float32")
    )
    tss[cum_var_col] *= KJ_TO_KWH # Convert from kj to kwh
    # Reset value to zero at the start of each vehicle time series
    # This is better than performing a groupby.apply with cumulative_trapezoid
    tss[cum_var_col] -= tss.groupby("vin", observed=True)[cum_var_col].transform("first")
    return tss

def compute_date_vars( tss:DF) -> DF:
    tss["time_diff"] = tss.groupby("vin", observed=False)["date"].diff()
    tss["sec_time_diff"] = tss["time_diff"].dt.total_seconds()
    return tss

def compute_charge_n_discharge_masks(tss:DF, in_charge_vals:list, in_discharge_vals:list) -> DF:
    """Computes the `in_charge` and `in_discharge` masks either from the charging_status column or from the evolution of the soc over time."""
    if make in CHARGE_MASK_WITH_CHARGING_STATUS_MAKES:
        return charge_n_discharging_masks_from_charging_status(tss, in_charge_vals, in_discharge_vals)
    if make in CHARGE_MASK_WITH_SOC_DIFFS_MAKES:
        return charge_n_discharging_masks_from_soc_diff(tss)
    raise ValueError(MAKE_NOT_SUPPORTED_ERROR.format(make=make))

def charge_n_discharging_masks_from_soc_diff( tss:DF) -> DF:
    tss_grp = tss.groupby("vin", observed=True)
    tss["soc_ffilled"] = tss_grp["soc"].ffill()
    tss["soc_diff"] = tss_grp["soc_ffilled"].diff()
    tss["soc_diff"] /= tss["soc_diff"].abs()
    soc_diff_ffilled = tss_grp["soc_diff"].ffill()
    soc_diff_bfilled = tss_grp["soc_diff"].bfill()
    tss["in_charge"] = soc_diff_ffilled.gt(0, fill_value=False) & soc_diff_bfilled.gt(0, fill_value=False)
    tss["in_discharge"] = soc_diff_ffilled.lt(0, fill_value=False) & soc_diff_bfilled.lt(0, fill_value=False)
    return tss

def charge_n_discharging_masks_from_charging_status( tss:DF, in_charge_vals:list, in_discharge_vals:list) -> DF:
    assert "charging_status" in tss.columns, NO_CHARGING_STATUS_COL_ERROR
    return (
        tss
        .eval(f"in_charge = charging_status in {in_charge_vals}")
        .eval(f"in_discharge = charging_status in {in_discharge_vals}")
    )

def trim_leading_n_trailing_soc_off_masks( tss:DF, masks:list[str]) -> DF:
    for mask in masks:
        tss["naned_soc"] = tss["soc"].where(tss[mask])
        soc_grp = tss.groupby(["vin", mask + "_idx"], observed=True)["naned_soc"]
        trailing_soc = soc_grp.transform("first")
        leading_soc = soc_grp.transform("last")
        tss["trailing_soc"] = trailing_soc
        tss["leading_soc"] = leading_soc
        tss[f"trimmed_{mask}"] = tss[mask] & (tss["soc"] != trailing_soc) & (tss["soc"] != leading_soc)
    tss = tss.drop(columns="naned_soc")
    return tss
max_td = TD(hours=1, minutes=30)
def compute_idx_from_masks( tss: DF, masks:list[str]) -> DF:
    for mask in masks:
        idx_col_name = f"{mask}_idx"
        shifted_mask = tss.groupby("vin", observed=True)[mask].shift(fill_value=False)
        tss["new_period_start_mask"] = shifted_mask.ne(tss[mask]) 
        if max_td is not None:
            tss["new_period_start_mask"] |= (tss["time_diff"] > max_td)
        tss[idx_col_name] = tss.groupby("vin", observed=True)["new_period_start_mask"].cumsum().astype("uint16")
        tss.drop(columns=["new_period_start_mask"], inplace=True)
    return tss

def compute_status_col( tss:DF) -> DF:
    tss_grp = tss.groupby("vin", observed=True)
    status = tss["in_charge"].map({True: "charging", False:"discharging", pd.NA:"unknown"})
    tss["status"] = status.mask(
        tss["in_charge"].eq(False, fill_value=True),
        np.where(tss_grp["odometer"].diff() > 0, "moving", "idle_discharging"),
    )
    return tss

In [None]:
processed_tss = run(raw_tss)

In [None]:

class TeslaProcessedTimeSeries(ProcessedTimeSeries):

    def __init__(self, make:str="tesla", id_col:str="vin", log_level:str="INFO", max_td:TD=MAX_TD, force_update:bool=False, **kwargs):
        self.logger = getLogger(make)
        set_level_of_loggers_with_prefix(log_level, make)
        super().__init__(make, id_col, log_level, max_td, force_update, **kwargs)

    def compute_charge_n_discharge_vars(self, tss:DF) -> DF:
        return (
            tss
            .pipe(self.compute_charge_n_discharge_masks)
            .pipe(self.compute_charge_idx_bis)
            # .pipe(self.compute_idx_from_masks, ["in_charge"])
            # .pipe(self.trim_leading_n_trailing_soc_off_masks, ["in_charge", "in_discharge"])
            # # .pipe(self.compute_idx_from_masks, ["trimmed_in_charge", "trimmed_in_discharge"])
        )

    def compute_charge_n_discharge_masks(self, tss:DF) -> DF:
        self.logger.debug("Computing tesla specific charge and discharge masks")
        # We use a nullable boolean Series to represnet the rows where:
        tss["nan_charging"] = (
            Series(pd.NA, index=tss.index, dtype="boolean")# We are not sure of anything.
            .mask(tss["charging_status"].isin(IN_CHARGE_CHARGING_STATUS_VALS), True)# We are sure that the vehicle is in charge.
            .mask(tss["charging_status"].isin(IN_DISCHARGE_CHARGING_STATUS_VALS), False)# We are sure that the vehicle is not in charge.
        )
        # If a period of uncertainty (NaN) is surrounded by equal periods of certainties (True-NaN-True or False-NaN-False),
        # We will fill them to the value of these certainties.
        # However there are edge cases that have multiple days of uncertainties periods (I can't find the VIN but I'm sure you can ;-) )
        # Interestingly enough the charge_energy_added variable does not get forwared that far and gets reset to zero. 
        # This would create outliers in our charge SoH estimation as we estimate the energy_gained as the diff between the last(0) and first value of charge_energy_added.
        # So we set a maximal uncertainty period duration over which we don't fill it.
        tss["nan_date"] = tss["date"].mask(tss["nan_charging"].isna())
        tss[["ffill_charging", "ffill_date"]] = tss.groupby("vin", observed=True)[["nan_charging", "nan_date"]].ffill()
        tss[["bfill_charging", "bfill_date"]] = tss.groupby("vin", observed=True)[["nan_charging", "nan_date"]].bfill()
        nan_period_duration:Series = tss.eval("bfill_date - ffill_date")
        fill_unknown_period = tss.eval("ffill_charging.eq(bfill_charging) & @nan_period_duration.le(@MAX_CHARGE_TD)")
        tss["nan_charging"] = tss["nan_charging"].mask(fill_unknown_period, tss["ffill_charging"])
        # As mentioned before, the SoC oscillates at [charge_limit_soc - ~3%, charge_limit_soc] so we set these periods to NaN as well.
        tss["nan_charging"] = tss["nan_charging"].mask(tss["soc"] >= (tss["charge_limit_soc"] - 3))
        # Then we seperate the Series into two, more explicit, columns.
        tss["in_charge"] = tss.eval("nan_charging.notna() & nan_charging")
        tss["in_discharge"] = tss.eval("nan_charging.notna() & ~nan_charging")
        return tss.drop(columns=["nan_charging", "ffill_charging", "bfill_charging", "ffill_date", "bfill_date"])
    
    def compute_charge_n_discharge_masks_bis(self, tss:DF) -> DF:
        self.logger.debug("Computing tesla specific charge and discharge masks")

        tss_na = tss.dropna(subset=['soc']).copy()

        tss_na['soc_diff'] = tss_na.groupby('vin', observed=True)['soc'].diff()

        tss_na['trend'] = tss_na['soc_diff'].apply(lambda x: 1 if x > 0 else -1 if x < 0 else 0)

        #tss_na['trend_change'] = tss_na.groupby('vin', observed=True)['trend'].transform(lambda x: x != x.shift())
        tss = tss.merge(tss_na[["soc", "date", "vin", 'soc_diff', 'trend']], 
                        on=["soc", "date", "vin"], how="left")
        tss[["trend", "soc", "odometer",]].bfill(inplace=True)
        tss["in_charge"] = tss.eval('trend==1')
        tss["in_discharge"] = tss.eval('trend==-1')
        return tss
    
    
    def compute_energy_added(self, tss:DF) -> DF:
        tss['charge_energy_added'] = tss['dc_charge_energy_added'].where(
            tss['dc_charge_energy_added'].notnull() & 
            (tss['dc_charge_energy_added'] > 0), 
            tss['ac_charge_energy_added'])
        return tss
    
    # def compute_charge_idx(self, tss:DF) -> DF:
    #     self.logger.debug("Computing tesla specific charge index.")
    #     if self.make == 'tesla-fleet-telemetry':
    #         tss = tss.pipe(self.compute_energy_added)
    #     tss_grp = tss.groupby("vin", observed=False)
    #     tss["charge_energy_added"] = tss_grp["charge_energy_added"].ffill()
    #     energy_added_over_time = tss_grp['charge_energy_added'].diff().div(tss["sec_time_diff"].values)
    #     # charge_energy_added is cummulative and forward filled, 
    #     # We check that the charge_energy_added decreases too fast to make sure that  correctly indentify two charging periods before and after a gap as two separate charging periods.
    #     new_charge_mask = energy_added_over_time.lt(MIN_POWER_LOSS, fill_value=0) 
    #     # For the same reason, we ensure that there are no gaps bigger than MAX_CHARGE_TD in between to rows of the same charging period.
    #     new_charge_mask |= tss["time_diff"].gt(MAX_CHARGE_TD) 
    #     # And of course we also check that there is no change of status. 
    #     new_charge_mask |= (~tss_grp["in_charge"].shift().bfill() & tss["in_charge"]) 
    #     tss["in_charge_idx"] = new_charge_mask.groupby(tss["vin"], observed=True).cumsum()
    #     print(tss["in_charge_idx"].count() / len(tss))
    #     tss["in_charge_idx"] = tss["in_charge_idx"].fillna(-1).astype("uint16")
    #     return tss
    
    def compute_charge_idx_bis(self, tss):

        if self.make == 'tesla-fleet-telemetry':
                    tss = tss.pipe(self.compute_energy_added)
        tss_na = tss.dropna(subset=['soc']).copy()
        tss_na['soc_diff'] = tss_na.groupby('vin', observed=True)['soc'].diff()
        tss_na['soc_diff_rolling'] = tss_na['soc_diff'].rolling(window=5, min_periods=1).mean()
        # Determine trend
        tss_na['trend'] = tss_na['soc_diff_rolling'].apply(lambda x: 1 if x > 0 else -1 if x < 0 else np.nan)
        tss_na['trend'] = tss_na['trend'].ffill()

        def detect_trend_change(group):

            group['prev_trend'] = group['trend'].shift(1)
            group['prev_prev_trend'] = group['trend'].shift(2)
            
            group['prev_date'] = group['date'].shift(1)
            group['time_diff_min'] = (group['date'] - group['prev_date']).dt.total_seconds() / 60
            group['time_gap'] = group['time_diff_min'] > 60  

            # Faire une sépration charge_idx et discharge_idx
            group['trend_change'] = (
                (((group['trend'] != group['prev_trend']) & 
                  (group['prev_trend'] == group['prev_prev_trend']) ) |
                group['time_gap'])
            )
            group.loc[group.index[0:2], 'trend_change'] = False
            return group


        tss_na = tss_na.groupby('vin', observed=True).apply(detect_trend_change).reset_index(drop=True)
        
        # Compute charge id
        tss_na['in_charge_idx'] = tss_na.groupby('vin',  observed=True)['trend_change'].cumsum()
        tss = tss.merge(tss_na[["soc", "date", "vin", 'soc_diff', 'in_charge_idx', 'trend', 'prev_trend', 'prev_prev_trend', 'trend_change',]], 
                        on=["soc", "date", "vin"], how="left")
        tss[["odometer","in_charge_idx"]] = tss[["odometer", "in_charge_idx"]].ffill()
        return tss


In [None]:
processed_tss = TeslaProcessedTimeSeries("tesla-fleet-telemetry", force_update=True)
processed_tss['in_charge_idx'] = processed_tss['in_charge_idx'].astype(str)


### raw results

In [None]:
from core.stats_utils import series_start_end_diff

In [None]:
raw_results = (processed_tss.groupby(["vin", "in_charge_idx"], observed=True, as_index=False).agg(
            ac_energy_added_min=pd.NamedAgg("ac_charge_energy_added", "min"),
            dc_energy_added_min=pd.NamedAgg("dc_charge_energy_added", "min"),
            ac_energy_added_end=pd.NamedAgg("ac_charge_energy_added", "last"),
            dc_energy_added_end=pd.NamedAgg("dc_charge_energy_added", "last"),
            soc_diff=pd.NamedAgg("soc", series_start_end_diff),
            inside_temp=pd.NamedAgg("inside_temp", "mean"),
            net_capacity=pd.NamedAgg("net_capacity", "first"),
            range=pd.NamedAgg("range", "first"),
            odometer=pd.NamedAgg("odometer", "first"),
            version=pd.NamedAgg("version", "first"),
            size=pd.NamedAgg("soc", "size"),
            model=pd.NamedAgg("model", "first"),
            date=pd.NamedAgg("date", "first"),
            ac_charging_power=pd.NamedAgg("ac_charging_power", "median"),
            dc_charging_power=pd.NamedAgg("dc_charging_power", "median"),
            tesla_code=pd.NamedAgg("tesla_code", "first"),
        )
        .eval("charging_power = ac_charging_power + dc_charging_power")
        .eval("ac_energy_added = ac_energy_added_end  - ac_energy_added_min")
        .eval("dc_energy_added = dc_energy_added_end  - dc_energy_added_min")
        .assign(energy_added=lambda df: np.maximum(df["ac_energy_added"], df["dc_energy_added"]))
        .eval("soh = energy_added / (soc_diff / 100.0 * net_capacity)")
        .eval("level_1 = soc_diff * (charging_power < 8) / 100")
        .eval("level_2 = soc_diff * (charging_power.between(8, 45)) / 100")
        .eval("level_3 = soc_diff * (charging_power > 45) / 100")
        .sort_values(["tesla_code", "vin", "date"]))

In [None]:
25.788389 / (40.029564 / 100 * 60)

In [None]:
raw_results[(raw_results['vin']=="LRWYGCFS6PC992837")][['soh', 'odometer', 'soc_diff', "energy_added"]]

In [None]:
# pour repartir de ce qui est stocké
#raw_results_origin = get_results_origin(force_update=False)

### Processed results

In [None]:
from transform.processed_results.main import *

In [None]:
SOH_FILTER_EVAL = {
     "tesla-fleet-telemetry-30": "soh = soh.where(soc_diff > 30 & soh.between(0.75, 1.05))",
     "tesla-fleet-telemetry-25": "soh = soh.where(soc_diff > 25 & soh.between(0.75, 1.05))",
     "tesla-fleet-telemetry-20": "soh = soh.where(soc_diff > 20 & soh.between(0.75, 1.05))",
     "tesla-fleet-telemetry-15": "soh = soh.where(soc_diff > 15 & soh.between(0.75, 1.05))",
     "tesla-fleet-telemetry-8": "soh = soh.where(soc_diff > 8 & soh.between(0.75, 1.05))",
}


In [None]:
def get_processed_results(brand:str) -> DF:
    logger.info(f"{'Processing ' + brand + ' results.':=^{50}}")
    results =  (
        raw_results
        # Some raw estimations may have inf values, this will make mask_out_outliers_by_interquartile_range and force_monotonic_decrease fail
        # So we replace them by NaNs.
        .assign(soh=lambda df: df["soh"].replace([np.inf, -np.inf], np.nan))
        .sort_values(["vin", "date"])
        .pipe(make_charge_levels_presentable)
        .eval(SOH_FILTER_EVAL[brand])
        .pipe(agg_results_by_update_frequency)
        .groupby('vin', observed=True)
        .apply(make_soh_presentable_per_vehicle, include_groups=False)
        .reset_index(level=0)
        #.pipe(filter_results_by_lines_bounds, VALID_SOH_POINTS_LINE_BOUNDS, logger=logger)
        .sort_values(["vin", "date"])
    )
    results["soh"] = results.groupby("vin", observed=True)["soh"].ffill()
    results["soh"] = results.groupby("vin", observed=True)["soh"].bfill()
    results["odometer"] = results.groupby("vin", observed=True)["odometer"].ffill()
    results["odometer"] = results.groupby("vin", observed=True)["odometer"].bfill()
    return results

In [None]:
processed_results_30 = get_processed_results('tesla-fleet-telemetry-30')
processed_results_25 = get_processed_results('tesla-fleet-telemetry-25')
processed_results_20 = get_processed_results('tesla-fleet-telemetry-20')
processed_results_15 = get_processed_results('tesla-fleet-telemetry-15')
processed_results_8 = get_processed_results('tesla-fleet-telemetry-8')

In [None]:
px.scatter(raw_results[(raw_results['soh'] >.7) &(raw_results['soh'] < 1.05)].dropna(subset='soh'), x='odometer', y='soh', color='vin')

In [None]:
px.scatter(processed_results_30[(processed_results_30['soh'] > .75) &(processed_results_30['soh'] < 1.05)].dropna(subset='soh'), x='odometer', y='soh', color='vin')