In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from core.s3_utils import *
import os
import boto3
import json
from transform.raw_tss.tesla_raw_tss import *

In [None]:
from pandas import DataFrame as DF

S3_RAW_TSS_KEY_FORMAT = "raw_ts/{brand}/time_series/raw_tss.parquet"
TESLA_RAW_TSS_KEY = S3_RAW_TSS_KEY_FORMAT.format(brand="tesla")
DEFAULT_TESLA_RAW_TSS_DF = DF(columns=["vin", "readable_date"])

In [None]:
s3 = S3_Bucket()

In [None]:

def get_response_keys_to_parse(bucket:S3_Bucket) -> DF:
    return (
        bucket.list_responses_keys_of_brand("tesla-fleet-telemetry")
        .assign(date=lambda df: df["file"].str[:-5].astype("datetime64[ns]"))
    )

In [None]:
parse_key  = get_response_keys_to_parse(s3)

In [None]:
# Fonction d'aplatissement
def flatten_metrics(metrics_list):
    if not isinstance(metrics_list, list):
        return {}
    flat = {}
    for item in metrics_list:
        key = item.get('key')
        value_dict = item.get('value', {})
        for subkey, subval in value_dict.items():
            flat[f"{key}_{subkey}"] = subval
    return flat

In [None]:
def explode_data(df):
    df_merge = pd.DataFrame(index=df.index)
    if "metrics" in df.columns:
        metrics=pd.json_normalize(df['metrics'])
        df_merge = pd.merge(df, metrics, left_index=True, right_index=True)
    if "data" in df.columns:
        data_df = df['data'].apply(flatten_metrics).apply(pd.Series)
        df_merge = pd.merge(df_merge, data_df,  left_index=True, right_index=True)
    else:
        return df
    return df_merge

In [None]:
parse_key= pd.read_csv('pars_key.csv', index_col=0)
parse_key['date'] = pd.to_datetime(parse_key['date'])

In [None]:
parse_key

In [None]:
response = s3.read_json_file("response/tesla-fleet-telemetry/LRWYGCFS6PC992837/2025-04-14.json")

In [None]:
raw_tss = get_raw_tss_from_keys(pd.DataFrame({'key':["response/tesla-fleet-telemetry/LRWYGCFS6PC992837/2025-04-14.json"],
              'dtype_folder': 'response',
              'brand': 'esla-fleet-telemetry',
              'vin': 'LRWYGCFS6PC992837',
              'file': '2025-04-14.json',
              'is_valid_file': True,
              'date':pd.to_datetime('2025-04-14')}), s3)

In [None]:
from transform.processed_tss.config import *
from transform.fleet_info.main import fleet_info
from scipy.integrate import cumulative_trapezoid
from core.constants import *


In [None]:
def charge_n_discharging_masks_from_soc_diff(tss:DF) -> DF:
        tss_grp = tss.groupby('vin', observed=True)
        tss["soc_ffilled"] = tss_grp["soc"].ffill()
        tss["soc_diff"] = tss_grp["soc_ffilled"].diff()
        tss["soc_diff"] /= tss["soc_diff"].abs()
        soc_diff_ffilled = tss_grp["soc_diff"].ffill()
        soc_diff_bfilled = tss_grp["soc_diff"].bfill()
        tss["in_charge"] = soc_diff_ffilled.gt(0, fill_value=False) & soc_diff_bfilled.gt(0, fill_value=False)
        tss["in_discharge"] = soc_diff_ffilled.lt(0, fill_value=False) & soc_diff_bfilled.lt(0, fill_value=False)
        return tss

def charge_n_discharging_masks_from_charging_status(tss:DF, in_charge_vals:list, in_discharge_vals:list) -> DF:
        assert "charging_status" in tss.columns, NO_CHARGING_STATUS_COL_ERROR
        return (
            tss
            .eval(f"in_charge = charging_status in {in_charge_vals}")
            .eval(f"in_discharge = charging_status in {in_discharge_vals}")
        )
def compute_charge_n_discharge_masks(tss:DF, make, in_charge_vals:list, in_discharge_vals:list) -> DF:
    if make in CHARGE_MASK_WITH_CHARGING_STATUS_MAKES:
        print(make)
        return charge_n_discharging_masks_from_charging_status(tss, in_charge_vals, in_discharge_vals)
    if make in CHARGE_MASK_WITH_SOC_DIFFS_MAKES:
        return charge_n_discharging_masks_from_soc_diff(tss)
    raise ValueError(MAKE_NOT_SUPPORTED_ERROR.format(make=make))

def trim_leading_n_trailing_soc_off_masks(tss:DF, masks:list[str]) -> DF:
        for mask in masks:
            tss["naned_soc"] = tss["soc"].where(tss[mask])
            soc_grp = tss.groupby(["vin", mask + "_idx"], observed=True)["naned_soc"]
            trailing_soc = soc_grp.transform("first")
            leading_soc = soc_grp.transform("last")
            tss["trailing_soc"] = trailing_soc
            tss["leading_soc"] = leading_soc
            tss[f"trimmed_{mask}"] = tss[mask] & (tss["soc"] != trailing_soc) & (tss["soc"] != leading_soc)
        tss = tss.drop(columns="naned_soc")
        return tss
def compute_idx_from_masks( tss: DF, masks:list[str]) -> DF:

    for mask in masks:
        idx_col_name = f"{mask}_idx"
        shifted_mask = tss.groupby("vin", observed=True)[mask].shift(fill_value=False)
        tss["new_period_start_mask"] = shifted_mask.ne(tss[mask]) 
        if MAX_TD is not None:
            tss["new_period_start_mask"] |= (tss["time_diff"] > MAX_TD)
        tss[idx_col_name] = tss.groupby("vin", observed=True)["new_period_start_mask"].cumsum().astype("uint16")
        tss.drop(columns=["new_period_start_mask"], inplace=True)
    return tss

def compute_status_col( tss:DF) -> DF:
    tss_grp = tss.groupby("vin", observed=True)
    status = tss["in_charge"].map({True: "charging", False:"discharging", pd.NA:"unknown"})
    tss["status"] = status.mask(
        tss["in_charge"].eq(False, fill_value=True),
        np.where(tss_grp["odometer"].diff() > 0, "moving", "idle_discharging"),
    )
    return tss

def compute_cum_var(tss: DF, var_col:str, cum_var_col:str) -> DF:
        if not var_col in tss.columns:
            return tss
        tss[cum_var_col] = (
            cumulative_trapezoid(
                # Leave the keywords as default order is y x not x y (-_-)
                # Make sure that date time units are in seconds before converting to int
                x=tss["date"].dt.as_unit("s").astype(int),
                y=tss[var_col].fillna(0).values,
                initial=0,
            )            
            .astype("float32")
        )
        tss[cum_var_col] *= KJ_TO_KWH # Convert from kj to kwh
        # Reset value to zero at the start of each vehicle time series
        # This is better than performing a groupby.apply with cumulative_trapezoid
        tss[cum_var_col] -= tss.groupby('vin', observed=True)[cum_var_col].transform("first")
        return tss
    
def compute_date_vars(tss:DF) -> DF:

        tss["time_diff"] = tss.groupby('vin', observed=False)["date"].diff()
        tss["sec_time_diff"] = tss["time_diff"].dt.total_seconds()
        return tss

In [None]:
def normalize_units_to_metric(tss):
        tss["odometer"] = tss["odometer"] * ODOMETER_MILES_TO_KM.get("tesla", 1)
        return tss

In [None]:
raw = raw_tss.rename(columns=RENAME_COLS_DICT, errors="ignore")
raw = raw.pipe(safe_locate, col_loc=list(COL_DTYPES.keys()))
raw = raw.pipe(safe_astype, COL_DTYPES)
raw = raw.pipe(normalize_units_to_metric)
raw = raw.pipe(str_lower_columns, COLS_TO_STR_LOWER)
raw = raw.pipe(compute_date_vars)

In [None]:
def compute_charge_n_discharge_vars(tss:DF) -> DF:
    return (
        tss
        # Compute the in_charge and in_discharge masks 
        .pipe(compute_charge_n_discharge_masks, 'tesla', IN_CHARGE_CHARGING_STATUS_VALS, IN_DISCHARGE_CHARGING_STATUS_VALS)
        # Compute the correspding indices to perfrom split-apply-combine ops
        .pipe(compute_idx_from_masks, ["in_charge", "in_discharge"])
        # We recompute the masks by trimming off the points that have the first and last soc values
        # This is done to reduce the noise in the output due to measurments noise.
        .pipe(trim_leading_n_trailing_soc_off_masks, ["in_charge", "in_discharge"]) 
        .pipe(compute_idx_from_masks, ["trimmed_in_charge", "trimmed_in_discharge"])
        .pipe(compute_cum_var, "power", "cum_energy")
        .pipe(compute_cum_var, "charger_power", "cum_charge_energy_added")
        .pipe(compute_status_col)
    )

In [None]:
raw = raw.pipe(compute_charge_n_discharge_vars)

raw = raw.merge(fleet_info, on="vin", how="left")
raw = raw.eval("age = date.dt.tz_localize(None) - start_date.dt.tz_localize(None)")
# It seems that the reset_index calls doesn't reset the id_col into a category if the groupby's by argument was categorical.
# So we recall astype on the id_col  in case it is supposed to be categorical.
processed_tss = raw.astype({'vin': COL_DTYPES['vin']})

In [None]:
raw_tss['ACChargingPower_stringValue']

In [None]:
raw_result = (raw.groupby(["vin", "trimmed_in_charge_idx"], observed=True, as_index=False)
        .agg(
            energy_added_min=pd.NamedAgg("charge_energy_added", "min"),
            energy_added_end=pd.NamedAgg("charge_energy_added", "last"),
            soc_diff=pd.NamedAgg("soc", series_start_end_diff),
            inside_temp=pd.NamedAgg("inside_temp", "mean"),
            net_capacity=pd.NamedAgg("net_capacity", "first"),
            range=pd.NamedAgg("range", "first"),
            odometer=pd.NamedAgg("odometer", "first"),
            version=pd.NamedAgg("version", "first"),
            size=pd.NamedAgg("soc", "size"),
            model=pd.NamedAgg("model", "first"),
            date=pd.NamedAgg("date", "first"),
            charging_power=pd.NamedAgg("charging_power", "median"),
            tesla_code=pd.NamedAgg("tesla_code", "first"),
        )
        .eval("energy_added = energy_added_end - energy_added_min")
        .eval("soh = energy_added / (soc_diff / 100.0 * net_capacity)")
        #.query("soc_diff > 40 & soh.between(0.75, 1.05)")
        .eval("level_1 = soc_diff * (charging_power < 8) / 100")
        .eval("level_2 = soc_diff * (charging_power.between(8, 45)) / 100")
        .eval("level_3 = soc_diff * (charging_power > 45) / 100")
	    # .eval("bottom_soh = soh.between(0.75, 0.9)")
        # .eval("fixed_soh_min_end = soh.mask(tesla_code == 'MTY13', soh / 0.96)")
        # .eval("fixed_soh_min_end = fixed_soh_min_end.mask(bottom_soh & tesla_code == 'MTY13', fixed_soh_min_end + 0.08)")
        # .eval("soh = fixed_soh_min_end")
        .sort_values(["tesla_code", "vin", "date"])
    )

In [None]:
SOH_FILTER_EVAL_STRINGS: dict[callable] = {
    "tesla": "soh = soh.where(soc_diff > 40 & soh.between(0.75, 1.05))",
    "volvo": "soh = soh.where(soc > 0.7)",
    "renault": "soh = soh.where(soc > 0.5)",
    "ford": "soh = soh",
    "mercedes-benz": "soh = soh",
    "bmw": "soh = soh",
    "kia": "soh = soh",
    "stellantis": "soh = soh",
}

def make_charge_levels_presentable(results:DF) -> DF:
    # If none of the level columns exist, return the results as is
    level_columns = ["level_1", "level_2", "level_3"]
    existing_level_columns = [col for col in level_columns if col in results.columns]
    if not existing_level_columns:
        return results
    negative_charge_levels = results[["level_1", "level_2", "level_3"]].lt(0)
    nb_negative_levels = negative_charge_levels.sum().sum()
    if nb_negative_levels > 0:
        logger.warning(f"There are {nb_negative_levels}({100*nb_negative_levels/len(results):2f}%) negative charge levels, setting them to 0.")
    results[["level_1", "level_2", "level_3"]] = results[["level_1", "level_2", "level_3"]].mask(negative_charge_levels, 0)
    return results
UPDATE_FREQUENCY = pd.Timedelta(days=7)
def agg_results_by_update_frequency(results:DF) -> DF:
    results["date"] = (
        pd.to_datetime(results["date"], format='mixed')
        .dt.floor(UPDATE_FREQUENCY)
        .dt.tz_localize(None)
        .dt.date
        .astype('datetime64[ns]')
    )
    return (
        results
        # Setting level columns to 0 if they don't exist.
        .assign(
            level_1=results.get("level_1", 0),
            level_2=results.get("level_2", 0),
            level_3=results.get("level_3", 0),
        )
        .groupby(["vin", "date"], observed=True, as_index=False)
        .agg(
            odometer=pd.NamedAgg("odometer", "last"),
            soh=pd.NamedAgg("soh", "median"),
            model=pd.NamedAgg("model", "first"),
            version=pd.NamedAgg("version", "first"),
            level_1=pd.NamedAgg("level_1", "sum"),
            level_2=pd.NamedAgg("level_2", "sum"),
            level_3=pd.NamedAgg("level_3", "sum"),            
        )
    )
from core.stats_utils import *
def make_soh_presentable_per_vehicle(df:DF) -> DF:
    if df["soh"].isna().all():
        return df
    if df["soh"].count() > 3:
        outliser_mask = mask_out_outliers_by_interquartile_range(df["soh"])
        assert outliser_mask.any(), f"There seems to be only outliers???:\n{df['soh']}."
        df = df[outliser_mask].copy()
    if df["soh"].count() >= 2:
        df["soh"] = force_decay(df[["soh", "odometer"]])
    return df

In [None]:

VALID_SOH_POINTS_LINE_BOUNDS = DF({
  "odometer": [20_000, 200_000, 0, 200_000],
  "soh": [1.0, 0.95, 0.9, 0.5],
  "point": ["A", "B", "A", "B"],
  "bound": ["max", "max", "min", "min"]
}).set_index(["bound", "point"])

In [None]:
processed_result = (raw_result.assign(soh=lambda df: df["soh"].replace([np.inf, -np.inf], np.nan))
        .sort_values(["vin", "date"])
        .pipe(make_charge_levels_presentable)
        .eval(SOH_FILTER_EVAL_STRINGS['tesla'])
        .pipe(agg_results_by_update_frequency)
        .groupby('vin', observed=True)
        .apply(make_soh_presentable_per_vehicle, include_groups=False)
        .reset_index(level=0)
        .pipe(filter_results_by_lines_bounds, VALID_SOH_POINTS_LINE_BOUNDS, logger=logger)
        .sort_values(["vin", "date"])
    )

In [None]:
processed_result

In [None]:
def get_processed_results(brand:str) -> DF:
    logger.info(f"{'Processing ' + brand + ' results.':=^{50}}")
    results =  (
        GET_RESULTS_FUNCS[brand]()
        # Some raw estimations may have inf values, this will make mask_out_outliers_by_interquartile_range and force_monotonic_decrease fail
        # So we replace them by NaNs.
        .assign(soh=lambda df: df["soh"].replace([np.inf, -np.inf], np.nan))
        .sort_values(["vin", "date"])
        .pipe(make_charge_levels_presentable)
        .eval(SOH_FILTER_EVAL_STRINGS[brand])
        .pipe(agg_results_by_update_frequency)
        .groupby('vin', observed=True)
        .apply(make_soh_presentable_per_vehicle, include_groups=False)
        .reset_index(level=0)
        .pipe(filter_results_by_lines_bounds, VALID_SOH_POINTS_LINE_BOUNDS, logger=logger)
        .sort_values(["vin", "date"])
    )
    results["soh"] = results.groupby("vin", observed=True)["soh"].ffill()
    results["soh"] = results.groupby("vin", observed=True)["soh"].bfill()
    results["odometer"] = results.groupby("vin", observed=True)["odometer"].ffill()
    results["odometer"] = results.groupby("vin", observed=True)["odometer"].bfill()
    return results

In [None]:
from transform.raw_tss.fleet_telemetry_raw_tss import *

In [None]:
res = get_raw_tss()

In [None]:
res_test = res.rename(columns=RENAME_COLS_DICT, errors="ignore")

In [None]:
tss = res_test.pipe(safe_locate, col_loc=list(COL_DTYPES.keys()))


In [None]:
tss = tss.pipe(safe_astype, COL_DTYPES)


In [None]:
def normalize_units_to_metric(tss):
        tss["odometer"] = tss["odometer"] * ODOMETER_MILES_TO_KM.get("tesla", 1)
        return tss

In [None]:
tss = tss.pipe(normalize_units_to_metric)

In [None]:
tss = tss.pipe(str_lower_columns, COLS_TO_STR_LOWER)


In [None]:
from transform.processed_tss.config import *

In [None]:
def compute_charge_n_discharge_vars(self, tss:DF) -> DF:
    return (
        tss
        # Compute the in_charge and in_discharge masks 
        .pipe(compute_charge_n_discharge_masks, 'tesla', IN_CHARGE_CHARGING_STATUS_VALS, IN_DISCHARGE_CHARGING_STATUS_VALS)
        # Compute the correspding indices to perfrom split-apply-combine ops
        .pipe(compute_idx_from_masks, ["in_charge", "in_discharge"])
        # We recompute the masks by trimming off the points that have the first and last soc values
        # This is done to reduce the noise in the output due to measurments noise.
        .pipe(trim_leading_n_trailing_soc_off_masks, ["in_charge", "in_discharge"]) 
        .pipe(compute_idx_from_masks, ["trimmed_in_charge", "trimmed_in_discharge"])
        .pipe(compute_cum_var, "power", "cum_energy")
        .pipe(compute_cum_var, "charger_power", "cum_charge_energy_added")
        .pipe(compute_status_col)
    )

In [None]:
tss.pipe(compute_charge_n_discharge_vars)

In [None]:
charge_n_discharging_masks_from_charging_status(tss, IN_CHARGE_CHARGING_STATUS_VALS, IN_DISCHARGE_CHARGING_STATUS_VALS)

In [None]:
from transform.processed_tss.ProcessedTimeSeries import ProcessedTimeSeries
from transform.processed_tss.config import *

In [None]:
r = ProcessedTimeSeries('fleet-telemetry')

In [None]:
r.columns


In [None]:
res[["vin", "date", "vehicle"]]

In [None]:
res.columns