# Improve time series processing
Most of the current code for processing time series is legacy code from watea POC.  
It works but is not optimized and is not scalable.  
This is because it was concieved to be used on a single time series at a time.  
We need to change this to be able to process multiple time series in a single DataFrame.  
This will hopefully improve the performance and scalability of the code.  

## Setup

### Imports

In [None]:
import random
from pandas import Timedelta as TD
from rich.progress import Progress
import plotly.express as px

from core.logging_utils import set_level_of_loggers_with_prefix
from core.pandas_utils import *
from core.time_series_processing import *
from transform.raw_tss.main import get_raw_tss
from transform.processed_tss.config import *
from transform.fleet_info.main import fleet_info

logger = getLogger("transform.processed_tss.tesla")
set_level_of_loggers_with_prefix("DEBUG", "transform.processed_tss.tesla")

### Data extraction

In [None]:
TEST_BRAND = "renault"

In [None]:
raw_tss = get_raw_tss(TEST_BRAND)

if TEST_BRAND == "tesla":
    VINS = [
        "LRW3E7EK5PC797921",
        "5YJ3E7EA9LF751886",
        "XP7YGCEL2RB413022",
        "LRW3E7FR7NC480876",
        "XP7YGCES9RB442881",
    ]
else:
    all_vins = raw_tss["vin"].unique()
    VINS = random.sample(list(all_vins), 5)
raw_tss_test = raw_tss.query("vin in @VINS")
display(raw_tss_test)

### Legacy code

I copied pasted the legacy code here for future references once the new implementation will have replaced it.  
I also added comments to the code to explain what it does(wrong).  

In [None]:
def legacy_tesla_processed_tss(raw_tss:DF) -> DF:
    with Progress(transient=True) as progress:
        task = progress.add_task("Processing VINs...", visible=False, total=raw_tss["vin"].nunique())
        return (
            raw_tss
            .rename(columns=RENAME_COLS_DICT, errors="ignore")
            .pipe(safe_locate, col_loc=list(COL_DTYPES.keys()), logger=logger)
            .pipe(safe_astype, COL_DTYPES, logger=logger)
            # We should probably not drop duplicates as there might be multiple measurements for the same date
            # We should probobably check the responses parsing and switch from concatenating to joining/merging on date instead.
            .drop_duplicates(subset=["vin", "date"]) 
            .sort_values(by=["vin", "date"])
            .pipe(legacy_charge_n_discharging, "vin", CHARGING_STATUS_VAL_TO_MASK, logger) 
            .groupby("vin")
            .apply(legacy_tesla_process_ts, progress, task, include_groups=False)
            .reset_index(drop=False)
            .pipe(set_all_str_cols_to_lower, but=["vin"])
            .pipe(left_merge, fleet_info.dropna(subset=["vin"]), "vin", "vin", COLS_TO_CPY_FROM_FLEET_INFO, logger)
            .pipe(compute_discharge_diffs, DISCHARGE_VARS_TO_MEASURE, logger)
        )

def legacy_tesla_process_ts(raw_ts: DF, progress: Progress, task) -> DF:
    vin = raw_ts.name
    progress.update(task, visible=True, advance=1, description=f"Processing vin {vin}...")
    if progress.finished:
        progress.update(task, visible=False)
    return (
        raw_ts
        .assign(
            # We don't use any of these variables later in the pipeline so we can drop them
            ffiled_outside_temp=raw_ts["outside_temp"].ffill(),
            ffiled_inside_temp=raw_ts["inside_temp"].ffill(),
            floored_soc=floor_to(raw_ts["soc"].ffill(), 1),
            date_diff=raw_ts["date"].diff(),
            soc_diff=raw_ts["soc"].diff(),
        )
        .pipe(compute_cum_var, power_col="power", cum_energy_col="cum_energy")
        # The only column we actually use from this function is cum_charge_energy_added from charger_power
        # Instead of doing a groupby/apply we can perform a single compute_cum_energy call
        # And then compute some sort of energy_added_offset that resets the results to zero at the start of each vin time series.
        .pipe(compute_cum_var, power_col="charger_power", cum_energy_col="cum_charge_energy_added")
        .assign(energy_added=lambda tss: tss["cum_charge_energy_added"].diff())
        .assign(energy_diff=lambda df: df["cum_energy"].diff())
        .pipe(fillna_vars, COLS_TO_FILL, MAX_TIME_DIFF_TO_FILL)
    )

def legacy_charge_n_discharging(tss:DF, id_col:str=None, charging_status_val_to_mask:dict=None, logger:Logger=logger) -> DF:
    """
    ### Description:
    Computes the charging and discharging masks for a time series.
    Uses the string charging_status column if it exists, otherwise uses the soc difference.
    ### Parameters:
    id_col: optional parameter to provide if the dataframe represents multiple time series.
    charging_status_val_to_mask: dict mapping charging status values to boolean values to create masks.
    """
    logger.info(f"compute_charging_n_discharging_masks called.")
    if "charging_status" in tss.columns and charging_status_val_to_mask is not None:
        logger.debug(f"Computing charging and discharging masks using charging status dictionary.")
        charge_mask = tss["charging_status"].map(charging_status_val_to_mask)
        tss["in_charge"] = charge_mask
        tss["in_discharge"] = charge_mask == False
        if id_col is not None and id_col in tss.columns:
            tss = (
                tss
                .groupby(id_col)
                .apply(compute_charge_n_discharge_perf_mask_and_idx_from_masks)
                .reset_index(drop=True)
            )
        else:
            tss = compute_charge_n_discharge_perf_mask_and_idx_from_masks(tss)
        return tss
    elif "soc" in tss.columns:
        logger.debug(f"Computing charging and discharging masks using soc difference.")
        if id_col in tss.columns:
            return (
                tss
                .groupby(id_col)
                .apply(low_freq_compute_charge_n_discharge_vars)
                .reset_index(drop=True)
            )
        else:
            return low_freq_compute_charge_n_discharge_vars(tss)
    else:
        logger.warning("No charging status or soc column found to compute charging and discharging masks, returning original tss.")
        return tss

## New implementation

In [None]:
MAX_TD = TD(hours=1, minutes=30)

class NewTimeSeriesProcessing:
    def __init__(self, name:str, id_col:str="vin"):
        self.name = name
        self.logger = getLogger(f"transform.processed_tss.{name}")
        self.id_col = id_col

    def new_process_raw_tss(self, raw_tss:DF) -> DF:
        return (
            raw_tss
            .rename(columns=RENAME_COLS_DICT, errors="ignore")
            .pipe(safe_locate, col_loc=list(COL_DTYPES.keys()), logger=logger)
            .pipe(safe_astype, COL_DTYPES, logger=logger)
            .sort_values(by=["vin", "date"])
            .pipe(set_all_str_cols_to_lower, but=["vin"])
            .pipe(self.compute_date_vars)
            .pipe(self.compute_charge_n_discharge_masks, IN_CHARGE_CHARGING_STATUS_VALS, IN_DISCHARGE_CHARGING_STATUS_VALS)
            .pipe(self.perf_masks_and_idx_from_condition_mask, ["in_charge", "in_discharge"], MAX_TD)
            .pipe(self.compute_cum_var, "power", "cum_energy")
            .pipe(self.compute_cum_var, "charger_power", "cum_charge_energy_added")
            .merge(fleet_info, on="vin", how="left")
        )

    def compute_cum_var(self, tss: DF, var_col:str, cum_var_col:str) -> DF:
        if var_col in tss.columns:
            self.logger.debug(f"Computing {cum_var_col} from {var_col}.")
            tss[cum_var_col] = (
                cumulative_trapezoid(
                    # Leave the keywords as default order is y x not x y (-_-)
                    # Make sure that date time units are in seconds before converting to int
                    x=tss["date"].dt.as_unit("s").astype(int),
                    y=tss[var_col].fillna(0).values,
                    initial=0,
                )            
                .astype("float32")
            )
            tss[cum_var_col] *= KJ_TO_KWH # Convert from kj to kwh
            # Reset value to zero at the start of each vin time series
            tss[cum_var_col] -= tss.groupby(self.id_col)[cum_var_col].transform("first")
        else:
            self.logger.debug(f"{var_col} not found, not computing {cum_var_col}.")
        return tss

    def compute_date_vars(self, tss:DF) -> DF:
        self.logger.debug(f"Computing sec_date and sec_date_diff.")
        tss["time_diff"] = tss.groupby(self.id_col)["date"].diff()
        tss["sec_time_diff"] = tss["time_diff"].dt.total_seconds()
        return tss

    def compute_charge_n_discharge_masks(self, tss:DF, in_charge_vals:list, in_discharge_vals:list) -> DF:
        self.logger.debug(f"Computing charging and discharging masks.")
        #if "charging_status" in tss.columns:
        #    return new_charge_n_discharging_from_charging_status(tss, in_charge_vals, in_discharge_vals, logger)
        #elif "soc" in tss.columns:
        return self.new_charge_n_discharging_from_soc_diff(tss)
        #else:
        #    raise ValueError("No charging status or soc column found to compute charging and discharging masks.")

    def new_charge_n_discharging_from_soc_diff(self, tss:DF) -> DF:
        tss_grp = tss.groupby(self.id_col)
        tss["ffilled_soc"] = tss_grp["soc"].ffill()
        tss["soc_dir"] = tss_grp["ffilled_soc"].diff()
        tss["soc_dir"] = self.norm_soc_dir(tss["soc_dir"])
        # mitigate soc spikes effect on mask
        tss["prev_dir"] = tss_grp["soc_dir"].ffill()
        tss["prev_dir"] = tss_grp["prev_dir"].shift()
        tss["next_dir"] = tss_grp["soc_dir"].bfill()
        tss["next_dir"] = tss_grp["next_dir"].shift(-1)
        tss["value_is_spike"] = tss.eval("(next_dir == prev_dir) & (soc_dir != next_dir) & soc_dir.notna()")
        tss["soc_dir"] = tss["soc_dir"].mask(tss["value_is_spike"], np.nan)
        def rolling_mean_soc_dir(ts:DF) -> Series:
            return (
                ts
                .rolling(window=TD(minutes=20), center=True, on="date")
                ["soc_dir"]
                .mean()
                .pipe(self.norm_soc_dir)
                .reset_index(drop=True)
            )

        tss["smoothed_soc_dir"] = tss_grp.transform(rolling_mean_soc_dir)
        #tss["soc_dir"] = (
        #    tss["soc_dir"]
        #    .mask(tss["smoothed_soc_dir"].gt(0) & tss["soc_dir"].lt(0), np.nan)
        #    .mask(tss["smoothed_soc_dir"].lt(0) & tss["soc_dir"].gt(0), np.nan)
        #)

        bfilled_dir = tss["soc_dir"].bfill()
        ffilled_dir = tss["soc_dir"].ffill()
        tss["soc_dir"] = tss["soc_dir"].mask(bfilled_dir == ffilled_dir, ffilled_dir)
        tss = tss.eval("in_discharge = soc_dir == -1")
        tss = tss.eval("in_charge = soc_dir == 1")
        #tss = tss.drop(columns=["smoothed_soc_dir", "ffilled_soc", "value_is_spike"])
        return tss

    def norm_soc_dir(self, soc_dir:Series) -> Series:
        """Normalize the soc direction to -1 for negative, NaN for zero, 1 for positive."""
        return soc_dir / soc_dir.abs()

    def new_charge_n_discharging_from_charging_status(self, tss:DF, in_charge_vals:list, in_discharge_vals:list) -> DF:
        self.logger.debug(f"Computing charging and discharging vars using charging status dictionary.")
        return (
            tss
            .eval(f"in_charge = charging_status in {in_charge_vals}")
            .eval(f"in_discharge = charging_status in {in_discharge_vals}")
        )

    def perf_masks_and_idx_from_condition_mask(self, tss:DF, masks:list[str], max_time_diff:TD=None) -> DF:
        self.logger.info(f"Trimming off trailing soc of {masks} masks.")
        for mask in masks:
            tss = self.compute_idx_from_mask(tss, mask, max_time_diff)
            trailing_soc = tss.groupby([self.id_col, f"{mask}_idx"])["soc"].transform("last")
            leading_soc = tss.groupby([self.id_col, f"{mask}_idx"])["soc"].transform("first")
            tss[f"{mask}_perf"] = tss[mask] & (tss["soc"] != trailing_soc) & (tss["soc"] != leading_soc)
            tss = self.compute_idx_from_mask(tss, f"{mask}_perf", max_time_diff)
        return tss

    def compute_idx_from_mask(self, tss: DF, mask:str, max_time_diff:TD=None) -> DF:
        self.logger.debug(f"Computing {mask}_idx from {mask} mask.")
        idx_col_name = f"{mask}_idx"
        shifted_mask = tss.groupby(self.id_col)[mask].shift(fill_value=False)
        tss["new_period_start_mask"] = shifted_mask.ne(tss[mask]) 
        if max_time_diff is not None:
            tss["new_period_start_mask"] |= (tss["time_diff"] > max_time_diff)
        tss[idx_col_name] = tss.groupby(self.id_col)["new_period_start_mask"].cumsum().astype("uint16")
        tss.drop(columns=["new_period_start_mask"], inplace=True)
        return tss

In [None]:
tss_test = NewTimeSeriesProcessing(TEST_BRAND).new_process_raw_tss(raw_tss_test)

In [None]:
#tss = (
#    tss
#    .eval("charging_status = charging_status.str.lower()")
#    .eval(f"in_charge_charging_status = charging_status in {IN_CHARGE_CHARGING_STATUS_VALS}")
#    .eval(f"in_discharge_charging_status = charging_status in {IN_DISCHARGE_CHARGING_STATUS_VALS}")
#    .eval("in_charge_acc = in_charge == in_charge_charging_status")
#    .eval("in_discharge_acc = in_discharge == in_discharge_charging_status")
#)

In [None]:
px.scatter(
    tss_test.eval("smoothed_dir_soc = smoothed_soc_dir.fillna(0)"), #.eval("charging_status = charging_status.fillna('Unknown')"),
    x="date",
    y="soc",
    color="in_charge_perf",
    color_continuous_scale="Rainbow",
    symbol="smoothed_soc_dir",
    hover_data="soc_dir",
    facet_row="vin",
).update_layout(height=1000, showlegend=True)

## Conclusion
We have a new implementation that is more scalable and performant, it took ~2 minute to process a 5 million lines dataframe with 13k unique VINs.  
Some way some how I wasn't able to reimplement an soc NaN checking to prevent the soc diff from being computed on NaN values ffilled socs that come from from a data point to far away in time.  
Also I couldn't reuse the rolling window logic from the legacy code to remove soc spikes.  
While this is not a problem for the current data, it will be a problem for future data that will have more noise to it if we deal with Watea like data (like Ituran(-_-)).  