# Improve time series processing
Most of the current code for processing time series is legacy code from watea POC.  
It works but is not optimized and is not scalable.  
This is because it was concieved to be used on a single time series at a time.  
We need to change this to be able to process multiple time series in a single DataFrame.  
This will hopefully improve the performance and scalability of the code.  

## Setup

### Imports

In [None]:
from pandas import Timedelta as TD
from rich.progress import Progress
import plotly.express as px

from core.logging_utils import set_level_of_loggers_with_prefix
from core.pandas_utils import *
from core.time_series_processing import *
from transform.raw_tss.main import get_raw_tss
from transform.processed_tss.config import *
from transform.fleet_info.main import fleet_info

logger = getLogger("transform.processed_tss.tesla")
set_level_of_loggers_with_prefix("DEBUG", "transform.processed_tss.tesla")

### Data extraction

In [None]:
raw_tss = get_raw_tss("tesla")
raw_tss

### Legacy code

I copied pasted the legacy code here for future references once the new implementation will have replaced it.  
I also added comments to the code to explain what it does(wrong).  

In [None]:
def legacy_tesla_processed_tss(raw_tss:DF) -> DF:
    with Progress(transient=True) as progress:
        task = progress.add_task("Processing VINs...", visible=False, total=raw_tss["vin"].nunique())
        return (
            raw_tss
            .rename(columns=RENAME_COLS_DICT, errors="ignore")
            .pipe(safe_locate, col_loc=list(COL_DTYPES.keys()), logger=logger)
            .pipe(safe_astype, COL_DTYPES, logger=logger)
            # We should probably not drop duplicates as there might be multiple measurements for the same date
            # We should probobably check the responses parsing an switch from concatenating to joining/merging on date instead.
            .drop_duplicates(subset=["vin", "date"]) 
            .sort_values(by=["vin", "date"])
            .pipe(legacy_charge_n_discharging, "vin", CHARGING_STATUS_VAL_TO_MASK, logger) 
            .groupby("vin")
            .apply(legacy_tesla_process_ts, progress, task, include_groups=False)
            .reset_index(drop=False)
            .pipe(set_all_str_cols_to_lower, but=["vin"])
            .pipe(left_merge, fleet_info.dropna(subset=["vin"]), "vin", "vin", COLS_TO_CPY_FROM_FLEET_INFO, logger)
            .pipe(compute_discharge_diffs, DISCHARGE_VARS_TO_MEASURE, logger)
        )

def legacy_tesla_process_ts(raw_ts: DF, progress: Progress, task) -> DF:
    vin = raw_ts.name
    progress.update(task, visible=True, advance=1, description=f"Processing vin {vin}...")
    if progress.finished:
        progress.update(task, visible=False)
    return (
        raw_ts
        .assign(
            # We don't use any of these variables later in the pipeline so we can drop them
            ffiled_outside_temp=raw_ts["outside_temp"].ffill(),
            ffiled_inside_temp=raw_ts["inside_temp"].ffill(),
            floored_soc=floor_to(raw_ts["soc"].ffill(), 1),
            date_diff=raw_ts["date"].diff(),
            soc_diff=raw_ts["soc"].diff(),
        )
        .pipe(compute_cum_energy, power_col="power", cum_energy_col="cum_energy")
        # The only column we actually use from this function is cum_charge_energy_added from charger_power
        # Instead of doing a groupby/apply we can perform a single compute_cum_energy call
        # And then compute some sort of energy_added_offset that resets the results to zero at the start of each vin time series.
        .pipe(compute_cum_energy, power_col="charger_power", cum_energy_col="cum_charge_energy_added")
        .assign(energy_added=lambda tss: tss["cum_charge_energy_added"].diff())
        .assign(energy_diff=lambda df: df["cum_energy"].diff())
        .pipe(fillna_vars, COLS_TO_FILL, MAX_TIME_DIFF_TO_FILL)
    )

def legacy_charge_n_discharging(tss:DF, id_col:str=None, charging_status_val_to_mask:dict=None, logger:Logger=logger) -> DF:
    """
    ### Description:
    Computes the charging and discharging masks for a time series.
    Uses the string charging_status column if it exists, otherwise uses the soc difference.
    ### Parameters:
    id_col: optional parameter to provide if the dataframe represents multiple time series.
    charging_status_val_to_mask: dict mapping charging status values to boolean values to create masks.
    """
    logger.info(f"compute_charging_n_discharging_masks called.")
    if "charging_status" in tss.columns and charging_status_val_to_mask is not None:
        logger.debug(f"Computing charging and discharging masks using charging status dictionary.")
        charge_mask = tss["charging_status"].map(charging_status_val_to_mask)
        tss["in_charge"] = charge_mask
        tss["in_discharge"] = charge_mask == False
        if id_col is not None and id_col in tss.columns:
            tss = (
                tss
                .groupby(id_col)
                .apply(compute_charge_n_discharge_perf_mask_and_idx_from_masks)
                .reset_index(drop=True)
            )
        else:
            tss = compute_charge_n_discharge_perf_mask_and_idx_from_masks(tss)
        return tss
    elif "soc" in tss.columns:
        logger.debug(f"Computing charging and discharging masks using soc difference.")
        if id_col in tss.columns:
            return (
                tss
                .groupby(id_col)
                .apply(low_freq_compute_charge_n_discharge_vars)
                .reset_index(drop=True)
            )
        else:
            return low_freq_compute_charge_n_discharge_vars(tss)
    else:
        logger.warning("No charging status or soc column found to compute charging and discharging masks, returning original tss.")
        return tss

## New implementation

In [None]:
MAX_TD = TD(hours=1, minutes=30)

def new_process_raw_tss(raw_tss:DF, logger=logger) -> DF:
    return (
        raw_tss
        .rename(columns=RENAME_COLS_DICT, errors="ignore")
        .pipe(safe_locate, col_loc=list(COL_DTYPES.keys()), logger=logger)
        .pipe(safe_astype, COL_DTYPES, logger=logger)
        .sort_values(by=["vin", "date"])
        .pipe(compute_date_vars, "vin", logger)
        .pipe(new_charge_n_discharging_from_charging_status, IN_CHARGE_CHARGING_STATUS_VALS, IN_DISCHARGE_CHARGING_STATUS_VALS, MAX_TD, "vin", logger)
    )

def compute_date_vars(tss:DF, id_col:str="vin", logger:Logger=logger) -> DF:
    logger.debug(f"Computing sec_date and sec_date_diff.")
    tss["time_diff"] = tss.groupby(id_col)["date"].diff()
    tss["sec_time_diff"] = tss["time_diff"].dt.total_seconds()
    return tss

def new_charge_n_discharging_from_charging_status(tss:DF, in_charge_vals:list, in_discharge_vals:list, max_td:TD=None, id_col:str="vin", logger:Logger=logger) -> DF:
    logger.debug(f"Computing charging and discharging vars using charging status dictionary.")
    return (
        tss
        .eval("charging_status = charging_status.str.lower()")
        .eval(f"in_charge = charging_status in {in_charge_vals}")
        .eval(f"in_discharge = charging_status in {in_discharge_vals}")
        .pipe(compute_idx_from_mask, "in_charge", max_td, id_col, logger)
        .pipe(compute_idx_from_mask, "in_discharge", max_td, id_col, logger)
    )

def compute_idx_from_mask(tss: DF, src_mask:str, max_time_diff:TD=None, id_col:str="vin", logger:Logger=logger) -> DF:
    logger.debug(f"Computing {src_mask}_idx from {src_mask} mask.")
    idx_col_name = f"{src_mask}_idx"
    shifted_mask = tss.groupby(id_col)[src_mask].shift(fill_value=False)
    tss["new_period_start_mask"] = shifted_mask.ne(tss[src_mask]) 
    if max_time_diff is not None:
        logger.debug(f"Adding max_time_diff condition to new_period_start_mask.")
        tss["new_period_start_mask"] |= (tss["time_diff"] > max_time_diff)
    else:
        logger.debug(f"No max_time_diff condition added to new_period_start_mask.")
    tss[idx_col_name] = tss.groupby(id_col)["new_period_start_mask"].cumsum().astype("uint16")
    tss.drop(columns=["new_period_start_mask"], inplace=True)
    return tss

In [None]:
tss = new_process_raw_tss(raw_tss, logger=logger)

In [None]:
VINS = [
    "LRW3E7EK5PC797921",
    "5YJ3E7EA9LF751886",
    "XP7YGCEL2RB413022",
    "LRW3E7FR7NC480876",
    "XP7YGCES9RB442881",
]
#vin = tss["vin"].sample(3)
ts = tss.query("vin in @VINS")

In [None]:
fig = px.scatter(
    ts.eval("charging_status = charging_status.fillna('Unknown')"),
    x="date",
    y="soc",
    color="in_charge_idx",
    color_continuous_scale="Rainbow",
    symbol="in_charge",
    hover_data="charging_status",
    facet_row="vin",
)
fig.update_layout(height=1000, showlegend=False)
fig