## Setup

### Import

In [None]:
import plotly.express as px

from core.pandas_utils import *
from core.plt_utils import plt_3d_df
from transform.processed_tss.ProcessedTimeSeries import ProcessedTimeSeries

### Data Extraction

In [None]:
raw_tss = (
    pd.read_csv(
        "./data_cache/ituran_response.csv",
        parse_dates=["signal_time"],
        usecols = [
            "dataran_id",
            "signal_time",
            "vehicle_make",
            "vehicle_model",
            "signal_name",
            "signal_value",
        ],
        dtype={
            "dataran_id": "string",
            "vehicle_make": "string",
            "vehicle_model": "string",
            "signal_name": "string",
        }
    )
    .sort_values(by=["dataran_id", "signal_time"], ascending=True)
)
raw_tss

In [None]:
raw_tss.dtypes

In [None]:
raw_tss["signal_name"].value_counts(sort=True, ascending=False, dropna=False)

In [None]:
raw_tss["dataran_id"].value_counts(sort=True, ascending=False)

In [None]:
NB_TEST_IDS = 20
ids = raw_tss["dataran_id"].unique()
test_ids = ids[:NB_TEST_IDS]
test_raw_tss = raw_tss.query("dataran_id in @test_ids")
test_raw_tss

In [None]:
INDEX_COLS = [
    'vehicle_make',
    'vehicle_model',
    'dataran_id',
    'signal_time',
]

COLUMNS_NAMES_MAP = {
    "Electric Data - Battery Status Of Charge - 2334": "soc",
    "Electric Data - Charging AC Mode - 2227": "charging_ac_mode",
    "Electric Data - Charging Current - 232": "charging_current",
    "Electric Data - Charging DC Mode - 9629": "charging_dc_mode",
    "Electric Data - Charging Voltage - 7C": "charging_voltage",
    "Electric Data - Ready Switch Open - 2015": "switch_open",
    "Electric Data - Time Remaining for Charge - 2291": "time_remaining_for_charge",
    "Electric Data - Vehicle Range Of Battery - 2229": "estimated_range",
    "signal_time": "date",
    "dataran_id": "vehicle_id",
    "vehicle_make": "make",
}

DTYPES = {
    "date": "datetime64[ns]",
    "vehicle_id": "string",
    #"make": "string",
    "soc": "float",
    "charging_ac_mode": "bool",
    "charging_current": "float",
    "charging_dc_mode": "bool",
    "charging_voltage": "float",
    "switch_open": "bool",
    "time_remaining_for_charge": "int",
    "estimated_range": "float",
}

In [None]:
duplicates = test_raw_tss.duplicated(subset=INDEX_COLS + ["signal_name"], keep=False)
first_occurrences = test_raw_tss.duplicated(subset=INDEX_COLS + ["signal_name"], keep='first')
no_duplicates = test_raw_tss.loc[~duplicates]
print("nb duplicates in no_duplicates:", no_duplicates.duplicated(subset=INDEX_COLS + ["signal_name"], keep=False).sum())
print("nb first duplicates:", first_occurrences.sum())

In [None]:
duplicate_raw_tss.shape[0] / raw_tss.shape[0]

In [None]:
def extract(nb_ids:int) -> DF:
    ids_to_extract = ids[:nb_ids]
    return (
        raw_tss
        .query("dataran_id in @ids_to_extract")
        .pivot(index=INDEX_COLS, columns="signal_name", values="signal_value")
        .reset_index()
        .rename(columns=COLUMNS_NAMES_MAP, errors="ignore")
        #.astype(DTYPES, errors="ignore")
        #.sort_values(by=["vehicle_id", "date"], ascending=True)
    )

In [None]:
short_raw_tss = (
    raw_tss
    .sample(n=30)
    .sort_values(by=["dataran_id", "signal_time"], ascending=True)
    .reset_index(drop=True)
)
display(short_raw_tss)
display(short_raw_tss.shape)
pivoted_short_raw_tss = (
    short_raw_tss
    .pivot(index=INDEX_COLS, columns="signal_name", values="signal_value")
    .reset_index()
    .sort_values(by=["dataran_id", "signal_time"], ascending=True)
)
display(pivoted_short_raw_tss)
display(pivoted_short_raw_tss.shape[0] / short_raw_tss.shape[0])

In [None]:
test_raw_tss["signal_name"].value_counts(sort=True, ascending=False, dropna=False)

In [None]:
class HighFreqProcecssedTimeSeries(ProcessedTimeSeries):

    def run(self) -> DF:
        return (
            raw_tss
            .drop_duplicates(INDEX_COLS + ["signal_name"])
            .pivot(index=INDEX_COLS, columns="signal_name", values="signal_value")
            .reset_index()
            .rename(columns=COLUMNS_NAMES_MAP, errors="ignore")
            .astype(DTYPES, errors="ignore")
            .sort_values(by=["vehicle_id", "date"], ascending=True)
            .pipe(self.compute_date_vars)
            .pipe(self.compute_charge_n_discharge_masks)
            .pipe(self.compute_current_vars)
            .pipe(self.compute_idx_from_masks, masks=["in_charge", "in_discharge"])
            .pipe(self.trim_leading_n_trailing_soc_off_masks, masks=["in_charge", "in_discharge"])
            .pipe(self.compute_idx_from_masks, masks=["trimmed_in_charge", "trimmed_in_discharge"])
        )

    def compute_charge_n_discharge_masks(self, tss:DF) -> DF:
        tss_grp = tss.groupby(self.id_col)
        tss["soc_ffilled"] = tss_grp["soc"].ffill()
        tss["soc_diff"] = tss_grp["soc_ffilled"].diff()
        tss["soc_diff"] /= tss["soc_diff"].abs()
        tss["soc_diff_rolled_mean"] = (
            tss
            .groupby("vehicle_id")
            .rolling(window=pd.Timedelta(minutes=20), on="date")
            ["soc_diff"]
            .mean()
            .reset_index(drop=True)
        )
        soc_diff_ffilled = tss_grp["soc_diff_rolled_mean"].ffill()
        soc_diff_bfilled = tss_grp["soc_diff_rolled_mean"].bfill()
        tss["in_charge"] = soc_diff_ffilled.gt(0, fill_value=False) & soc_diff_bfilled.gt(0, fill_value=False)
        tss["in_discharge"] = soc_diff_ffilled.lt(0, fill_value=False) & soc_diff_bfilled.lt(0, fill_value=False)
        return tss

    def compute_current_vars(self, tss:DF) -> DF:
        tss["power"] = tss.eval("charging_current * charging_voltage")
        tss["charging_power"] = tss["power"].mask(~tss["in_charge"], pd.NA)
        tss["power"] = tss["power"].mask(tss["in_charge"], pd.NA)
        tss = self.compute_cum_var(tss, var_col="charging_power", cum_var_col="cum_energy_added")
        tss = self.compute_cum_var(tss, var_col="power", cum_var_col="cum_energy_spent")
        return tss


In [None]:
tss = HighFreqProcecssedTimeSeries(make="ituran", id_col="vehicle_id", force_update=True, log_level="DEBUG")
tss

## EDA

### Data sparcity

In [None]:
COLS_TO_PLOT = [
    #"cum_energy_added",
    #"cum_energy_spent",
    #"soc",
    #"charging_ac_mode",
    #"charging_current",
    #"charging_dc_mode",
    #"charging_voltage",
    #"time_remaining_for_charge",
]
for col in COLS_TO_PLOT:
    px.scatter(
        (
            tss
            .dropna(subset=["date", col], how="any")
            .set_index("vehicle_id", drop=False)
        ),
        facet_row="vehicle_id",
        x="date",
        y=col,
        color="vehicle_id"
    ).update_layout(height=1000).show()

In [None]:
charging_points:DF = (
    tss
    .eval("ffilled_estimated_range=estimated_range.ffill()")
    .eval("ffilled_voltage=charging_voltage.ffill()")
    .eval("ffilled_current=charging_current.ffill()")
    .eval("ffilled_time_remaining_for_charge=time_remaining_for_charge.ffill()")
    .query("trimmed_in_charge")
    .groupby(["vehicle_id", "trimmed_in_charge_idx", "soc"])
    .agg(
        energy_added_at_start=pd.NamedAgg(column="cum_energy_added", aggfunc="first"),
        energy_added_at_end=pd.NamedAgg(column="cum_energy_added", aggfunc="last"),
        energy_added=pd.NamedAgg(column="cum_energy_added", aggfunc=series_start_end_diff),
        in_ac=pd.NamedAgg(column="charging_ac_mode", aggfunc=lambda x: x.mode().iat[0]),
        in_dc=pd.NamedAgg(column="charging_dc_mode", aggfunc=lambda x: x.mode().iat[0]),
        current=pd.NamedAgg(column="ffilled_current", aggfunc="median"),
        voltage=pd.NamedAgg(column="ffilled_voltage", aggfunc="median"),
        estimated_range=pd.NamedAgg(column="ffilled_estimated_range", aggfunc="median"),
        time_remaining_for_charge=pd.NamedAgg(column="ffilled_time_remaining_for_charge", aggfunc="median"),
    )
    .reset_index()
    .eval("energy_added=energy_added_at_end - energy_added_at_start")
    .eval("power = current * voltage")
    .astype({"in_ac": "bool", "in_dc": "bool"})
    .query("energy_added > 0")
)
charging_points

In [None]:
charging_points["energy_added"].value_counts(sort=True, ascending=False, dropna=False, normalize=True).iloc[1:]

In [None]:
display(charging_points["in_ac"].value_counts(sort=True, ascending=False, dropna=False))
display(charging_points["in_dc"].value_counts(sort=True, ascending=False, dropna=False))

In [None]:
(
    charging_points
    #.query("vehicle_id == '-178342787'")
    .corr(numeric_only=True)
    .sort_values(by="energy_added", ascending=False)
    #.loc[:, "energy_added"]
)

In [None]:
plt_3d_df(
    charging_points.query("current > 4750 & current < 4800 & energy_added < 69000 & time_remaining_for_charge < 200"),
    x='time_remaining_for_charge',
    y="estimated_range",
    z="energy_added",
    color="in_dc",
    opacity=0.5,
    size=3,
    width=1500,
    height=1000,
)

## Conclusion
We can see that while the date range in the time series is 6 month, ther are only 2 days wotrth of data.  
Given the variables at hand, we *could* implement an soh estimation similar to the one we used in watea.  
For that we would need more data and ideally the temperature.  
If we don't have the temperature we would need to check how the models handle heat differentials? (do they use a heater compensate low temps? Is the battery simply not affected by the temperature?)