# Ituran preliminary EDA
The goal of this notebook is to examine the data provided by Ituran.  
We want to know what columns we will need for the POC and their respective quality (frequence, error margin, ...).  

## Setup

### Import

In [None]:
import plotly.express as px
from pandas import Timedelta as TD
from scipy.integrate import cumulative_trapezoid

from core.pandas_utils import *
from core.plt_utils import plt_3d_df
from core.time_series_processing import compute_cum_energy, compute_charging_n_discharging

### Data Extraction

In [None]:
raw_tss = pd.read_csv("./data_cache/ituran_response.csv", parse_dates=["signal_time"])
raw_tss

In [None]:
raw_tss["signal_name"].value_counts(sort=True, ascending=False, dropna=False)

In [None]:
raw_tss["dataran_id"].value_counts(sort=True, ascending=False)

In [None]:
NB_TEST_VINS = 10
test_vins = raw_tss["dataran_id"].value_counts(sort=True, ascending=False).index[:NB_TEST_VINS].astype("string")
test_raw_tss = raw_tss.astype({"dataran_id": "string"}).query("dataran_id in @test_vins")
test_raw_tss

Need to fix data extraction:

In [None]:
ROWS_TO_KEEP = [
    "Electric Data - Charging Voltage - 7C",
    "Electric Data - Charging Current - 232",
]
test_vins_10 = (
    test_raw_tss["dataran_id"]
    .astype("string")
    .sort_values(ascending=True)
    .unique()[:10]
)
test_pivot_10 = (
    raw_tss
    .astype({"dataran_id": "string"})
    .query("dataran_id in @test_vins_10")
    .query("signal_name in @ROWS_TO_KEEP")
    .pivot_table(index=["dataran_id", "signal_time"], columns="signal_name", values="signal_value")
    .reset_index()
    .sort_values(by=["dataran_id", "signal_time"], ascending=True)
)
display(test_pivot_10)
display(test_pivot_10["dataran_id"].value_counts(sort=True, ascending=False, dropna=False))


test_vins_4 = (
    test_raw_tss["dataran_id"]
    .astype("string")
    .sort_values(ascending=True)
    .unique()[:4]
)
test_pivot_4 = (
    raw_tss
    .astype({"dataran_id": "string"})
    .query("dataran_id in @test_vins_4")
    .query("signal_name in @ROWS_TO_KEEP")
    .pivot_table(index=["dataran_id", "signal_time"], columns="signal_name", values="signal_value")
    .reset_index()
    .sort_values(by=["dataran_id", "signal_time"], ascending=True)
)
display(test_pivot_4)
display(test_pivot_4["dataran_id"].value_counts(sort=True, ascending=False, dropna=False))

In [None]:
(
    test_raw_tss["dataran_id"]
    .sort_values(ascending=True)
    .unique()[:4]
)

In [None]:
test_raw_tss["signal_name"].value_counts(sort=True, ascending=False, dropna=False)

In [None]:
INDEX_COLS = [
    'dataran_id',
    'signal_time',
    'vehicle_make',
    'vehicle_model'
]

COLUMNS_NAMES_MAP = {
    "Electric Data - Battery Status Of Charge - 2334": "soc",
    "Electric Data - Charging AC Mode - 2227": "charging_ac_mode",
    "Electric Data - Charging Current - 232": "charging_current",
    "Electric Data - Charging DC Mode - 9629": "charging_dc_mode",
    "Electric Data - Charging Voltage - 7C": "charging_voltage",
    "Electric Data - Ready Switch Open - 2015": "switch_open",
    "Electric Data - Time Remaining for Charge - 2291": "time_remaining_for_charge",
    "Electric Data - Vehicle Range Of Battery - 2229": "estimated_range",
    "signal_time": "date",
    "dataran_id": "vehicle_id",
    "vehicle_make": "make",
}

DTYPES = {
    "date": "datetime64[ns]",
    "vehicle_id": "string",
    #"make": "string",
    "soc": "float",
    "charging_ac_mode": "bool",
    "charging_current": "float",
    "charging_dc_mode": "bool",
    "charging_voltage": "float",
    "switch_open": "bool",
    "time_remaining_for_charge": "int",
    "estimated_range": "float",
}

In [None]:
class TimeSeriesProcessing:
    def __init__(self, name:str, id_col:str):
        self.name = name
        self.id_col = id_col

    def process_raw_tss(self, raw_tss:DF) -> DF:
        return (
            raw_tss
            .pivot_table("signal_value", INDEX_COLS, "signal_name", dropna=False)
            .reset_index()
            .rename(columns=COLUMNS_NAMES_MAP, errors="ignore")
            .sort_values(by=["vehicle_id", "date"], ascending=True)
            .astype(DTYPES, errors="ignore")
            .pipe(self.compute_charge_n_discharge_masks)
            .pipe(self.compute_current_vars)
            .pipe(self.compute_idx_from_masks, masks=["in_charge", "in_discharge"])
            .pipe(self.trim_leading_n_trailing_soc_off_masks, masks=["in_charge", "in_discharge"])
            .pipe(self.compute_idx_from_masks, masks=["trimmed_in_charge", "trimmed_in_discharge"])
        )

    def compute_charge_n_discharge_masks(self, tss:DF) -> DF:
        tss_grp = tss.groupby(self.id_col)
        tss["soc_ffilled"] = tss_grp["soc"].ffill()
        tss["soc_diff"] = tss_grp["soc_ffilled"].diff()
        tss["soc_diff"] /= tss["soc_diff"].abs()
        tss["soc_diff_rolled_mean"] = (
            tss
            .groupby("vehicle_id")
            .rolling(window=pd.Timedelta(minutes=20), on="date")
            ["soc_diff"]
            .mean()
            .reset_index(drop=True)
        )
        soc_diff_ffilled = tss_grp["soc_diff_rolled_mean"].ffill()
        soc_diff_bfilled = tss_grp["soc_diff_rolled_mean"].bfill()
        tss["in_charge"] = soc_diff_ffilled.gt(0, fill_value=False) & soc_diff_bfilled.gt(0, fill_value=False)
        tss["in_discharge"] = soc_diff_ffilled.lt(0, fill_value=False) & soc_diff_bfilled.lt(0, fill_value=False)
        return tss

    def compute_current_vars(self, tss:DF) -> DF:
        tss_grp = tss.groupby(self.id_col)
        tss["charging_power"] = tss.eval("charging_current * charging_voltage").mask(~tss["in_charge"], pd.NA)
        tss["power"] = tss.eval("charging_current * charging_voltage").mask(tss["in_charge"], pd.NA)
        tss = compute_cum_energy(tss, power_col="charging_power", cum_energy_col="cum_energy_added")
        tss = compute_cum_energy(tss, power_col="power", cum_energy_col="cum_energy_spent")
        #tss["cum_energy_added"] -= tss_grp["cum_energy_added"].transform("first")
        #tss["cum_energy_spent"] -= tss_grp["cum_energy_spent"].transform("first")
        return tss

    def trim_leading_n_trailing_soc_off_masks(self, tss:DF, masks:list[str]) -> DF:
        tss_grp = tss.groupby(self.id_col)
        for mask in masks:
            leading_soc= tss_grp[mask].transform("first")
            trailing_soc = tss_grp[mask].transform("last")
            tss[f"trimmed_{mask}"] = tss[mask] & (tss["soc"] != trailing_soc) & (tss["soc"] != leading_soc)
        return tss

    def compute_idx_from_masks(self, tss:DF, masks:list[str], max_time_diff:TD=None) -> DF:
        for mask in masks:
            if max_time_diff is not None:
                mask = mask & (tss.groupby("vehicle_id")["date"].diff().lt(max_time_diff, fill_value=False))
            tss_grp = tss.groupby("vehicle_id")
            shifted_mask = tss_grp[mask].shift(1, fill_value=False)
            tss[f"{mask}_idx"] = shifted_mask.ne(tss[mask]).cumsum()
        return tss

In [None]:
print(*test_vins, sep="\n")

In [None]:
tss = TimeSeriesProcessing(name="ituran", id_col="vehicle_id").process_raw_tss(test_raw_tss)
tss.shape

For some reason, we the cum_energy_added is always 0 when we have 10 vehicles...
THis is most likely due to an error in the data extraction.

In [None]:
def compute_cum_energy_added(ts:DF) -> DF:
    res_vals = cumulative_trapezoid(
        x=ts["date"].dt.as_unit("s").astype("int64"),
        y=ts["charging_power"].astype("float32").fillna(0),
        initial=0
    )
    res = Series(res_vals)
    return res

cum_energy_added = (
    tss
    .groupby("vehicle_id")
    .apply(compute_cum_energy_added, include_groups=False)
    .unstack()
    .reset_index(drop=True)
)
display(cum_energy_added)
cum_energy_added.value_counts(sort=True, ascending=False, dropna=False, normalize=True)

In [None]:
cum_energy_added = compute_cum_energy_added(tss)
display(cum_energy_added)
cum_energy_added.value_counts(sort=True, ascending=False, dropna=False, normalize=True)

## EDA

### Data sparcity

In [None]:
COLS_TO_PLOT = [
    #"cum_energy_added",
    #"cum_energy_spent",
    #"soc",
    #"charging_ac_mode",
    #"charging_current",
    #"charging_dc_mode",
    #"charging_voltage",
    #"time_remaining_for_charge",
]
for col in COLS_TO_PLOT:
    px.scatter(
        (
            tss
            .dropna(subset=["date", col], how="any")
            .set_index("vehicle_id", drop=False)
        ),
        facet_row="vehicle_id",
        x="date",
        y=col,
        color="vehicle_id"
    ).update_layout(height=1000).show()

In [None]:
px.scatter(
    (
        tss
        .dropna(subset=["date", "switch_open"], how="any")
        .set_index("vehicle_id", drop=False)
    ),
    facet_row="vehicle_id",
    x="date",
    y="soc",
    color="switch_open"
).update_layout(height=1000)

In [None]:
charging_points:DF = (
    tss
    .eval("ffilled_estimated_range=estimated_range.ffill()")
    .eval("ffilled_voltage=charging_voltage.ffill()")
    .eval("ffilled_current=charging_current.ffill()")
    .eval("ffilled_time_remaining_for_charge=time_remaining_for_charge.ffill()")
    .query("trimmed_in_charge")
    .groupby(["vehicle_id", "trimmed_in_charge_idx", "soc"])
    .agg(
        energy_added_at_start=pd.NamedAgg(column="cum_energy_added", aggfunc="first"),
        energy_added_at_end=pd.NamedAgg(column="cum_energy_added", aggfunc="last"),
        energy_added=pd.NamedAgg(column="cum_energy_added", aggfunc=series_start_end_diff),
        in_ac=pd.NamedAgg(column="charging_ac_mode", aggfunc=lambda x: x.mode().iat[0]),
        in_dc=pd.NamedAgg(column="charging_dc_mode", aggfunc=lambda x: x.mode().iat[0]),
        current=pd.NamedAgg(column="ffilled_current", aggfunc="median"),
        voltage=pd.NamedAgg(column="ffilled_voltage", aggfunc="median"),
        estimated_range=pd.NamedAgg(column="ffilled_estimated_range", aggfunc="median"),
        time_remaining_for_charge=pd.NamedAgg(column="ffilled_time_remaining_for_charge", aggfunc="median"),
    )
    .reset_index()
    .eval("energy_added=energy_added_at_end - energy_added_at_start")
    .eval("power = current * voltage")
    .astype({"in_ac": "bool", "in_dc": "bool"})
)
charging_points

In [None]:
charging_points["energy_added"].value_counts(sort=True, ascending=False, dropna=False)

In [None]:
display(charging_points["in_ac"].value_counts(sort=True, ascending=False, dropna=False))
display(charging_points["in_dc"].value_counts(sort=True, ascending=False, dropna=False))

In [None]:
(
    charging_points
    #.query("vehicle_id == '-178342787'")
    .corr(numeric_only=True)
    .sort_values(by="energy_added", ascending=False)
    #.loc[:, "energy_added"]
)

In [None]:
plt_3d_df(
    charging_points.query("current > 4750 & current < 4800 & energy_added < 69000 & time_remaining_for_charge < 200"),
    x='time_remaining_for_charge',
    y="estimated_range",
    z="energy_added",
    color="in_dc",
    opacity=0.5,
    size=3,
    width=1500,
    height=1000,
)

## Conclusion
We can see that while the date range in the time series is 6 month, ther are only 2 days wotrth of data.  
Given the variables at hand, we *could* implement an soh estimation similar to the one we used in watea.  
For that we would need more data and ideally the temperature.  
If we don't have the temperature we would need to check how the models handle heat differentials? (do they use a heater compensate low temps? Is the battery simply not affected by the temperature?)