# Ride distance EDA
The goal of this notebook is to extract valuable insights from descriptive statics of the renault Zoes rides and/or discharge cycles.  

## Setup

In [None]:
! mkdir -p data_cache

### Imports

In [None]:
import pandas as pd
from pandas import Series
from pandas import DataFrame as DF
from datetime import timedelta as TD
import plotly.express as px
import plotly.graph_objects as go
from ydata_profiling import ProfileReport

from core.config import *
from core.pandas_utils import series_start_end_diff
from transform.fleet_info.ayvens_fleet_info import fleet_info
from transform.raw_tss.high_mobility_raw_tss import get_raw_tss
from core.time_series_processing import low_freq_mask_in_motion_periods
from core.time_series_processing import low_freq_compute_charge_n_discharge_vars

### Data extraction

In [None]:
COLS_TO_CPY_FROM_FLEET_INFO = [
    "make",
    "model",
    "version",
    "dummy_soh_maker_offset",
    "dummy_soh_model_offset",
    "dummy_soh_model_slope",
    "dummy_soh_vehicle_offset",
    "capacity",
    "registration_date",
    "vin"
]

RENAME_COLS_DICT = {
    "date_of_value": "date",
    "diagnostics.odometer": "odometer",
    "odometer.value": "odometer",
    "diagnostics.odometer": "odometer",
    "mileage_km": "odometer",
    "mileage": "odometer",
    "charging.battery_energy": "battery_energy",
    "charging.estimated_range": "estimated_range",
    "charging.battery_level": "soc",
    "soc_hv_header": "soc",
}

COL_DTYPES = {
    "soc": "float",
    "odometer": "float",
    "estimated_range": "float",
    "battery_energy": "float",
    "soc": "float",
    "dummy_soh_maker_offset": "float",
    "dummy_soh_model_offset": "float",
    "dummy_soh_model_slope": "float",
    "dummy_soh_vehicle_offset": "float",
    # "dummy_soh_offset": "float",
    "vin": "string",
    "capacity": "float",
    "registration_date": "datetime64[ns]"
}

COLS_TO_KEEP = [
    "date",
    "soc",
    "odometer",
    "estimated_range",
    "battery_energy",
    "soc",
    "vin",
]

In [None]:
raw_tss = get_raw_tss("renault")

#hot fix before fixing fleet_info
fleet_info["registration_date"] = pd.to_datetime(fleet_info["registration_date"]) #.dt.tz_localize("UTC")


tss:DF = (
    raw_tss
    .merge(fleet_info[COLS_TO_CPY_FROM_FLEET_INFO], on="vin", how="left")
    .rename(columns=RENAME_COLS_DICT)   
    .astype(COL_DTYPES, errors="ignore")
    .sort_values(by=["vin", "date"])
    .eval("soc = soc * 100")
    .eval("age = date - registration_date")
    .assign(age_in_years=lambda tss:tss["age"].dt.days / 365)
)
# Hot fix before fixing fleet info
tss.loc[tss.eval("model == 'R110'"), "capacity"] = 41.0
tss.loc[tss.eval("model == 'R135'"), "capacity"] = 52.0

In [None]:
tss

## Data preprocessing

In [None]:
tss = (
    tss
    .groupby("vin")
    .apply(low_freq_mask_in_motion_periods, include_groups=False)
)

In [None]:
most_common_vins = tss.value_counts("vin").index
most_common_vin = most_common_vins[0] 
ts = (
    tss
    .xs(most_common_vin, level=0)
)

In [None]:
# Let's check the result of our masking with the most common vin (i.e the one with the most amount of timestamps/lines)
fig = go.Figure()

# Step 3: Add the line trace

# Step 4: Fill area based on the boolean mask
# Create a filled area for True values in the mask
fig.add_trace(go.Scatter(
    x=ts['date'],
    y=ts['odometer'] * ts['in_motion_perf_mask'],  # Fill only where mask is True
    fill='tozeroy',  # Fill to zero
    mode='none',  # No markers or lines for this trace
    name='In motion',
    fillcolor='rgba(0, 100, 80, 0.2)'  # Color of the fill
))
fig.add_trace(go.Scatter(
    x=ts['date'],
    y=ts['odometer'] * ts['time_diff_low_enough'],  # Fill only where mask is True
    fill='tozeroy',  # Fill to zero
    mode='none',  # No markers or lines for this trace
    name='time_diff_low_enough',
    fillcolor='rgba(100, 0, 80, 0.2)'  # Color of the fill
))

fig.add_trace(go.Scatter(x=ts['date'], y=ts['odometer'], mode='lines', name='odometer'))
# Step 5: Update layout for better visualization
fig.update_layout(
    title='Time Series Plot with Filled Area',
    xaxis_title='Date',
    yaxis_title='odometer',
    template='plotly_white'
)

# Step 6: Show the figure
fig.show()

In [None]:
tss = (
    tss
    .groupby("vin")
    .apply(low_freq_compute_charge_n_discharge_vars)
)

In [None]:
def compute_soh_vars(tss:DF) -> DF:
    tss:DF = (
        tss
        .eval("expected_battery_energy = soc * capacity / 100")
        .eval("soh = 100 * battery_energy / expected_battery_energy")
    )
    # Update once we have a better soh estimation
    tss["soh"] = tss["soh"].mask(tss["soh"].gt(50), pd.NA)

    return tss

tss = compute_soh_vars(tss)

In [None]:
def plt_soc_n_discharge(ts:DF, vin="unknown"):
    # Let's check the result of our masking with the most common vin (i.e the one with the most amount of timestamps/lines)
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=ts['date'],
        y=ts['soc'] * ts['in_discharge'],  # Fill only where mask is True
        fill='tozeroy',  # Fill to zero
        mode='none',  # No markers or lines for this trace
        name='in_discharge',
        fillcolor='rgba(100, 0, 80, 0.2)'  # Color of the fill
    ))

    fig.add_trace(go.Scatter(x=ts['date'], y=ts['soc'], mode='markers', name='soc'))
    # Step 5: Update layout for better visualization
    fig.update_layout(
        title=f'Time Series Plot with Filled Area of {vin}',
        xaxis_title='Date',
        yaxis_title='soc',
        template='plotly_white'
    )

    # Step 6: Show the figure
    fig.show()

for vin in most_common_vins[:4]:
    plt_soc_n_discharge(tss.xs(vin), vin)

## EDA


### Extraction of discharges

In [None]:
discharges:DF = (
    tss
    .sort_index()
    .query("in_discharge_perf_mask")
    .groupby(["vin", "in_discharge_perf_idx"])
    .agg(
        # soc
        soc_start=pd.NamedAgg("soc", "first"),
        soc_end=pd.NamedAgg("soc", "last"),
        soc_diff=pd.NamedAgg("soc", series_start_end_diff),
        # odometer
        odometer_start=pd.NamedAgg("odometer", "first"),
        odometer_end=pd.NamedAgg("odometer", "last"),
        distance=pd.NamedAgg("odometer", series_start_end_diff),
        # time
        duration=pd.NamedAgg("date", series_start_end_diff),
        age_in_years=pd.NamedAgg("age_in_years", "first"),
        # vehicle
        model=pd.NamedAgg("model", "first"),
        version=pd.NamedAgg("version", "first"),
    )
    .reset_index(drop=False)
    .assign(duration_sec= lambda discharges: discharges["duration"].dt.total_seconds())
    .eval("distance_per_soc = distance / soc_diff * -1") # Negate distance_per_soc as soc_diff is negative
    .eval("duration_per_soc = duration_sec / soc_diff * -1") # Negate distance_per_soc as soc_diff is negative
)

### Overview

In [None]:
discharges

In [None]:
discharges.count() / len(discharges)

### Correlations visualizations

In [None]:
import numpy as np


def scatter_discharges(discharges:DF, x:str, y:str):
    cols_to_check = [x, y]

    inf_mask = (
        discharges
        .loc[:, cols_to_check]
        .isin([np.inf, -np.inf])
        .any(axis="columns")
    )
    nan_mask = discharges[cols_to_check].isna().any(axis="columns")
    discharges_to_plt = discharges[~nan_mask & ~inf_mask]

    px.scatter(
        discharges_to_plt.query("version != 'R110'").query("distance_per_soc > 0"),
        x=x,
        y=y,
        trendline="ols",
        trendline_scope="overall"
    ).show()

scatter_discharges(discharges, "odometer_start", "distance_per_soc")
scatter_discharges(discharges, "in_discharge_perf_idx", "distance_per_soc")
scatter_discharges(discharges, "age_in_years", "distance_per_soc")
scatter_discharges(discharges, "odometer_start", "distance_per_soc")


In [None]:
corr = discharges.query("version == 'R135'").corr(numeric_only=True)
display(corr["distance_per_soc"].sort_values())
display(corr["duration_per_soc"].sort_values())

## Conclusion

As of right now it is fairly difficult to infer any usefull insights on the vehicles from these trips stats.  
This is due to the noisy nature of trips stats as multiple factors external to the vehicle affect the stats.  
To reduce this noise we *could* use some sort of pattern matching to only compare daily trips that are easier to compare together as the route (more or less) the same.   