# BMW raw time series Exploratory Data Analysis
The goal of this notebook is to validate the integrity of the data provided by the BMW API.  

## Setup

### Imports

In [None]:
from datetime import datetime as DT
import pytz

import numpy as np
import pandas as pd
from pandas import DataFrame as DF
import plotly.express as px

from core.s3_utils import S3_Bucket
from core.config import *
from core.pandas_utils import *
from transform.raw_tss.main import get_raw_tss

### Data extraction

In [None]:
raw_tss = get_raw_tss("bmw")
raw_tss.columns

In [None]:
tss = (
    raw_tss.astype(
        {
            "charging_ac_ampere": "float",
            "charging_ac_voltage": "float",
            "charging_method": "category",
            "charging_plug_connected": "category",
            "charging_status": "category",
            "coolant_temperature": "float",
            "kombi_remaining_electric_range": "float",
            "mileage": "float",
            "soc_customer_target": "float",
            "soc_hv_header": "float",
            "soc_target_charging_time_forecast": "float",
            "teleservice_status": "category",
            "vin": "string",
        }
    )
    .assign(
        date=pd.to_datetime(raw_tss["date_of_value"], format="mixed").mask(
            raw_tss["date_of_value"].isna(), raw_tss["date"]
        )
    )
    .drop(columns=["date_of_value"])
    .rename(
        columns={
            # "date_of_value": "date",
            "mileage": "odometer",
            "soc_hv_header": "soc",
        }
    )
    .sort_values(by=["vin", "date"])
)

## Pre - processing


In [None]:
# Calculer le nombre de valeurs non-nulles pour chaque colonne
column_counts = tss.count()

# Identifier les colonnes avec des valeurs (count > 0)
columns_to_keep = column_counts[column_counts > 0].index
print(columns_to_keep)
# Filtrer le DataFrame pour ne garder que ces colonnes
tss = tss[columns_to_keep]
# Set index
tss.set_index("vin", drop=False)

In [None]:
! mkdir -p data_cache
var_counts = raw_tss.groupby("vin").count()
var_counts.to_csv("data_cache/var_counts_per_vin.csv")

## Time series EDA

In [None]:
# If you wan to plot only for one specific vin you can use
tss_unique = tss[tss["vin"] == "WBY1Z610407A12415"]  ## It is a car that has good data
# If you want to plot for a random sample of vins you can use
selected_vins = np.random.choice(tss["vin"].unique(), size=5, replace=False)
tss_sample = tss[tss["vin"].isin(selected_vins)]

### Available data 


In [None]:
tss.count() / len(tss)

In [None]:
# How many cars have a non-null avg_electric_range_consumption?
cars_with_range = tss[tss["coolant_temperature"].notna()]["vin"].nunique()
total_cars = tss["vin"].nunique()
print(f"We have data for {cars_with_range} out of {total_cars} cars")
print(tss[tss["coolant_temperature"].notna()]["vin"].unique())

In [None]:
# What is the value of the coolant temperature?
tss["teleservice_status"].unique()

In [None]:
px.scatter(tss_unique, x="date", y="basic_model_range")

## Printing first graphs


Let's list the variables and the respective count ratio.

In [None]:
px.scatter(
    tss_sample,
    x="date",
    y="odometer",
    facet_col="vin",
    facet_col_wrap=1,
    facet_row_spacing=0.01,  # Ensure the spacing is smaller than 0.025641
).update_layout(
    height=500,  # Adjust the height to fit the rows
)

In [None]:
px.scatter(tss_sample, x="date", y="battery_voltage", facet_col="vin", facet_col_wrap=1)

We can see that the plots seem skewed.  
let's see why.  

In [None]:
mask = tss["date"] < DT(year=2024, month=8, day=1, tzinfo=pytz.UTC)
tss[mask].count()

In [None]:
px.box(tss, x="date")

We can see that there are a few points before auggust, pretty surprising given the fact the BMW POC started way later than this (late September).

In [None]:
# requested_vars = (
#     DF.from_dict(data=VARIABLES_THAT_WE_ASKED_FOR)
#     .drop(columns=["key_type"])
# )

# display(requested_vars)

In [None]:
received_vars = (
    tss.dtypes.to_frame("unit")
    .reset_index(drop=False)
    .rename(columns={"key": "key_name"})
)
display(received_vars)

In [None]:
raw_tss[raw_tss["date_of_value"].isna()]
# raw_tss.query("date_of_value == 'None'")

In [None]:
tss.columns

In [None]:
tss.head(10)

In [None]:
# CrÃ©er le scatter plot
fig = px.scatter(
    tss,
    x="date",
    y="battery_voltage",
    title="Battery Voltage vs State of Charge",
    labels={
        "soc": "State of Charge (%)",
        "battery_voltage": "Battery Voltage (V)",
    },
    hover_data=["date"],  # Ajouter la date dans les infos au survol
)
fig.show()

## First attempt on the SoH



### Using the avg_electric_range_consumption

In [None]:
# How many cars have a non-null avg_electric_range_consumption?
cars_with_range = tss[tss["avg_electric_range_consumption"].notna()]["vin"].nunique()
total_cars = tss["vin"].nunique()
print(f"We have data for {cars_with_range} out of {total_cars} cars")
print(tss[tss["avg_electric_range_consumption"].notna()]["vin"].unique())

-> The data is only available for i4 cars. The avg_electric_range_consumption is not useful for the SoH calculation.

### Using  the kombi_remaining_electric_range


In [None]:
# How many cars have a non-null kombi_remaining_electric_range?
cars_with_range = tss[tss["kombi_remaining_electric_range"].notna()]["vin"].nunique()
total_cars = tss["vin"].nunique()
print(f"We have data for {cars_with_range} out of {total_cars} cars")
print(tss[tss["kombi_remaining_electric_range"].notna()]["vin"].unique())

-> The data is available for all cars

In [None]:
tss["SoH"] = tss["kombi_remaining_electric_range"] / tss["soc"]
tss_sample["SoH"] = tss_sample["kombi_remaining_electric_range"] / tss_sample["soc"]

#### Study for one car


In [None]:
px.scatter(
    tss_unique,
    x="soc",
    y="SoH",
    color="charging_method",
)

-> It doesn't seems to have any difference between the charging methods.

In [None]:
px.scatter(
    tss_unique,
    x="soc",
    y="SoH",
    color="charging_status",
)

-> No differenceis the car is charging or not 


#### Study for all the cars 

In [None]:
px.scatter(
    tss,
    x="odometer",
    y="SoH",
    color="vin",
)

In [None]:
import plotly.express as px

# Calculate SoH for each entry
tss["SoH"] = (tss["kombi_remaining_electric_range"] / tss["soc"]) * 100

# Group by VIN to calculate the mean SoH and maximum odometer
aggregated_data = (
    tss.groupby("vin").agg({"SoH": "mean", "odometer": "max"}).reset_index()
)


# Create a scatter plot for mean SoH vs. max odometer
fig = px.scatter(
    aggregated_data,
    x="odometer",
    y="SoH",
    color="vin",
    hover_data=["vin"],
    title="Mean SoH vs Maximum Odometer per Vehicle",
    labels={"odometer": "Maximum Odometer Reading", "SoH": "Mean SoH (%)"},
)

# Show the plot
fig.show()

#### Adding filters


##### Filtering for cars with a SoC > 40%

### Using the charging 
charging_ac_ampere / charging_ac_voltage

## Data extraction pipelines comparaisons
Assuming that the data provided by High Mobility comes from BMW API, we will compare these two pipelines:    
As
 of writing this notebook markdown cell, the two data extraction pipelines are (give or take):  
- BMW API - High Mobility - [Tom's ingestion](../../../ingestion/) - My high_mobility_raw_ts
- BMW API - Theophile's ingestion - My bmw_raw_tss - The preprocessing code cell above(unlikely to destroy affect any values)

Let's call them long and direct pipelines.

### Long pipeline EDA
We will extract the raw time series of all the vins, even the ones we didn't pull from the BMW API.

In [None]:
bucket = S3_Bucket()


def get_bmw_hm_raw_tss() -> DF:
    keys = bucket.list_keys("raw_ts/bmw/time_series/")
    keys = keys[keys.str.endswith(".parquet")]
    if len(keys) == 0:
        print("no keys found!!!!!!!!")
        return DF(None, columns=KEY_LIST_COLUMN_NAMES)
    # Only retain .json responses
    # Responses are organized as follow response/brand_name/vin/date-of-response.json
    keys = str_split_and_retain_src(
        keys, "/", col_names=["key", "dtype_folder", "brnad", "dtype_folder2", "file"]
    )
    raw_tss_dict = {
        key["file"].split(".")[0]: bucket.read_parquet_df(key["key"])
        for _, key in keys.iterrows()
    }
    raw_tss = pd.concat(
        raw_tss_dict, axis="index", keys=raw_tss_dict.keys(), names=["vin", "idx"]
    )
    return raw_tss


long_raw_tss = get_bmw_hm_raw_tss()

long_raw_tss

In [None]:
long_raw_tss.count() / len(long_raw_tss)

Looking at the variables in the long_raw_tss, or rather the lack there of, it is pretty obvious that the direct pipeline is more appropriate.  

## Conclusion

We have a fair bit of missing values compared to the ones that we asked for in the direct data pipeline.  
The "High Mobility pipeline" is even worse so we are already bettery off with the direct one.  