# Ford raw time series EDA
The goals of this notebook are to:
- visualize and get an intuition of how the ford data is like.  
- Find a lead to estimate the soh.

## Imports

In [None]:
import logging
from datetime import datetime as DT
from datetime import timedelta as TD
from dateutil import parser
from dotenv import load_dotenv
import os

from rich import print
import pandas as pd
from pandas import Series
from pandas import DataFrame as DF
import plotly.express as px

from core.s3_utils import S3_Bucket
from core.config import *


## Setup

In [None]:
fleet_info = pd.read_csv("../ayvens/fleet_info.csv", usecols=["VIN","Make","Model","Type","Capacity"], dtype={"Make":"string"})
# display(fleet_info["Make"].str.lower().value_counts())
fleet_info = (
    fleet_info
    .rename(columns={"VIN": "vin"})
    .assign(Make=fleet_info["Make"].str.lower())
    .query("Make == 'ford' & Type != 'x'")
    .set_index("vin", drop=False)
)
fleet_info[["Model", "Capacity"]].value_counts()

In [None]:
# We are collecting all the data for the S3

PROD_CREDS = {
    "bucket_name":os.getenv("PROD_S3_BUCKET"),
    "aws_access_key_id":os.getenv("PROD_S3_KEY"),
    "aws_secret_access_key":os.getenv("PROD_S3_SECRET"),
}

bucket = S3_Bucket()

def get_ford_raw_ts(vin:str) -> DF:
    return (
        bucket.read_parquet_df(f"raw_ts/ford/time_series/{vin}.parquet")
        .set_index("date", drop=False)
        .sort_index()
    )
RENAME_COLS_DICT = {
    "diagnostics.odometer": "odometer",
    "charging.battery_energy": "battery_energy",
    "charging.battery_level": "soc",
    "charging.status": "status",
}

raw_tss = {}
for vin, vehicle_info in fleet_info.iterrows():
    try:
        raw_tss[vin] = (
            get_ford_raw_ts(vin)
            .assign(vin=vin)
            .assign(type=vehicle_info["Type"])
            .assign(capacity=vehicle_info["Capacity"])
            .rename(columns=RENAME_COLS_DICT)
        )
    except Exception as e:
        # display(e)
        # display(vin)
        continue
raw_tss = pd.concat(raw_tss, axis="index", keys=raw_tss.keys(), names=["vin"])

raw_tss["type"].unique()
# raw_tss["capacity"].unique()

In [None]:
#Printing the columns
raw_tss.columns

## Time series processing
We need to display some first graph to get an overview of the available data.

In [None]:
import plotly.graph_objects as go

def twinx(df, cols_y1, cols_y2, x_col=None):
    """
    Creates a Plotly figure with two y-axes (twin y-axis plot).
    
    Parameters:
    - df: pd.DataFrame - The dataframe containing the data.
    - cols_y1: list - List of column names for the primary y-axis (left side).
    - cols_y2: list - List of column names for the secondary y-axis (right side).
    - x_col: str (optional) - The column name to be used for the x-axis. If not provided, index is used.
    
    Returns:
    - fig: go.Figure - Plotly figure with dual y-axis.
    """
    fig = go.Figure()

    # Determine the x-axis data
    if x_col is None:
        x_data = df.index
    else:
        x_data = df[x_col]
    
    # Add traces for the first (left) y-axis
    for col in cols_y1:
        fig.add_trace(go.Scatter(x=x_data, y=df[col], name=col, yaxis="y1"))

    # Add traces for the second (right) y-axis
    for col in cols_y2:
        fig.add_trace(go.Scatter(x=x_data, y=df[col], name=col, yaxis="y2"))

    # Update layout for dual y-axis
    fig.update_layout(
        yaxis=dict(title="Primary Y-Axis", titlefont=dict(color="blue")),
        yaxis2=dict(
            title="Secondary Y-Axis",
            titlefont=dict(color="red"),
            overlaying="y",
            side="right"
        ),
        xaxis=dict(title=x_col if x_col else "Index")
    )

    return fig


In [None]:
ts = raw_tss.xs("WF0TK1EM2MMA16501", level=0)
COLS_TO_DISPLAY = [
    'battery_energy',
    'soc',
    #'capacity',
]
twinx(ts, ['battery_energy'], ["soc"], x_col="date")

The battery energy is a direct function of the battery level.
This is a first track to calculate the SoH.

In [None]:
# Instead of ts["capacity"].type()
print("Data type of 'capacity':", ts["capacity"].dtype)
print("Unique values in 'capacity':", ts["capacity"].unique())
print("Value counts of 'capacity':\n", ts["capacity"].value_counts())



In [None]:
ts['capacity'] = pd.to_numeric(ts['capacity'], errors='coerce')

ts["SoH"] = ts["battery_energy"] / ts["soc"] / ts["capacity"] * 100
px.scatter(ts, x="soc", y=["SoH"]).show()

Depending on the battery level, the SoH is between 0.85 and 0.95.
We can add a filter to have a better understanding of the SoH.

# Adding Filter 

In [None]:
tss_filtered = ts.query("soc > 0.5")
px.scatter(tss_filtered, x="soc", y=["SoH"]).show()

# Adding filter on the charge/discharge