# BMW processed time series Exploratory Data Analysis
The goal of this notebook is to elaborate a method to calculate the SoH of the BMW fleet.  

## Setup

### Imports

In [None]:
from datetime import datetime as DT
import pytz

import pandas as pd
from pandas import DataFrame as DF
import plotly.express as px

from core.s3_utils import S3_Bucket
from core.config import *
from core.pandas_utils import *
from transform.processed_tss.main import get_processed_tss


### Data extraction

In [None]:
tss = get_processed_tss("bmw", force_update=True)
tss.columns
tss = tss.eval("in_charge_perf_idx = in_charge.cumsum()")


In [None]:
tss.head(10)

In [None]:
 # Reset the index to make it unique
tss = tss.reset_index(drop=True)
duplicate_indices = tss.index[tss.index.duplicated()]
print("Duplicate indices:", duplicate_indices)

In [None]:
# tss["odometer"] = tss.groupby("vin")["odometer"].ffill()
tss["odometer"] = tss.groupby("vin")["odometer"].transform(lambda group: group.ffill())



## Time series EDA

Let's list the variables and the respective count ratio.

In [None]:
tss.count() / len(tss)

In [None]:
VIN = "WBY1Z610407A12415"
tss_unique = tss.query("vin == @VIN")

random_vins = np.random.choice(tss['vin'].unique(), size=5, replace=False)

tss_sample = tss[tss['vin'].isin(random_vins)]

## Print first graphs

In [None]:
px.scatter(tss, 
           x="soc", 
           y="estimated_range", 
           color="vin")

## SoH calculation

### First method on the estimated range


#### Few graphs before calculation 


In [None]:
px.scatter(tss, 
           x="soc", 
           y="estimated_range", 
           color="vin")

In [None]:
px.scatter(tss, 
           x="soc", 
           y="estimated_range", 
           color="in_charge")


-> No correlation between in_charge and estimated_range

In [None]:
px.scatter(tss_unique, 
           x="date", 
           y="soc", 
           color="in_charge_perf_idx")


### SoH calculation


In [None]:
tss["SoH"] = tss["estimated_range"] / (tss["soc"] * tss["range"]) * 100
tss_unique["SoH"] = tss_unique["estimated_range"] / (tss_unique["soc"] * tss_unique["range"]) * 100
tss_sample["SoH"] = tss_sample["estimated_range"] / (tss_sample["soc"] * tss_sample["range"]) * 100


#### SoC / SOH 


In [None]:
px.scatter(tss, 
           x="SoH", 
           y="soc", 
           color="vin")

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

# Assuming tss is your DataFrame
tss_clean = tss.dropna(subset=['soc', 'SoH'])
# Create a scatter plot
fig = px.scatter(tss_clean, 
                 x="soc", 
                 y="SoH", 
                 color="vin",
                 title='Scatter plot with Trendline')

# Calculate the trendline
x = tss_clean['soc']
y = tss_clean['SoH']
# Fit a linear model
coefficients = np.polyfit(x, y, 1)
trendline = np.poly1d(coefficients)

# Extract the slope and intercept
slope, intercept = coefficients

# Add the trendline to the plot using go.Scatter
fig.add_trace(go.Scatter(
    x=x,
    y=trendline(x),
    mode='lines',
    name='Trendline'
))

# Add an annotation for the trendline equation
equation_text = f"y = {slope:.8f}x + {intercept:.2f}"
fig.add_annotation(
    x=max(x),  # Position the annotation at the maximum x value
    y=max(trendline(x)),  # Position the annotation at the corresponding y value
    text=equation_text,
    showarrow=False,
    font=dict(size=12, color="black"),
    xanchor='right'
)

# Show the plot
fig.show()
print(equation_text)

#### In charge / SOH

In [None]:
px.scatter(tss_unique, 
           x="date", 
           y="SoH", 
           color="in_charge")

#### Charging 


In [None]:
px.scatter(tss.query("in_charge == True"), 
           x="date", 
           y="SoH", 
           color="charging_plug_connected")

#### SOH / odometer


In [None]:
px.scatter(tss, 
           x="odometer", 
           y="SoH", 
           color="vin")

-> Find a better fill method for odometer

#### Improving the calculation 


In [None]:
# Filtering on the number of point of SoH 
non_null_estimated_range = tss.dropna(subset=['estimated_range'])
vin_counts = non_null_estimated_range['vin'].value_counts()
vins_with_at_least_10_non_null = vin_counts[vin_counts >= 100].index
filtered_tss = tss[tss['vin'].isin(vins_with_at_least_10_non_null)]

### Conclusion

In [None]:
aggregated_tss = filtered_tss.groupby("vin").agg(
    {"soc": "mean", 
     "estimated_range": "mean", 
     "range": "mean", 
     "odometer": "mean",
     "vin": "first", 
     "SoH": "mean"})
px.scatter(aggregated_tss, 
           x="odometer", 
           y="SoH", 
           color="vin")

### Individual Study 

In [None]:
VIN = "WBY8P210607G05514"
tss_unique = tss.query("vin == @VIN")

px.scatter(tss_unique, 
           x="date", 
           y="SoH", 
           color="in_charge")

In [None]:
px.scatter(tss_unique, 
           x="soc", 
           y="SoH", 
           color="in_charge")