# Volvo processed_ts series Exploratory Data Analysis
The goal of this notebook is to validate a first SoH 

## Setup

### Imports

In [None]:
from datetime import datetime as DT
import pytz

import numpy as np
import pandas as pd
from pandas import DataFrame as DF
import plotly.express as px

from core.s3_utils import S3_Bucket
from core.config import *
from core.pandas_utils import *
from transform.processed_tss.main import get_processed_tss
from transform.raw_tss.main import get_raw_tss
from transform.fleet_info.main import fleet_info

### Data extraction

In [None]:
fleet_info.columns

In [None]:
tss = get_processed_tss("volvo-cars", force_update=True)
tss.columns

## Time series EDA

In [None]:
# If you wan to plot only for one specific vin you can use 
tss_unique = tss[tss["vin"] == "YV1XZEDVEM2478472"] ## It is a car that has good data
# If you want to plot for a random sample of vins you can use 
selected_vins = np.random.choice(tss['vin'].unique(), size=5, replace=False)
tss_sample = tss[tss['vin'].isin(selected_vins)]


### Available data 


In [None]:
tss.count() / len(tss)

## Printing first graphs


Let's list the variables and the respective count ratio.

In [None]:
# Créer le scatter plot
fig = px.scatter(
    tss,
    x="date",
    y="estimated_range",
    title="Estimated range vs Odometer",
    color="vin",
    labels={
        "soc": "State of Charge (%)",
        "estimated_range": "Estimated Range (km)",
    },
    hover_data=["date"]  # Ajouter la date dans les infos au survol
)
fig.show()



## First attempt on the SoH



### Using the estimated_range

In [None]:
# How many cars have a non-null avg_electric_range_consumption?
cars_with_range = tss[tss["estimated_range"].notna()]['vin'].nunique()
total_cars = tss['vin'].nunique()
print(f"We have data for {cars_with_range} out of {total_cars} cars")
print(tss[tss["estimated_range"].notna()]['vin'].unique())

-> The data is available for all cars

In [None]:
# Assuming 'version' is a column in your DataFrame
version_coefficient = {
    'p8 awd r-design': 1,  # No change
    'twin pure electric pro': 1,  # Example coefficient for version2
    'pure electric plus': 1,
    'p8 awd': 1, 
    'pure electric 170 KW plus': 1, 
    # Add more versions and coefficients as needed
}
# Apply coefficient based on version
tss = tss.eval("SoH = estimated_range / soc / range")
tss_sample = tss_sample.eval("SoH = estimated_range / soc / range")

In [None]:
display(tss["SoH"].value_counts(dropna=False, normalize=True))
display(tss["soc"].value_counts(dropna=False, normalize=True))
display(tss["range"].value_counts(dropna=False, normalize=True))
display(tss["estimated_range"].value_counts(dropna=False, normalize=True))

#### Study for one car


In [None]:
tss_unique = tss[tss["vin"] == "YV1XZEDVEM2478472"] ## It is a car that has good data

px.scatter(
    tss_unique,
    x="date",
    y="SoH", 
    color="vin",
)

In [None]:
px.scatter(
    tss.eval("day_date = date.dt.date"),
    x="date",
    y="SoH", 
    color="vin",
)

In [None]:
import plotly.express as px
  # Réinitialiser l'index pour enlever 'vin' de l'index
tss.reset_index(drop=True, inplace=True)

# Group by VIN to calculate the mean SoH and maximum odometer
aggregated_data = tss.groupby('vin').agg({
    'SoH': 'mean',
    'odometer': 'max',
    'version': 'first'
}).reset_index()


# Create a scatter plot for mean SoH vs. max odometer
fig = px.scatter(
    aggregated_data,
    x='odometer',
    y='SoH',
    color="version",
    hover_data=['vin'],
    title='Mean SoH vs Maximum Odometer per Vehicle',
    labels={
        'odometer': 'Maximum Odometer Reading',
        'SoH': 'Mean SoH (%)'
    }
)

# Show the plot
fig.show()

#### Adding filters


In [None]:
px.scatter(
    tss_sample.query("in_discharge_perf_mask == True"),
    x="date",
    y="SoH", 
    color="in_discharge_perf_idx",
)

In [None]:
#

### Studying individually

In [None]:
px.scatter(
    tss_unique.query("in_discharge_perf_mask == True"),
    x="date",
    y="SoH", 
    color="in_discharge_perf_idx",
)

Depending on the number of discharge the accruacy won't be the same 
-> Let's see what filter we need to apply

In [None]:
agregate_tss = tss.groupby("vin").agg({
    "in_discharge_perf_idx": "max",
})
agregate_tss.plot.hist(
    y="in_discharge_perf_idx")

-> We can fix a number of minimum discharge of 5 to be able to calculate the SoH

In [None]:
# Step 1: Group by 'vin' and calculate the maximum 'in_discharge_perf_idx'
max_in_discharge_perf_idx = tss.groupby('vin')['in_discharge_perf_idx'].max()

# Step 2: Filter 'vin' values where the maximum 'in_discharge_perf_idx' is greater than 5
valid_vins = max_in_discharge_perf_idx[max_in_discharge_perf_idx > 5].index

# Step 3: Filter the original 'tss' DataFrame using the valid 'vin' values
filtered_tss = tss[tss['vin'].isin(valid_vins)]

# Now apply your calculation on the filtered data
filtered_tss["SoH"] = filtered_tss.query("in_discharge_perf_mask == True").apply(
    lambda row: (row["estimated_range"] / row["soc"] / row["range"]) * version_coefficient.get(row["version"], 1.0), axis=1
)

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

# Réinitialiser l'index pour enlever 'vin' de l'index
tss.reset_index(drop=True, inplace=True)

# Group by VIN to calculate the mean SoH and maximum odometer
aggregated_data = filtered_tss.groupby('vin').agg({
    'SoH': 'mean',
    'odometer': 'max',
    'version': 'first'
}).reset_index()

# Calculate the trendline
x = aggregated_data['odometer']
y = aggregated_data['SoH']
# Fit a linear model
coefficients = np.polyfit(x, y, 1)
trendline = np.poly1d(coefficients)

# Extract the slope and intercept
slope, intercept = coefficients

# Create a scatter plot for mean SoH vs. max odometer
fig = px.scatter(
    aggregated_data,
    x='odometer',
    y='SoH',
    color="version",
    hover_data=['vin'],
    title='Mean SoH vs Maximum Odometer per Vehicle',
    labels={
        'odometer': 'Maximum Odometer Reading',
        'SoH': 'Mean SoH (%)'
    }
)

# Add the trendline to the plot using go.Scatter
fig.add_trace(go.Scatter(
    x=x,
    y=trendline(x),
    mode='lines',
    name='Trendline'
))

# Add an annotation for the trendline equation
equation_text = f"y = {slope:.2f}x + {intercept:.2f}"
fig.add_annotation(
    x=max(x),  # Position the annotation at the maximum x value
    y=max(trendline(x)),  # Position the annotation at the corresponding y value
    text=equation_text,
    showarrow=False,
    font=dict(size=12, color="black"),
    xanchor='right'
)

# Show the plot
fig.show()

--> We need to ajust the coefficient 

In [None]:
# Assuming 'version' is a column in your DataFrame
version_coefficient = {
    'p8 awd r-design': 1.2,  # No change
    'twin pure electric pro': 1.4,  # Example coefficient for version2
    'pure electric plus': 1.2,
    'p8 awd': 1.2, 
    'pure electric 170 kw plus': 1.1, 
    # Add more versions and coefficients as needed
}

#### Recalculating the SoH


In [None]:
# Step 1: Group by 'vin' and calculate the maximum 'in_discharge_perf_idx'
max_in_discharge_perf_idx = tss.groupby('vin')['in_discharge_perf_idx'].max()

# Step 2: Filter 'vin' values where the maximum 'in_discharge_perf_idx' is greater than 5
valid_vins = max_in_discharge_perf_idx[max_in_discharge_perf_idx > 5].index

# Step 3: Filter the original 'tss' DataFrame using the valid 'vin' values
filtered_tss = tss[tss['vin'].isin(valid_vins)].query("in_discharge_perf_mask == True")

# Now apply your calculation on the filtered data
def calculate_soh(row):
    version = row["version"]
    coefficient = version_coefficient.get(version, 1.0)
    return (row["estimated_range"] / row["soc"] / row["range"]) * coefficient

filtered_tss["SoH"] = filtered_tss.apply(calculate_soh, axis=1)

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

# Group by VIN to calculate the mean SoH and maximum odometer
aggregated_data = filtered_tss.groupby('vin').agg({
    'SoH': 'mean',
    'odometer': 'max',
    'version': 'first'
}).reset_index()

# Calculate the trendline
x = aggregated_data['odometer']
y = aggregated_data['SoH']
# Fit a linear model
coefficients = np.polyfit(x, y, 1)
trendline = np.poly1d(coefficients)

# Extract the slope and intercept
slope, intercept = coefficients

# Create a scatter plot for mean SoH vs. max odometer
fig = px.scatter(
    aggregated_data,
    x='odometer',
    y='SoH',
    color="version",
    hover_data=['vin'],
    title='Mean SoH vs Maximum Odometer per Vehicle',
    labels={
        'odometer': 'Maximum Odometer Reading',
        'SoH': 'Mean SoH (%)'
    }
)

# Add the trendline to the plot using go.Scatter
fig.add_trace(go.Scatter(
    x=x,
    y=trendline(x),
    mode='lines',
    name='Trendline'
))

# Add an annotation for the trendline equation
equation_text = f"y = {slope:.8f}x + {intercept:.2f}"
fig.add_annotation(
    x=max(x),  # Position the annotation at the maximum x value
    y=max(trendline(x)),  # Position the annotation at the corresponding y value
    text=equation_text,
    showarrow=False,
    font=dict(size=12, color="black"),
    xanchor='right'
)

# Show the plot
fig.show()

In [None]:
aggregated_data['SoH'] = aggregated_data['SoH'].round(2)
aggregated_data['odometer'] = aggregated_data['odometer'].round(0)

aggregated_data.to_csv("volvo_data.csv", index=False)

### Using the charging 
charging_ac_ampere / charging_ac_voltage