In [None]:
import logging

import plotly.express as px
from core.plt_utils import basic_fig_update
import pandas as pd
from pandas import DataFrame as DF
import matplotlib.pyplot as plt
# import umap
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer

from core.plt_utils import plt_3d_df
from watea.watea_constants import *
from watea.processed_watea_ts import processed_ts_of, processed_ts_iterator
from watea.watea_fleet_info import fleet_info_df
from watea.energy_distribution import *

logging.basicConfig(level=logging.INFO)

FEATURE_COLS = [
    "current",
    "voltage",
    "energy_added",
    "soc",
]


### Setup

In [None]:
fleet_energy_points_lst = []
for id, ts in processed_ts_iterator("has_power_during_charge"):
    fleet_energy_points_lst.append(compute_charging_points(ts, id))


In [None]:
fleet_energy_points = (
    pd.concat(fleet_energy_points_lst, ignore_index=True)
    .set_index("soc", drop=False)
    .sort_index()
)
display(fleet_energy_points)
describe = fleet_energy_points.describe().T
describe["notna_ratio"] = describe["count"] / len(fleet_energy_points)
display(describe)
fleet_energy_points["energy_added"].plot.hist()

In [None]:
fleet_energy_points_cleaned = fleet_energy_points.query("energy_added < 502 & energy_added > 100").dropna(how="any")
down_sampled_fleet_charging_points = fleet_energy_points_cleaned.sample(frac=0.4)

### Feature extraction

In [None]:
fleet_voltage_by_soc = (
    fleet_energy_points_cleaned
    .loc[:, ["voltage", "soc"]]
)
median = (
    fleet_voltage_by_soc
    .drop_duplicates()
    .rolling(80, center=True, on="soc")
    .min()
    .rolling(80, center=True)
    .min()
    .dropna()
    .reset_index()
)
fig, ax = plt.subplots(figsize=(15, 9))
fleet_voltage_by_soc.plot.scatter("soc", "voltage", s=0.35, ax=ax)
median.plot.line(x="soc", y="voltage", color="red", ax=ax)
CHARGE_ENERGY_POINTS_TO_DIST_MODEL = Pipeline([
    ('reshape', FunctionTransformer(lambda x: x.reshape(-1, 1))),
    ('poly_features', PolynomialFeatures(degree=4)),
    ('regressor', LinearRegression())
])

fitted_shape_data = (
    CHARGE_ENERGY_POINTS_TO_DIST_MODEL
    .fit(median["soc"].values, median["voltage"].values)
    .predict(SOC_RANGE)
    .squeeze()
)
fitted_shape_series = (
    Series(data=fitted_shape_data, index=pd.Index(SOC_RANGE, name="soc"))
    # .sub(min(fitted_shape_data))
)
fitted_shape_series.plot.line(ax=ax, color="green")
plt.show()

In [None]:
fleet_energy_points_cleaned:DF = (
    fleet_energy_points_cleaned
    .assign(min_voltage=fitted_shape_series.loc[fleet_voltage_by_soc["soc"]])
    .eval("soc_voltage_feature = voltage - min_voltage")
)
down_sampled_fleet_charging_points = fleet_energy_points_cleaned.sample(frac=0.4).dropna(how="any")
plt_3d_df(fleet_energy_points_cleaned, "temperature", "soc", "current", color="energy_added", colorscale="Rainbow", size=2.5)

### Estimatore implementation

In [None]:
FEATURE_COLS = [
    "current",
    "voltage",
    "soc_voltage_feature",
    "temperature",
    "soc",
]

def neg_median_absolute_difference(y_true, y_pred):
    abs_diff = np.abs(y_true - y_pred)
    return -np.median(abs_diff)

neg_median_abs_diff_scorer = make_scorer(neg_median_absolute_difference, greater_is_better=True)

# Define the function for KNN cross-validation
def knn_cross_validation(df: pd.DataFrame, input_columns: list, target_column: str, n_neighbors: int = 4, cv_splits: int = 5):
    """
    Performs K-Nearest Neighbors cross-validation on the given DataFrame using
    negative median absolute difference as the scoring metric.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing the data.
        input_columns (list): A list of column names to be used as features.
        target_column (str): The name of the target column.
        n_neighbors (int): Number of neighbors to use in KNN. Default is 4.
        cv_splits (int): Number of cross-validation splits. Default is 5.

    Returns:
        dict: A dictionary containing the mean and standard deviation of the cross-validation scores.
    """
    # Extract the input features and target from the DataFrame
    X = df[input_columns].values
    y = df[target_column].values
    
    # Initialize the KNN model
    knn = KNeighborsRegressor(n_neighbors=n_neighbors)
    
    # Set up cross-validation
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=42)
    
    # Perform cross-validation using the custom scorer
    cv_scores = cross_val_score(knn, X, y, cv=kf, scoring=neg_median_abs_diff_scorer, verbose=True)
    
    # Compute the mean and standard deviation of the scores
    mean_score = cv_scores.mean()
    std_score = cv_scores.std()
    
    # Return the results
    return {
        'median_score': mean_score,
        'std_score': std_score
    }



knn_cross_validation(fleet_energy_points_cleaned, FEATURE_COLS, "energy_added")

In [None]:
knn_cross_validation(fleet_energy_points_cleaned.query("odometer < 3000"), FEATURE_COLS, "energy_added")

In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer
import numpy as np

# Define the custom scoring function
def neg_median_absolute_difference(y_true, y_pred):
    abs_diff = np.abs(y_true - y_pred)
    return -np.median(abs_diff)

# Define the function for training on the train set and evaluating on the test set
def knn_train_test_evaluation(train_df: pd.DataFrame, test_df: pd.DataFrame, input_columns: list, target_column: str, n_neighbors: int = 4):
    """
    Trains a K-Nearest Neighbors model on the train dataset and evaluates it on the test dataset using
    negative median absolute difference as the scoring metric.

    Parameters:
        train_df (pd.DataFrame): The DataFrame containing the training data.
        test_df (pd.DataFrame): The DataFrame containing the test data.
        input_columns (list): A list of column names to be used as features.
        target_column (str): The name of the target column.
        n_neighbors (int): Number of neighbors to use in KNN. Default is 4.

    Returns:
        float: The score on the test dataset (negative median absolute difference).
    """
    # Extract the input features and target from the training DataFrame
    X_train = train_df[input_columns].values
    y_train = train_df[target_column].values
    
    # Extract the input features and target from the testing DataFrame
    X_test = test_df[input_columns].values
    y_test = test_df[target_column].values
    
    # Initialize the KNN model
    knn = KNeighborsRegressor(n_neighbors=n_neighbors)
    
    # Train the model on the training data
    knn.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = knn.predict(X_test)
    
    # Calculate the negative median absolute difference
    score = neg_median_absolute_difference(y_test, y_pred)
    
    # Return the score
    return score

knn_train_test_evaluation(
    train_df=fleet_energy_points_cleaned.query("odometer < 3000"),
    test_df=fleet_energy_points_cleaned.query("odometer > 3000"),
    input_columns=FEATURE_COLS,
    target_column="energy_added",
)

In [None]:
# Define the function for training on the train set and evaluating on the test set
def predict_100_soh_energy_added(train_df: pd.DataFrame, test_df: pd.DataFrame, n_neighbors: int = 4):
    # Extract the input features and target from the training DataFrame
    X_train = train_df[FEATURE_COLS].values
    y_train = train_df["energy_added"].values
    
    # Extract the input features and target from the testing DataFrame
    X_test = test_df[FEATURE_COLS].values
    
    # Initialize the KNN model
    knn = KNeighborsRegressor(n_neighbors=n_neighbors)
    
    # Train the model on the training data
    knn.fit(X_train, y_train)
    
    # Return the score
    return knn.predict(X_test)

default_100_soh_points = fleet_energy_points_cleaned.query("odometer < 3000")

fleet_energy_points_cleaned:DF = (
    fleet_energy_points_cleaned
    .assign(default_100_soh_energy_added=predict_100_soh_energy_added(default_100_soh_points, fleet_energy_points_cleaned))
    .eval("soh = 100 * energy_added / default_100_soh_energy_added")
)

fleet_energy_points_cleaned

In [None]:
def plot_2d_line(df: pd.DataFrame, x_column: str, y_column: str, line_group_column: str, color: str = None, color_scale: str = None):
    """
    Creates a 2D line plot using Plotly with optional color and color scale.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing the data.
        x_column (str): The column name for the x-axis.
        y_column (str): The column name for the y-axis.
        line_group_column (str): The column name for grouping the lines.
        color (str, optional): The column name to use for the line color. Default is None.
        color_scale (str, optional): The color scale to use. Default is None.

    Returns:
        plotly.graph_objs._figure.Figure: The generated Plotly figure.
    """
    if color:
        # If color is provided, use px.line with color_discrete_sequence
        fig = px.line(
            df,
            x=x_column,
            y=y_column,
            line_group=line_group_column,
            color=color,
            color_discrete_sequence=px.colors.qualitative.Plotly if not color_scale else getattr(px.colors.qualitative, color_scale)
        )
    else:
        # If no color is provided, create a line plot without coloring
        fig = px.line(
            df,
            x=x_column,
            y=y_column,
            line_group=line_group_column
        )
    
    # Update the layout (optional)
    fig.update_layout(
        title=f'2D Line Plot of {y_column} vs {x_column} Grouped by {line_group_column}',
        xaxis_title=x_column,
        yaxis_title=y_column,
        legend_title=line_group_column if not color else color
    )
    
    # Show the plot
    fig.show()


agg_fleet_energy_points_cleaned = (
    fleet_energy_points_cleaned
    .groupby("charge_id")
    .agg({
        "odometer":"median",
        "energy_added":"median",
        "voltage":"median",
        "current":"median",
        "temperature":"median",
        "sec_duration":"median",
        "date":"median",
        "soc":"median",
        "min_voltage":"median",
        "soc_voltage_feature":"median",
        "default_100_soh_energy_added":"median",
        "soh":"median",
        #Debugging
        "id":pd.Series.mode,
        "charge_idx":pd.Series.mode,
        "charge_id":pd.Series.mode,
    })
    .sort_values("date")
)
# plot_2d_line(agg_fleet_energy_points_cleaned, "odometer", "soh", "id", "id_idx", color_scale='Viridis')
px.line(agg_fleet_energy_points_cleaned, x='odometer', y='soh', color='id', symbol="id", markers=False)

In [None]:
plt_3d_df(fleet_energy_points_cleaned, )