In [None]:
from core.sql_utils import *
engine = get_sqlalchemy_engine()
con = engine.connect()

with engine.connect() as connection:
    dbeaver_df = pd.read_sql(text("""SELECT vm.model_name, vm.id, vm.type, m.make_name, b.capacity FROM vehicle_model vm
                                  join make m on m.id=vm.make_id
                                  join battery b on b.id=vm.battery_id;"""), con)


In [None]:
dbeaver_df.columns

In [None]:
from core.gsheet_utils import *

In [None]:
df = load_excel_data(get_gspread_client(), "Courbes de tendance", "Courbes OS")
df_sheet = pd.DataFrame(columns=df[0,:8], data=df[1:,:8])
df_sheet['SoH'] = df_sheet['SoH'].apply(lambda x:  x.replace('%', '').strip()).astype(float) / 100



In [None]:
from rapidfuzz import process, fuzz
from core.sql_utils import *
import re
def mapping_vehicle_type(type_car, make_name, model_name, db_df, battery_capacity=None):
    """Map a given vehicle to the closest model identifier in the database.
    Args:
        type_car (str): type car to find match
        make_name (str): oem car
        model_name (str): model car
        db_df (pd.DataFrame): db with all the model in dbeaver
        battery_capacity (str, optional): capacity car battery. Defaults to None.

    Returns:
        str: type le plus proche présent dans la db de vehicle_model
    """
   
    make_name = make_name.lower()
    type_car = type_car.lower()
    try:
        if len(model_name) > 4:
            d = re.findall('\d*', model_name)
            d.sort()
            model_name = d[-1]
    except:
        model_name = model_name.lower()
    # filter on OEM
    subset = db_df[db_df['make_name'] == make_name].copy()
    # Find the best match

    # Returns the closest model, score_cutoff set to 0.1 for now to ensure we almost always get a result
    match_model = process.extractOne(model_name, subset['model_name'], scorer=fuzz.token_sort_ratio, score_cutoff=.1)
    if match_model :
        match_model_name, _, _ = match_model
        # filter on model name
        subset = subset[subset['model_name']==match_model_name]
        # find the battery with the closest capacity
        try:
            if battery_capacity:
                battery_target = float(battery_capacity.lower().replace('kwh', '').strip())
                subset["distance"] = (subset["capacity"] - battery_target).abs()
                closest_rows = subset[subset["distance"] == subset["distance"].min()]
            else:
                closest_rows = subset

            # match on type
            match_type = process.extractOne(type_car, closest_rows['type'], scorer=fuzz.token_sort_ratio)
            if match_type:
                _, _, index = match_type
                return closest_rows.loc[index]["type"]

        # fallback: find the closest type without battery
        except:
            match_type = process.extractOne(type_car, subset['type'], scorer=fuzz.token_sort_ratio)
            _, _, index = match_type
            return subset.loc[index, "type"]
        
    return  "unknown"

In [None]:
df_sheet['model_id'] = df_sheet.apply(lambda row: mapping_vehicle_type(row['Type'], row['OEM'], 
                                                                       row['Modèle'], dbeaver_df, row['battery_capacity']), axis=1)
df_sheet['model_id'] = df_sheet['model_id'].astype(str)

In [None]:
from load.trendline.trendline_utils import *

In [None]:
import plotly.graph_objects as go
def trendline_apply(x, f):
    return eval(f)

def graph(df, trendline, trendline_max, trendline_min, model):
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=df['Odomètre (km)'],
        y=df['SoH'],
        mode='markers',
        marker_color='rgba(50, 182, 193, .9)',
        name='SoH compute'
    ))

    x_sorted = df['Odomètre (km)'].sort_values()
    fig.add_trace(go.Scatter(
        x=x_sorted,
        y=trendline_apply(x_sorted, trendline['trendline']),
        mode='lines',
        line=dict(color='red'),
        name='Fit'
    ))

    fig.add_trace(go.Scatter(
        x=x_sorted,
        y=trendline_apply(x_sorted, trendline_max['trendline']),
        mode='lines',
        line=dict(color='green'),
        name='Upper'
    ))

    fig.add_trace(go.Scatter(
        x=x_sorted,
        y=trendline_apply(x_sorted, trendline_min['trendline']),
        mode='lines',
        line=dict(color='green'),
        name='Lower'
    ))

    fig.update_layout(
        width=1000,
        height=600,
        xaxis_title='Odometer',
        yaxis_title='State of Health (SoH)',
        legend_title='Légende',
        title=f"version: {model}",
        template='plotly_white',
        xaxis=dict(range=[0, 150000]),  # Change selon l'échelle souhaitée pour l'odomètre
        yaxis=dict(range=[.75, 1.1])     # Change selon l'échelle souhaitée pour le SoH
    )
    

    return fig.show()
    

In [None]:
df_sheet['Odomètre (km)'] = df_sheet['Odomètre (km)'].apply(lambda x: float(x.replace(',', '')))

In [None]:
test_208 = df_sheet[(df_sheet['Modèle']=='e-208') & (df_sheet['model_id']=='50 kwh')]

In [None]:
resultats = []
for modele, group in df_sheet.groupby(["Modèle", 'model_id']):
    nb_total_vins = group['lien'].nunique()
    nb_moins_50k = group[group['Odomètre (km)'] <= 50000]['lien'].nunique()
    nb_plus_80k = group[group['Odomètre (km)'] >= 50000]['lien'].nunique()
    
    if nb_total_vins >= 50 and nb_moins_50k >= 20 and nb_plus_80k >= 20:
        resultats.append(modele)

print("Modèles qui remplissent les critères :", resultats)

In [None]:
from core.sql_utils import *
engine = get_sqlalchemy_engine()
con = engine.connect()

with engine.connect() as connection:
    prod_df = pd.read_sql(text("""SELECT v.vin, vm.model_name, vm.type, vd.odometer, vd.soh, vm.version  from vehicle_data vd
join vehicle v
on vd.vehicle_id=v.id
join vehicle_model vm
on v.vehicle_model_id=vm.id;"""), con)


In [None]:
resultats = []
for modele, group in df_sheet.groupby(["Modèle", 'model_id']):
    nb_total_vins = group['lien'].nunique()
    nb_moins_50k = group[group['Odomètre (km)'] <= 50000]['lien'].nunique()
    nb_plus_80k = group[group['Odomètre (km)'] >= 50000]['lien'].nunique()
    
    if nb_total_vins >= 50 and nb_moins_50k >= 20 and nb_plus_80k >= 20:
        resultats.append(modele)

print("Modèles qui remplissent les critères :", resultats)

In [None]:
resultats = []
for modele, group in prod_df.groupby(["model_name", 'type', 'version']):
    nb_total_vins = group['vin'].nunique()
    nb_moins_50k = group[group['odometer'] <= 50000]['vin'].nunique()
    nb_plus_80k = group[group['odometer'] >= 50000]['vin'].nunique()
    
    if nb_total_vins >= 50 and nb_moins_50k >= 20 and nb_plus_80k >= 20:
        resultats.append(modele)

print("Modèles qui remplissent les critères :", resultats)

df_r110 = df_sheet[df_sheet['model_id']==]
df_q210 = df_sheet[df_sheet['model_id']==]
df_spring65 = df_sheet[df_sheet['model_id'].isin([])]
df_spring45 = df_sheet[df_sheet['model_id'].isin(([]))]
df_208 = df_sheet[df_sheet['model_id']==]
df_2008 = df_sheet[df_sheet['model_id']==]
df_niro = df_sheet[df_sheet['model_id']==]
df_mini = df_sheet[df_sheet['model_id']==]
df_leaf = df_sheet[df_sheet['model_id']==]
df_r90 = df_sheet[df_sheet['model_id'].isin([])]

df_kona = df_sheet[df_sheet['model_id']==]

df_twingo = df_sheet[df_sheet['model_id']==]


In [None]:
list_df = [df_r110, df_q210, df_spring65, df_spring45, df_208, df_2008, df_niro, df_mini, df_leaf, df_r90, df_kona, df_twingo, ]

In [None]:
for df in list_df:
    df = df.drop(df[(df['Odomètre (km)'] < 20000) & (df['SoH'] < .95)].index)
    df = df.drop(df[(df['SoH'] < .8)].index)
    id = df['model_id'].unique()
    model = f"{dbeaver_df[dbeaver_df['id'].astype(str) == str(id[0])]['model_name'].values[0]} {dbeaver_df[dbeaver_df['id'].astype(str) == str(id[0])]['type'].values[0]}"
    x_data, y_data = df["Odomètre (km)"].values, df["SoH"].values
    x_data = np.hstack((x_data, np.array([0])))
    y_data = np.hstack((y_data, np.array([1])))
    sort_idx = np.argsort(x_data)
    x_sorted, y_sorted = x_data[sort_idx], y_data[sort_idx]
    coef_mean, _ = curve_fit(log_function, x_sorted, y_sorted, maxfev=10000, bounds=([.97, -np.inf, -np.inf], [1.03, np.inf, np.inf]))
    y_fit = log_function(x_sorted, *coef_mean)
    y_lower, y_upper = compute_trendline_bounds(y_sorted, y_fit)
    coef_lower, coef_upper = generate_trendline_functions(coef_mean[0], x_sorted, y_lower, y_upper)
    trendlines = build_trendline_expressions(coef_mean, coef_lower, coef_upper, y_lower, y_upper)
    if coef_upper[0] >= -0.01:
        mask = eval(trendlines[0]['trendline'], {"np": np, "x": df["Odomètre (km)"]})
        test = df[df['SoH'] > mask ]
        x_data, y_data = test["Odomètre (km)"].values, test["SoH"].values
        x_data = np.hstack((x_data, np.array([0])))
        y_data = np.hstack((y_data, np.array([1])))
        sort_idx = np.argsort(x_data)
        x_sorted, y_sorted = x_data[sort_idx], y_data[sort_idx]
        coef_mean_upper, _ = curve_fit(log_function, x_sorted, y_sorted, maxfev=10000, bounds=([.97, -np.inf, -np.inf], [1.03, np.inf, np.inf]))
        y_fit = log_function(x_sorted, *coef_mean_upper)
        y_lower, y_upper = compute_trendline_bounds(y_sorted, y_fit)
        coef_lower_borne_sup, coef_upper_borne_sup = generate_trendline_functions(coef_mean[0], x_sorted, y_lower, y_upper)
        new_upper = build_trendline_expressions(coef_mean, coef_lower_borne_sup, coef_upper_borne_sup, y_lower, y_upper)
        upper_bound = new_upper[1]
    
    if  coef_lower[0] >= -0.01:
        print('laalddaeda')
        mask = eval(trendlines[0]['trendline'], {"np": np, "x": df["Odomètre (km)"]})
        test = df[df['SoH'] < mask ]
        x_data, y_data = test["Odomètre (km)"].values, test["SoH"].values
        x_data = np.hstack((x_data, np.array([0])))
        y_data = np.hstack((y_data, np.array([1])))
        sort_idx = np.argsort(x_data)
        x_sorted, y_sorted = x_data[sort_idx], y_data[sort_idx]
        coef_mean_new, _ = curve_fit(log_function, x_sorted, y_sorted, maxfev=10000, bounds=([.97, -np.inf, -np.inf], [1.03, np.inf, np.inf]))
        y_fit = log_function(x_sorted, *coef_mean_new)
        y_lower, y_upper = compute_trendline_bounds(y_sorted, y_fit)
        coef_lower, coef_upper = generate_trendline_functions(coef_mean_new[0], x_sorted, y_lower, y_upper)
        new_lower = build_trendline_expressions(coef_mean_new, coef_lower, coef_upper, y_lower, y_upper)
        lower_bound = new_lower[0]
        print(lower_bound)
    print(trendlines[0])
    graph(df, trendlines[0], trendlines[1], trendlines[2], model)

In [None]:
trendlines

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

# --- 1. Charger les données ---
df = list_df[0]  # Remplace par ton chemin si nécessaire
df = df[df['SoH'] > .8]
x_data, y_data = df["Odomètre (km)"].values, df["SoH"].values
x_data = np.hstack((x_data, np.array([0])))
y_data = np.hstack((y_data, np.array([1])))
sort_idx = np.argsort(x_data)
x_sorted, y_sorted = x_data[sort_idx], y_data[sort_idx]
coef_mean, _ = curve_fit(log_function, x_sorted, y_sorted, maxfev=10000, bounds=([.97, -np.inf, -np.inf], [1.03, np.inf, np.inf]))
print(coef_mean)
x = df["Odomètre (km)"].values
y = df["SoH"].values

# --- 2. Paramètre fixe ---
a =coef_mean[2]  # Peut être ajusté plus tard

# --- 3. Transformation de x ---
X_trans = np.log1p(x / a)
X_design = sm.add_constant(X_trans)  # Ajoute l’intercept

# --- 4. Régression linéaire ---
model = sm.OLS(y, X_design).fit()

# --- 5. Prédictions et IC 95% ---
x_pred = np.linspace(x.min(), x.max(), 300)
x_pred_trans = np.log1p(x_pred / a)
X_pred_design = sm.add_constant(x_pred_trans)
pred = model.get_prediction(X_pred_design)
pred_summary = pred.summary_frame(alpha=0.05)

# --- 6. Tracé ---
mean = pred_summary["mean"].astype(float).values
ci_lower = pred_summary["mean_ci_lower"].astype(float).values
ci_upper = pred_summary["mean_ci_upper"].astype(float).values

plt.figure(figsize=(10, 6))
plt.scatter(x, y, label="Données réelles", alpha=0.7)
plt.plot(x_pred, mean, color="red", label="Trendline")
plt.fill_between(x_pred, ci_lower, ci_upper, color="red", alpha=0.3, label="IC 95%")
plt.xlabel("Odomètre (km)")
plt.ylabel("SoH")
plt.title("Trendline avec intervalle de confiance à 95%")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# --- 7. Affichage des résultats ---
print("Coefficients estimés :")
print(model.params)

print("\nIntervalles de confiance (95%) :")
print(model.conf_int())


In [None]:
trendline

In [None]:
df_208 = df_sheet[df_sheet['model_id']=='6fbdaf91-ff62-4617-b4bb-ca37bfa406ab']
x_data, y_data = df_208["Odomètre (km)"].values, df_208["SoH"].values
sort_idx = np.argsort(x_data)
x_sorted, y_sorted = x_data[sort_idx], y_data[sort_idx]
coef_mean, _ = curve_fit(log_function, x_sorted, y_sorted, maxfev=10000, bounds=([.97, -np.inf, -np.inf], [1.03, np.inf, np.inf]))
y_fit = log_function(x_sorted, *coef_mean)
y_lower, y_upper = compute_trendline_bounds(y_sorted, y_fit)
coef_lower, coef_upper = generate_trendline_functions(coef_mean[0], x_sorted, y_lower, y_upper)
trendlines = build_trendline_expressions(coef_mean, coef_lower, coef_upper, y_lower, y_upper)

In [None]:
def clean_battery_data(df, odometer_column):
    """
    Nettoie les données de batterie en supprimant les valeurs aberrantes.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame contenant les données de batterie avec des colonnes odometer et 'SoH'
    odometer_column: str
        Nom de la colonne qui contient l'info sur l'odomètre
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame nettoyé
    """
    df_clean = df.copy()
    df_clean = df_clean.raname(columns={odometer_column: 'odometer'})
    df_clean = df_clean.drop(df_clean[(df_clean['odometer'] < 20000) & (df_clean['SoH'] < .95)].index)
    df_clean = df_clean.drop(df_clean[(df_clean['SoH'] < .8)].index)
    return df_clean


def get_model_name(df, dbeaver_df):
    """
    Récupère le nom du modèle à partir de l'ID du modèle.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame contenant la colonne 'model_id'
    dbeaver_df : pandas.DataFrame
        DataFrame de référence avec colonnes 'id', 'model_name', 'type'
        
    Returns:
    --------
    str
        Nom complet du modèle
    """
    id = df['model_id'].unique()
    model = f"{dbeaver_df[dbeaver_df['id'].astype(str) == str(id[0])]['model_name'].values[0]} {dbeaver_df[dbeaver_df['id'].astype(str) == str(id[0])]['type'].values[0]}"
    return model


def prepare_data_for_fitting(df):
    """
    Prépare les données pour le fitting en ajoutant le point d'origine et en triant.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame avec colonnes 'odometer' et 'SoH'
        
    Returns:
    --------
    tuple
        (x_sorted, y_sorted) - données triées prêtes pour le fitting
    """
    x_data, y_data = df["odometer"].values, df["SoH"].values
    x_data = np.hstack((x_data, np.array([0])))
    y_data = np.hstack((y_data, np.array([1])))
    sort_idx = np.argsort(x_data)
    x_sorted, y_sorted = x_data[sort_idx], y_data[sort_idx]
    return x_sorted, y_sorted


def compute_main_trendline(x_sorted, y_sorted):
    """
    Calcule la ligne de tendance principale et les bornes.
    
    Parameters:
    -----------
    x_sorted : numpy.array
        Données x triées
    y_sorted : numpy.array
        Données y triées
        
    Returns:
    --------
    tuple
        (coef_mean, coef_lower, coef_upper, mean, upper, lower)
    """
    coef_mean, _ = curve_fit(log_function, x_sorted, y_sorted, maxfev=10000, 
                           bounds=([.97, -np.inf, -np.inf], [1.03, np.inf, np.inf]))
    y_fit = log_function(x_sorted, *coef_mean)
    y_lower, y_upper = compute_trendline_bounds(y_sorted, y_fit)
    coef_lower, coef_upper = generate_trendline_functions(coef_mean[0], x_sorted, y_lower, y_upper)
    mean, upper, lower = build_trendline_expressions(coef_mean, coef_lower, coef_upper, y_lower, y_upper)
    return coef_mean, coef_lower, coef_upper,  mean, upper, lower


def compute_upper_bound(df, trendline, coef_mean):
    """
    Calcule la borne supérieure si nécessaire.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame des données
    trendline : str
        Equation de la trendline moyenne
    coef_mean : numpy.array
        Coefficients moyens
        
    Returns:
    --------
    dict or None
        Borne supérieure calculée ou None
    """
    print(trendline)
    mask = eval(trendline['trendline'], {"np": np, "x": df["odometer"]})
    test = df[df['SoH'] > mask]
    x_sorted, y_sorted = prepare_data_for_fitting(test)
    
    coef_mean_upper, _ = curve_fit(log_function, x_sorted, y_sorted, maxfev=10000, 
                                bounds=([.97, -np.inf, -np.inf], [1.03, np.inf, np.inf]))
    y_fit = log_function(x_sorted, *coef_mean_upper)
    y_lower, y_upper = compute_trendline_bounds(y_sorted, y_fit)
    coef_lower_borne_sup, coef_upper_borne_sup = generate_trendline_functions(coef_mean[0], x_sorted, y_lower, y_upper)
    new_upper = build_trendline_expressions(coef_mean, coef_lower_borne_sup, coef_upper_borne_sup, y_lower, y_upper)
    upper_bound = new_upper[1]
    return upper_bound



def compute_lower_bound(df, trendlines):
    """
    Calcule la borne inférieure si nécessaire.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame des données nettoyées
    coef_lower : numpy.array
        Coefficients de la borne inférieure
    trendlines : list
        Liste des lignes de tendance
        
    Returns:
    --------
    dict or None
        Borne inférieure calculée ou None
    """

    mask = eval(trendlines['trendline'], {"np": np, "x": df["odometer"]})
    test = df[df['SoH'] < mask]
    x_sorted, y_sorted = prepare_data_for_fitting(test)
    
    coef_mean_upper, _ = curve_fit(log_function, x_sorted, y_sorted, maxfev=10000, 
                                bounds=([.97, -np.inf, -np.inf], [1.03, np.inf, np.inf]))
    y_fit = log_function(x_sorted, *coef_mean_upper)
    y_lower, y_upper = compute_trendline_bounds(y_sorted, y_fit)
    coef_lower_borne_sup, coef_upper_borne_sup = generate_trendline_functions(coef_mean[0], x_sorted, y_lower, y_upper)
    new_upper = build_trendline_expressions(coef_mean, coef_lower_borne_sup, coef_upper_borne_sup, y_lower, y_upper)
    upper_bound = new_upper[1]
    return upper_bound



def process_battery_data(df, dbeaver_df):
    """
    Traite les données de batterie pour un DataFrame donné.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame contenant les données de batterie avec colonnes 'odometer', 'SoH', 'model_id'
    dbeaver_df : pandas.DataFrame
        DataFrame de référence contenant les informations des modèles avec colonnes 'id', 'model_name', 'type'
        
    Returns:
    --------
    dict
        Dictionnaire contenant les résultats du traitement:
        - 'model': nom du modèle
        - 'trendlines': lignes de tendance principales
        - 'upper_bound': borne supérieure (si calculée)
        - 'lower_bound': borne inférieure (si calculée)
        - 'processed_df': DataFrame après nettoyage
    """
    
    # Étape 1: Nettoyage des données
    df_clean = clean_battery_data(df)
    
    # Étape 2: Récupération du nom du modèle
    model = get_model_name(df_clean, dbeaver_df)
    
    # Étape 3: Préparation des données pour le fitting
    x_sorted, y_sorted = prepare_data_for_fitting(df_clean)
    
    # Étape 4: Calcul de la ligne de tendance principale
    coef_mean, coef_lower, coef_upper, mean, upper, lower = compute_main_trendline(x_sorted, y_sorted)
    
    # Étape 5: Calcul des bornes si nécessaire
    upper_bound = compute_upper_bound(df_clean, trendlines, coef_mean)
    lower_bound = compute_lower_bound(df_clean, trendlines)
    
    # Étape 6: Génération du graphique
    graph(df_clean, mean, upper, lower, model)
    
    # Retour des résultats
    return {
        'model': model,
        'trendlines': trendlines,
        'upper_bound': upper_bound,
        'lower_bound': lower_bound,
        'processed_df': df_clean
    }


def process_all_battery_data(list_df, dbeaver_df):
    """
    Traite tous les DataFrames de la liste.
    
    Parameters:
    -----------
    list_df : list
        Liste des DataFrames à traiter
    dbeaver_df : pandas.DataFrame
        DataFrame de référence pour les informations des modèles
        
    Returns:
    --------
    list
        Liste des résultats pour chaque DataFrame traité
    """
    results = []
    
    for df in list_df:
        result = process_battery_data(df, dbeaver_df)
        results.append(result)
    
    return results



In [None]:
for df in list_df:
    df_clean = clean_battery_data(df)

    id = df['model_id'].unique()
    model = f"{dbeaver_df[dbeaver_df['id'].astype(str) == str(id[0])]['model_name'].values[0]} {dbeaver_df[dbeaver_df['id'].astype(str) == str(id[0])]['type'].values[0]}"
    x_data, y_data = prepare_data_for_fitting(df_clean)
    coef_mean, coef_lower, coef_upper, mean, upper_bound, lower_bound = compute_main_trendline(x_data, y_data)
    if coef_upper[0] >= 0:
        print('ici')
        upper_bound = compute_upper_bound(df, mean, coef_mean)
    if  coef_lower[0] >= 0:
        print('la')
        lower_bound = compute_lower_bound(df_clean, mean)
    graph(df, mean, upper_bound, lower_bound, model)