# Trendlines Validation Method
Objective:
Find a reliable method to validate the calculated trend curves.
Current method in production:
- Validation based on a minimum of 20 different vehicles.

Current approach being explored:
- Use at least 50 different vehicles, including:
    - 20 vehicles with more than 100,000 km
    - 20 vehicles with less than 80,000 km



In [None]:
from core.gsheet_utils import *
from rapidfuzz import process, fuzz
from core.sql_utils import *
import re
from core.sql_utils import *

### Load data

In [None]:

engine = get_sqlalchemy_engine()
con = engine.connect()

with engine.connect() as connection:
    dbeaver_df = pd.read_sql(text("""SELECT vm.model_name, vm.id, vm.type, m.make_name, b.capacity FROM vehicle_model vm
                                  join make m on m.id=vm.make_id
                                  join battery b on b.id=vm.battery_id;"""), con)


In [None]:

with engine.connect() as connection:
    prod_df = pd.read_sql(text("""SELECT v.vin, vm.model_name, vm.type, vd.odometer, vd.soh, vm.version  from vehicle_data vd
join vehicle v
on vd.vehicle_id=v.id
join vehicle_model vm
on v.vehicle_model_id=vm.id;"""), con)


In [None]:
df = load_excel_data(get_gspread_client(), "Courbes de tendance", "Courbes OS")
df_sheet = pd.DataFrame(columns=df[0,:8], data=df[1:,:8])
df_sheet['SoH'] = df_sheet['SoH'].apply(lambda x:  x.replace('%', '').strip()).astype(float) / 100

In [None]:
def mapping_vehicle_type(type_car, make_name, model_name, db_df, battery_capacity=None):
    """Map a given vehicle to the closest model identifier in the database.
    Args:
        type_car (str): type car to find match
        make_name (str): oem car
        model_name (str): model car
        db_df (pd.DataFrame): db with all the model in dbeaver
        battery_capacity (str, optional): capacity car battery. Defaults to None.

    Returns:
        str: type le plus proche présent dans la db de vehicle_model
    """
   
    make_name = make_name.lower()
    type_car = type_car.lower()
    try:
        if len(model_name) > 4:
            d = re.findall('\d*', model_name)
            d.sort()
            model_name = d[-1]
    except:
        model_name = model_name.lower()
    # filter on OEM
    subset = db_df[db_df['make_name'] == make_name].copy()
    # Find the best match

    # Returns the closest model, score_cutoff set to 0.1 for now to ensure we almost always get a result
    match_model = process.extractOne(model_name, subset['model_name'], scorer=fuzz.token_sort_ratio, score_cutoff=.1)
    if match_model :
        match_model_name, _, _ = match_model
        # filter on model name
        subset = subset[subset['model_name']==match_model_name]
        # find the battery with the closest capacity
        try:
            if battery_capacity:
                battery_target = float(battery_capacity.lower().replace('kwh', '').strip())
                subset["distance"] = (subset["capacity"] - battery_target).abs()
                closest_rows = subset[subset["distance"] == subset["distance"].min()]
            else:
                closest_rows = subset

            # match on type
            match_type = process.extractOne(type_car, closest_rows['type'], scorer=fuzz.token_sort_ratio)
            if match_type:
                _, _, index = match_type
                return closest_rows.loc[index]["type"]

        # fallback: find the closest type without battery
        except:
            match_type = process.extractOne(type_car, subset['type'], scorer=fuzz.token_sort_ratio)
            _, _, index = match_type
            return subset.loc[index, "type"]
        
    return  "unknown"

In [None]:
df_sheet['model_id'] = df_sheet.apply(lambda row: mapping_vehicle_type(row['Type'], row['OEM'], 
                                                                       row['Modèle'], dbeaver_df, row['battery_capacity']), axis=1)
df_sheet['model_id'] = df_sheet['model_id'].astype(str)

In [None]:
df_sheet['Odomètre (km)'] = df_sheet['Odomètre (km)'].apply(lambda x: float(x.replace(',', '')))

## Check conditioning

In [None]:
resultats = []
for modele, group in df_sheet.groupby(["Modèle", 'model_id']):
    nb_total_vins = group['lien'].nunique()
    nb_moins_50k = group[group['Odomètre (km)'] <= 50000]['lien'].nunique()
    nb_plus_80k = group[group['Odomètre (km)'] >= 50000]['lien'].nunique()
    
    if nb_total_vins >= 50 and nb_moins_50k >= 20 and nb_plus_80k >= 20:
        resultats.append(modele)
for i in resultats:
    print(i)

In [None]:
resultats = []
for modele, group in prod_df.groupby(["model_name", 'type', 'version']):
    nb_total_vins = group['vin'].nunique()
    nb_moins_50k = group[group['odometer'] <= 50000]['vin'].nunique()
    nb_plus_80k = group[group['odometer'] >= 50000]['vin'].nunique()
    
    if nb_total_vins >= 50 and nb_moins_50k >= 20 and nb_plus_80k >= 20:
        resultats.append(modele)

for i in resultats:
    print(i)

Coclusion