# residual value

The objective here is to be able to estimate the residual value of a vehicle by taking into account different parameters.

In [None]:
from core.gsheet_utils import load_excel_data
from core.sql_utils import get_sqlalchemy_engine
from core.spark_utils import *
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from activation.config.mappings import mapping_vehicle_type

## Data 

### Load

In [None]:
df_scrapping = load_excel_data("Courbes de tendance", "Courbes OS")
df_scrapping = pd.DataFrame(columns=df_scrapping[:1][0], data=df_scrapping[1:])
df_scrapping = df_scrapping.rename(columns={'OEM': 'Make'})

In [None]:
df_scrapping['source'] = ''
for i, row in df_scrapping.iterrows():
    if 'aramis' in df_scrapping.loc[i, 'lien']:
        df_scrapping.loc[i, 'source'] = "aramis"
    if 'spoticar' in df_scrapping.loc[i,'lien']:
        df_scrapping.loc[i, 'source'] = "spoticar"
    if 'ev-market' in df_scrapping.loc[i,'lien']:
        df_scrapping.loc[i, 'source'] = "ev-market"
    if 'autosphere' in df_scrapping.loc[i,'lien']:
        df_scrapping.loc[i, 'source'] = "autosphere"
        

In [None]:
df_scrapping = df_scrapping.rename(columns={"Make": "make_name", "Odomètre (km)": "odometer", "Année": "year", "SoH": "soh", "Modèle": "model_name"})

In [None]:
engine = get_sqlalchemy_engine()
df_dbeaver = pd.read_sql("""SELECT vm.id, vm.model_name, vm.type, vm.version, vm.autonomy, b.battery_chemistry, b.capacity, b.net_capacity, m.make_name FROM vehicle_model vm
join battery b on b.id=vm.battery_id
join make m on m.id=vm.make_id""", engine)

In [None]:
df_scrapping['model_name'] = df_scrapping['model_name'].apply(lambda x: str.lower(x))

In [None]:
df_scrapping

In [None]:
missing_model_mask = df_scrapping['model_name'].isna() | (df_scrapping['model_name'] == "unknown")
df_scrapping[missing_model_mask]

In [None]:
def safe_mapping_vehicle_type(row):
    model_name = row['model_name']
    if not model_name or model_name == "unknown":
        return np.nan
    try:
        return mapping_vehicle_type(row['Type'], row['make_name'], model_name, df_dbeaver)
    except Exception as e:
        # pour tracer les erreurs sans planter
        print(f"⚠️ Erreur sur {row['make_name']} {model_name}: {e}")
        return np.nan

df_scrapping['type_2'] = df_scrapping.apply(safe_mapping_vehicle_type, axis=1)

In [None]:
df_info = (df_scrapping.merge(df_dbeaver, how='left', left_on=['type_2'], right_on=['id'])
           [["make_name_x", "model_name_y", "autonomy", "battery_chemistry", "capacity", "net_capacity", "soh", "odometer", "year", "source", "price"]]
           .rename(columns={"make_name_x": "make", "model_name_y": "model"}))

### Preparation

In [None]:
df_info.make.value_counts(normalize=True)

In [None]:
df_info

In [None]:
df_info['price'] = df_info['price'].replace('', np.nan)
df_info['price'] = df_info['price'].astype(float)
df_info['soh'] = df_info['soh'].apply(lambda x: float(x.replace('%', '')) if x != '' else np.nan)
df_info['odometer'] = df_info['odometer'].apply(lambda x: float(x.replace(',', '').replace(' ', '')) if x != '' else np.nan)
df_info['year'] = df_info['year'].apply(lambda x: int(x) if x != '' else np.nan)
df_info = df_info.dropna().reset_index(drop=True)

In [None]:
df_info.shape

## Modèles

In [None]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

model = RandomForestRegressor()
categorical = ['make', 'battery_chemistry'] # source
numerical = ['odometer', 'year', 'soh', 'net_capacity']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical),
        ('num', StandardScaler(), numerical)
    ],
    remainder='passthrough',
    force_int_remainder_cols=False
)    

pipeline = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('model', RandomForestRegressor(n_estimators=100, random_state=42))
    ])



### Train

In [None]:
X = df_info.drop(columns=["price", "model", 'capacity'])
y = df_info['price']

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.drop(columns=['source']), y, train_size=.8, random_state=42)
pipeline.fit(X_train, y_train)

## pred

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
res = pd.DataFrame(y_pred, columns=['pred'])
res['true'] = y_test.reset_index(drop=True)


In [None]:

df = res.reset_index()

fig = go.Figure()

# Trace pour les prédictions
fig.add_trace(go.Scatter(
    x=df["index"], 
    y=df["pred"], 
    mode="markers",
    name="Pred"
))

# Trace pour les vraies valeurs
fig.add_trace(go.Scatter(
    x=df["index"], 
    y=df["true"], 
    mode="markers",
    name="True"
))

fig.update_layout(
    title="Predicted vs True Values",
    xaxis_title="Index",
    yaxis_title="Value",
)

## eval

In [None]:
round(mean_absolute_error(y_test, y_pred))

In [None]:
round((mean_absolute_percentage_error(y_test, y_pred) * 100), 2)

## Check feature importance

In [None]:
X

In [None]:
pipeline.fit(X.drop(columns=['source']), y)  # remove drop if source in pipeline / add drop(columns=['source']) if not

In [None]:
feature_names = pipeline.named_steps['preprocess'].get_feature_names_out()
importances = pipeline.named_steps['model'].feature_importances_

# Create DataFrame
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

In [None]:
importance_df.sort_values("feature")

In [None]:
importance_df[importance_df["feature"].str.contains("make", case=False, na=False)==True].sum()

In [None]:
importance_df[importance_df["feature"].str.contains("battery", case=False, na=False)==True].sum()