# Study batery influence on residual value



In [None]:
from core.gsheet_utils import load_excel_data, get_google_client
from core.s3.s3_utils import S3Service
from core.sql_utils import get_sqlalchemy_engine
from core.spark_utils import *
import pandas as pd
import numpy as np 
import plotly.express as px

In [None]:
df_scrapping = load_excel_data("Courbes de tendance", "Courbes OS")
df_scrapping = pd.DataFrame(columns=df_scrapping[:1][0], data=df_scrapping[1:])

In [None]:
df_scrapping['Odom√®tre (km)'] = df_scrapping['Odom√®tre (km)'].apply(lambda x: float(x.replace(',', '')))
df_scrapping['SoH'] = df_scrapping['SoH'].apply(lambda x: float(x.replace('%', '')))
df_scrapping['price'] = df_scrapping['price'].replace('', np.nan).astype(float)

In [None]:
engine = get_sqlalchemy_engine()
df_dbeaver = pd.read_sql("""SELECT o.oem_name, m.make_name, vm.model_name, vm.type, vm.version, vm.autonomy, b.battery_chemistry, b.capacity, b.net_capacity FROM vehicle_model vm
join battery b on b.id=vm.battery_id 
join make m on m.id=vm.make_id
join oem o on o.id=vm.oem_id""", engine)

In [None]:
df = df_scrapping.merge(df_dbeaver, right_on=["model_name", "type"], left_on=['Mod√®le', 'Type'])[['oem_name', 'make_name', 'model_name','type','version','autonomy','battery_chemistry','capacity','net_capacity', 'SoH', 'Ann√©e', 'price', 'Odom√®tre (km)']]
df.rename(columns={'Odom√®tre (km)': 'odometer'}, inplace=True)

## Battery influence on VR

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Matrice de corr√©lation
corr_vars = ['price', 'autonomy', 'odometer', 'SoH', 'net_capacity', 'Ann√©e']
corr_matrix = df[df['price'] > 0][corr_vars].corr()


px.imshow(corr_matrix[['price', 'odometer']].round(2), text_auto=True)


Le SoH et l'age sont corr√©l√©s au prix mais ils sont aussi tr√®s corr√©l√©s a l'odometer.

Deux approches sont possible:
- Prendre en compte la capacit√© d el batterie et l'odom√®tre pour expliquer le prix sans reduire la corr√©lation SoH et Ann√©e.
- Traiter la corr√©lation du SoH et / ou ann√©e. 

### Retirer l'impact du SoH et de l'ann√©e sur le prix

**Ann√©e comme variable continue**

In [None]:
import statsmodels.formula.api as smf
df_analysis = df[df['price'] > 0].dropna(subset=['price', 'autonomy', 'odometer', 'SoH', 'Ann√©e', 'net_capacity'])
df_analysis['Ann√©e'] = df_analysis['Ann√©e'].astype(int)
model = smf.ols('price ~ autonomy + net_capacity + odometer + SoH + Ann√©e', data=df_analysis).fit()

print("\nüìä Interpr√©tation:")
print(f"- Chaque km d'autonomie suppl√©mentaire augmente le prix de {model.params['autonomy']:.2f}‚Ç¨")
print(f"- Chaque kWh de capacit√© nette suppl√©mentaire augmente le prix de {model.params['net_capacity']:.2f}‚Ç¨")
print(f"- Chaque 1000 km parcourus diminue le prix de {-model.params['odometer'] * 1000:.2f}‚Ç¨")
print(f"- Chaque point de SoH suppl√©mentaire augmente le prix de {model.params['SoH']:.2f}‚Ç¨")
print(f"- Chaque ann√©e suppl√©mentaire diminue le prix de {model.params['Ann√©e']:.2f}‚Ç¨")
print(f"- R¬≤ = {model.rsquared:.3f} ‚Üí {model.rsquared*100:.1f}% de la variance expliqu√©e")




In [None]:
soh_mean = df_analysis['SoH'].mean()
year_mean = df_analysis['Ann√©e'].mean()
model = smf.ols('price ~ SoH + Ann√©e', data=df_analysis).fit()
df_analysis['price_adjusted_continue'] = df_analysis['price'] - (
    model.params['SoH'] * (df_analysis['SoH'] - soh_mean)
    + model.params['Ann√©e'] * (df_analysis['Ann√©e'] - year_mean)
)


In [None]:
corr_matrix = df_analysis[df_analysis['price'] > 0][['price_adjusted_continue', 'SoH', 'Ann√©e', 'odometer', 'autonomy', 'net_capacity']].corr()
px.imshow(corr_matrix[['price_adjusted_continue']].round(2), text_auto=True)

Pour le cas continue les facteurs de corr√©lations pour l'ann√©e et le SoH on diminuent mais comme on pouvat le pr√©voir celui de l'odom√®tre aussi.

In [None]:
# Cr√©er des cat√©gories d'autonomie
df_cat_adj = df_analysis.copy()
df_cat_adj['autonomy_category'] = pd.cut(
    df_cat_adj['autonomy'], 
    bins=[0, 300, 400, 500, 1000],
    labels=['<300 km', '300-400 km', '400-500 km', '>500 km']
)

# D√©finir l'ordre des cat√©gories
category_order = ['<300 km', '300-400 km', '400-500 km', '>500 km']

# Box plot : Prix par cat√©gorie d'autonomie
fig = px.box(
    df_cat_adj,
    x="autonomy_category",
    y="price",
    color="autonomy_category",
    category_orders={"autonomy_category": category_order},
    title="Distribution des prix par cat√©gorie d'autonomie"
)
fig.show()
fig.write_html("distrib_price_adj_by_autonomy_category.html")



# Scatter : Odom√®tre vs Prix par cat√©gorie
fig = px.scatter(
    df_cat_adj,
    x="odometer",
    y="price",
    color="autonomy_category",
    category_orders={"autonomy_category": category_order},
    trendline="ols",
    title="D√©pr√©ciation selon l'odom√®tre, par cat√©gorie d'autonomie"
)
fig.show()
fig.write_html("scatter_price_adj_odometer.html")

# Visualisation : Prix observ√© vs net_capcit√©
fig = px.scatter(
    df_analysis,
    x="net_capacity",
    y="price",
    hover_data=["model_name", "type", "SoH", "price"],
    title=f"Prix ajust√© en fonction de la capacit√© nette",
    trendline="ols"
)
fig.show()
fig.write_html("scatter_price_adj_net_capacity.html")


# Visualisation : Prix observ√© vs autonomie
fig = px.scatter(
    df_analysis,
    x="autonomy",
    y="price",
    hover_data=["model_name", "type", "SoH", "price"],
    title=f"Prix ajust√© en fonction de l'autonomie",
    trendline="ols"
)
fig.show()
fig.write_html("scatter_price_adj_autonomie.html")



In [None]:
X = df_analysis[['autonomy', 'odometer']]
y = df_analysis['price']

X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print("\nüìä R√©sultat de la r√©gression:")
print(f"- Chaque km d'autonomie suppl√©mentaire augmente le prix de {model.params['autonomy']:.2f}‚Ç¨")
print(f"- Chaque 1000 km parcourus diminue le prix de {-model.params['odometer'] * 1000:.2f}‚Ç¨")
# print(f"- Chaque kWh de capacit√© nette augmente le prix de {model.params['net_capacity']:.2f}‚Ç¨")
print(f"- R¬≤ = {model.rsquared:.3f} ‚Üí {model.rsquared*100:.1f}% de la variance expliqu√©e")