In [401]:
import pandas as pd
from numpy import asarray
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, SGDRegressor, Ridge
import pickle

### Modèle d'entraînement

Import

In [402]:
df = pd.read_csv('data/cars_infos_cleaned.csv')

Répartition

In [403]:
# On définit notre target
y = df['Prix($)']
# Et nos features
X = df[[
    'Marque',
    'Modèle',
    'Classe', 
    'Portes', 
    'Longueur(cm)',
    'Hauteur(cm)', 
    'Largeur(cm)', 
    'Empattement(cm)', 
    'Moteur(cm³)',
    'Poids(t)', 
    'Carburant', 
    'ConsommationVille(L/100km)',
    'ConsommationAutoroute(L/100km)',
    'PositionMoteur',
    'Transmission',
    'Chevaux',
    'Régime(tr/min)', 
    'Turbo', 
    'Cylindres', 
    'Injecteur',
    'TypeMoteur', 
    'Alésage(mm)', 
    'Piston(cm)', 
    'TauxCompression',
    # 'CoteRisque'
]]

Séparation train/test

In [404]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, random_state=42
    )

Répartition des features

In [405]:
numeric_features = [ 
    'Longueur(cm)',
    'Hauteur(cm)', 
    'Largeur(cm)', 
    'Empattement(cm)', 
    'Moteur(cm³)',
    'Poids(t)', 
    'ConsommationVille(L/100km)',
    'ConsommationAutoroute(L/100km)',
    'Chevaux',
    'Régime(tr/min)', 
    'Cylindres', 
    'Alésage(mm)', 
    'Piston(cm)', 
    'TauxCompression',
    # 'CoteRisque'
]

categorical_features = [
    'Marque',
    'Modèle',
    'Classe', 
    'Portes', 
    'Carburant', 
    'PositionMoteur',
    'Transmission',
    'Turbo', 
    'Injecteur',
    'TypeMoteur', 
]

Transformateurs numériques

In [406]:
numeric_transformer_minmax = Pipeline([('minmax', MinMaxScaler())])
numeric_transformer_std = Pipeline([('standard', StandardScaler())])
numeric_transformer_rbst = Pipeline([('robust', RobustScaler()),])

numeric_transformers = [
    numeric_transformer_minmax,
    numeric_transformer_std,
    numeric_transformer_rbst
]

Transformateur catégorielle

In [407]:
categorical_transformer = OneHotEncoder(sparse_output=True, handle_unknown='ignore')

Estimateurs

In [408]:
lnr = LinearRegression()
lasso = Lasso(alpha=0.1)
sgd = SGDRegressor(max_iter= 5000, tol= 1e-3)
ridge = Ridge(alpha=0.5)
randomforest = RandomForestRegressor()

Essais

In [409]:
#Dictionnaires qui permettront d'enregistrer les scores des différents essais
scores = {}
transformer_dict = {
    1 : 'minmax',
    2 : 'standart',
    3 : 'robust'  
    }

1. LinearRegression

In [410]:
counter = 0
#on itère sur les transformateurs
for transformer in numeric_transformers:
    #on définit le preprocesseur
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    #on définit le pipeline
    pipe = Pipeline([
        ('prep', preprocessor),
        ('estimator', lnr)
    ])
    trained_pipe = pipe.fit(X_train, y_train)
    trained_pipe.predict(X_test)
    score = trained_pipe.score(X_test, y_test)
    counter += 1
    #on définit un string regroupant le nom des
    scorestring = f'LNR_{transformer_dict[counter]}'
    scores[scorestring] = score
    

2. Lasso

In [411]:
counter = 0
#on recommence en changeant l'estimateur et son nom dans l'enregistreur de score
for transformer in numeric_transformers:
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    pipe = Pipeline([
        ('prep', preprocessor),
        ('estimator', lasso)
    ])
    trained_pipe = pipe.fit(X_train, y_train)
    trained_pipe.predict(X_test)
    score = trained_pipe.score(X_test, y_test)
    counter += 1
    scorestring = f'LASSO_{transformer_dict[counter]}'
    scores[scorestring] = score

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


3. SGDRegressor

In [412]:
counter = 0
for transformer in numeric_transformers:
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    #on définit le pipeline
    pipe = Pipeline([
        ('prep', preprocessor),
        ('estimator', sgd)
    ])
    trained_pipe = pipe.fit(X_train, y_train)
    trained_pipe.predict(X_test)
    score = trained_pipe.score(X_test, y_test)
    counter += 1
    scorestring = f'SGD_{transformer_dict[counter]}'
    scores[scorestring] = score

4. Ridge

In [413]:
counter = 0
for transformer in numeric_transformers:
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    #on définit le pipeline
    pipe = Pipeline([
        ('prep', preprocessor),
        ('estimator', ridge)
    ])
    trained_pipe = pipe.fit(X_train, y_train)
    trained_pipe.predict(X_test)
    score = trained_pipe.score(X_test, y_test)
    counter += 1
    scorestring = f'RIDGE_{transformer_dict[counter]}'
    scores[scorestring] = score

5. RandomForestRegressor

In [414]:
counter = 0
for transformer in numeric_transformers:
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    #on définit le pipeline
    pipe = Pipeline([
        ('prep', preprocessor),
        ('estimator', randomforest)
    ])
    trained_pipe = pipe.fit(X_train, y_train)
    trained_pipe.predict(X_test)
    score = trained_pipe.score(X_test, y_test)
    counter += 1
    scorestring = f'RANDOMFOREST_{transformer_dict[counter]}'
    scores[scorestring] = score

In [415]:
# Aperçu des scores enregistrés
scores

{'LNR_minmax': 0.8419553939436093,
 'LNR_standart': 0.8419504135148801,
 'LNR_robust': 0.8417340562668099,
 'LASSO_minmax': 0.91185046176733,
 'LASSO_standart': 0.9133664759208652,
 'LASSO_robust': 0.9133167233652554,
 'SGD_minmax': 0.8680948843540253,
 'SGD_standart': 0.8842819041400505,
 'SGD_robust': 0.8728696177825752,
 'RIDGE_minmax': 0.8631495364809691,
 'RIDGE_standart': 0.8884534052177184,
 'RIDGE_robust': 0.8895011981926942,
 'RANDOMFOREST_minmax': 0.9519184962843126,
 'RANDOMFOREST_standart': 0.9544576069066135,
 'RANDOMFOREST_robust': 0.9551854392084838}

In [416]:
# Affichage des meilleurs paramètres

bestscore = max(scores.values())
bestcombo = [k for (k, val) in scores.items() if val == bestscore][0]
print(f"Meilleur score : {bestscore}' / Transformateur : {bestcombo.split('_')[1].upper()} / Eestimateur : {bestcombo.split('_')[0]}")

Meilleur score : 0.9551854392084838' / Transformateur : ROBUST / Eestimateur : RANDOMFOREST


Nous savons alors quel duo d'estimateur et de transformateur nous gardons pour la prédiction sur l'application