In [236]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
import pickle

Import

In [237]:
df = pd.read_csv('data/cars_infos_cleaned.csv')

Répartition

In [238]:
# On définit notre target
y = df['Prix($)']
# Et nos features, en supprimant les features qui baisserait la précision de notre modèle.
X = df[[
    'Marque',
    # 'Modèle',
    'Classe', 
    'Portes', 
    'Longueur(cm)',
    # 'Hauteur(cm)', 
    'Largeur(cm)', 
    # 'Empattement(cm)', 
    'Moteur(cm³)',
    'Poids(t)', 
    'Carburant', 
    'ConsommationVille(L/100km)',
    # 'ConsommationAutoroute(L/100km)',
    'PositionMoteur',
    'Transmission',
    'Chevaux',
    # 'Régime(tr/min)', 
    'Turbo', 
    'Cylindres', 
    # 'Injecteur',
    # 'TypeMoteur', 
    # 'Alésage(mm)', 
    # 'Piston(cm)', 
    # 'TauxCompression',
    # 'CoteRisque'
]]

Séparation train/test

In [239]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, random_state=42
    )

Répartition des features

In [240]:
numeric_features = [ 
    'Longueur(cm)',
    # 'Hauteur(cm)', 
    'Largeur(cm)', 
    # 'Empattement(cm)', 
    'Moteur(cm³)',
    'Poids(t)', 
    'ConsommationVille(L/100km)',
    # 'ConsommationAutoroute(L/100km)',
    'Chevaux',
    # 'Régime(tr/min)', 
    'Cylindres', 
    # 'Alésage(mm)', 
    # 'Piston(cm)', 
    # 'TauxCompression',
    # 'CoteRisque'
]

categorical_features = [
    'Marque',
    # 'Modèle',
    'Classe', 
    'Portes', 
    'Carburant', 
    'PositionMoteur',
    'Transmission',
    'Turbo', 
    # 'Injecteur',
    # 'TypeMoteur', 
]

Transformateur numérique

In [241]:
numeric_transformer_std = Pipeline([
    ('standard', StandardScaler()),
    #on ajoute les polynomial features
    ('poly', PolynomialFeatures(degree=3))
    ])

Transformateur catégorielle

In [242]:
categorical_transformer = OneHotEncoder(sparse_output=True, handle_unknown='ignore')

In [243]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_std, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

Estimateur

In [244]:
randomforest = RandomForestRegressor()

Pipeline

In [245]:
pipe = Pipeline([
    ('prep', preprocessor),
    ('esti', randomforest)
])

Score

In [246]:
trained_pipe = pipe.fit(X_train, y_train)
trained_pipe.predict(X_test)
trained_pipe.score(X_test, y_test)

0.9495503800156135

Export

In [247]:
pickle.dump(trained_pipe, open('data/trained_pipe.pkl', 'wb'))