In [89]:
import pandas as pd
from numpy import asarray
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
import pickle

### Régression Linéaire

Import

In [90]:
df = pd.read_csv('data/cars_infos_cleaned.csv')

Répartition

In [91]:
y = df['Prix($)']
X = df[[
    'Marque',
    'Modèle',
    'Classe', 
    'Portes', 
    'Longueur(cm)',
    'Hauteur(cm)', 
    'Largeur(cm)', 
    'Empattement(cm)', 
    'Moteur(cm³)',
    'Poids(t)', 
    'Carburant', 
    'ConsommationVille(L/100km)',
    'ConsommationAutoroute(L/100km)',
    'PositionMoteur',
    'Transmission',
    'Chevaux',
    'Régime(tr/min)', 
    'Turbo', 
    'Cylindres', 
    'Injecteur',
    'TypeMoteur', 
    'Alésage(mm)', 
    'Piston(cm)', 
    'TauxCompression',
    # 'CoteRisque'
]]

Preparation du pipeline

In [92]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, random_state=42
    )

In [93]:
numeric_features = [ 
    'Longueur(cm)',
    'Hauteur(cm)', 
    'Largeur(cm)', 
    'Empattement(cm)', 
    'Moteur(cm³)',
    'Poids(t)', 
    'ConsommationVille(L/100km)',
    'ConsommationAutoroute(L/100km)',
    'Chevaux',
    'Régime(tr/min)', 
    'Cylindres', 
    'Alésage(mm)', 
    'Piston(cm)', 
    'TauxCompression',
    # 'CoteRisque'
]

categorial_features = [
    'Marque',
    'Modèle',
    'Classe', 
    'Portes', 
    'Carburant', 
    'PositionMoteur',
    'Transmission',
    'Turbo', 
    'Injecteur',
    'TypeMoteur', 
]

In [94]:
#preparation des transformateurs numériques
numeric_transformer_minmax = Pipeline([('minmax', MinMaxScaler())])
numeric_transformer_std = Pipeline([('standard', StandardScaler())])
numeric_transformer_rbst = Pipeline([('standard', RobustScaler()),])

In [95]:
categorial_transformer = OneHotEncoder(sparse_output=True, handle_unknown='ignore')

In [96]:
preprocessor_minmax = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_minmax, numeric_features),
        ('cat', categorial_transformer, categorial_features)
    ],
    remainder='passthrough'
)

preprocessor_std = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_std, numeric_features),
        ('cat', categorial_transformer, categorial_features)
    ],
    remainder='passthrough'
)

preprocessor_rbst = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_rbst, numeric_features),
        ('cat', categorial_transformer, categorial_features)
    ],
    remainder='passthrough'
)

Estimateur

In [97]:
lnr = LinearRegression()

In [101]:
pipe = Pipeline([
    ('prep', preprocessor_rbst),
    ('lnr', lnr)
])

trained_pipe = pipe.fit(X_train, y_train)
trained_pipe.predict(X_test)
trained_pipe.score(X_test, y_test)

0.8417340562668099

In [102]:
pickle.dump(trained_pipe, open('data/trained_pipe.pkl', 'wb'))