In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('float_format', '{:f}'.format)
import numpy as np
import time
import os
import joblib

# model
from xgboost import XGBRegressor

# metrics and tools
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# plot lib
from matplotlib import pyplot as plt
import plotly.express as px


In [2]:
df = pd.read_csv('src/get_around_pricing_project.csv')

## MACHINE LEARNING :
- 1 - Training on all data with my best model
- 2 - export the model for API deployment

### Preprocessing

In [3]:
# Separate target variable Y from features X
print("Separating labels from features...")
features_list = ['model_key', 'mileage', 'engine_power', 'fuel',
       'paint_color', 'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires']

target_variable = "rental_price_per_day"

X = df.loc[:,features_list]
Y = df.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())


Separating labels from features...
...Done.

Y : 
0    106
1    264
2    101
3    158
4    183
Name: rental_price_per_day, dtype: int64

X :
  model_key  mileage  engine_power    fuel paint_color     car_type  \
0   Citroën   140411           100  diesel       black  convertible   
1   Citroën    13929           317  petrol        grey  convertible   
2   Citroën   183297           120  diesel       white  convertible   
3   Citroën   128035           135  diesel         red  convertible   
4   Citroën    97097           160  diesel      silver  convertible   

   private_parking_available  has_gps  has_air_conditioning  automatic_car  \
0                       True     True                 False          False   
1                       True     True                 False          False   
2                      False    False                 False          False   
3                       True     True                 False          False   
4                       True     True     

In [4]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)


Found numeric features  ['mileage', 'engine_power']
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


In [5]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='mean')), # Not need because of missing values => None
    ('scaler', StandardScaler())
])


In [6]:
# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])


In [7]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


### Train model on all data

In [8]:
# Instanciate our ML model
model = XGBRegressor(booster='dart', max_depth=4, n_estimators=90,
                        n_jobs=0, reg_alpha=3, reg_lambda=1)

In [9]:
pipe = make_pipeline(preprocessor, model)
pipe.fit(X, Y)

### Export (preprocessing + model) to JOBLIB File

In [10]:
timestr = time.strftime("%Y%m%d-%H%M%S")
# name of the model
name_model = pipe.__dict__['steps'][1][0]
# save our joblib file
joblib.dump(pipe, f'src/{timestr}-{name_model}.joblib')

['src/20230109-153321-xgbregressor.joblib']

In [11]:
# Record our feature list
df = pd.DataFrame()
df['feature_list'] = features_list
df['target_variable'] = target_variable
df.to_csv(f'src/{timestr}-features_dataframe.csv')