In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score

# models to test
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor


# Plot lib
import plotly.express as px

In [2]:
df = pd.read_csv('src/get_around_pricing_project.csv')

## Part 1 - Exploratory Data Analysis

In [3]:
# Basic stats
print("Number of rows + infos:")
print()
print("{}".format(df.shape[0]))
print()
print("{}" .format(df.info()))
print()

print("Display of dataset: ")
display(df.head(10))
print()

print("Basics statistics: ")
data_desc = df.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*df.isnull().sum()/df.shape[0])

Number of rows + infos:

4843

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4843 entries, 0 to 4842
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Unnamed: 0                 4843 non-null   int64 
 1   model_key                  4843 non-null   object
 2   mileage                    4843 non-null   int64 
 3   engine_power               4843 non-null   int64 
 4   fuel                       4843 non-null   object
 5   paint_color                4843 non-null   object
 6   car_type                   4843 non-null   object
 7   private_parking_available  4843 non-null   bool  
 8   has_gps                    4843 non-null   bool  
 9   has_air_conditioning       4843 non-null   bool  
 10  automatic_car              4843 non-null   bool  
 11  has_getaround_connect      4843 non-null   bool  
 12  has_speed_regulator        4843 non-null   bool  
 13  winter_tires               4843 

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183
5,5,Citroën,152352,225,petrol,black,convertible,True,True,False,False,True,True,True,131
6,6,Citroën,205219,145,diesel,grey,convertible,True,True,False,False,True,True,True,111
7,7,Citroën,115560,105,petrol,white,convertible,True,True,False,False,False,True,True,78
8,8,Peugeot,123886,125,petrol,black,convertible,True,False,False,False,False,True,True,79
9,9,Citroën,139541,135,diesel,white,convertible,False,False,False,False,True,False,True,132



Basics statistics: 


Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
count,4843.0,4843,4843.0,4843.0,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843.0
unique,,28,,,4,10,8,2,2,2,2,2,2,2,
top,,Citroën,,,diesel,black,estate,True,True,False,False,False,False,True,
freq,,969,,,4641,1633,1606,2662,3839,3865,3881,2613,3674,4514,
mean,2421.0,,140962.8,128.98823,,,,,,,,,,,121.214536
std,1398.198007,,60196.74,38.99336,,,,,,,,,,,33.568268
min,0.0,,-64.0,0.0,,,,,,,,,,,10.0
25%,1210.5,,102913.5,100.0,,,,,,,,,,,104.0
50%,2421.0,,141080.0,120.0,,,,,,,,,,,119.0
75%,3631.5,,175195.5,135.0,,,,,,,,,,,136.0



Percentage of missing values: 


Unnamed: 0                   0.0
model_key                    0.0
mileage                      0.0
engine_power                 0.0
fuel                         0.0
paint_color                  0.0
car_type                     0.0
private_parking_available    0.0
has_gps                      0.0
has_air_conditioning         0.0
automatic_car                0.0
has_getaround_connect        0.0
has_speed_regulator          0.0
winter_tires                 0.0
rental_price_per_day         0.0
dtype: float64

In [4]:
df['model_key'].value_counts()

Citroën        969
Renault        916
BMW            827
Peugeot        642
Audi           526
Nissan         275
Mitsubishi     231
Mercedes        97
Volkswagen      65
Toyota          53
SEAT            46
Subaru          44
Opel            33
Ferrari         33
PGO             33
Maserati        18
Suzuki           8
Porsche          6
Ford             5
KIA Motors       3
Alfa Romeo       3
Fiat             2
Lexus            2
Lamborghini      2
Mini             1
Mazda            1
Honda            1
Yamaha           1
Name: model_key, dtype: int64

### Cleaning : 
- deleting categorical values which appears once ('Mini, Mazda, Honda, Yamaha')
- to avoid preprocessing problem

In [5]:
df[df['model_key'] != 'Mini']

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4838,4838,Toyota,39743,110,diesel,black,van,False,True,False,False,False,False,True,121
4839,4839,Toyota,49832,100,diesel,grey,van,False,True,False,False,False,False,True,132
4840,4840,Toyota,19633,110,diesel,grey,van,False,True,False,False,False,False,True,130
4841,4841,Toyota,27920,110,diesel,brown,van,True,True,False,False,False,False,True,151


In [6]:
print(df.shape)
df = df[df['model_key'] != 'Mini']
df = df[df['model_key'] != 'Mazda']
df = df[df['model_key'] != 'Honda']
df = df[df['model_key'] != 'Yamaha']
print(df.shape)

(4843, 15)
(4839, 15)


## Part 2 - Machine Learning

In [7]:
df.columns

Index(['Unnamed: 0', 'model_key', 'mileage', 'engine_power', 'fuel',
       'paint_color', 'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires', 'rental_price_per_day'],
      dtype='object')

In [8]:
# Separate target variable Y from features X
print("Separating labels from features...")
features_list = ['model_key', 'mileage', 'engine_power', 'fuel',
       'paint_color', 'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires']

target_variable = "rental_price_per_day"

X = df.loc[:,features_list]
Y = df.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())


Separating labels from features...
...Done.

Y : 
0    106
1    264
2    101
3    158
4    183
Name: rental_price_per_day, dtype: int64

X :
  model_key  mileage  engine_power    fuel paint_color     car_type  \
0   Citroën   140411           100  diesel       black  convertible   
1   Citroën    13929           317  petrol        grey  convertible   
2   Citroën   183297           120  diesel       white  convertible   
3   Citroën   128035           135  diesel         red  convertible   
4   Citroën    97097           160  diesel      silver  convertible   

   private_parking_available  has_gps  has_air_conditioning  automatic_car  \
0                       True     True                 False          False   
1                       True     True                 False          False   
2                      False    False                 False          False   
3                       True     True                 False          False   
4                       True     True     

In [9]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)


Found numeric features  ['mileage', 'engine_power']
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


In [10]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()


Dividing into train and test sets...
...Done.



In [11]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='mean')), # Not need because of missing values => None
    ('scaler', StandardScaler())
])


In [12]:
# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])


In [13]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [14]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()


Performing preprocessings on train set...
     model_key  mileage  engine_power    fuel paint_color   car_type  \
2038       BMW   102677           100  petrol       black  hatchback   
900    Peugeot   148986           100  diesel       black     estate   
933    Citroën   170500           135  diesel       black     estate   
2260       BMW   151334            85  diesel       white  hatchback   
3377   Citroën   207355           125  petrol       black      sedan   

      private_parking_available  has_gps  has_air_conditioning  automatic_car  \
2038                      False    False                 False          False   
900                        True     True                 False          False   
933                        True     True                 False          False   
2260                      False     True                 False          False   
3377                      False    False                 False          False   

      has_getaround_connect  has_speed

### Train Model

#### Test several models

In [15]:
# Below, this is the whole list of models I'll compare

linear = LinearRegression()
lr = LogisticRegression()
gradient = GradientBoostingRegressor()
elastic = ElasticNet()
ridge = Ridge()
lasso = Lasso()
xgb = XGBRegressor()

list_models = [
    linear,
    ridge,
    lasso,
    lr,
    gradient,
    elastic,
    xgb


    ]



In [16]:
scores_df = pd.DataFrame(columns = ['model', 'r2_score_train', 'r2_score_test'])
df_ytest = pd.DataFrame()
scores_df.index = scores_df.index + 1

for model in list_models:
    model = model
    name_model = str(model.__class__).split(".")[-1][:-2]
    model.fit(X_train, Y_train)
    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)
    print()
    print(f"R2 score de {name_model} on training set : ", r2_score(Y_train, Y_train_pred))
    print(f"R2 score de {name_model} on test set : ", r2_score(Y_test, Y_test_pred))
    scores_df = scores_df.append({'model': name_model, \
    'r2_score_train': r2_score(Y_train, Y_train_pred), \
    'r2_score_test': r2_score(Y_test, Y_test_pred)}, \
    ignore_index = True)

scores_df = scores_df.sort_values(by='r2_score_test', ascending=False)
scores_df.head(10)


R2 score de LinearRegression on training set :  0.7161362935005403
R2 score de LinearRegression on test set :  0.685345880372676

R2 score de Ridge on training set :  0.7156871600816888
R2 score de Ridge on test set :  0.6855881173091058

R2 score de Lasso on training set :  0.6405107468673226
R2 score de Lasso on test set :  0.6066709283842178


  scores_df = scores_df.append({'model': name_model, \
  scores_df = scores_df.append({'model': name_model, \
  scores_df = scores_df.append({'model': name_model, \
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  scores_df = scores_df.append({'model': name_model, \



R2 score de LogisticRegression on training set :  0.5641132198560028
R2 score de LogisticRegression on test set :  0.4927691480259436

R2 score de GradientBoostingRegressor on training set :  0.7842335792764243
R2 score de GradientBoostingRegressor on test set :  0.7057033337673964

R2 score de ElasticNet on training set :  0.5735419669389648
R2 score de ElasticNet on test set :  0.5470373377215343


  scores_df = scores_df.append({'model': name_model, \
  scores_df = scores_df.append({'model': name_model, \



R2 score de XGBRegressor on training set :  0.9532057388190381
R2 score de XGBRegressor on test set :  0.723070435249132


  scores_df = scores_df.append({'model': name_model, \


Unnamed: 0,model,r2_score_train,r2_score_test
6,XGBRegressor,0.953206,0.72307
4,GradientBoostingRegressor,0.784234,0.705703
1,Ridge,0.715687,0.685588
0,LinearRegression,0.716136,0.685346
2,Lasso,0.640511,0.606671
5,ElasticNet,0.573542,0.547037
3,LogisticRegression,0.564113,0.492769


### Grid Search

In [17]:
# Perform grid search
print("Grid search...")
regressor = XGBRegressor()
# Grid of values to be tested
params = {
    'booster': ['dart'],
    'max_depth' : [4],
    'n_estimators': [90],
    'reg_alpha' : [3],
    'reg_lambda' : [1],
    'n_jobs' : [0]
    
}
gridsearch = GridSearchCV(regressor, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best R2 score : ", gridsearch.best_score_)


Grid search...
...Done.
Best hyperparameters :  {'booster': 'dart', 'max_depth': 4, 'n_estimators': 90, 'n_jobs': 0, 'reg_alpha': 3, 'reg_lambda': 1}
Best R2 score :  0.7750721026705104


### TRAIN with XGBoost Regressor

In [18]:
# Train model
print("Train model...")
regressor = XGBRegressor(booster='dart', max_depth=4, n_estimators=90,
                        n_jobs=0, reg_alpha=3, reg_lambda=1)
regressor.fit(X_train, Y_train)
print("...Done.")


Train model...
...Done.


### Performance assessment

In [19]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = regressor.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()


Predictions on training set...
...Done.
[ 95.242676 102.79112   93.244064 ... 118.427124  99.54197  104.82127 ]



In [20]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = regressor.predict(X_test)
print("...Done.")
#print(Y_test_pred)
print()


Predictions on test set...
...Done.



In [21]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))


R2 score on training set :  0.8701778030830274
R2 score on test set :  0.7441566934337913
