In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [2]:
df = pd.read_csv('data/get_around_pricing_project.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [3]:
# removing outliers 3 standard deviations away from mean for mileage and rental price per day
outliers = df.loc[df['mileage'] < df['mileage'].mean() + df['mileage'].std()*3]
outliers = outliers.loc[outliers['mileage'] > df['mileage'].mean() - df['mileage'].std()*3]
outliers = outliers.loc[outliers['rental_price_per_day'] < outliers['rental_price_per_day'].mean() + outliers['rental_price_per_day'].std()*3]
outliers = outliers.loc[outliers['rental_price_per_day'] > outliers['rental_price_per_day'].mean() - outliers['rental_price_per_day'].std()*3]
outliers = outliers.loc[outliers['engine_power'] > outliers['engine_power'].mean() - outliers['engine_power'].std()*3]
df = outliers.loc[outliers['engine_power'] < outliers['engine_power'].mean() + outliers['engine_power'].std()*3]

In [4]:
target = 'rental_price_per_day'

X = df.drop(['rental_price_per_day'], axis=1)
y = df.loc[:, target]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
X.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True
5,Citroën,152352,225,petrol,black,convertible,True,True,False,False,True,True,True


In [11]:
numeric_features = [1,2]
categorical_features = [0,3,4,5,6,7,8,9,10,11,12]

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

### 1. Linear Regression

In [12]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Predictions
train_pred = lin_reg.predict(X_train)
test_pred = lin_reg.predict(X_test)
print(f"Score on train set: {r2_score(y_train, train_pred)}")
print(f"Score on test set: {r2_score(y_test, test_pred)}")

Score on train set: 0.742563206766329
Score on test set: 0.6958168005935483


In [13]:
# Obtenir les coefficients du modèle
coefficients = lin_reg.coef_

# Afficher les coefficients associés à chaque variable
for feature, coef in zip(X.columns, coefficients):
    print(f"{feature}: {coef}")

model_key: -12.042921942103332
mileage: 11.724595534958027
engine_power: -4.318482178039275
fuel: 2.5287870395512244
paint_color: -3.365703717083777
car_type: -5.678882161803463
private_parking_available: 5.339855903552572
has_gps: -40.35373087670303
has_air_conditioning: -21.776817288735728
automatic_car: 1.952561511115703
has_getaround_connect: -4.402978199679985
has_speed_regulator: 5.8007421083506845
winter_tires: -49.122287968889005


In [18]:
from sklearn.model_selection import GridSearchCV

gbr = GradientBoostingRegressor()
params = {
    'max_leaf_nodes': [20,30,40,50,60],
    'max_depth' : [6,8,10,12,16],
    'min_samples_leaf' : [2,4,6,8,10],
    'n_estimators' : [75,100,125,150]
}

gridsearch = GridSearchCV(gbr, param_grid = params, n_jobs=-1, cv = 5,scoring='r2') 
gridsearch.fit(X_train, y_train)
print("Best: %f using %s" % (gridsearch.best_score_, gridsearch.best_params_))

Best: 0.810116 using {'max_depth': 8, 'max_leaf_nodes': 40, 'min_samples_leaf': 2, 'n_estimators': 150}


In [26]:
gbr = GradientBoostingRegressor(max_depth= 8, max_leaf_nodes= 40, min_samples_leaf=2, n_estimators= 150)
gbr.fit(X_train, y_train)

train_pred = gbr.predict(X_train)
test_pred = gbr.predict(X_test)

print("score on train set : ", r2_score(y_train, train_pred))
print("score on test set : ", r2_score(y_test, test_pred))

score on train set :  0.9376804770192795
score on test set :  0.7977032062571631


### API

In [None]:
joblib.dump(gbr,'api/gbr_model.pkl')
joblib.dump(preprocessor,'api/preprocessor.pkl')

In [36]:
example_input = {
  "model_key": "Renault","mileage": 65000,"engine_power": 110,"fuel": "diesel","paint_color": "black","car_type": "SUV","private_parking_available": True,
  "has_gps": True,"has_air_conditioning": True,"automatic_car": False,"has_getaround_connect": True,"has_speed_regulator": True,"winter_tires": True
  }

In [37]:
input_df = pd.DataFrame(columns=list(example_input.keys()))
input_df.loc[0] = list(example_input.values())

In [38]:
X = input_df
X = preprocessor.transform(X)
pred = gbr.predict(X)
print('Predicted rental price per day : ', pred[0])

Predicted rental price per day :  136.52987177766812


In [None]:
import requests
url = 'http://localhost:8000/predict'
request_pred = requests.post(url, json={ "model_key": "Ford", "mileage": 50000, "engine_power": 115, "fuel": "diesel", "paint_color": "blue", "car_type": "convertible", 
"private_parking_available": True, "has_gps": True, "has_air_conditioning": True, "automatic_car": False, "has_getaround_connect": True, "has_speed_regulator": True, "winter_tires": False })
print(f"Rental price prediction for this car : {round(request_pred.json()['prediction'],2)} $")