In [58]:
import pandas as pd
import plotly.express as px 
import plotly.graph_objects as go
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')


In [59]:
df_pricing = pd.read_csv("s3://getaround-bucket/get_around_pricing_project.csv")
display(df_pricing)

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4838,4838,Toyota,39743,110,diesel,black,van,False,True,False,False,False,False,True,121
4839,4839,Toyota,49832,100,diesel,grey,van,False,True,False,False,False,False,True,132
4840,4840,Toyota,19633,110,diesel,grey,van,False,True,False,False,False,False,True,130
4841,4841,Toyota,27920,110,diesel,brown,van,True,True,False,False,False,False,True,151


In [3]:
df_pricing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4843 entries, 0 to 4842
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Unnamed: 0                 4843 non-null   int64 
 1   model_key                  4843 non-null   object
 2   mileage                    4843 non-null   int64 
 3   engine_power               4843 non-null   int64 
 4   fuel                       4843 non-null   object
 5   paint_color                4843 non-null   object
 6   car_type                   4843 non-null   object
 7   private_parking_available  4843 non-null   bool  
 8   has_gps                    4843 non-null   bool  
 9   has_air_conditioning       4843 non-null   bool  
 10  automatic_car              4843 non-null   bool  
 11  has_getaround_connect      4843 non-null   bool  
 12  has_speed_regulator        4843 non-null   bool  
 13  winter_tires               4843 non-null   bool  
 14  rental_p

In [4]:
df_pricing.drop(columns=['Unnamed: 0'],inplace=True)

MODELS


In [6]:
all_features = ['model_key', 'mileage', 'engine_power', 'private_parking_available', 'has_gps', 'fuel', 'paint_color', 'car_type',
                'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']
numeric_features = ['mileage', 'engine_power']
categorical_features = ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps',
                        'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']

In [7]:
y_target = "rental_price_per_day"
X = df_pricing.loc[:,all_features]
Y = df_pricing.loc[:,y_target]


In [8]:
# split test and train set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

#pipeline
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first'))])

#transformation
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

#preprocessing train
X_train = preprocessor.fit_transform(X_train)

#preprocessing test
X_test = preprocessor.transform(X_test)


linear regression

In [60]:
#train linear model
lin_reg = LinearRegression()
lin_reg.fit(X_train, Y_train)

# Predictions on train and test set
Y_train_pred_lr = lin_reg.predict(X_train)
Y_test_pred_lr = lin_reg.predict(X_test)

print("r2-score on train set           :", r2_score(Y_train, Y_train_pred_lr))
print("Standard deviation on train set :", cross_val_score(lin_reg, X_train, Y_train, cv=10, scoring='r2').std())
print("r2-score on test set            :", r2_score(Y_test, Y_test_pred_lr))

r2-score on train set           : 0.7115141026470525
Standard deviation on train set : 0.056299027756351754
r2-score on test set            : 0.7085807828114719


decision tree regressor

In [61]:
from sklearn.tree import DecisionTreeRegressor
model_dt = DecisionTreeRegressor(max_depth=10,random_state=42)
model_dt.fit(X_train, Y_train)

Y_train_pred_dt = model_dt.predict(X_train)
Y_test_pred_dt = model_dt.predict(X_test)
print("r2-score on train set           :", r2_score(Y_train, Y_train_pred_dt))
print("Standard deviation on train set :", cross_val_score(model_dt, X_train, Y_train, cv=10, scoring='r2').std())
print("r2-score on test set            :", r2_score(Y_test, Y_test_pred_dt))

r2-score on train set           : 0.8577817304549757
Standard deviation on train set : 0.06541055340747982
r2-score on test set            : 0.6865506226193911


xgb regressor

In [55]:
# Perform grid search

xgboost = xgb.XGBRegressor()

# Grid of values to be tested
params = {
    'max_depth': [6, 8, 10,12,14],
    'min_child_weight': [1, 2, 4, 6, 8,10],
    'n_estimators': [2, 4, 6, 8, 10, 12]
}
gridsearch_xgb = GridSearchCV(xgboost, param_grid = params, cv = 3, verbose = 1) # cv : the number of folds to be used for CV
gridsearch_xgb.fit(X_train, Y_train)

print("Best hyperparameters : ", gridsearch_xgb.best_params_)
print("Best validation accuracy : ", gridsearch_xgb.best_score_)

# Predictions on train set
Y_train_pred_xgb = gridsearch_xgb.predict(X_train)


# Predictions on test set
Y_test_pred_xgb = gridsearch_xgb.predict(X_test)


print("r2 score on train set : ", r2_score(Y_train, Y_train_pred_xgb))
print("r2 score on test set : ", r2_score(Y_test, Y_test_pred_xgb))



Fitting 3 folds for each of 180 candidates, totalling 540 fits
Best hyperparameters :  {'max_depth': 10, 'min_child_weight': 10, 'n_estimators': 12}
Best validation accuracy :  0.7251482716205183
r2 score on train set :  0.8659716352807807
r2 score on test set :  0.7836270772184544


- Linear Regression performs decently, with a comparable R2-score on both training and test sets, indicating no evident overfitting but potential for improvement.
- Decision Tree shows significantly better performance on the training set compared to the test set, suggesting overfitting.
- XGBoost Regressor outperforms both Linear Regression and Decision Tree in terms of performance on the test set, with a higher R2-score and lower likelihood of overfitting.
- The XGBoost Regressor also achieves the best validation accuracy during hyperparameter tuning, indicating it is optimized to generalize well on unseen data.


In summary, the XGBoost Regressor appears to be the preferred model due to its superior ability to generalize on unseen data and its higher performance on the test set compared to the other two models.