In [2]:
#import streamlit as st
import pandas as pd
import plotly.express as px 
import plotly.graph_objects as go
import numpy as np

DATA_URL = './get_around_pricing_project.csv'

def load_data(): 
    data = pd.read_csv(DATA_URL)
    return data

data = load_data()

In [3]:
data = data.iloc[:,1:]
target = 'rental_price_per_day'

num_features = [c for c in data.columns if c != target]

def display_distribution(c):
    fig = px.scatter(data, c)
    fig.update_layout(width=700,height=500)
    fig.show()

# for c in num_features:
#     display_distribution(c)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import joblib

In [7]:
data.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [8]:
data['model_key'].value_counts()

model_key
Citroën        969
Renault        916
BMW            827
Peugeot        642
Audi           526
Nissan         275
Mitsubishi     231
Mercedes        97
Volkswagen      65
Toyota          53
SEAT            46
Subaru          44
Opel            33
Ferrari         33
PGO             33
Maserati        18
Suzuki           8
Porsche          6
Ford             5
KIA Motors       3
Alfa Romeo       3
Fiat             2
Lexus            2
Lamborghini      2
Mini             1
Mazda            1
Honda            1
Yamaha           1
Name: count, dtype: int64

In [9]:
model_counts = data['model_key'].value_counts()
data['model_key'] = data['model_key'].apply(lambda x: 'other' if model_counts[x] < 10 else x)

In [10]:
data['car_type'].unique()

array(['convertible', 'coupe', 'estate', 'hatchback', 'sedan',
       'subcompact', 'suv', 'van'], dtype=object)

In [12]:
target = 'rental_price_per_day'
features = [c for c in data.columns if c != target]

# Define categorical and numerical features
categorical_features = [
    'model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 
    'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 
    'has_speed_regulator', 'winter_tires'
]

numerical_features = ['mileage', 'engine_power']

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(drop="first")

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [13]:
a=False
if a:
    # Define models
    regressors = {
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'XGBoost': XGBRegressor()
    }
    params_rf = {
        'regressor__n_estimators': [100, 200],
        'regressor__max_depth': [10, 20],
    }

    params_gb = {
        'regressor__n_estimators': [100, 200],
        'regressor__learning_rate': [0.1, 0.05],
        'regressor__max_depth': [3, 5]
    }

    params_xgb = {
        'regressor__n_estimators': [100, 200],
        'regressor__learning_rate': [0.1, 0.05],
        'regressor__max_depth': [3, 5]
    }

    param_grids = {
        'Random Forest': params_rf,
        'Gradient Boosting': params_gb,
        'XGBoost': params_xgb
    }

    # Split data into train and test sets
    X = data[features]
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Evaluate each model using GridSearchCV
    results = {}
    for name, model in regressors.items():
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor', model)])
        grid_search = GridSearchCV(pipeline, param_grid=param_grids[name], cv=5, scoring='neg_mean_absolute_error',verbose=2)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results[name] = {'MAE': mae, 'MSE': mse, 'R2': r2}

    results_df = pd.DataFrame(results).T
    print(results_df)


In [14]:
regressors = {
    'XGBoost': XGBRegressor()
}

params_xgb = {
    'regressor__n_estimators': [500],
    'regressor__learning_rate': [0.0443],
    'regressor__max_depth': [5]
}
param_grids = {
    'XGBoost': params_xgb
}

# Split data into train and test sets
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to find best parameters
def find_best_params(regressors, param_grids, X_train, y_train, X_test, y_test):
    results = {}
    best_params = {}
    for name, model in regressors.items():
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', model)])
        grid_search = GridSearchCV(pipeline, param_grid=param_grids[name], cv=5, scoring='r2', verbose=0)
        # grid_search = GridSearchCV(pipeline, param_grid=param_grids[name], cv=5, scoring='neg_mean_absolute_error', verbose=2)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_

        joblib.dump(best_model, f'best_model_{name}.pkl')

        y_pred = best_model.predict(X_test)
        
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results[name] = {'MAE': mae, 'MSE': mse, 'R2': r2}
        best_params[name] = grid_search.best_params_
    
    return results, best_params

# Find best parameters and results
results, best_params = find_best_params(regressors, param_grids, X_train, y_train, X_test, y_test)

# Display results
results_df = pd.DataFrame(results).T
print(results_df)

# Display best parameters
print("Best Parameters:")
for name, params in best_params.items():
    print(f"{name}: {params}")


              MAE         MSE        R2
XGBoost  10.37208  259.220416  0.753879
Best Parameters:
XGBoost: {'regressor__learning_rate': 0.0443, 'regressor__max_depth': 5, 'regressor__n_estimators': 500}


In [15]:
data.columns

Index(['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color',
       'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires', 'rental_price_per_day'],
      dtype='object')

In [16]:
import joblib

loaded_model = joblib.load('GetAroundApi/best_model_XGBoost.pkl')
random_row = data.sample(n=1, random_state=0)
actual_value = random_row[target].values[0]
new_data = random_row.drop(columns=[target])
new_predictions = loaded_model.predict(new_data)
print(f"Prediction for the selected row: {new_predictions[0]:.2f} $/day")
print(f"Actual value for the selected row: {actual_value} $/day")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


AttributeError: Can't get attribute '_RemainderColsList' on <module 'sklearn.compose._column_transformer' from 'c:\\Users\\antoi\\Documents\\Work_Learn\\JEDHA\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py'>

In [1]:
import requests

# Define the URL of the FastAPI endpoint
# url = "http://localhost:4000/predict"
url = "https://2nzi-getaroundapi.hf.space/predict"

# Define the input data as a dictionary
input_data = {
    "brand": "Renault",
    "mileage": 10000,
    "engine_power": 100,
    "fuel": "diesel",
    "paint_color": "black",
    "car_type": "sedan",
    "private_parking_available": True,
    "has_gps": True,
    "has_air_conditioning": True,
    "automatic_car": False,
    "has_getaround_connect": True,
    "has_speed_regulator": True,
    "winter_tires": False
}

# Make a POST request to the endpoint with the input data
response = requests.post(url, params=input_data)

# Print the response from the server
print(response.json())


{'prediction': 160.41680908203125}
