In [201]:
# Create a fast API
# https://app.jedha.co/course/serve-your-model-with-api-ft/fastapi-basics-ft

import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import re
import seaborn as sns

In [202]:
dataset = pd.read_csv('src/get_around_pricing_project.csv', index_col=0)

In [203]:
dataset.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [204]:
dataset.dtypes

model_key                    object
mileage                       int64
engine_power                  int64
fuel                         object
paint_color                  object
car_type                     object
private_parking_available      bool
has_gps                        bool
has_air_conditioning           bool
automatic_car                  bool
has_getaround_connect          bool
has_speed_regulator            bool
winter_tires                   bool
rental_price_per_day          int64
dtype: object

In [205]:
# No missing value or unique value in a column
'''
100*dataset.isnull().sum()/dataset.shape[0]
#
for cols in dataset.columns:
    print(f'{cols} : {dataset[cols].value_counts().count()}/{len(dataset)}')
'''

"\n100*dataset.isnull().sum()/dataset.shape[0]\n#\nfor cols in dataset.columns:\n    print(f'{cols} : {dataset[cols].value_counts().count()}/{len(dataset)}')\n"

In [206]:
dataset_plot = dataset
target = 'rental_price_per_day'
explain_values = dataset_plot.drop(columns=[target])

for column in explain_values.columns:
    if dataset_plot[column].dtypes == "object":
        # Quantitative Values
        fig = px.histogram(dataset_plot[column])
        fig.update_layout(title= f"{column.replace('_', ' ')}")
        fig.update_layout(showlegend=False)
        fig.show()

    elif dataset_plot[column].dtypes == bool:
        # Bool Values 
        cat_data = dataset_plot.groupby(column)[target].sum()
        fig = px.bar(x=cat_data.index, y=cat_data, labels=dict(x="", y=""))
        fig.update_layout(title= f"{column.replace('_', ' ')}")
        fig.show()

    else:
        # Qualitative Values
        fig = px.histogram(dataset_plot[column])
        fig.update_layout(title= f"{column.replace('_', ' ')}")
        fig.update_layout(showlegend=False)
        fig.show()

In [207]:
# Filtre X first values in Categorielle
def limitValueCategoriel(dataset, column_name, number_choises):
    count_values = dataset[column_name].value_counts().sort_values(ascending=False)
    count_values = count_values[count_values[:] > count_values[number_choises] ]

    keep_values = count_values.keys().tolist()

    return dataset[dataset[column_name].isin(keep_values)]

In [208]:
dataset = limitValueCategoriel(dataset, 'model_key', 12)
dataset = limitValueCategoriel(dataset, 'fuel', 2)
dataset = limitValueCategoriel(dataset, 'paint_color', 6)
dataset = limitValueCategoriel(dataset, 'car_type', 5)

dataset = dataset[dataset['mileage'] < 300_000 ]

dataset = dataset[dataset['engine_power'] > 50 ]
dataset = dataset[dataset['engine_power'] < 250 ]

In [209]:
dataset_plot = dataset
target = 'rental_price_per_day'
explain_values = dataset_plot.drop(columns=[target])

for column in explain_values.columns:
    if dataset_plot[column].dtypes == "object":
        # Quantitative Values
        fig = px.histogram(dataset_plot[column])
        fig.update_layout(title= f"{column.replace('_', ' ')}")
        fig.update_layout(showlegend=False)
        fig.show()

    elif dataset_plot[column].dtypes == bool:
        # Bool Values 
        cat_data = dataset_plot.groupby(column)[target].sum()
        fig = px.bar(x=cat_data.index, y=cat_data, labels=dict(x="", y=""))
        fig.update_layout(title= f"{column.replace('_', ' ')}")
        fig.show()

    else:
        # Qualitative Values
        fig = px.histogram(dataset_plot[column])
        fig.update_layout(title= f"{column.replace('_', ' ')}")
        fig.update_layout(showlegend=False)
        fig.show()

In [210]:
dataset.to_csv(r'src/get_around_pricing_project_clean.csv', index=False)