In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Predicting Used Car Prices: A Data Science Exploration

## Introduction

Welcome to this interactive exploration in our Jupyter notebook where we will traverse the intricate landscape of used car prices. As data scientists, our task is to scrutinize, analyze, and interpret complex datasets, teasing out the hidden patterns and underlying structures within the data.

In this notebook, we'll dive into a dataset that captures a diverse spectrum of used car features along with their associated prices. From brand, model, and age to more granular details like mileage and condition, this dataset provides an exciting opportunity to investigate the multifaceted world of used cars.

Our objective here is twofold. Firstly, we aim to gain a deep understanding of our data - its characteristics, distributions, and inherent relationships. Secondly, we aspire to leverage this understanding to build a predictive model that can accurately estimate the price of a used car based on its features.

This journey will involve a variety of techniques and processes central to the field of data science, including but not limited to:

- **Data Understanding and Exploration:** Through Exploratory Data Analysis (EDA), we will familiarize ourselves with the dataset, identify patterns, and generate hypotheses about potential relationships in the data.

- **Data Preparation:** We will clean the data, handle missing values, and perform necessary transformations to prepare it for modeling. This step may also include feature engineering, where we create new variables from existing ones to enhance our model's predictive power.

- **Modeling:** We'll apply suitable machine learning algorithms to the data to construct a predictive model. We will iterate over different models, tune hyperparameters, and assess their performance to select the best model.

- **Evaluation and Validation:** The performance of our model will be evaluated using suitable metrics and validation techniques to ensure its reliability and robustness.

It's essential to remember that data science is not a linear process, but rather an iterative one. Throughout our journey, we may loop back to earlier steps based on what we learn from subsequent stages.

So, without further ado, let's dive into this data science expedition, and uncover the story that lies within our used car prices dataset!

Let's read the data

In [None]:
df = pd.read_csv('/kaggle/input/used-cars-dataset/cars.csv')

Generally, the first thing is always to take a look at the dataset, to get an idea of what the features are like, and what it looks like.

In [None]:
df.sample(10)

## Exploratory Data Analisys - EDA

First, some easy checks to inspect the values

In [None]:
df.describe().T.applymap('{:.2f}'.format)

In [None]:
df.select_dtypes(include='object').describe().T

In [None]:
df.isnull().sum()/len(df)

The most striking thing is undoubtedly the enormous dispersion in the price, our "main" variable. It seems to have some incorrectly loaded or erroneous values (especially due to that value of 1,000,000,000).

Let's check the tail of the distribution

In [None]:
print(f"Percentil 95: {np.percentile(df['price'], q=95)}")
print(f"Percentil 99: {np.percentile(df['price'], q=99)}")
print(f"Percentil 99.3: {np.percentile(df['price'], q=99.3)}")
print(f"Percentil 99.5: {np.percentile(df['price'], q=99.5)}")
print(f"Percentil 99.8: {np.percentile(df['price'], q=99.8)}")
print(f"Percentil 99.9: {np.percentile(df['price'], q=99.9)}")

Now, if we filter the dataset, we see

In [None]:
upper_limit = np.percentile(df['price'], q=99.9)

df[df['price']<upper_limit].describe().T.applymap('{:.2f}'.format)

In [None]:
df[df['price']<upper_limit].select_dtypes(include='object').describe().T

We still see a very high dispersion, so we can plot to understand a little more. Even though we're only looking at the price, remember that it's our main variable.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
price_90 = np.percentile(df['price'], q=90)
price_95 = np.percentile(df['price'], q=95)
price_99_9 = np.percentile(df['price'], q=99.9)

data_95 = df.loc[df['price']<price_95, ['price']]
data_95.reset_index(inplace=True, drop=True)

data_tail = df.loc[df['price'].between(price_90,price_99_9), ['price']]
data_tail.reset_index(inplace=True, drop=True)

fig, axes = plt.subplots(1, 2, figsize=(22, 8)) 

sns.histplot(data=data_95, x='price', ax=axes[0], bins=30) 
axes[0].set_title(f'Price distribution - 95 percentile', fontsize=16)

sns.histplot(data=data_tail, x='price', ax=axes[1], bins=30) 
axes[1].set_title(f'Price distribution - Tail', fontsize=16)

plt.tight_layout()
plt.show()

So, we are going to work just with the prices below 150.000

In [None]:
df = df[df['price']<150000]
df.reset_index(drop=True, inplace=True)

In [None]:
# Compute the correlation matrix
corr = df.select_dtypes(exclude='object').corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(10, 8))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)

plt.title("Correlation Matrix")

plt.show()

Now, we can explore the categorical features. First check the first values of each category

In [None]:
for i in df.select_dtypes(include='object').columns:
    vals = round(df[i].value_counts(normalize=True)[:15].sum(),4)
    print(f'Column: {i}, total in the first 15 values: {vals}','\n', '')
    print(df[i].value_counts()[:15])
    print('************','\n')

## Frist Model

Transformation of some columns

In [None]:
df['drivetrain'].value_counts()

In [None]:
dive_train_map = {'Front-wheel Drive':'Front Wheel Driver',
                          'FWD':'Front Wheel Driver',
                          'Front-Wheel Drive':'Front Wheel Driver',
                          'Four-wheel Drive':'Four Wheel Driver',
                          'Four-Wheel Drive':'Four Wheel Driver',
                          'Four Wheel Drive':'Four Wheel Driver',
                          '4WD':'Four Wheel Driver',
                          'All-wheel Drive':'All Wheel Driver',
                          'All-Wheel Drive':'All Wheel Driver',
                          'AWD':'All Wheel Driver',
                          'Rear-wheel Drive':'Rear Wheel Driver',
                          'RWD':'Rear Wheel Driver'}

df['drivetrain'].map(dive_train_map).fillna(value='Other').value_counts(normalize=True)

In [None]:
df['drivetrain'] = df['drivetrain'].map(dive_train_map).fillna(value='Others')

In [None]:
cp_divetrain = df['drivetrain'].unique()

#### Categorical Features

In [None]:
cat_features = ['manufacturer','transmission','drivetrain','fuel_type']

Manufacturer

In [None]:
cp_manufacturer = df['manufacturer'].value_counts().index[:20]
df['manufacturer'] = df['manufacturer'].where(df['manufacturer'].isin(cp_manufacturer), 'Other')


Transmition

In [None]:
cp_transmission = df['transmission'].value_counts().index[:10]
df['transmission'] = df['transmission'].where(df['transmission'].isin(cp_transmission), 'Other')

Fuel Type

In [None]:
cp_fuel_type = ['Gasoline','Hybrid','Diesel','E85 Flex Fuel','Electric']
df['fuel_type'] = df['fuel_type'].where(df['fuel_type'].isin(cp_fuel_type), 'Other')

In [None]:
for col in cat_features:
    print(col,'\n')
    print(df[col].value_counts())
    print('*****')

In [None]:
for col in cat_features:
    categorias_preservar = df[col].value_counts().index[:15]
    df[col] = df[col].where(df[col].isin(categorias_preservar), 'others')

In [None]:
for i in cat_features:
    vals = round(df[i].value_counts(normalize=True)[:11].sum(),4)
    print(f'Column: {i}, total in the first 10 values: {vals}','\n', '')
    print(df[i].value_counts()[:15])
    print('************','\n')

In [None]:
for i in cat_features:
    vals = round(df[i].value_counts(normalize=True)[:11].sum(),4)
    print(f'Column: {i}, total in the first 10 values: {vals}','\n', '')
    print(df[i].value_counts(normalize=True)[:15])
    print('************','\n')

#### Numerical Features

Miles per Galon transform

In [None]:
def get_mpg(x):
    x = str(x)
    if x == 'nan':
        return np.nan
    elif len(x) <= 2:
        return float(x)
    else:
        return (float(x.split('-')[0])+float(x.split('-')[1]))/2

In [None]:
df['mpg'] = df['mpg'].map(get_mpg)

In [None]:
num_features = ['year','mileage', 'mpg', 'driver_reviews_num', 'seller_rating', 'driver_rating']

#### Dummie variables

In [None]:
dum_variables = ['accidents_or_damage', 'one_owner','personal_use_only']

### Dataset filter

In [None]:
all_features = dum_variables + cat_features + num_features
all_features.append('price')

In [None]:
all_features

In [None]:
df[all_features].isnull().sum()/len(df)

In [None]:
data = df[all_features].copy()

In [None]:
data.head()

In [None]:
data.dropna(inplace=True)
data.reset_index(inplace=True, drop=True)

In [None]:
for i in dum_variables:
    data[i] = data[i].astype(int)

In [None]:
data.isnull().sum()

First check hiperpameter tunning on small data

In [None]:
data_small = data.sample(n=40000, random_state=23).copy()
data_small.reset_index(inplace=True,drop=True)

In [None]:
data_small.head()

Imports

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler, StandardScaler, OrdinalEncoder
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.preprocessing import FunctionTransformer
import xgboost

In [None]:
print(xgboost.__version__)

In [None]:
class DummyEncoder:

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X
    
    def get_feature_names(self):
        return self.columns_

In [None]:
cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  
])

num_pipeline = Pipeline([
    ('scaler', RobustScaler())  
])

dum_pipeline = Pipeline([
    ('identity', FunctionTransformer()) 
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features),
        ('dum', dum_pipeline, dum_variables)
    ])

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror')) #
])

In [None]:
pipeline

In [None]:
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 5, 7],
    'regressor__colsample_bytree': [0.5, 0.7, 1.0],
    'regressor__gamma': [0, 0.1, 0.2]
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=3, verbose=2, n_jobs=4)

#### Split the data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = data_small[all_features].copy()
y = data_small['price'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Train score:", grid_search.score(X_train, y_train))
print("Test score:", grid_search.score(X_test, y_test))

In [None]:
import pickle

Load model for the whole dataset

In [None]:
data.info()

In [None]:
X = data[all_features].copy()
y = data['price'].copy()

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
pipeline_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror',colsample_bytree= 0.5, gamma = 0, 
                      learning_rate = 0.2, max_depth= 7, n_estimators =300)) #
])


In [None]:
with open('/kaggle/working/xgb_model_v1.pkl', 'rb') as file:
    model_v1 = pickle.load(file)

In [None]:
pipeline_xgb.fit(X_train, y_train)
#pipeline_xgb.fit(X_train, y)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predecir los valores de las características de prueba
y_pred = pipeline_xgb.predict(X_test)

# Calcular métricas de regresión
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Imprimir métricas
print('Mean Squared Error (MSE):', mse)
print('Root Mean Squared Error (RMSE):', rmse)
print('Mean Absolute Error (MAE):', mae)
print('R^2:', r2)


In [None]:
mae/data['price'].mean()

In [None]:
import pickle

# Guardar el modelo en un archivo
with open('/kaggle/working/xgb_model.pkl', 'wb') as file:
    pickle.dump(pipeline_xgb, file)

## Variables for app

Categorical Features

In [None]:
{k:data[k].unique() for k in cat_features}

Numerical Features

In [None]:
{k:(data[k].min(), data[k].max()) for k in num_features}

Dummie Variables

In [None]:
{k:data[k].unique() for k in dum_variables}

In [None]:
dum_variables

In [None]:
import xgboost

In [None]:
print(xgboost.__version__)

In [None]:
import sklearn

print(sklearn.__version__)

In [None]:
print(pd.__version__)

In [None]:
!python --version

## Neural Network

In [None]:
data.head()

In [None]:
pd.qcut(data['year'],q = 10)