## Construcción del modelo

Apalancados en la ingeniería de features, creamos un conjunto de clases a utilizar en la definición de pipelines, que nos permitan reproducir y modificar con facilidad los pasos de preprocesamiento, previos al entrenamiento de un modelo: 

In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import sys
sys.path.append('src')
from feature_engineering.name_splitter import NameSplitter
from feature_engineering.owner_mapper import OwnerMapper
from feature_engineering.seat_rounder import SeatRounder
from feature_engineering.max_power_converter import MaxPowerConverter
from feature_engineering.engine_converter import EngineConverter
from feature_engineering.multiple_interative_imputer import MultipleIterativeImputer
from feature_engineering.multiple_one_hot_encoder import MultipleOneHotEncoder
from feature_engineering.torque_standardizer import TorqueStandardizer
from feature_engineering.mileage_converter import MileageConverter

In [82]:
categorical_cols = ['fuel', 'seller_type', 'transmission', 'make']

# Definir pipelines para preprocesamiento
preprocess_pipeline = Pipeline(steps=[
    ('name_spliter', NameSplitter()),
    ('mileage_converter', MileageConverter()),
    ('engine_converter', EngineConverter()),
    ('max_power_converter', MaxPowerConverter()),
    ('torque_standardizer', TorqueStandardizer()),
    ('map_owner', OwnerMapper()),
    ('multiple_one_hot_encoder', MultipleOneHotEncoder(categorical_cols=categorical_cols)),
])

columns_to_drop_for_imputation = []

full_pipeline = Pipeline(steps=[
    ('multiple_iterative_imputer', MultipleIterativeImputer(columns_to_drop=columns_to_drop_for_imputation)),
    ('round_seats', SeatRounder()),
])

final_pipeline = Pipeline(steps=[
    ('preprocess_pipeline', preprocess_pipeline),
    ('full_pipeline', full_pipeline),
])

final_pipeline_with_scaler = Pipeline(steps=[
    ('final_pipeline', final_pipeline),
    ('scaler', StandardScaler())
])

In [83]:
# Cargar y dividir los datos
data = pd.read_csv('../datasets/Car details v3.csv')
X = data.drop('selling_price', axis=1)
y = data['selling_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [84]:
# Ajustar y transformar los datos
X_train_processed = final_pipeline.fit_transform(X_train)

In [85]:
missing_values = X_train_processed.isnull().sum()

for column, missing in missing_values.items():
    print(f"Feature: {column} - Missing values: {missing}")

Feature: year - Missing values: 0
Feature: km_driven - Missing values: 0
Feature: owner - Missing values: 0
Feature: seats - Missing values: 0
Feature: mileage_kmpl - Missing values: 0
Feature: engine_cc - Missing values: 0
Feature: max_power_bhp - Missing values: 0
Feature: torque_peak_power - Missing values: 0
Feature: torque_peak_speed - Missing values: 0
Feature: fuel_Diesel - Missing values: 0
Feature: fuel_LPG - Missing values: 0
Feature: fuel_Petrol - Missing values: 0
Feature: seller_type_Individual - Missing values: 0
Feature: seller_type_Trustmark Dealer - Missing values: 0
Feature: transmission_Manual - Missing values: 0
Feature: make_Audi - Missing values: 0
Feature: make_BMW - Missing values: 0
Feature: make_Chevrolet - Missing values: 0
Feature: make_Daewoo - Missing values: 0
Feature: make_Datsun - Missing values: 0
Feature: make_Fiat - Missing values: 0
Feature: make_Force - Missing values: 0
Feature: make_Ford - Missing values: 0
Feature: make_Honda - Missing values: 0

In [86]:
# Ajustar y transformar los datos
X_train_processed = final_pipeline.fit_transform(X_train)

# Mostrar las primeras filas
print("Training data processed:")
X_train_processed

Training data processed:


Unnamed: 0,year,km_driven,owner,seats,mileage_kmpl,engine_cc,max_power_bhp,torque_peak_power,torque_peak_speed,fuel_Diesel,...,make_Mercedes-Benz,make_Mitsubishi,make_Nissan,make_Opel,make_Renault,make_Skoda,make_Tata,make_Toyota,make_Volkswagen,make_Volvo
6783,2006,120000,3,5,19.70,796.0,46.30,62.0,3000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1073,2018,100000,1,7,12.90,2755.0,174.50,450.0,2400.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7756,2017,39000,1,5,19.59,1995.0,187.74,380.0,2750.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
144,2013,39000,1,5,14.28,1798.0,138.03,173.0,4000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6424,2013,70000,2,5,23.40,1248.0,74.00,190.0,2000.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,2009,120000,1,7,12.05,2179.0,120.00,290.0,2800.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5390,2014,80000,2,5,23.40,1248.0,74.00,190.0,2000.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,2016,35000,1,5,18.60,1197.0,81.83,114.7,4000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7603,2019,27000,1,5,28.40,1248.0,74.02,190.0,2000.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
X_test_processed = final_pipeline.transform(X_test)

# Mostrar las primeras filas de X_test
print("Test data processed head:")
X_test_processed

Test data processed head:




Unnamed: 0,year,km_driven,owner,seats,mileage_kmpl,engine_cc,max_power_bhp,torque_peak_power,torque_peak_speed,fuel_Diesel,...,make_Mercedes-Benz,make_Mitsubishi,make_Nissan,make_Opel,make_Renault,make_Skoda,make_Tata,make_Toyota,make_Volkswagen,make_Volvo
1971,2004,110000,3,5,12.800000,1493.000000,100.000000,128.511000,4600.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4664,2014,291977,1,7,14.000000,2179.000000,138.100000,320.000000,2700.000000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5448,2016,70000,1,5,23.200000,1248.000000,73.940000,190.000000,2000.000000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3333,2006,120000,2,5,16.900000,1497.000000,100.000000,132.435000,4800.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2316,2013,69000,2,5,22.900000,1248.000000,74.000000,190.000000,2000.000000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,2015,35000,1,5,18.600000,1197.000000,81.830000,114.700000,4000.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1956,2011,90000,3,6,17.317134,1907.105199,94.520595,190.366291,2652.839585,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3782,2017,20000,1,8,11.960000,1298.000000,80.000000,103.000000,4500.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
799,2018,60000,1,5,23.010000,999.000000,67.000000,91.000000,4250.000000,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [88]:
from sklearn.metrics import (mean_absolute_error, r2_score,
                             root_mean_squared_error, 
                             mean_absolute_percentage_error)


from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [89]:
ridges = Ridge()

grid = GridSearchCV(ridges,
                    {"alpha": np.linspace(0, 20, 1000)},
                    refit=True,
                    cv=5,
                    scoring='neg_mean_absolute_error')
grid.fit(X_train_processed, y_train)

In [90]:
ridge = grid.best_estimator_

In [91]:
y_pred = ridge.predict(X_test_processed)

mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE de testeo fue: {mae}")
print(f"RMSE de testeo fue: {rmse}")
print(f"MAPE de testeo fue: {mape}")
print(f"R2 de testeo fue: {r2}")

MAE de testeo fue: 163620.34962967262
RMSE de testeo fue: 337115.4783887698
MAPE de testeo fue: 0.46563888378934476
R2 de testeo fue: 0.8350220886237696


In [92]:
print(f"Mejor modelo: {ridge}")

Mejor modelo: Ridge(alpha=0.02002002002002002)


- Ridge
- Arbol regresión
- SVR
- Boost (hay 2)
- Random Forest