## Construcción del modelo

Apalancados en la ingeniería de features, creamos un conjunto de clases a utilizar en la definición de pipelines, que nos permitan reproducir y modificar con facilidad los pasos de preprocesamiento, previos al entrenamiento de un modelo: 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import sys
sys.path.append('src')
from feature_engineering.name_splitter import NameSplitter
from feature_engineering.owner_mapper import OwnerMapper
from feature_engineering.seat_rounder import SeatRounder
from feature_engineering.max_power_converter import MaxPowerConverter
from feature_engineering.engine_converter import EngineConverter
from feature_engineering.multiple_interative_imputer import MultipleIterativeImputer
from feature_engineering.multiple_one_hot_encoder import MultipleOneHotEncoder
from feature_engineering.torque_standardizer import TorqueStandardizer
from feature_engineering.mileage_converter import MileageConverter
from feature_engineering.model_dropper import ModelDropper

In [3]:
categorical_cols = ['fuel', 'seller_type', 'transmission', 'make']

# Definir pipelines para preprocesamiento
preprocess_pipeline = Pipeline(steps=[
    ('name_spliter', NameSplitter()),
    ('mileage_converter', MileageConverter()),
    ('engine_converter', EngineConverter()),
    ('max_power_converter', MaxPowerConverter()),
    ('torque_standardizer', TorqueStandardizer()),
    ('map_owner', OwnerMapper()),
    ('model_dropper', ModelDropper()),
])

preprocess_pipeline_with_one_hot_encoder = Pipeline(steps=[
    ('preprocess_pipeline', preprocess_pipeline),
    ('multiple_one_hot_encoder', MultipleOneHotEncoder(categorical_cols=categorical_cols)),
])

columns_to_drop_for_imputation = []

full_pipeline = Pipeline(steps=[
    ('multiple_iterative_imputer', MultipleIterativeImputer(columns_to_drop=columns_to_drop_for_imputation)),
    ('round_seats', SeatRounder()),
])

final_pipeline = Pipeline(steps=[
    ('preprocess_pipeline', preprocess_pipeline_with_one_hot_encoder),
    ('full_pipeline', full_pipeline),
])

final_pipeline_with_scaler = Pipeline(steps=[
    ('final_pipeline', final_pipeline),
    ('scaler', StandardScaler())
])

In [50]:
# Cargar y dividir los datos
data = pd.read_csv('../datasets/Car details v3.csv')

data["selling_price_log"] = np.log(data["selling_price"])

X = data.drop(columns=['selling_price', 'selling_price_log'])
y = data['selling_price']
y_log = data['selling_price_log']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.3, random_state=42)

In [61]:
# Ajustar y transformar los datos
X_train_processed = final_pipeline_with_scaler.fit_transform(X_train)
X_test_processed = final_pipeline_with_scaler.transform(X_test)

# Definir pipelines para preprocesamiento
preprocess_pipeline_log = Pipeline(steps=[
    ('name_spliter', NameSplitter()),
    ('mileage_converter', MileageConverter()),
    ('engine_converter', EngineConverter()),
    ('max_power_converter', MaxPowerConverter()),
    ('torque_standardizer', TorqueStandardizer()),
    ('map_owner', OwnerMapper()),
    ('model_dropper', ModelDropper()),
])

preprocess_pipeline_with_one_hot_encoder_log = Pipeline(steps=[
    ('preprocess_pipeline', preprocess_pipeline_log),
    ('multiple_one_hot_encoder', MultipleOneHotEncoder(categorical_cols=categorical_cols)),
])

full_pipeline_log = Pipeline(steps=[
    ('multiple_iterative_imputer', MultipleIterativeImputer(columns_to_drop=columns_to_drop_for_imputation)),
    ('round_seats', SeatRounder()),
])

final_pipeline_log = Pipeline(steps=[
    ('preprocess_pipeline', preprocess_pipeline_with_one_hot_encoder_log),
    ('full_pipeline', full_pipeline_log),
])

final_pipeline_with_scaler_log = Pipeline(steps=[
    ('final_pipeline', final_pipeline_log),
    ('scaler', StandardScaler())
])

X_train_processed_log = final_pipeline_with_scaler_log.fit_transform(X_train_log)
X_test_processed_log = final_pipeline_with_scaler_log.transform(X_test_log)



Veamos de usar un Ridge como primer modelo simple. Usaremos búsqueda de grilla para el hiperparámetro alpha:

In [62]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error


from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [63]:
# Definir el modelo Ridge y la búsqueda en malla
ridges = Ridge()
param_grid = {"alpha": np.linspace(0, 20, 1000)}

# GridSearchCV para los datos originales (sin logaritmo)
grid_original = GridSearchCV(ridges, param_grid, refit=True, cv=5, scoring='neg_mean_absolute_error')
grid_original.fit(X_train_processed, y_train)
ridge_original = grid_original.best_estimator_

ridges_log = Ridge()
# GridSearchCV para los datos transformados con log
grid_log = GridSearchCV(ridges_log, param_grid, refit=True, cv=5, scoring='neg_mean_absolute_error')
grid_log.fit(X_train_processed_log, y_train_log)
ridge_log = grid_log.best_estimator_

In [64]:
# Predicciones para ambos casos
y_pred_original = ridge_original.predict(X_test_processed)

# Cálculo de métricas de error para los datos originales
mae_original = mean_absolute_error(y_test, y_pred_original)
rmse_original = np.sqrt(mean_squared_error(y_test, y_pred_original))
mape_original = mean_absolute_percentage_error(y_test, y_pred_original)
r2_original = r2_score(y_test, y_pred_original)

# Imprimir resultados
print("Resultados sin logaritmo:")
print(f"MAE de testeo fue: {mae_original}")
print(f"RMSE de testeo fue: {rmse_original}")
print(f"MAPE de testeo fue: {mape_original}")
print(f"R2 de testeo fue: {r2_original}")

Resultados sin logaritmo:
MAE de testeo fue: 163006.4090899027
RMSE de testeo fue: 336923.71578199096
MAPE de testeo fue: 0.4607069575470324
R2 de testeo fue: 0.8352097251957769


In [65]:
# Para los datos con logaritmo, debes invertir el logaritmo de las predicciones y del conjunto de test
y_pred_log = ridge_log.predict(X_test_processed_log)

# Invertir logaritmo tanto para predicciones como para y_test para hacer comparaciones justas
y_pred_log_inv = np.exp(y_pred_log)
y_test_inv = np.exp(y_test_log)

# Cálculo de métricas de error para los datos con log (ahora comparando con los valores originales)
mae_log = mean_absolute_error(y_test_inv, y_pred_log_inv)
rmse_log = np.sqrt(mean_squared_error(y_test_inv, y_pred_log_inv))
mape_log = mean_absolute_percentage_error(y_test_inv, y_pred_log_inv)
r2_log = r2_score(y_test_inv, y_pred_log_inv)

print("\nResultados con logaritmo:")
print(f"MAE de testeo fue: {mae_log}")
print(f"RMSE de testeo fue: {rmse_log}")
print(f"MAPE de testeo fue: {mape_log}")
print(f"R2 de testeo fue: {r2_log}")


Resultados con logaritmo:
MAE de testeo fue: 111450.01825608018
RMSE de testeo fue: 262317.13546574564
MAPE de testeo fue: 0.19627356954087408
R2 de testeo fue: 0.9001100635393217


In [59]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

results_df = pd.DataFrame({
    'y_original': y_test,
    'y_pred_original': y_pred_original,
    'y_test_log': y_test_log,
    'y_pred_log': y_pred_log,
    'y_test_inv': y_test_inv,
    'y_pred_log_inv': y_pred_log_inv,
})

In [60]:
results_df

Unnamed: 0,y_original,y_pred_original,y_test_log,y_pred_log,y_test_inv,y_pred_log_inv
1971,198000,-2094.704,12.196,11.748,198000.000,126507.310
4664,500000,656347.654,13.122,13.214,500000.000,547758.193
5448,425000,594697.150,12.960,13.206,425000.000,543632.482
3333,150000,62545.011,11.918,12.043,150000.000,169963.949
2316,525000,440717.910,13.171,12.804,525000.000,363494.173
...,...,...,...,...,...,...
462,600000,463286.732,13.305,12.873,600000.000,389583.806
1956,400000,681960.489,12.899,13.009,400000.000,446243.086
3782,500000,601750.901,13.122,13.261,500000.000,574296.616
799,400000,405766.891,12.899,12.983,400000.000,434997.347


In [34]:
y

0       450000
1       370000
2       158000
3       225000
4       130000
         ...  
8123    320000
8124    135000
8125    382000
8126    290000
8127    290000
Name: selling_price, Length: 8128, dtype: int64

In [35]:
y_log

0       13.017003
1       12.821258
2       11.970350
3       12.323856
4       11.775290
          ...    
8123    12.676076
8124    11.813030
8125    12.853176
8126    12.577636
8127    12.577636
Name: selling_price_log, Length: 8128, dtype: float64

In [66]:
preprocess_pipeline = Pipeline(steps=[
    ('name_spliter', NameSplitter()),
    ('mileage_converter', MileageConverter()),
    ('engine_converter', EngineConverter()),
    ('max_power_converter', MaxPowerConverter()),
    ('torque_standardizer', TorqueStandardizer()),
    ('map_owner', OwnerMapper()),
    ('model_dropper', ModelDropper()),
])

full_pipeline = Pipeline(steps=[
    ('multiple_iterative_imputer', MultipleIterativeImputer(columns_to_drop=columns_to_drop_for_imputation)),
    ('round_seats', SeatRounder()),
])

final_pipeline = Pipeline(steps=[
    ('preprocess_pipeline', preprocess_pipeline),
    ('full_pipeline', full_pipeline),
])

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

regresion_lineal = LinearRegression()

sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_test_scaled = sc_X.transform(X_test)

regresion_lineal.fit(X_train_scaled, y_train)

y_pred_lineal = regresion_lineal.predict(X_test_scaled)

print(f"El error de testeo fue: {mean_absolute_error(y_test, y_pred_lineal)}")

- Ridge
- Arbol regresión
- SVR
- Boost (hay 2)
- Random Forest