In [11]:
import warnings
warnings.filterwarnings('ignore')


In [12]:
# Importación librerías
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [13]:
# Carga de datos de archivo .csv
dataTraining = pd.read_csv('https://raw.githubusercontent.com/davidzarruk/MIAD_ML_NLP_2023/main/datasets/dataTrain_carListings.zip')
dataTesting = pd.read_csv('https://raw.githubusercontent.com/davidzarruk/MIAD_ML_NLP_2023/main/datasets/dataTest_carListings.zip', index_col=0)

In [14]:
dataTraining.count()


Price      400000
Year       400000
Mileage    400000
State      400000
Make       400000
Model      400000
dtype: int64

In [15]:
df = dataTraining

# Separamos la variable objetivo (Price) y las características
X = df.drop('Price', axis=1)
y = df['Price']

# Dividimos el conjunto de datos en entrenamiento y prueba (80% entrenamiento, 20% prueba)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Definimos qué columnas son numéricas y cuáles son categóricas
numeric_features = ['Year', 'Mileage']
categorical_features = ['State', 'Make', 'Model']

# Creamos transformadores para escalar variables numéricas y codificar variables categóricas
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Aplicamos las transformaciones a las columnas correspondientes utilizando ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Ajustamos y transformamos el conjunto de entrenamiento
X_train_transformed = preprocessor.fit_transform(X_train)

# Transformamos el conjunto de prueba (no ajustamos para evitar filtración de datos)
X_test_transformed = preprocessor.transform(X_test)

In [16]:
X_Testing_transformed = preprocessor.transform(dataTesting)

In [24]:
# XGBoost
import xgboost as xgb

xgbr = xgb.XGBRegressor(n_estimators=500, learning_rate=0.1, max_depth=100, random_state=42)
xgbr.fit(X_train_transformed, y_train)
y_pred = xgbr.predict(X_test_transformed)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE para XGBoost:", rmse)

RMSE para XGBoost: 4077.876763392094


In [25]:
y_pred_xgbr = xgbr.predict(X_Testing_transformed)
submission_a = pd.DataFrame(y_pred_xgbr)
submission_a.to_csv('test_submission_xgbr.csv', index_label='ID')
submission_a.head()

Unnamed: 0,0
0,22177.511719
1,36017.972656
2,24499.138672
3,8778.37207
4,29882.189453


In [26]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor


# crear un modelo base utilizando un árbol de decisión
base_model = DecisionTreeRegressor()

# crear el modelo BaggingRegressor y entrenarlo
bagging_model = BaggingRegressor(base_estimator=base_model, n_estimators=10, random_state=42)
bagging_model.fit(X_train_transformed, y_train)
y_pred = bagging_model.predict(X_test_transformed)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE para BaggingRegressor:", rmse)


RMSE para BaggingRegressor: 3885.0825239775054


In [27]:
bagging = BaggingRegressor()
bagging.fit(X_train_transformed, y_train)

In [28]:
y_pred = bagging.predict(X_test_transformed)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE para Bagging:", rmse)

RMSE para Bagging: 3868.7472461537923


In [29]:
y_pred_bagging = bagging.predict(X_Testing_transformed)
submission_a = pd.DataFrame(y_pred_bagging)
submission_a.to_csv('test_submission_y_pred_bagging.csv', index_label='ID')
submission_a.head()

Unnamed: 0,0
0,21342.0
1,35541.4
2,24879.4
3,8230.9
4,29942.4
