In [4]:
# Importación librerías
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, PolynomialFeatures
from sklearn.metrics import make_scorer
from sklearn.impute import SimpleImputer
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split, RandomizedSearchCV,cross_val_score
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
# Carga de datos de archivo .csv
dataTraining = pd.read_csv('https://raw.githubusercontent.com/davidzarruk/MIAD_ML_NLP_2023/main/datasets/dataTrain_carListings.zip')
dataTesting = pd.read_csv('https://raw.githubusercontent.com/davidzarruk/MIAD_ML_NLP_2023/main/datasets/dataTest_carListings.zip', index_col=0)
duplicados = dataTraining.duplicated().sum()
df_train2 = dataTraining.drop_duplicates()
Q1 = df_train2['Price'].quantile(0.25)
Q3 = df_train2['Price'].quantile(0.75)
IQR = Q3 - Q1
limite_inferior = Q1 - 1.5 * IQR
limite_superior = Q3 + 1.5 * IQR

# Identificar valores atípicos del precio y quitandolos
valores_atipicos = df_train2[(df_train2['Price'] < limite_inferior) | (df_train2['Price'] > limite_superior)]
df_train3 = df_train2[(df_train2['Price'] >= limite_inferior) & (df_train2['Price'] <= limite_superior)]

#Identificando valores atipicos de Mileage y quitandolos 
df_train3 = df_train3[df_train3['Mileage'] <= 1500000]
print(df_train2.shape)
print(df_train3.shape)

# Supongamos que tienes tus datos en un DataFrame llamado df_train3

# División de datos en características y variable objetivo
X = df_train3.drop('Price', axis=1)
y = df_train3['Price']

#Ingenieria de caracteristicas
current_year = datetime.now().year
X['Car_Age'] = current_year - X['Year']
X['Mileage_Year'] = X['Year'] / X['Mileage']
X['Brand_Model'] = X['Make'] + '_' + X['Model']

# Listas de características numéricas y categóricas
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Definición de transformadores para características numéricas y categóricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# ColumnTransformer para aplicar transformadores a diferentes tipos de características
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Pipeline que incluye el preprocesamiento y el modelo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)])

# Aplicar el pipeline a los datos
X_preprocessed = pipeline.fit_transform(X)

# Division de los datos
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.33, random_state=42)
#MeTRICA
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

(399831, 6)
(388972, 6)


In [11]:
#XGBOOST
xgboost_model = XGBRegressor(random_state=12345,subsample= 0.95, reg_lambda= 1.5, n_estimators= 500, max_depth= 7, learning_rate= 0.1, gamma= 0.3, colsample_bytree=0.8)
xgboost_model.fit(X_train, y_train)
y_pred_xgb = xgboost_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred_xgb)
print(rmse)

2998.231894311593


In [75]:
#XGBOOST CON SELECCION DE CARACTERISTICAS IMPORTANTES, falta hacer el modelo
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

# Entrenar un modelo de Random Forest
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
# Obtener la importancia de las características
feature_importances = rf_model.feature_importances_
# Crear un selector de características basado en la importancia de las características
selector = SelectFromModel(rf_model, threshold='median')
# Aplicar el selector a los datos de entrenamiento
X_train_selected = selector.fit_transform(X_train, y_train)
# Aplicar el mismo selector a los datos de prueba
X_test_selected = selector.transform(X_test)
# Obtener las características seleccionadas
selected_features = X.columns[selector.get_support()]
# Imprimir las características seleccionadas
print("Características seleccionadas:", selected_features)

In [8]:
#BUSQUEDA DE HIPERPARAMETROS DE XGBOOST
# Definir el modelo
xgboost_model = XGBRegressor(random_state=12345)

# Definir la cuadrado del error medio como métrica para optimizar
scoring = make_scorer(mean_squared_error, squared=False)

# Definir el espacio de búsqueda de hiperparámetros
param_distributions = {
    'n_estimators': [400, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1,0.15, 0.2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9, 0.95],
    'reg_lambda': [1, 1.5, 2],
    'gamma': [0, 0.1, 0.3],
    'colsample_bytree':[0.7,0.8]
}

# Realizar la búsqueda aleatoria de hiperparámetros
random_search = RandomizedSearchCV(xgboost_model, param_distributions, n_iter=50, cv=5, scoring=scoring, random_state=42)
random_search.fit(X_train, y_train)

# Obtener el mejor modelo y sus hiperparámetros
best_xgb_model = random_search.best_estimator_
best_params = random_search.best_params_

# Imprimir los mejores parámetros
print("Mejores parámetros encontrados:", best_params)

# Evaluar el modelo en el conjunto de prueba
y_pred_best = best_xgb_model.predict(X_test)
rmse_best = root_mean_squared_error(y_test, y_pred_best)
print("RMSE en el conjunto de prueba con los mejores parámetros:", rmse_best)

Mejores parámetros encontrados: {'subsample': 0.7, 'reg_lambda': 2, 'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.7}
RMSE en el conjunto de prueba con los mejores parámetros: 3818.467429996832


In [13]:
#data de competencia
dataTesting['Car_Age'] = current_year - dataTesting['Year']
dataTesting['Mileage_Year'] = dataTesting['Year'] / dataTesting['Mileage']
dataTesting['Brand_Model'] = dataTesting['Make'] + '_' + dataTesting['Model']

X_testcru = pipeline.transform(dataTesting)
y_predmodelo1 = xgboost_model.predict(X_testcru)
print(y_predmodelo1[:5]) 
df = pd.DataFrame(y_predmodelo1) 
# Rename the default column (likely '0') to 'Price'
df = df.rename(columns={0: 'Price'}) 
# Save to CSV 
df.to_csv(r"C:\Users\1234\Downloads\MIAD_NLP\proyecto\modeloxgboost8.csv", index_label='ID') 
#4144 en kaggle

[20987.104 36943.902 15242.517  8612.789 30949.766]
