In [32]:
# importamos las librerias comunes
import pandas as pd
import numpy as np

# importamos las librerias de visualizacion que usaremos mas adelante 
import matplotlib.pyplot as plt
import seaborn as sns

# importamos las librerias que necesitaremos para preprocesado de nuestro modelo 

from sklearn.preprocessing import OrdinalEncoder,StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


# importamos las librerias para los diferentes modelos

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR 
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost

In [33]:
# leemos el dataset, lo asignamos a la variable df 
df = pd.read_csv('C:/Users/desardi131/Desktop/desardi131/The_Bridge_Data_Science/Python/Proyectos/ML/ML_Precio_Coches/src/dataset/coches_fil.csv', encoding='latin-1')


In [34]:
# miramos que columnas tiene el dataset

df.columns

# vamos a retirar las columnas innecesarias para el calculo del precio como son 'id', 'url', 'image_url', 'VIN','descrition', 'size', 'lat' y 'long'
# tambien retiramos las columna region ya que nos da la misma informacion que state pero teniendo muchas mas categorias unicas 

df = df.drop(['id', 'url', 'image_url', 'VIN', 'lat', 'long', 'description', 'size', 'region'],axis=1)


In [35]:
df.dtypes

price             int64
year              int64
manufacturer     object
model            object
condition        object
cylinders        object
fuel             object
odometer        float64
title_status     object
transmission     object
drive            object
type             object
paint_color      object
state            object
reliability     float64
dtype: object

In [36]:
# Miramos la informaciion del dataset

#df.info()

# Vemos que 'drive' y 'paint_color' tienen una gran cantidad de valores nulos, asi que vamos a comprobar el ratio para decidir que hacer

ratio = (len(df.drive[df['drive'].isnull()== True])/len(df.drive))*100
print('Porcentaje de valores nulos:', round(ratio,2),'%')

'''
Al ser un porcentaje alrededor del 10% convertiremos los valores nulos,
como se trata de atributos categóricos sustituiremos los valores nulos por la moda

'''
# sustituyendo los valores en columna 'drive'
mode = df.loc[:,'drive'].mode()

df['drive'].fillna(mode[0],inplace=True)

# sustituyendo los valores en columna 'paint_color'
mode2 = df.loc[:,'paint_color'].mode()

df['paint_color'].fillna(mode2[0],inplace=True)






Porcentaje de valores nulos: 10.28 %


In [37]:
# revisamos los valores nulos que nos quedan, comprobamos si existen dubplicados y eliminamos
# los valores nulos restantes ya que no tendria sentido sustuir por la moda en la columna de modelos y tenemos
#  suficientes datos aun eliminando los nulos restantes
df.drop_duplicates(inplace=True)

df = df[~df['model'].isnull()==True]
df= df[~df['transmission'].isnull()==True]

# retiramos los vehiculos mal clasificados de numero de cilindros para retirar outliers
df.loc[:,'cylinders'].drop([1479,6716,9603], axis=0,inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12188 entries, 0 to 15426
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         12188 non-null  int64  
 1   year          12188 non-null  int64  
 2   manufacturer  12188 non-null  object 
 3   model         12188 non-null  object 
 4   condition     12188 non-null  object 
 5   cylinders     12188 non-null  object 
 6   fuel          12188 non-null  object 
 7   odometer      12188 non-null  float64
 8   title_status  12188 non-null  object 
 9   transmission  12188 non-null  object 
 10  drive         12188 non-null  object 
 11  type          12188 non-null  object 
 12  paint_color   12188 non-null  object 
 13  state         12188 non-null  object 
 14  reliability   12188 non-null  float64
dtypes: float64(2), int64(2), object(11)
memory usage: 1.7+ MB


In [38]:
df.reset_index(inplace=True,drop=True)

In [39]:
'''
Ahora tenemos 2 columnas float, 2 columnas de tipo integer64 
y 12 de tipo object, estas ultimas son categoricas y necesitamos 
sustitur sus valores por numeros para poder trabajar con los modelos.

voy a usar ordinalencoder en lugar de labelencoder para poder realizarlo a 
todas las columnas a la vez y asi reducir el numero de pasos.
'''
# antes vamos a dividir el rango de kilometraje de la columna 'odometer' en 10 quantiles 
labels = [0,1,2,3,4,5,6,7,8,9]
df['odometer_lab'] = pd.qcut(x=df.loc[:,'odometer'],q=len(labels),labels=labels)

df['odometer_lab'] = df['odometer_lab'].astype(float)




In [40]:
# asignamos X e y como las features y target respectivamente
X = df.drop('price', axis=1)
y = df['price']

# dividimos nuestros datos en train y test para comenzar a trabajar con ellos 

X_train_df, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print(X_train_df.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(9141, 15)
(3047, 15)
(9141,)
(3047,)


In [41]:
# conversion de datos apoyandonos en el principio de pareto 
prueba= (X_train_df['model'].value_counts(normalize=True)*100)
prueba_df = pd.DataFrame(prueba)

# 1587 valores unicos, el 20% es 318, corrigiendo a 323 obtenemos el 80% de las apariciones de los modelos
# el resto de modelos los cambiamos a other y hemos reducido las categorias de 1587 a 324 en la columna modelos 
prueba_df.reset_index(inplace=True)

# una lista de los modelos que no van a ser modificados, siendo el 20% de valores unicos aunque utilizando 5 valores mas para poder alcanzar el 80% de los datos
lista_modelos =  prueba_df.loc[:((X_train_df.model.nunique())+5)*0.2,'index'].to_list()

# metemos en un diccionario los modelos que vamos a llamar 'other'
dic = {}
for num, i in enumerate(X_train_df.model):
    if i not in lista_modelos:
       dic[num]=i

# cambiamos los valores 
X_train_df['model'] = X_train_df['model'].replace(list(dic.values()),'other')

# comprobamos los valores de la columna model y vemos que hemos pasado de 1612 valores unicos a 325
X_train_df.nunique()

year              11
manufacturer      16
model            320
condition          3
cylinders          5
fuel               5
odometer        5370
title_status       6
transmission       3
drive              3
type               8
paint_color       12
state             51
reliability        8
odometer_lab      10
dtype: int64

In [42]:
# convertimos en lista el array de modelos unicos 
list_unicos = list(X_train_df['model'].unique())

# cada elemento dentro de la lista lo convertimos en una nueva lista para poder dividrlos en strings 
list_list = []
for i in range((len(list_unicos)+1)):
    list_list.append([])

for n,i in enumerate(list_list):
    
    i.append(list_unicos[n-1])

# dividimos cada elemento en estrings usando el espacio como separador 
list_nu = []
for n in list_list:

    list_nu.append(n[0].split(' '))
        
# nos quedamos con los primeros elementos de cada sublista ya que indica el modelo generico y asi podemos reducir el numero de valores unicos 
ult_list = []
for n in list_nu:
    if len(n)>1:

        if 'benz' in n:
            ult_list.append(n)

        elif n[0]=='3' or n[0] == '5' or n[0] == '6':
            ult_list.append(n[:2])
            
        else:
            ult_list.append([n[0]])
    else:
        ult_list.append(n)

# las sublistas que hemos dejado con mas de un elemento volvemos a unir sus strings ya que la informacion no estaba solo en el primer elemento de la sublista
ult_list2 = []
for n in ult_list:
    if len(n)>1:
        ult_list2.append([' '.join(n)])
    else:
        ult_list2.append(n)

# retiramos las sublistas y nos quedamos solo con una lista que contiene todos los elementos  
flat_list = [item for sublist in ult_list2 for item in sublist]

# ahora eleminamos los duplicados y comprobamos que nos hemos quedado con 132 elementos unicos de 325 originales
flat_list = list(dict.fromkeys(flat_list))
print('Valores unicos para  "modelo":',len(flat_list))

# a continuacion vamos a sustituir en X_train_df los antiguos valores por los nuevos ya filtrados



Valores unicos para  "modelo": 128


In [43]:
# unificamos las distintas versiones de un modelo en un solo modelo, quedandonos asi con 110 valores unicos 
for i,n in enumerate(flat_list):

    # corregir el bucle cuando tenga tiempo 
    if flat_list[i] == '3' or flat_list[i] == '5' or flat_list[i] == '6' or flat_list[i] == '7' or flat_list[i] == 'es' or flat_list[i] == 'is' or flat_list[i] == '2':
        continue
    
    elif flat_list[i] == '128i':
        X_train_df['model'] = X_train_df.model.replace(X_train_df.model[X_train_df.model.str.contains(flat_list[i])].unique(),'1 series')

    elif flat_list[i] == '320i' or flat_list[i] == '335i'   or flat_list[i] == '328xi'    or flat_list[i] == '335xi' or flat_list[i] == '328i':
        X_train_df['model'] = X_train_df.model.replace(X_train_df.model[X_train_df.model.str.contains(flat_list[i])].unique(),'3 series')

    elif flat_list[i] == '528i' or flat_list[i] == '535i'or flat_list[i] == '535xi' or flat_list[i] == '528xi':
        X_train_df['model'] = X_train_df.model.replace(X_train_df.model[X_train_df.model.str.contains(flat_list[i])].unique(),'5 series')

    else:
        X_train_df['model'] = X_train_df.model.replace(X_train_df.model[X_train_df.model.str.contains(flat_list[i])].unique(),flat_list[i])



In [44]:
# vamos a retirar la columna 'odometer' ya que son valores continuos y sus valores ya los hemos agrupado en la columna 'odometer_lab'
X_train_df.drop('odometer',axis=1, inplace=True)

In [45]:
X_train_df.columns

Index(['year', 'manufacturer', 'model', 'condition', 'cylinders', 'fuel',
       'title_status', 'transmission', 'drive', 'type', 'paint_color', 'state',
       'reliability', 'odometer_lab'],
      dtype='object')

In [46]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
# a continuacion realizamos un onehot encoder para las coloumnas 
df_oh_train = pd.concat([X_train_df, y_train], axis= 1)

# vamos a transformar las columnas pero vamos a meter el dataframe X,y train.
transformer = make_column_transformer((OneHotEncoder(handle_unknown='ignore'), ['manufacturer' , 'model', 'condition',
       'cylinders','fuel' ,'title_status','transmission','drive','type','paint_color', 'state']),remainder='passthrough')

transformer.fit(df_oh_train)

transformed = transformer.transform(df_oh_train).toarray()

transformed_train_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())





In [47]:
# transformacion X_test 

# vamos a utilizar la lista de modelos que obtuvimos en train para aplicarla a test
# esta lista contiene todos los modelos que mas aparecen y el resto lo marca como 'other'

lista_modelos 

# metemos en un diccionario los modelos que vamos a llamar 'other' esta vez test
dic = {}
for num, i in enumerate(X_test.model):
    if i not in lista_modelos:
       dic[num]=i

# cambiamos los valores en test
X_test['model'] = X_test['model'].replace(list(dic.values()),'other')

# el resultado nos da 105 modelos distintos 
X_train_df.nunique()

year             11
manufacturer     16
model           105
condition         3
cylinders         5
fuel              5
title_status      6
transmission      3
drive             3
type              8
paint_color      12
state            51
reliability       8
odometer_lab     10
dtype: int64

In [48]:
# no necesitamos repetir el proceso de quedarnos con modelos unicos ya que tenemos la lista de train 
# ahora esa lista la aplicamos en X_test y tendremos los valores unificados 
# unificamos las distintas versiones de un modelo en un solo modelo, quedandonos asi con 110 valores unicos 
for i,n in enumerate(flat_list):

    # corregir el bucle cuando tenga tiempo 
    if flat_list[i] == '3' or flat_list[i] == '5' or flat_list[i] == '6' or flat_list[i] == '7' or flat_list[i] == 'es' or flat_list[i] == 'is' or flat_list[i] == '2':
        continue
    
    elif flat_list[i] == '128i':
        X_test['model'] = X_test.model.replace(X_test.model[X_test.model.str.contains(flat_list[i])].unique(),'1 series')

    elif flat_list[i] == '320i' or flat_list[i] == '335i'   or flat_list[i] == '328xi'    or flat_list[i] == '335xi' or flat_list[i] == '328i':
        X_test['model'] = X_test.model.replace(X_test.model[X_test.model.str.contains(flat_list[i])].unique(),'3 series')

    elif flat_list[i] == '528i' or flat_list[i] == '535i'or flat_list[i] == '535xi' or flat_list[i] == '528xi':
        X_test['model'] = X_test.model.replace(X_test.model[X_test.model.str.contains(flat_list[i])].unique(),'5 series')

    else:
        X_test['model'] = X_test.model.replace(X_test.model[X_test.model.str.contains(flat_list[i])].unique(),flat_list[i])


In [49]:
# a continuacion realizamos un onehot encoder para las coloumnas 
df_oh_test = pd.concat([X_test, y_test], axis= 1)

transformed = transformer.transform(df_oh_test).toarray()

transformed_test_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
transformed_test_df



Unnamed: 0,onehotencoder__x0_audi,onehotencoder__x0_bmw,onehotencoder__x0_ford,onehotencoder__x0_honda,onehotencoder__x0_hyundai,onehotencoder__x0_kia,onehotencoder__x0_lexus,onehotencoder__x0_mazda,onehotencoder__x0_mercedes-benz,onehotencoder__x0_mini,...,onehotencoder__x10_va,onehotencoder__x10_vt,onehotencoder__x10_wa,onehotencoder__x10_wi,onehotencoder__x10_wv,onehotencoder__x10_wy,year,reliability,odometer_lab,price
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2017.0,80.0,0.0,11395.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2011.0,80.0,3.0,8985.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2016.0,80.0,6.0,8990.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,2014.0,82.0,2.0,9500.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2013.0,80.0,5.0,13900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3042,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2017.0,87.0,2.0,13950.0
3043,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2013.0,80.0,7.0,5800.0
3044,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2015.0,82.0,4.0,14999.0
3045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2015.0,85.0,2.0,16500.0


In [50]:
X_train = transformed_train_df.drop('price',axis=1)
y_train = transformed_train_df['price']

X_test = transformed_test_df.drop('price',axis=1)
y_test = transformed_test_df['price']


In [51]:
lin_reg = Pipeline(steps=
                    [
                    ('scaler', StandardScaler()),
                    ('regressor', LinearRegression())]) # simple imputer
                    
scores = cross_val_score(lin_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=10)



In [52]:
abs(scores.mean())

2.4811022785868994e+28

In [53]:
lin_reg.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('regressor', LinearRegression())])

In [54]:
rmse = np.sqrt(mean_squared_error(y_train, lin_reg.predict(X_train)))
r2 = r2_score(y_train, lin_reg.predict(X_train))

print("The model performance for training set")
print("--------------------------------------")
print("RMSE is {}".format(rmse))
print("R2 score is {}".format(r2))
print("\n")

The model performance for training set
--------------------------------------
RMSE is 2220.4109525927133
R2 score is 0.7034536498982193




In [55]:
rmse_test = np.sqrt(mean_squared_error(y_test, lin_reg.predict(X_test)))
r2_test = r2_score(y_test, lin_reg.predict(X_test))

print("The model performance for test set")
print("--------------------------------------")
print("RMSE is {}".format(rmse_test))
print("R2 score is {}".format(r2_test))
print("\n")

The model performance for test set
--------------------------------------
RMSE is 2355.7149082470683
R2 score is 0.6565899188152515




In [57]:
poly_reg = Pipeline( steps= [
                            ('scaler', StandardScaler()),
                            ('regressor', PolynomialFeatures(degree=2))])

poly_reg.fit(X_train)

X_poly = poly_reg.transform(X_train)

pol_reg = LinearRegression()
pol_reg.fit(X_poly,y_train)

LinearRegression()

In [59]:
y_pred = pol_reg.predict(poly_reg.transform(X_test))
y_pred

array([ 1.26344623e+04,  8.13547839e+03,  7.69722827e+03, ...,
        1.57510578e+04, -7.19428385e+10,  1.28235028e+04])

In [63]:
rmse_test2 = np.sqrt(mean_squared_error(y_test, y_pred))
r2_test2 = r2_score(y_test, y_pred)

print("The model performance for training set")
print("--------------------------------------")
print("RMSE is {}".format(rmse_test2))
print("R2 score is {}".format(r2_test2))
print("\n")

The model performance for training set
--------------------------------------
RMSE is 5621657997934.274
R2 score is -1.955673818222572e+18




In [64]:
rmse_train = np.sqrt(mean_squared_error(y_train, pol_reg.predict(poly_reg.transform(X_train))))
r2_train = r2_score(y_train, pol_reg.predict(poly_reg.transform(X_train)))

print("The model performance for training set")
print("--------------------------------------")
print("RMSE is {}".format(rmse_train))
print("R2 score is {}".format(r2_train))
print("\n")

The model performance for training set
--------------------------------------
RMSE is 1248.5571957504603
R2 score is 0.9062345709540077




In [65]:
ridge_reg = Pipeline(steps=
                    [
                    ('scaler', StandardScaler()),
                    ('ridgereg', Ridge())])

ridge_params = {'ridgereg__alpha': np.arange(100,120,1)}

grid_ridge = GridSearchCV(estimator= ridge_reg, param_grid= ridge_params, n_jobs=-1, scoring='neg_mean_squared_error',cv=10, error_score='raise')

grid_ridge.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise',
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('ridgereg', Ridge())]),
             n_jobs=-1,
             param_grid={'ridgereg__alpha': array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
       113, 114, 115, 116, 117, 118, 119])},
             scoring='neg_mean_squared_error')

In [66]:
grid_ridge.best_params_

{'ridgereg__alpha': 105}

In [67]:
grid_ridge.best_score_


-5204136.308817354

In [72]:
y_pred_gs_rid = grid_ridge.predict(X_test)

test_error_rid = np.sqrt(mean_squared_error(y_test, y_pred_gs_rid))
print("Test RMSE:", round(test_error_rid,2),'$')

Test RMSE: 2351.17 $


In [89]:
pipe_rf = Pipeline(steps=[
                    ('scaler', StandardScaler()),
    ("regressor", RandomForestRegressor())])


random_forest_params = {
    'regressor__n_estimators': [200],
    'regressor__max_features': [50]
}

grid_rf = GridSearchCV(pipe_rf,
                   random_forest_params,
                   cv = 5,
                   n_jobs=-1,
                   scoring='neg_mean_squared_error')

pipe_en = Pipeline(steps=[
                    ('scaler', StandardScaler()),
    ('regressor', ElasticNet())])

elastic_params = {
    'regressor__l1_ratio': [1],
    'regressor__alpha': np.arange(3.1, 10, 1)
}

grid_en = GridSearchCV(pipe_en,
                   elastic_params,
                   cv = 5,
                   n_jobs=-1,
                   scoring='neg_mean_squared_error')






grids = {
    "grid_rf": grid_rf,
    "grid_en": grid_en,
   
}





In [90]:
for nombre, grid_search in grids.items():
    grid_search.fit(X_train, y_train)

In [91]:
best_grids = [(i,abs( j.best_score_)) for i, j in grids.items()]

best_grids = pd.DataFrame(best_grids, columns = ["Grid", "best_loss"])
best_grids.sort_values(by = "best_loss", ascending = True)

Unnamed: 0,Grid,best_loss
0,grid_rf,4884782.0
1,grid_en,5213685.0


In [92]:
print("Best estimator:", grid_rf.best_estimator_)
print("Best params:", grid_rf.best_params_)
print("Best score:", grid_rf.best_score_)

Best estimator: Pipeline(steps=[('scaler', StandardScaler()),
                ('regressor',
                 RandomForestRegressor(max_features=50, n_estimators=200))])
Best params: {'regressor__max_features': 50, 'regressor__n_estimators': 200}
Best score: -4884782.325166544


In [93]:
y_pred_gs_rf = grid_rf.predict(X_test)
test_error_rf = np.sqrt(mean_squared_error(y_test, y_pred_gs_rf))
print("Test RMSE:", round(test_error_rf,2),'$')

Test RMSE: 2220.34 $


In [94]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

In [95]:
xgb_reg = xgboost.XGBRegressor(random_state=42, early_stopping_rounds = 2)
xgb_reg.fit(X_train, y_train, eval_set = [(X_val, y_val)])
y_pred = xgb_reg.predict(X_val)
val_error = mean_squared_error(y_val, y_pred)
print("Validation MSE:", val_error)

[0]	validation_0-rmse:7757.00150
[1]	validation_0-rmse:5752.72082
[2]	validation_0-rmse:4426.17114
[3]	validation_0-rmse:3591.74320
[4]	validation_0-rmse:3089.53925
[5]	validation_0-rmse:2801.37677
[6]	validation_0-rmse:2629.27742
[7]	validation_0-rmse:2528.09617
[8]	validation_0-rmse:2480.14736
[9]	validation_0-rmse:2438.47376
[10]	validation_0-rmse:2401.47930
[11]	validation_0-rmse:2381.33581
[12]	validation_0-rmse:2368.48314
[13]	validation_0-rmse:2354.51924
[14]	validation_0-rmse:2349.08611
[15]	validation_0-rmse:2337.86126
[16]	validation_0-rmse:2328.65399
[17]	validation_0-rmse:2322.73307
[18]	validation_0-rmse:2317.81370
[19]	validation_0-rmse:2316.60887
[20]	validation_0-rmse:2305.64043
[21]	validation_0-rmse:2300.97523
[22]	validation_0-rmse:2301.88851
[23]	validation_0-rmse:2297.71600
[24]	validation_0-rmse:2294.81889
[25]	validation_0-rmse:2291.84466
[26]	validation_0-rmse:2289.04679
[27]	validation_0-rmse:2282.61240
[28]	validation_0-rmse:2281.72824
[29]	validation_0-rmse:2

In [96]:
# no tieene sentido comrpobar todo con test ya que para eso se hace la validacion
# test solo debe probarse con el mejor modelo 
y_pred = xgb_reg.predict(X_test)
test_error = mean_squared_error(y_test, y_pred)
print("Test MSE:", test_error)

Test MSE: 5122060.920629907


In [103]:
best_grids
best_grids.sort_values(by = "best_loss", ascending = True)

Unnamed: 0,Grid,best_loss
3,grid_xgb,4671928.0
0,grid_rf,4692040.0
1,grid_en,8056868.0
2,grid_svr,8433718.0
