In [15]:
# ============================================
# 1. Imports y configuracion general
# ============================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    FunctionTransformer
)
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score
)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [16]:
# ============================================
# 2. Carga de datos
# ============================================
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

print(train.shape, test.shape)
train.head()


(79800, 21) (34200, 20)


Unnamed: 0,id,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,41996,7hUhmkALyQ8SX9mJs5XI3D,Love and Rockets,Love and Rockets,Motorcycle,22,211533,False,0.305,0.849,...,-10.795,1,0.0549,5.8e-05,0.0567,0.464,0.32,141.793,4,goth
1,76471,5x59U89ZnjZXuNAAlc8X1u,Filippa Giordano,Filippa Giordano,"Addio del passato - From ""La traviata""",22,196000,False,0.287,0.19,...,-12.03,0,0.037,0.93,0.000356,0.0834,0.133,83.685,4,opera
2,54809,70Vng5jLzoJLmeLu3ayBQq,Susumu Yokota,Symbol,Purple Rose Minuet,37,216506,False,0.583,0.509,...,-9.661,1,0.0362,0.777,0.202,0.115,0.544,90.459,3,idm
3,16326,1cRfzLJapgtwJ61xszs37b,Franz Liszt;YUNDI,Relajación y siestas,"Liebeslied (Widmung), S. 566",0,218346,False,0.163,0.0368,...,-23.149,1,0.0472,0.991,0.899,0.107,0.0387,69.442,3,classical
4,109799,47d5lYjbiMy0EdMRV8lRou,Scooter,Scooter Forever,The Darkside,27,173160,False,0.647,0.921,...,-7.294,1,0.185,0.000939,0.371,0.131,0.171,137.981,4,techno


In [17]:
# ============================================
# 3. Definicion de target, columnas numericas y categoricas
# ============================================
target_col = "popularity"

# columnas de alta cardinalidad (texto libre / ids)
high_card_cols = ["id", "track_id", "artists", "album_name", "track_name"]

# columnas categoricas "clasicas" del dataset
categorical_cols_base = ["explicit", "key", "mode", "time_signature", "track_genre"]

# todas las numericas (incluye popularity)
numeric_cols_all = train.select_dtypes(include=["int64", "float64"]).columns.tolist()
numeric_cols_all

# ============================================
# 3.1 Analisis de alta cardinalidad
# ============================================
for col in ["artists", "album_name", "track_name"]:
    nunique = train[col].nunique()
    print(f"{col}: {nunique} valores distintos")


artists: 25775 valores distintos
album_name: 37315 valores distintos
track_name: 55767 valores distintos


In [18]:
# ============================================
# 4.1 Nueva feature: duracion en minutos
# ============================================
for df in [train, test]:
    df["duration_min"] = df["duration_ms"] / 60000.0

# ============================================
# 4.2 Nueva feature: interaccion energy * danceability
# ============================================
for df in [train, test]:
    df["energy_danceability"] = df["energy"] * df["danceability"]



In [19]:
# ============================================
# 5. Gestion de outliers con IQR capping
# ============================================

# Definimos las columnas numericas para tratar (excepto target)
numeric_features = [
    col for col in numeric_cols_all
    if col != target_col
]

# Calculamos limites IQR en train
iqr_bounds = {}
for col in numeric_features:
    q1 = train[col].quantile(0.25)
    q3 = train[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    iqr_bounds[col] = (lower, upper)

# Aplicamos cap en train y test con los mismos limites
def cap_iqr(df, bounds):
    df_cap = df.copy()
    for col, (lower, upper) in bounds.items():
        if col in df_cap.columns:
            df_cap[col] = df_cap[col].clip(lower=lower, upper=upper)
    return df_cap

train = cap_iqr(train, iqr_bounds)
test = cap_iqr(test, iqr_bounds)


In [20]:
# ============================================
# 6. Seleccion final de features
# ============================================

# Partimos de todas las columnas
all_cols = train.columns.tolist()

# Eliminamos target y columnas de alta cardinalidad
drop_cols = high_card_cols + [target_col]

feature_cols = [c for c in all_cols if c not in drop_cols]

# Recalcular listas numericas y categoricas sobre feature_cols
numeric_features = train[feature_cols].select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = train[feature_cols].select_dtypes(include=["object", "bool"]).columns.tolist()

print("Features numericas:", numeric_features)
print("Features categoricas:", categorical_features)


Features numericas: ['duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'duration_min', 'energy_danceability']
Features categoricas: ['explicit', 'track_genre']


In [21]:
# ============================================
# 7. Train / Validation split
# ============================================
X = train[feature_cols].copy()
y = train[target_col].copy()

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE
)

X_train.shape, X_val.shape


((63840, 17), (15960, 17))

In [22]:
# ============================================
# 8. Preprocesamiento: ColumnTransformer
# ============================================

# Transformacion log1p para numericas (evita log(0))
log_transformer = FunctionTransformer(np.log1p, validate=False)

numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features),
    ]
)


In [23]:
# ============================================
# 9. Funcion auxiliar: entrenar + buscar hiperparametros + evaluar
# ============================================
def evaluar_modelo(nombre, estimator, param_grid=None, cv=5):
    """
    Crea un Pipeline(preprocessor + estimator),
    realiza GridSearchCV (si hay param_grid),
    entrena en X_train / y_train y evalua en X_val / y_val.
    Devuelve un dict con metricas y mejores hiperparametros.
    """
    print(f"\n=== {nombre} ===")
    
    pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", estimator)
    ])
    
    if param_grid is not None and len(param_grid) > 0:
        grid = GridSearchCV(
            pipe,
            param_grid=param_grid,
            cv=cv,
            scoring="neg_root_mean_squared_error",
            n_jobs=-1,
            verbose=1
        )
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        best_params = grid.best_params_
        cv_rmse = -grid.best_score_
        print(f"Mejores hiperparámetros: {best_params}")
        print(f"Mejor RMSE CV: {cv_rmse:.4f}")
    else:
        # Sin busqueda de hiperparametros (ej: Regresion Lineal)
        best_model = pipe
        best_model.fit(X_train, y_train)
        best_params = {}
        cv_rmse = np.nan
    
    # Evaluacion en el conjunto de validacion (holdout)
    y_pred = best_model.predict(X_val)
    
    mse = mean_squared_error(y_val, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    
    print(f"RMSE val: {rmse:.4f}")
    print(f"MSE  val: {mse:.4f}")
    print(f"MAE  val: {mae:.44f}")
    print(f"R²   val: {r2:.4f}")
    
    resultados = {
        "modelo": nombre,
        "best_params": best_params,
        "cv_rmse": cv_rmse,
        "rmse_val": rmse,
        "mse_val": mse,
        "mae_val": mae,
        "r2_val": r2,
        "best_estimator": best_model
    }
    return resultados


In [24]:
# ============================================
# 10. Modelos y grids de hiperparametros
# ============================================

modelos = []

# 1) Regresion Lineal (sin hiperpaametros relevantes)
modelos.append({
    "nombre": "LinearRegression",
    "estimator": LinearRegression(),
    "param_grid": {}  # sin hyperparam tuning
})

# 2) Arbol de Decision
modelos.append({
    "nombre": "DecisionTreeRegressor",
    "estimator": DecisionTreeRegressor(random_state=RANDOM_STATE),
    "param_grid": {
        "model__max_depth": [None, 5, 10, 20],
        "model__min_samples_split": [2, 5, 10],
        "model__min_samples_leaf": [1, 2, 4]
    }
})

# 3) Random Forest
modelos.append({
    "nombre": "RandomForestRegressor",
    "estimator": RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1),
    "param_grid": {
        "model__n_estimators": [100, 200],
        "model__max_depth": [None, 10, 20],
        "model__min_samples_leaf": [1, 2, 4],
        "model__max_features": ["sqrt", "log2"]
    }
})

# 4) Gradient Boosting
modelos.append({
    "nombre": "GradientBoostingRegressor",
    "estimator": GradientBoostingRegressor(random_state=RANDOM_STATE),
    "param_grid": {
        "model__n_estimators": [100, 200],
        "model__learning_rate": [0.05, 0.1],
        "model__max_depth": [2, 3],
        "model__min_samples_leaf": [1, 2]
    }
})

# 5) Red Neuronal MLP
modelos.append({
    "nombre": "MLPRegressor",
    "estimator": MLPRegressor(
        random_state=RANDOM_STATE,
        max_iter=300
    ),
    "param_grid": {
        "model__hidden_layer_sizes": [(64,), (64, 32)],
        "model__alpha": [0.0001, 0.001],
        "model__learning_rate_init": [0.001, 0.01]
    }
})


In [25]:
# ============================================
# 11. Entrenamiento, busqueda de hiperparametros y evaluacion
# ============================================
resultados = []

for m in modelos:
    res = evaluar_modelo(
        nombre=m["nombre"],
        estimator=m["estimator"],
        param_grid=m["param_grid"],
        cv=5  # Cross-Validation k-fold
    )
    resultados.append(res)

# Convertir a DataFrame para resumen
resultados_df = pd.DataFrame([{
    "modelo": r["modelo"],
    "cv_rmse": r["cv_rmse"],
    "rmse_val": r["rmse_val"],
    "mse_val": r["mse_val"],
    "mae_val": r["mae_val"],
    "r2_val": r["r2_val"],
    "best_params": r["best_params"]
} for r in resultados])

# Ordenar por RMSE de validacion
resultados_df.sort_values("rmse_val", inplace=True)
resultados_df



=== LinearRegression ===
RMSE val: 19.1222
MSE  val: 365.6591
MAE  val: 14.00852793635774062863674771506339311599731445
R²   val: 0.2604

=== DecisionTreeRegressor ===
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Mejores hiperparámetros: {'model__max_depth': 20, 'model__min_samples_leaf': 4, 'model__min_samples_split': 10}
Mejor RMSE CV: 20.6579
RMSE val: 20.3722
MSE  val: 415.0255
MAE  val: 15.92286178065897672695427900180220603942871094
R²   val: 0.1606

=== RandomForestRegressor ===
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Mejores hiperparámetros: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__n_estimators': 200}
Mejor RMSE CV: 16.2003
RMSE val: 15.6015
MSE  val: 243.4063
MAE  val: 11.05727427988402133962608786532655358314514160
R²   val: 0.5077

=== GradientBoostingRegressor ===
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Mejores hiperparámetros: {'model__learning_rate': 0.1, 'm



Unnamed: 0,modelo,cv_rmse,rmse_val,mse_val,mae_val,r2_val,best_params
2,RandomForestRegressor,16.200262,15.601484,243.406305,11.057274,0.507687,"{'model__max_depth': None, 'model__max_feature..."
4,MLPRegressor,18.632637,18.32028,335.632661,13.265748,0.321151,"{'model__alpha': 0.001, 'model__hidden_layer_s..."
0,LinearRegression,,19.122214,365.65907,14.008528,0.260419,{}
3,GradientBoostingRegressor,19.488466,19.270401,371.348362,15.105208,0.248912,"{'model__learning_rate': 0.1, 'model__max_dept..."
1,DecisionTreeRegressor,20.657926,20.372176,415.025544,15.922862,0.160571,"{'model__max_depth': 20, 'model__min_samples_l..."


In [26]:
# ============================================
# 12. Seleccion del mejor modelo (segun RMSE en validacion)
# ============================================
mejor_idx = resultados_df["rmse_val"].idxmin()
mejor_nombre = resultados_df.loc[mejor_idx, "modelo"]
print("Mejor modelo segun RMSE de validacion:", mejor_nombre)

# Recuperamos el estimator ya entrenado en train/val
# (lo tenemos en la lista 'resultados')
mejor_dict = [r for r in resultados if r["modelo"] == mejor_nombre][0]
mejor_modelo = mejor_dict["best_estimator"]
mejor_modelo


Mejor modelo segun RMSE de validacion: RandomForestRegressor


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [27]:
from sklearn.base import clone

# Volvemos a armar X_full e y_full
X_full = train[feature_cols].copy()
y_full = train[target_col].copy()   # target_col = "popularity"

# Clonamos el mejor modelo (Pipeline) y lo entrenamos con todo el train
modelo_final = clone(mejor_modelo)
modelo_final.fit(X_full, y_full)


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [28]:
# Features del set de test
X_test = test[feature_cols].copy()

# Predicciones del modelo final
test_preds = modelo_final.predict(X_test)

# Armar submission con los nombres que pide la competencia
submission = pd.DataFrame({
    "id": test["id"],          # o el nombre de ID que tenga tu test
    "popularity": test_preds   # o el nombre de la columna objetivo en sample_submission
})

display(submission.head())
print(submission.shape)
print(submission.isna().sum())

# Guardar archivo para subir a Kaggle
submission.to_csv("submission.csv", index=False)
print("Archivo 'submission.csv' generado.")


Unnamed: 0,id,popularity
0,113186,45.773333
1,42819,15.73
2,59311,2.93
3,91368,0.121796
4,61000,26.6525


(34200, 2)
id            0
popularity    0
dtype: int64
Archivo 'submission.csv' generado.


In [29]:
# ============================================
# Feature nueva: popularidad promedio por género
# ============================================

# Popularidad promedio por track_genre en TRAIN
genre_mean = train.groupby("track_genre")[target_col].mean()

global_mean = train[target_col].mean()

# Agregamos la columna a train y test (con fallback al promedio global)
train["genre_pop_mean"] = train["track_genre"].map(genre_mean).fillna(global_mean)
test["genre_pop_mean"] = test["track_genre"].map(genre_mean).fillna(global_mean)
# Volver a armar feature_cols, numeric_features y categorical_features

drop_cols = high_card_cols + [target_col]

feature_cols = [c for c in train.columns if c not in drop_cols]

numeric_features = train[feature_cols].select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = train[feature_cols].select_dtypes(include=["object", "bool"]).columns.tolist()

print("Num:", numeric_features)
print("Cat:", categorical_features)



Num: ['duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'duration_min', 'energy_danceability', 'genre_pop_mean']
Cat: ['explicit', 'track_genre']


In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

X = train[feature_cols].copy()
y = train[target_col].copy()

rf = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1)

rf_pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", rf)
])

param_grid_rf = {
    "model__n_estimators": [200, 400],
    "model__max_depth": [None, 25, 35],
    "model__min_samples_leaf": [1, 2],
    "model__max_features": ["sqrt"]
}

grid_rf = GridSearchCV(
    rf_pipe,
    param_grid=param_grid_rf,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=2
)

grid_rf.fit(X, y)

print("Mejores params RF:", grid_rf.best_params_)
print("Mejor RMSE CV RF:", -grid_rf.best_score_)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Mejores params RF: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__n_estimators': 400}
Mejor RMSE CV RF: 15.747570879392532


In [31]:
from sklearn.base import clone

best_rf_pipe = grid_rf.best_estimator_

# Reentrenamos en TODO el train
X_full = train[feature_cols].copy()
y_full = train[target_col].copy()

modelo_final = clone(best_rf_pipe)
modelo_final.fit(X_full, y_full)

# Predicciones para test
X_test = test[feature_cols].copy()
preds = modelo_final.predict(X_test)

submission = pd.DataFrame({
    "id": test["id"],
    "popularity": preds
})

submission.to_csv("submission_rf_genre_mean.csv", index=False)
print("Nuevo archivo generado: submission_rf_genre_mean.csv")


Nuevo archivo generado: submission_rf_genre_mean.csv


In [33]:
from sklearn.model_selection import KFold

def target_encode_oof(train, test, col, target_col, n_splits=5, smoothing=10):
    """
    Target encoding out-of-fold para la columna categórica `col`.
    Devuelve train, test con una nueva columna f"{col}_te".
    """
    global_mean = train[target_col].mean()
    
    te_col = f"{col}_te"
    train[te_col] = np.nan
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    
    for train_idx, val_idx in kf.split(train):
        tr = train.iloc[train_idx]
        val = train.iloc[val_idx]
        
        stats = tr.groupby(col)[target_col].agg(["mean", "count"])
        # smoothing
        stats["te"] = (stats["mean"] * stats["count"] + global_mean * smoothing) / (stats["count"] + smoothing)
        
        train.loc[val.index, te_col] = val[col].map(stats["te"])
    
    # Si queda algún NaN (categorías raras), usar global_mean
    train[te_col] = train[te_col].fillna(global_mean)
    
    # Para test usamos stats calculadas en TODO el train
    stats_full = train.groupby(col)[target_col].agg(["mean", "count"])
    stats_full["te"] = (stats_full["mean"] * stats_full["count"] + global_mean * smoothing) / (stats_full["count"] + smoothing)
    test[te_col] = test[col].map(stats_full["te"]).fillna(global_mean)
    
    return train, test

# Encoding por artista y álbum
train, test = target_encode_oof(train, test, "artists", target_col)
train, test = target_encode_oof(train, test, "album_name", target_col)

# Ya tenías genre_pop_mean creada antes, podés mantenerla.



In [35]:
# Volvemos a definir qué columnas droppeamos
high_card_cols = ["id", "track_id", "artists", "album_name", "track_name"]  # seguimos sacando las originales
drop_cols = high_card_cols + [target_col]

feature_cols = [c for c in train.columns if c not in drop_cols]

numeric_features = train[feature_cols].select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = train[feature_cols].select_dtypes(include=["object", "bool"]).columns.tolist()

print("Num:", numeric_features)
print("Cat:", categorical_features)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features),
    ]
)



Num: ['duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'duration_min', 'energy_danceability', 'genre_pop_mean', 'artists_te', 'album_name_te']
Cat: ['explicit', 'track_genre']


In [36]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

X = train[feature_cols].copy()
y = train[target_col].copy()

rf = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1)

rf_pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", rf)
])

param_grid_rf = {
    "model__n_estimators": [200, 400],
    "model__max_depth": [None, 25, 35],
    "model__min_samples_leaf": [1, 2],
    "model__max_features": ["sqrt"]
}

grid_rf = GridSearchCV(
    rf_pipe,
    param_grid=param_grid_rf,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=2
)

grid_rf.fit(X, y)

print("Mejores params RF:", grid_rf.best_params_)
print("Mejor RMSE CV RF:", -grid_rf.best_score_)
rf_best = grid_rf.best_estimator_


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Mejores params RF: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__n_estimators': 400}
Mejor RMSE CV RF: 8.746982414714944


In [37]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

hgb = HistGradientBoostingRegressor(random_state=RANDOM_STATE)

hgb_pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", hgb)
])

param_dist_hgb = {
    "model__learning_rate": [0.01, 0.03, 0.05, 0.1],
    "model__max_depth": [None, 6, 10],
    "model__max_leaf_nodes": [31, 63, 127],
    "model__min_samples_leaf": [20, 40, 80]
}

rand_hgb = RandomizedSearchCV(
    hgb_pipe,
    param_distributions=param_dist_hgb,
    n_iter=20,              # podés subirlo si tu PC banca
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=2,
    random_state=RANDOM_STATE
)

rand_hgb.fit(X, y)

print("Mejores params HGBR:", rand_hgb.best_params_)
print("Mejor RMSE CV HGBR:", -rand_hgb.best_score_)
hgb_best = rand_hgb.best_estimator_


Fitting 5 folds for each of 20 candidates, totalling 100 fits


ValueError: 
All the 100 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\User\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\User\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\User\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\User\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\User\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\ensemble\_hist_gradient_boosting\gradient_boosting.py", line 564, in fit
    X, known_categories = self._preprocess_X(X, reset=True)
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\User\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\ensemble\_hist_gradient_boosting\gradient_boosting.py", line 274, in _preprocess_X
    X = validate_data(self, X, **check_X_kwargs)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\User\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 2954, in validate_data
    out = check_array(X, input_name="X", **check_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\User\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 1012, in check_array
    array = _ensure_sparse_format(
            ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\User\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 611, in _ensure_sparse_format
    raise TypeError(
TypeError: Sparse data was passed for X, but dense data is required. Use '.toarray()' to convert to a dense numpy array.


In [38]:
from sklearn.base import clone

# ============================
# Entrenar modelo final (solo Random Forest)
# ============================

X_full = train[feature_cols].copy()
y_full = train[target_col].copy()

rf_final = clone(rf_best)      # clonamos el mejor RF encontrado por GridSearchCV
rf_final.fit(X_full, y_full)   # entrenamos con TODO el train

# ============================
# Predicciones sobre el set de test
# ============================

X_test = test[feature_cols].copy()
pred_rf = rf_final.predict(X_test)

# ============================
# Generar submission para Kaggle
# ============================

submission = pd.DataFrame({
    "id": test["id"],        # columna ID
    "popularity": pred_rf    # predicción del RF final
})

submission.to_csv("submission.csv", index=False)
print("Archivo 'submission.csv' generado correctamente.")



Archivo 'submission.csv' generado correctamente.


In [39]:
print("popularity" in feature_cols)
print(target_col, "en feature_cols?", target_col in feature_cols)

False
popularity en feature_cols? False


In [40]:
len(feature_cols), feature_cols


(20,
 ['duration_ms',
  'explicit',
  'danceability',
  'energy',
  'key',
  'loudness',
  'mode',
  'speechiness',
  'acousticness',
  'instrumentalness',
  'liveness',
  'valence',
  'tempo',
  'time_signature',
  'track_genre',
  'duration_min',
  'energy_danceability',
  'genre_pop_mean',
  'artists_te',
  'album_name_te'])