In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train.head()

RANDOM_STATE = 42


In [3]:
# ============================================
# 1. Definir features y target
# ============================================
target_col = "popularity"

drop_cols = ["id", "track_id", "artists", "album_name", "track_name"]

X = train.drop(columns=[target_col] + drop_cols)
y = train[target_col]

# Columnas según tipo
numeric_features = [
    "duration_ms", "danceability", "energy", "loudness",
    "speechiness", "acousticness", "instrumentalness",
    "liveness", "valence", "tempo"
]

categorical_features = [
    "explicit", "key", "mode", "time_signature", "track_genre"
]

In [4]:
# ============================================
# 2. Train / validation split
# ============================================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

In [5]:
# ============================================
# 3. Preprocesamiento
# ============================================

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [6]:
# ============================================
# 4. Modelos de regresión a evaluar
# ============================================

modelos = {
    "Regresión Lineal": LinearRegression(),
    "Árbol de Decisión": DecisionTreeRegressor(
        random_state=RANDOM_STATE
    ),
    "Random Forest": RandomForestRegressor(
        n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1
    ),
    "Gradient Boosting": GradientBoostingRegressor(
        random_state=RANDOM_STATE
    ),
    "MLPRegressor": MLPRegressor(
        hidden_layer_sizes=(64, 32),
        activation="relu",
        max_iter=200,
        random_state=RANDOM_STATE
    )
}

resultados = []

for nombre, modelo in modelos.items():
    pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", modelo)
    ])
    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_val)
    
    # CORREGIDO: RMSE manual
    rmse = mean_squared_error(y_val, y_pred) ** 0.5
    
    resultados.append((nombre, rmse))
    print(f"{nombre} - RMSE: {rmse:.4f}")


Regresión Lineal - RMSE: 19.1185
Árbol de Decisión - RMSE: 21.6687
Random Forest - RMSE: 15.7183
Gradient Boosting - RMSE: 19.9061
MLPRegressor - RMSE: 18.7064


