<a href="https://colab.research.google.com/github/estefaniahernandezz/PROYECTO-IA/blob/main/04_modelo_con_OneHot_y_CatBoost_SVM_alternativo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# =======================================================
# 1) Cargar train_preprocesado y test
# =======================================================
train = pd.read_csv("/content/drive/MyDrive/ARCHIVOS IA/train_preprocesado.csv")
test  = pd.read_csv("/content/test.csv")

# =======================================================
# 2) Reconstruir target
# =======================================================
target_cols = [
    "RENDIMIENTO_GLOBAL_alto",
    "RENDIMIENTO_GLOBAL_bajo",
    "RENDIMIENTO_GLOBAL_medio-alto",
    "RENDIMIENTO_GLOBAL_medio-bajo"
]

train["RENDIMIENTO_GLOBAL"] = train[target_cols].idxmax(axis=1)
train["RENDIMIENTO_GLOBAL"] = train["RENDIMIENTO_GLOBAL"].str.replace("RENDIMIENTO_GLOBAL_", "")

# y = target — X = features
y_full = train["RENDIMIENTO_GLOBAL"]
X_full = train.drop(target_cols + ["RENDIMIENTO_GLOBAL"], axis=1)

# =======================================================
# 3) LIMITAR CANTIDAD DE DATOS (SIN DAÑAR DISTRIBUCIÓN)
# =======================================================

LIMIT = 70000

X, _, y, _ = train_test_split(
    X_full, y_full,
    train_size=LIMIT,
    stratify=y_full,   # mantiene proporciones correctas
    random_state=42
)

print("Tamaño final usado para entrenar:", X.shape)

# =======================================================
# 4) Concatenar para One-Hot uniforme (X + test)
# =======================================================
full = pd.concat([X, test], axis=0)

# Completar nulos numéricos
for c in full.select_dtypes(include=['int64','float64']).columns:
    full[c] = full[c].fillna(full[c].median())

# Completar nulos categóricos
for c in full.select_dtypes(include=['object']).columns:
    full[c] = full[c].fillna("Desconocido")

# One-hot
full_enc = pd.get_dummies(full, drop_first=False)

# Separar train y test codificados
X_enc = full_enc.iloc[:len(X)]
test_enc = full_enc.iloc[len(X):]

# Eliminar columnas duplicadas (por seguridad)
X_enc = X_enc.loc[:, ~X_enc.columns.duplicated()]
test_enc = test_enc.loc[:, ~test_enc.columns.duplicated()]

# Alinear (caso extremo)
test_enc = test_enc.reindex(columns=X_enc.columns, fill_value=0)

# =======================================================
# 5) Codificar etiquetas
# =======================================================
le = LabelEncoder()
y_enc = le.fit_transform(y)

# =======================================================
# 6) Entrenar modelo optimizado
# =======================================================

model = RandomForestClassifier(
    n_estimators=500,
    max_depth=20,
    min_samples_leaf=3,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)

model.fit(X_enc, y_enc)

# =======================================================
# 7) Predicciones Kaggle
# =======================================================
test_pred = model.predict(test_enc)
test_pred = le.inverse_transform(test_pred)

submission = pd.DataFrame({
    "ID": test["ID"],
    "RENDIMIENTO_GLOBAL": test_pred
})

submission.to_csv("submission.csv", index=False)
print("submission.csv generado OK.")
submission.head()


Tamaño final usado para entrenar: (70000, 1041)
