<a href="https://colab.research.google.com/github/estefaniahernandezz/PROYECTO-IA/blob/main/99_modelo_soluci%C3%B3n.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import os
import re
import numpy as np
import pandas as pd

# Ruta base
BASE_PATH = "/content/drive/MyDrive/ARCHIVOS IA"
CSV_PATH = os.path.join(BASE_PATH, "train.csv")

pd.set_option("display.max_columns", 120)

from google.colab import drive
drive.mount('/content/drive')


df = pd.read_csv(CSV_PATH, low_memory=False)
print("Filas, columnas:", df.shape)
df.head()

# Quitar espacios raros en nombres de columnas
df.columns = [c.strip() for c in df.columns]

# Pasar strings a formato 'bonito': sin espacios al inicio/fin
for c in df.select_dtypes(include='object').columns:
    df[c] = df[c].astype(str).str.strip()

df.head(2)
# === Conversión Sí/No/N a 1 y 0 ===
def to_binary(v):
    if pd.isna(v):
        return np.nan
    s = str(v).strip().lower()
    # limpiar tildes o caracteres extraños
    s = s.replace("í", "i").replace("Ã­", "i").replace("á", "a").replace("ã", "a")
    if s in ["si", "s", "yes", "y"]:
        return 1
    if s in ["no", "n"]:
        return 0
    if s in ["1", "true", "verdadero"]:
        return 1
    if s in ["0", "false", "falso"]:
        return 0
    return np.nan

# aplicar a todas las columnas que tengan Sí/No/N
for c in df.columns:
    if df[c].astype(str).str.lower().isin(["si", "sí", "s", "no", "n"]).any():
        df[c] = df[c].map(to_binary)

df.head(3)


# Numéricas: medianas
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for c in num_cols:
    df[c] = df[c].fillna(df[c].median())

# Categóricas: "Desconocido"
cat_cols = df.select_dtypes(include='object').columns.tolist()
for c in cat_cols:
    df[c] = df[c].fillna("Desconocido")

df.isna().mean().sort_values(ascending=False).head(10)

df_enc = pd.get_dummies(df, columns=cat_cols, drop_first=False)
print("Shape después de one-hot:", df_enc.shape)
df_enc.head(2)


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_enc[num_cols] = scaler.fit_transform(df_enc[num_cols])
df_enc.head(2)

# =========================================================
# Verificación rápida del preprocesado
# =========================================================

print("Forma final del DataFrame:", df_enc.shape)
print("- Filas:", df_enc.shape[0])
print("- Columnas:", df_enc.shape[1])
print()

# 1. Revisar que no queden valores nulos
faltantes = df_enc.isna().sum().sum()
print(f"Total de valores faltantes: {faltantes}")
print()

# 2. Estadísticas de columnas numéricas (primeras 10)
print("Estadísticas de columnas numéricas (primeras 10):")
display(df_enc[num_cols].describe().T[['mean', 'std']].head(10))
print()

# 3. Ejemplo de columnas One-Hot (departamentos)
print("Ejemplo de columnas One-Hot (departamentos):")
display(df_enc.filter(like='E_PRGM_DEPARTAMENTO').head(5))
print()

# 4. Top 10 categorías más comunes (suma de 1's)
print("Top 10 categorías más comunes (suma de 1's):")
display(df_enc.filter(like='E_PRGM_DEPARTAMENTO').sum().sort_values(ascending=False).head(10))
print()


# 5. Muestra aleatoria de 5 filas
print("Muestra aleatoria de 5 filas:")
display(df_enc.sample(5, random_state=42))

#========================= Entrega Final =================================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# =======================================================
# 1) Cargar train_preprocesado y test
# =======================================================
train = pd.read_csv("/content/drive/MyDrive/ARCHIVOS IA/train_preprocesado.csv")
test  = pd.read_csv("/content/test.csv")

# =======================================================
# 2) Reconstruir target
# =======================================================
target_cols = [
    "RENDIMIENTO_GLOBAL_alto",
    "RENDIMIENTO_GLOBAL_bajo",
    "RENDIMIENTO_GLOBAL_medio-alto",
    "RENDIMIENTO_GLOBAL_medio-bajo"
]

train["RENDIMIENTO_GLOBAL"] = train[target_cols].idxmax(axis=1)
train["RENDIMIENTO_GLOBAL"] = train["RENDIMIENTO_GLOBAL"].str.replace("RENDIMIENTO_GLOBAL_", "")

y_full = train["RENDIMIENTO_GLOBAL"]
X_full = train.drop(target_cols + ["RENDIMIENTO_GLOBAL"], axis=1)

# =======================================================
# 3) LIMITAR EL DATASET (versión que funcionó)
# =======================================================

LIMIT = 50000    # <-- ESTE valor era el que funcionó (ajustable)
X, _, y, _ = train_test_split(
    X_full, y_full,
    train_size=LIMIT,
    stratify=y_full,
    random_state=42
)
print("Tamaño final usado:", X.shape)

# =======================================================
# 4) Preprocesado uniforme (X + test)
# =======================================================
full = pd.concat([X, test], axis=0)

# Completar nulos numéricos
for c in full.select_dtypes(include=['int64','float64']).columns:
    full[c] = full[c].fillna(full[c].median())

# Completar nulos categóricos
for c in full.select_dtypes(include=['object']).columns:
    full[c] = full[c].fillna("Desconocido")

# One-hot encoding
full_enc = pd.get_dummies(full, drop_first=False)

# Separar otra vez
X_enc = full_enc.iloc[:len(X)]
test_enc = full_enc.iloc[len(X):]

# Eliminar duplicadas (clave)
X_enc = X_enc.loc[:, ~X_enc.columns.duplicated()]
test_enc = test_enc.loc[:, ~test_enc.columns.duplicated()]

# Alinear columnas (si falta alguna en test, se llena con 0)
test_enc = test_enc.reindex(columns=X_enc.columns, fill_value=0)

# =======================================================
# 5) Codificar el target
# =======================================================
le = LabelEncoder()
y_enc = le.fit_transform(y)

# =======================================================
# 6) MODELO FINAL (RandomForest optimizado — el que funcionó)
# =======================================================

model = RandomForestClassifier(
    n_estimators=500,
    max_depth=20,
    min_samples_leaf=3,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)

model.fit(X_enc, y_enc)

# =======================================================
# 7) Predicción Kaggle
# =======================================================
test_pred = model.predict(test_enc)
test_pred = le.inverse_transform(test_pred)

submission = pd.DataFrame({
    "ID": test["ID"],
    "RENDIMIENTO_GLOBAL": test_pred
})

submission.to_csv("submission.csv", index=False)
print("submission.csv generado correctamente.")
submission.head()
