In [None]:
# =============================================
# 1. Librerías
# =============================================
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

# =============================================
# 2. Cargar datos
# =============================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# =============================================
# 3. Selección de columnas clave socioeconómicas
# =============================================
cols = [
    'FAMI_ESTRATOVIVIENDA',
    'FAMI_TIENEINTERNET',
    'FAMI_EDUCACIONPADRE',
    'FAMI_EDUCACIONMADRE'
]
train = train[['ID'] + cols + ['RENDIMIENTO_GLOBAL']]
test = test[['ID'] + cols]

# =============================================
# 4. Limpieza de valores raros
# =============================================
valores_raros = ['98', '99', '999', 98, 99, 999]
for col in cols:
    train[col] = train[col].replace(valores_raros, np.nan)
    test[col] = test[col].replace(valores_raros, np.nan)

# =============================================
# 5. Imputación: modo general
# =============================================
for col in cols:
    modo = train[col].mode()[0]
    train[col] = train[col].fillna(modo)
    test[col] = test[col].fillna(modo)

# =============================================
# 6. One-hot encoding
# =============================================
def one_hot(df, col):
    return pd.concat([df.drop(columns=[col]), pd.get_dummies(df[col], prefix=col)], axis=1)

for col in cols:
    train = one_hot(train, col)
    test = one_hot(test, col)

# Alinear columnas test con train
test = test.reindex(columns=train.drop(columns=['RENDIMIENTO_GLOBAL']).columns, fill_value=0)

# =============================================
# 7. Codificación objetivo
# =============================================
mapa = {'bajo': 0, 'medio-bajo': 1, 'medio-alto': 2, 'alto': 3}
inv_mapa = {v: k for k, v in mapa.items()}
y = train['RENDIMIENTO_GLOBAL'].map(mapa)
X = train.drop(columns=['RENDIMIENTO_GLOBAL'])

# =============================================
# 8. Escalado (SVM lo requiere)
# =============================================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test)

# =============================================
# 9. División validación
# =============================================
Xtrain, Xval, ytrain, yval = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# =============================================
# 10. Entrenamiento
# =============================================
clf = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)
clf.fit(Xtrain, ytrain)

# =============================================
# 11. Evaluación
# =============================================
yval_pred = clf.predict(Xval)
acc = accuracy_score(yval, yval_pred)
print(f"Accuracy en validación: {acc:.4f}")
scores = cross_val_score(clf, X_scaled, y, cv=5)
print(f"CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

# =============================================
# 12. Predicción final
# =============================================
clf.fit(X_scaled, y)
y_test_pred = clf.predict(test_scaled)

# =============================================
# 13. Submission
# =============================================
submission = pd.DataFrame({
    'ID': test['ID'],
    'RENDIMIENTO_GLOBAL': pd.Series(y_test_pred).map(inv_mapa)
})
submission.to_csv("submission.csv", index=False)
print(submission.head())
