In [135]:
# ==========================
# 0) Imports y configuración
# ==========================
import os
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras import layers, models, callbacks
from sklearn.pipeline import Pipeline

In [136]:
# Reproducibilidad
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

print("TensorFlow:", tf.__version__)

TensorFlow: 2.19.0


In [137]:
# =====================================
# 1) Cargar dataset
# =====================================
df = pd.read_csv("/content/student_exam_scores.csv")

# Revisar columnas y forma
print(df.columns)
df.head()

Index(['student_id', 'hours_studied', 'sleep_hours', 'attendance_percent',
       'previous_scores', 'exam_score'],
      dtype='object')


Unnamed: 0,student_id,hours_studied,sleep_hours,attendance_percent,previous_scores,exam_score
0,S001,8.0,8.8,72.1,45,30.2
1,S002,1.3,8.6,60.7,55,25.0
2,S003,4.0,8.2,73.7,86,35.8
3,S004,3.5,4.8,95.1,66,34.0
4,S005,9.1,6.4,89.8,71,40.3


In [138]:
# =======================================
# 2) Selección de variables y objetivo (y)
# =======================================
# Objetivo: exam_score (sacar notas)
y_binary = (df["exam_score"].astype(int) >= 30).astype(int)

# Tomamos columnas usuales y evitamos leakage
X = df[[
    "hours_studied", "sleep_hours", "attendance_percent", "previous_scores",
]].copy()

In [139]:
# =======================================
# 3) Feature engineering liviano y limpio
# =======================================
# Relación horas de estudio y asistencia
X["study_attendance"] = X["hours_studied"] / (X["attendance_percent"] + 1e-5)

# Relación horas de estudio y horas de sueño
X["sleep_study"] = X["hours_studied"] / (X["sleep_hours"] + 1e-5)

# Promedio entre nota previa y asistencia
X["previous_attend"] = (X["previous_scores"] + X["attendance_percent"]) / 2

print(X)

     hours_studied  sleep_hours  attendance_percent  previous_scores  \
0              8.0          8.8                72.1               45   
1              1.3          8.6                60.7               55   
2              4.0          8.2                73.7               86   
3              3.5          4.8                95.1               66   
4              9.1          6.4                89.8               71   
..             ...          ...                 ...              ...   
195           10.5          5.4                94.0               87   
196            7.1          6.1                85.1               92   
197            1.6          6.9                63.8               76   
198           12.0          7.3                50.5               58   
199           10.2          6.3                97.4               68   

     study_attendance  sleep_study  previous_attend  
0            0.110957     0.909090            58.55  
1            0.021417     0

In [140]:
# =================================
# 4) Definición de columnas por tipo
# =================================
num_cols = ["hours_studied", "sleep_hours", "attendance_percent",
            "previous_scores", "study_attendance", "sleep_study", "previous_attend"]

In [141]:
# ========================================
# 5) Preprocesamiento con ColumnTransformer
# ========================================
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()) # denso; aquí sí con media
])

from sklearn.pipeline import Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
    ],
    remainder="drop"
)

In [142]:
# ================================
# 6) Split train/test estratificado
# ================================
X_train_df, X_test_df, y_train, y_test = train_test_split(
    X, y_binary, test_size=0.2, random_state=SEED, stratify=y_binary
)

# Ajustar transformadores en train y transformar ambos
X_train = preprocessor.fit_transform(X_train_df)
X_test = preprocessor.transform(X_test_df)

X_train = X_train.astype("float32")
X_test = X_test.astype("float32")

print("Input dims:", X_train.shape[1])

Input dims: 7


In [143]:
# ================================
# 7) Definir y compilar el modelo
# ================================
def build_model(input_dim: int) -> tf.keras.Model:
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(16, activation="relu"),
        layers.Dropout(0.15),
        layers.Dense(8, activation="relu"),
        layers.Dropout(0.15),
        layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
    )
    return model

model = build_model(X_train.shape[1])
model.summary()

In [144]:
# ================================
# 8) Callbacks (buenoas prácticas)
# ================================
cbs = [
    callbacks.EarlyStopping(monitor="val_loss", mode="min", patience=12, restore_best_weights=True),
    callbacks.ModelCheckpoint("best_model.keras", monitor="val_loss", mode="min", save_best_only=True),
    callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=6)
]

In [145]:
# ====================
# 9) Entrenamiento
# ====================
hist = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=32,
    callbacks=cbs,
    verbose=1
)

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 128ms/step - accuracy: 0.5625 - auc: 0.5142 - loss: 0.6954 - val_accuracy: 0.3750 - val_auc: 0.3482 - val_loss: 0.7099 - learning_rate: 0.0010
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.6135 - auc: 0.6069 - loss: 0.6655 - val_accuracy: 0.4062 - val_auc: 0.4643 - val_loss: 0.6926 - learning_rate: 0.0010
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.6646 - auc: 0.6309 - loss: 0.6549 - val_accuracy: 0.5312 - val_auc: 0.5536 - val_loss: 0.6773 - learning_rate: 0.0010
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.7260 - auc: 0.7036 - loss: 0.6369 - val_accuracy: 0.5938 - val_auc: 0.6562 - val_loss: 0.6626 - learning_rate: 0.0010
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.7115 - auc: 0.6484 - loss: 0.6379 - val_ac

In [146]:
# ==========================
# 10) Evaluación  en test
# ==========================

y_proba = model.predict(X_test).ravel()
y_pred = (y_proba > 0.5).astype(int)

print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred))
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred, digits=4))
print("\nAUC:", roc_auc_score(y_test, y_proba))


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step

Matriz de confusión:
 [[ 7  4]
 [ 3 26]]

Reporte de clasificación:
               precision    recall  f1-score   support

           0     0.7000    0.6364    0.6667        11
           1     0.8667    0.8966    0.8814        29

    accuracy                         0.8250        40
   macro avg     0.7833    0.7665    0.7740        40
weighted avg     0.8208    0.8250    0.8223        40


AUC: 0.9153605015673982


In [147]:
# ==========================
# 11) Función para predecir un estudiante
# ==========================
def predict_one(sample: dict) -> float:
    # Crear DataFrame de un solo registro
    s = pd.DataFrame([sample])

    # Crear las mismas features derivadas
    s["study_attendance"] = s["hours_studied"] / (s["attendance_percent"] + 1e-5)
    s["sleep_study"] = s["hours_studied"] / (s["sleep_hours"] + 1e-5)
    s["previous_attend"] = (s["previous_scores"] + s["attendance_percent"]) / 2

    # Escalar con el mismo preprocesador
    s_proc = preprocessor.transform(s).astype("float32")

    # Predecir probabilidad
    proba = model.predict(s_proc).item()
    return proba

# ==========================
# Ejemplo con la primera fila del dataset
# ==========================
sample = X.iloc[0].to_dict()
proba = predict_one(sample)
print(f"\nProbabilidad de aprobar: {proba:.4f}")
print("Aprobado" if proba >= 0.5 else "Reprobado")

# ==========================
# Predecir varias filas
# ==========================
for i in range(10):
    sample = X.iloc[i].to_dict()
    proba = predict_one(sample)
    print(f"Fila {i} -> Prob aprobar: {proba:.4f}, {'Aprobado' if proba>=0.5 else 'Reprobado'}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step

Probabilidad de aprobar: 0.5428
Aprobado
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Fila 0 -> Prob aprobar: 0.5428, Aprobado
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Fila 1 -> Prob aprobar: 0.4085, Reprobado
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Fila 2 -> Prob aprobar: 0.4935, Reprobado
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Fila 3 -> Prob aprobar: 0.5041, Aprobado
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Fila 4 -> Prob aprobar: 0.6229, Aprobado
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Fila 5 -> Prob aprobar: 0.6718, Aprobado
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Fila 6 -> Prob aprobar: 0.7658, Aprobado
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Fila 7 -> Prob aproba