In [None]:
## Paso 1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



## Paso 2
# Creamos datos inventados con estructura típica
np.random.seed(42)

n = 500
df = pd.DataFrame({
    "edad": np.random.randint(18, 60, size=n),
    "horas_estudio": np.random.randint(0, 20, size=n),
    "nota_parcial": np.random.randint(0, 100, size=n),
    "actividad_extra": np.random.choice(["Sí", "No"], size=n),
    "abandona": np.random.choice([0, 1], size=n, p=[0.7, 0.3])  # target ficticio
})

df.head()



## Paso 3
df.describe()
df['abandona'].value_counts(normalize=True)

sns.histplot(df["edad"], kde=True)
plt.title("Distribución de edades")
plt.show()

sns.countplot(x="actividad_extra", hue="abandona", data=df)
plt.title("Abandono según actividad extra")
plt.show()



## Paso 4
df.drop("abandona", axis=1)
y = df["abandona"]

# One-hot encoding manual para categóricas
X = pd.get_dummies(X, drop_first=True)

# Escalado de numéricas
scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



## Paso 5
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)



## Paso 6
for name, model in [("Logistic Regression", lr), ("Random Forest", rf)]:
    print(f"\n{name}")
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Report:\n", classification_report(y_test, y_pred))