In [16]:
# Importar las librerías necesarias
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler


In [17]:
# Cargar el dataset limpio
data = pd.read_csv('data.csv', sep=';')
data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


Combertimos la columna Target a binaria.Haciendo Dropout=1 e =0 todo lo demás ( Enrolled y Graduate ). Y se divide el dataset en dos conjuntos, uno de entrenamiento y otro de prueba.

In [18]:
# Transformar la columna 'Target' a binaria: Dropout = 1, el resto = 0
data['Target_Binary'] = data['Target'].apply(lambda x: 1 if x == 'Dropout' else 0)
X = data.drop(columns=['Target', 'Target_Binary'])
y = data['Target_Binary']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Normalizamos los datos para poder aplicar el algoritmo de regresión logística y asegurar que el modelo converja correctamente.

In [None]:
# Estandarizar las características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Se entrena el modelo de regresión logística con los datos de entrenamiento ya escalados, y se define el 'max_iter'(número máximo de iteraciones) para garantizar convergencia.

In [None]:
model = LogisticRegression(max_iter=5000, random_state=42)
model.fit(X_train_scaled, y_train)

#Predicciones en el conjunto de prueba
y_pred = model.predict(X_test_scaled)

Métricas de evaluación:
precisión,F1 score,recall y lal matriz de confusión para medir el rendimiento del modelo.

In [12]:
# Evaluar el modelo
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [13]:
# Mostrar los resultados
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.8666666666666667
F1 Score: 0.794425087108014
Recall: 0.7215189873417721
Confusion Matrix:
[[539  30]
 [ 88 228]]


INFORME DETALLADO CON MÉTRICAS ADICIONALES PARA EVALUAR EL MODELO:

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.95      0.90       569
           1       0.88      0.72      0.79       316

    accuracy                           0.87       885
   macro avg       0.87      0.83      0.85       885
weighted avg       0.87      0.87      0.86       885



Ahora se analizan los coeficientes del modelo para identificar qué variables tienen mayor impacto en la probabilidad de abandono.

In [15]:
# Analizar los coeficientes del modelo
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("Coeficientes del modelo:")
print(coefficients)

Coeficientes del modelo:
                                           Feature  Coefficient
28             Curricular units 2nd sem (enrolled)     1.111815
27             Curricular units 2nd sem (credited)     0.468141
21             Curricular units 1st sem (credited)     0.407824
19                               Age at enrollment     0.376000
25                Curricular units 1st sem (grade)     0.293355
3                                           Course     0.253040
7                                      Nacionality     0.233738
33                               Unemployment rate     0.221303
13                                       Displaced     0.171382
15                                          Debtor     0.160049
8                           Mother's qualification     0.157016
17                                          Gender     0.103468
2                                Application order     0.084500
11                             Father's occupation     0.063543
34             