In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cluster import KMeans
from MODELING.train_models import Modeling

In [2]:
modeling = Modeling()
df = modeling.load_dataset("../data/OULADX.csv")
df.head()

Unnamed: 0,code_module_x,code_presentation_x,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,...,code_module_y,code_presentation_y,assessment_type,date,weight,source_y,code_module,code_presentation,sum_click_total,count_vle_days
0,AAA,2013J,11391.0,M,East Anglian Region,HE Qualification,,55<=,0.0,240.0,...,,,,,,,AAA,2013J,922.0,40.0
1,AAA,2013J,11391.0,M,East Anglian Region,HE Qualification,,55<=,0.0,240.0,...,,,,,,,AAA,2013J,922.0,40.0
2,AAA,2013J,11391.0,M,East Anglian Region,HE Qualification,,55<=,0.0,240.0,...,,,,,,,AAA,2013J,922.0,40.0
3,AAA,2013J,11391.0,M,East Anglian Region,HE Qualification,,55<=,0.0,240.0,...,,,,,,,AAA,2013J,922.0,40.0
4,AAA,2013J,11391.0,M,East Anglian Region,HE Qualification,,55<=,0.0,240.0,...,,,,,,,AAA,2013J,922.0,40.0


In [3]:
# Crear variables agregadas por estudiante
prepared = modeling.prepare_features(df)
prepared.head(100)

Unnamed: 0,id_student,score_mean,n_assessments,total_weight,total_clicks,studied_credits,num_of_prev_attempts,target
0,3733.0,,0,0.0,,60.0,0.0,0
1,6516.0,61.800000,5,0.0,2715.0,60.0,0.0,1
2,8462.0,87.000000,14,0.0,634.0,90.0,0.0,0
3,11391.0,82.000000,5,0.0,922.0,240.0,0.0,1
4,23629.0,82.500000,4,0.0,153.0,60.0,2.0,0
...,...,...,...,...,...,...,...,...
95,34731.0,75.666667,3,0.0,671.0,60.0,0.0,0
96,34756.0,67.333333,3,0.0,114.0,150.0,1.0,0
97,34863.0,73.909091,11,0.0,178.0,60.0,0.0,1
98,35001.0,75.000000,2,0.0,71.0,30.0,0.0,0


In [4]:
X_train, X_test, y_train, y_test = modeling.split_data(prepared)

In [5]:
# logistic
modeling.train_logistic_regression(X_train, X_test, y_train, y_test)


=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.60      0.71      0.65      3743
           1       0.58      0.46      0.51      3255

    accuracy                           0.59      6998
   macro avg       0.59      0.58      0.58      6998
weighted avg       0.59      0.59      0.59      6998

[[2653 1090]
 [1761 1494]]


# Logistic Regression

```plaintext
Accuracy: 59%
Precision clase 1 (Pass): 0.58
Recall clase 1 (Pass): 0.46
```

#### Bajo desempeño. El modelo predice bien los 0 (no aprobados), pero tiene dificultad para detectar 1 (aprobados).

La matriz de confusión lo confirma:

```plaintext
[[2653 1090]   ← clase 0
 [1761 1494]]  ← clase 1
```
Muchos falsos negativos (casos reales positivos predichos como negativos).

In [6]:
modeling.train_random_forest(X_train, X_test, y_train, y_test)


=== Random Forest ===
              precision    recall  f1-score   support

           0       0.77      0.74      0.76      3743
           1       0.72      0.74      0.73      3255

    accuracy                           0.74      6998
   macro avg       0.74      0.74      0.74      6998
weighted avg       0.74      0.74      0.74      6998

[[2784  959]
 [ 841 2414]]


# Random Forest

```plaintext
Accuracy: 74%
Precision clase 1 (Pass): 0.72
Recall clase 1 (Pass): 0.74
```

#### Buen desempeño. Mejora bastante en comparación con Logistic Regression.

La matriz de confusión:

```plaintext
[[2784  959]
 [ 841 2414]]
```
Buen equilibrio entre precisión y recall. Detecta bien a los que pasan y no pasan.

In [7]:
modeling.train_gradient_boosting(X_train, X_test, y_train, y_test)


=== Gradient Boosting ===
              precision    recall  f1-score   support

           0       0.83      0.71      0.77      3743
           1       0.71      0.83      0.77      3255

    accuracy                           0.77      6998
   macro avg       0.77      0.77      0.77      6998
weighted avg       0.77      0.77      0.77      6998

[[2667 1076]
 [ 562 2693]]


# Gradient Boosting

```plaintext
Accuracy: 77%
Precision clase 1 (Pass): 0.71
Recall clase 1 (Pass): 0.83
```

El mejor modelo de los tres.

Alto recall para los que pasan: detecta bien a quienes realmente aprueban.

Matriz de confusión:
```
[[2667 1076]
 [ 562 2693]]
```
Mejores falsos positivos y negativos que Logistic y Random Forest. Muy buena generalización.

In [8]:
modeling.explore_clusters(prepared)


=== K-Means Clustering ===
Cluster counts:
 cluster
1    12933
0     7010
2     3383
Name: count, dtype: int64

=== Análisis descriptivo por clúster ===
        score_mean                              total_clicks          \
              mean     median        std  count         mean  median   
cluster                                                                
0        57.668183  58.666667  16.150965   7010   800.047504   550.0   
1        79.129321  80.000000   9.206367  12933  1052.138174   839.0   
2        82.400767  83.666667   8.984635   3383  4940.860774  4331.0   

                            studied_credits                           
                 std  count            mean median        std  count  
cluster                                                               
0         744.002085   7010      103.460770  120.0  46.400531   7010  
1         785.670109  12933       60.877987   60.0  23.139485  12933  
2        2182.474732   3383       81.807567   60.0  35.631

Unnamed: 0,id_student,score_mean,n_assessments,total_weight,total_clicks,studied_credits,num_of_prev_attempts,target,cluster
1,6516.0,61.800000,5,0.0,2715.0,60.0,0.0,1,1
2,8462.0,87.000000,14,0.0,634.0,90.0,0.0,0,1
3,11391.0,82.000000,5,0.0,922.0,240.0,0.0,1,0
4,23629.0,82.500000,4,0.0,153.0,60.0,2.0,0,1
6,23698.0,74.444444,9,0.0,886.0,120.0,0.0,1,0
...,...,...,...,...,...,...,...,...,...
28775,2698251.0,58.142857,7,0.0,1500.0,60.0,0.0,0,0
28776,2698257.0,67.800000,5,0.0,723.0,120.0,0.0,1,0
28777,2698535.0,39.250000,16,0.0,3420.0,60.0,0.0,0,0
28778,2698577.0,64.400000,5,0.0,695.0,60.0,0.0,0,1


| Cluster | score\_mean | total\_clicks | studied\_credits | Perfil                                              |
| ------- | ----------- | ------------- | ---------------- | --------------------------------------------------- |
| 0       | Bajo        | Bajo          | Bajo             | Posible riesgo de abandono o bajo compromiso        |
| 1       | Medio       | Medio-Alto    | Medio            | Estudiantes participativos y con desempeño moderado |
| 2       | Alto        | Muy alto      | Alto             | Estudiantes exitosos y comprometidos                |