In [67]:
# Set number of cpu kernels
import os
os.environ["LOKY_MAX_CPU_COUNT"] = "4"

In [68]:
import pandas as pd

# load data
data = pd.read_csv("data.csv", delimiter=";")
X = data.drop("Target", axis=1)
y = data["Target"]

In [69]:
# Feature Engineering function
def engineer_features(X):
    X = X.copy()
    
    # Academic performance features
    X['academic_progress_rate'] = (
        X['Curricular units 1st sem (approved)'] + 
        X['Curricular units 2nd sem (approved)']
    ) / (
        X['Curricular units 1st sem (enrolled)'] + 
        X['Curricular units 2nd sem (enrolled)'] + 1e-6
    )
    
    X['overall_grade_avg'] = (
        X['Curricular units 1st sem (grade)'] + 
        X['Curricular units 2nd sem (grade)']
    ) / 2
    
    X['failure_rate'] = (
        (X['Curricular units 1st sem (enrolled)'] - 
         X['Curricular units 1st sem (approved)']) / 
        (X['Curricular units 1st sem (enrolled)'] + 1e-6)
    )
    
    # Financial/demographic features
    X['financial_risk'] = (X['Tuition fees up to date'] == 0) & (X['Scholarship holder'] == 0)
    X['age_course_interaction'] = X['Age at enrollment'] * X['Course']
    X['parent_education_max'] = X[["Mother's qualification", "Father's qualification"]].max(axis=1)
    
    # Time-based features
    X['grade_consistency'] = abs(
        X['Curricular units 1st sem (grade)'] - 
        X['Curricular units 2nd sem (grade)']
    )
    
    X['early_warning'] = (
        X['Curricular units 1st sem (approved)'] / 
        (X['Curricular units 1st sem (enrolled)'] + 1e-6)
    ) < 0.5
    
    return X

In [70]:
# Apply feature engineering
X_engineered = engineer_features(X)

In [71]:
# Data preprocessing
one_hot_columns = [
    "Marital status",
    "Application mode",
    "Course",
    "Previous qualification (grade)",
    "Nacionality",
    "Mother's qualification",
    "Father's qualification",
    "Mother's occupation",
    "Father's occupation",
]
for col in one_hot_columns:
    data[col].convert_dtypes("category")

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_engineered, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
#from imblearn.pipeline import Pipeline
#from imblearn.over_sampling import SMOTE
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, f_classif

# Select the k best features based on ANOVA F-value
feature_selector = SelectKBest(score_func=f_classif, k=10)
# Pipeline
pipeline = Pipeline(
    [
#        ("smote", SMOTE(random_state=42)),
        ("feature_selection", feature_selector),
        ("classifier", HistGradientBoostingClassifier(
            early_stopping=True,  # Enable validation-based stopping
            scoring='f1_macro',   # Align with CV metric
            validation_fraction=0.1,
            n_iter_no_change=10,  # Stop if no improvement in 10 iterations
            random_state=42,
            class_weight="balanced",
            categorical_features="from_dtype"
            )
        )
    ]
)

In [74]:
from scipy.stats import uniform, loguniform, randint
# Parameter distribution Randomized search
param_dist = {
    "feature_selection__k": [5, 10, 15, 20],
    
    "classifier__learning_rate": loguniform(0.01, 0.2),
    "classifier__max_iter": randint(200, 800),
    "classifier__max_leaf_nodes": randint(15, 128),
    "classifier__max_depth": [None, 5, 10, 20],
    "classifier__min_samples_leaf": randint(10, 50),
    "classifier__l2_regularization": loguniform(1e-3, 1),
    "classifier__max_bins": [128, 255],
    "classifier__max_features": uniform(0.6, 0.4)
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Construct randomized search
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=50,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

random_search.fit(X_train, y_train)
best_params = random_search.best_params_

print(f"\n Best parameters found: {best_params}")
print(f"\n Best score found: {random_search.best_score_}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits

 Best parameters found: {'classifier__l2_regularization': np.float64(0.016267099368894124), 'classifier__learning_rate': np.float64(0.13702370114575005), 'classifier__max_bins': 128, 'classifier__max_depth': 5, 'classifier__max_features': np.float64(0.771564497559442), 'classifier__max_iter': 284, 'classifier__max_leaf_nodes': 62, 'classifier__min_samples_leaf': 42, 'feature_selection__k': 20}

 Best score found: 0.707919499135836


In [76]:
from sklearn.metrics import confusion_matrix

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))

[[207  66  31]
 [ 20  98  40]
 [ 12  74 337]]


In [77]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     Dropout       0.87      0.68      0.76       304
    Enrolled       0.41      0.62      0.49       158
    Graduate       0.83      0.80      0.81       423

    accuracy                           0.73       885
   macro avg       0.70      0.70      0.69       885
weighted avg       0.77      0.73      0.74       885

