In [3]:
import pandas as pd

In [4]:
data = pd.read_csv("data.csv", delimiter=";")
X = data.drop("Target", axis=1)
y = data["Target"]

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [6]:
# Class imbalance is mild. May need to adjust for later
data["Target"].value_counts(True)

Target
Graduate    0.499322
Dropout     0.321203
Enrolled    0.179476
Name: proportion, dtype: float64

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

one_hot_columns = [
    "Marital status",
    "Application mode",
    "Course",
    "Previous qualification (grade)",
    "Nacionality",
    "Mother's qualification",
    "Father's qualification",
    "Mother's occupation",
    "Father's occupation",
]

numeric_columns = [col for col in X_train.columns if col not in one_hot_columns]

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", encoder, one_hot_columns),
        ("num", StandardScaler(), numeric_columns)
    ]
)

In [18]:
from lightgbm import LGBMClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("classifier", LGBMClassifier(
            random_state=42,
            class_weight='balanced',  # Handles class imbalance
            n_estimators=100,
            verbosity=-1  # Silences warnings
            )
        )
    ]
)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, loguniform, randint

param_dist = {
    "classifier__learning_rate": loguniform(0.01, 0.2),
    "classifier__n_estimators": randint(200, 800),
    "classifier__num_leaves": randint(15, 128),
    "classifier__max_depth": [None, 5, 10, 20],
    "classifier__min_child_samples": randint(10, 50),
    "classifier__reg_alpha": loguniform(1e-3, 1),
    "classifier__reg_lambda": loguniform(1e-3, 1),
    "classifier__feature_fraction": uniform(0.6, 0.4),
    "classifier__subsample": uniform(0.6, 1.0),
    "classifier__subsample_freq": randint(0, 10),
    "classifier__bagging_fraction": uniform(0.5, 0.5)  # samples between 0.5 and 1.0
}


random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=50,
    scoring='f1_macro',
    cv=2,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

In [21]:
random_search.fit(X_train, y_train)
print(random_search.best_params_)
print(random_search.best_score_)

Fitting 2 folds for each of 50 candidates, totalling 100 fits


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /home/conda/feedstock_root/build_artifacts/liblightgbm_1742951514774/work/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /home/conda/feedstock_root/build_artifacts/liblightgbm_1742951514774/work/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /home/conda/feedstock_root/build_artifacts/liblightgbm_1742951514774/work/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /home/conda/feedstock_root/build_artifacts/liblightgbm_1742951514774/work/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /home/conda/feedstock_root/build_artifacts/liblightgbm_1742951514774/work/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /home/conda/feedstock_root/build_artifacts/liblightgbm_17429515147

{'classifier__feature_fraction': np.float64(0.7988994023569542), 'classifier__learning_rate': np.float64(0.02462927943506523), 'classifier__max_depth': None, 'classifier__min_child_samples': 26, 'classifier__n_estimators': 312, 'classifier__num_leaves': 16, 'classifier__reg_alpha': np.float64(0.03221343740912342), 'classifier__reg_lambda': np.float64(0.0014270403521460836), 'classifier__subsample': np.float64(0.8786464642366114), 'classifier__subsample_freq': 0}
0.7097065893287613


In [22]:
from sklearn.metrics import confusion_matrix

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))

[[237  37  42]
 [ 32  63  56]
 [ 10  25 383]]




In [10]:
# Test set evaluation
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     Dropout       0.85      0.73      0.79       316
    Enrolled       0.49      0.44      0.47       151
    Graduate       0.80      0.90      0.85       418

    accuracy                           0.76       885
   macro avg       0.71      0.69      0.70       885
weighted avg       0.76      0.76      0.76       885



In [9]:
from sklearn.model_selection import GridSearchCV

best_params = random_search.best_params_

# Define narrowed ranges around best values
param_grid = {
    "classifier__learning_rate": [best_params["classifier__learning_rate"] * 0.5, 
                                  best_params["classifier__learning_rate"], 
                                  best_params["classifier__learning_rate"] * 1.5],
    "classifier__max_iter": [best_params["classifier__max_iter"] - 100,
                             best_params["classifier__max_iter"],
                             best_params["classifier__max_iter"] + 100],
    "classifier__max_leaf_nodes": [best_params["classifier__max_leaf_nodes"] - 10,
                                   best_params["classifier__max_leaf_nodes"],
                                   best_params["classifier__max_leaf_nodes"] + 10],
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1,
    verbose=1
)

In [10]:
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
{'classifier__learning_rate': np.float64(0.08015600033571255), 'classifier__max_iter': 673, 'classifier__max_leaf_nodes': 71}
0.7128234623498366


NameError: name 'grid_search' is not defined

In [9]:
# Test set evaluation
from sklearn.metrics import classification_report
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     Dropout       0.85      0.73      0.79       316
    Enrolled       0.49      0.44      0.47       151
    Graduate       0.80      0.90      0.85       418

    accuracy                           0.76       885
   macro avg       0.71      0.69      0.70       885
weighted avg       0.76      0.76      0.76       885



In [12]:
# Test set evaluation
best_model_ = grid_search.best_estimator_
y_pred_ = best_model_.predict(X_test)
print(classification_report(y_test, y_pred_))

              precision    recall  f1-score   support

     Dropout       0.84      0.72      0.78       316
    Enrolled       0.46      0.51      0.48       151
    Graduate       0.81      0.87      0.84       418

    accuracy                           0.76       885
   macro avg       0.71      0.70      0.70       885
weighted avg       0.76      0.76      0.76       885

