In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
import pandas as pd
import xgboost as xgb


data = pd.read_csv("data_prepped_r.csv")
X = data.drop("Target", axis=1)
y = data["Target"]

from sklearn.preprocessing import LabelEncoder

# Encode target labels
label_encoder = LabelEncoder()

y = label_encoder.fit_transform(y)

# Define feature types (verify with your dataset)
one_hot_columns = [
    "Marital status",
    "Application mode",
    "Course",
    "Previous qualification (grade)",
    "Nacionality",
    "Mother's qualification",
    "Father's qualification",
    "Mother's occupation",
    "Father's occupation",
]

numeric_columns = [col for col in X.columns if col not in one_hot_columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_columns)
    ])

# Create pipeline
xgb_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("classifier", xgb.XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        use_label_encoder=False,
        tree_method="hist",
        enable_categorical=False,
        random_state=42,
        scale_pos_weight=1  # Use this if your classes are imbalanced
        )
    )
])

param_grid = {
    "classifier__learning_rate": [0.03, 0.035],       # Finer around 0.03
    "classifier__n_estimators": [400, 500],             # Around 400
    "classifier__max_depth": [6, 7],                      # Around 7
    "classifier__reg_lambda": [0.005, 0.01],           # Around 0.01
    "classifier__colsample_bytree": [0.2, 0.3],         # Around 0.3
    # 3 * 3 * 3 * 3 * 3 = 243 combinations
}

# Grid search with cross-validation
grid_search = GridSearchCV(
    estimator=xgb_pipe,
    param_grid=param_grid,
    cv=5,
    scoring='neg_log_loss',
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

grid_search.fit(X, y)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation log loss: {-grid_search.best_score_:.3f}")


Fitting 5 folds for each of 32 candidates, totalling 160 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best parameters: {'classifier__colsample_bytree': 0.2, 'classifier__learning_rate': 0.03, 'classifier__max_depth': 6, 'classifier__n_estimators': 500, 'classifier__reg_lambda': 0.01}
Best cross-validation log loss: 0.229
