In [35]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd


data = pd.read_csv("data_prepped_r.csv")
X = data.drop("Target", axis=1)
y = data["Target"]

# Define feature types (verify with your dataset)
one_hot_columns = [
    "Marital status",
    "Application mode",
    "Course",
    "Previous qualification (grade)",
    "Nacionality",
    "Mother's qualification",
    "Father's qualification",
    "Mother's occupation",
    "Father's occupation",
]

numeric_columns = [col for col in X.columns if col not in one_hot_columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_columns)
    ])


# Create pipeline
hgb_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("classifier", HistGradientBoostingClassifier(
            early_stopping=True,
            scoring='neg_log_loss',
            validation_fraction=0.2,
            n_iter_no_change=15,
            random_state=42,
            class_weight="balanced",
            )
    )
])

param_grid_ = {
    "classifier__learning_rate": [0.05, 0.1],
    "classifier__max_iter": [500, 1000],
    "classifier__max_leaf_nodes": [31, 63],
    "classifier__l2_regularization": [0.01, 0.20],
    "classifier__max_features": [0.1, 0.8],
}
param_grid = {
    "classifier__learning_rate": [0.075, 0.1, 0.125],  # Narrow around best 0.05
    "classifier__max_iter": [800, 1000, 1200],        # Allow more iterations
    "classifier__max_leaf_nodes": [63, 127],          # Deeper trees
    "classifier__l2_regularization": [0.005, 0.01],   # Reduce regularization
    "classifier__max_features": [0.3, 0.5, 0.7],      # Balance feature usage
}



# Grid search with cross-validation
grid_search = GridSearchCV(
    estimator=hgb_pipe,
    param_grid=param_grid,
    cv=5,
    scoring='neg_log_loss',
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

grid_search.fit(X, y)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation log loss: {-grid_search.best_score_:.3f}")

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters: {'classifier__l2_regularization': 0.01, 'classifier__learning_rate': 0.075, 'classifier__max_features': 0.3, 'classifier__max_iter': 800, 'classifier__max_leaf_nodes': 127}
Best cross-validation log loss: 0.237
