In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd

# Load data
data = pd.read_csv("data_prepped.csv")
X = data.drop("Target", axis=1)
y = data["Target"]

# Define feature types
one_hot_columns = [
    "Marital status",
    "Application mode",
    "Course",
    "Previous qualification (grade)",
    "Nacionality",
    "Mother's qualification",
    "Father's qualification",
    "Mother's occupation",
    "Father's occupation",
]

numeric_columns = [col for col in X.columns if col not in one_hot_columns]

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_columns)
    ]
)

# Create pipeline with Random Forest
rf_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        random_state=42,
        class_weight='balanced',
        n_jobs=-1
    ))
])

# Define hyperparameter grid for Random Forest
param_grid = {
    "classifier__n_estimators": [700, 800],
    "classifier__max_depth": [30, 50],
    "classifier__min_samples_split": [2, 5],
    "classifier__min_samples_leaf": [2, 4],
    "classifier__max_features": ['log2', 0.5]
}

param_grid_ = {
    "classifier__n_estimators": [300, 500],
    "classifier__max_depth": [20, 30],
    "classifier__max_features": ["sqrt", 0.5]
}

# Grid search with cross-validation
grid_search = GridSearchCV(
    estimator=rf_pipe,
    param_grid=param_grid,
    cv=5,
    scoring='neg_log_loss',
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)


grid_search.fit(X, y)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation log loss: {-grid_search.best_score_:.3f}")

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best parameters: {'classifier__max_depth': 30, 'classifier__max_features': 0.5, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 800}
Best cross-validation log loss: 0.289
