# Set up


In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("../data/engineered_features.csv")
df.head()

# Choose numeic features
numeric_cols = [
    'Application order', 'Admission grade', 'Pre Qual (grade)',
    '1st - enrolled', '1st - evaluations',  '1st - approved', 
    '2nd - enrolled', '2nd - evaluations',  '2nd - approved',
    '1st_avg_grade', '1st_approval_rate',
    '2nd_avg_grade', '2nd_approval_rate',
    'delta_approval_rate', 'delta_avg_grade'    
]

# Choose categorical features
categorical_cols = [
    'Marial Status', 'Application mode', 'Course', 'Pre Qual',
    'Nationality', 'Daytime/Evening', 'Scholarship', 'Tuition fees up to date',
    'Displaced', 'Gender', 'International', 'age_group',
    "Mom's Qual", "Mom's Occupation",
    "Dad's Qual", "Dad's Occupation",
]

# Map labels
label_map = {"Graduate": 0, "Enrolled": 1, "Dropout": 2}
df['Target'] = df['Target'].map(label_map)

X = df[numeric_cols + categorical_cols]
y = df['Target']

# Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Create processor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ],
    remainder='drop'
)

# Define Grid and Pipelines


In [3]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, make_scorer

# Define refined hyperparameter grids
rf_param_grid = {
    'n_estimators': [300, 400, 500],
    'max_depth': [20, 25, 30],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [2, 4, 6],
    'max_features': ['sqrt', 'log2'],
    'class_weight': ['balanced_subsample'],
    'criterion': ['gini', 'entropy']
}

gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [10, 20, 30],
    'min_samples_leaf': [5, 10, 15],
    'subsample': [0.8, 0.9, 1.0],
}

from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.utils.class_weight import compute_sample_weight

# Create pipelines outside of search
def create_rf_pipeline():
    return Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
    ])

def create_gb_pipeline():
    return Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', GradientBoostingClassifier(random_state=42))
    ])
    
# Custom scorer for imbalanced data
f1_scorer = make_scorer(f1_score, average='macro')

# Setup cross-validation with proper stratification
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
sample_weights = compute_sample_weight('balanced', y_train);

# RandomForest Hyperparameter Search


In [None]:
rf_search = RandomizedSearchCV(
    create_rf_pipeline(),
    param_distributions={'classifier__' + k: v for k, v in rf_param_grid.items()},
    n_iter=50,
    scoring={'f1_macro': f1_scorer, 'accuracy': 'accuracy'}, refit='accuracy',
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=0,
    return_train_score=True
)

rf_search.fit(X_train, y_train)
print(f"Best RF CV Score: {rf_search.best_score_:.4f}")
print(f"Best RF Params: {rf_search.best_params_}")

=== RandomForest Hyperparameter Tuning ===
Best RF CV Score: 0.7725
Best RF Params: {'classifier__n_estimators': 400, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'sqrt', 'classifier__max_depth': 25, 'classifier__criterion': 'gini', 'classifier__class_weight': 'balanced_subsample'}


# GradientBoosting Hyperparameter Search


In [5]:
gb_search = RandomizedSearchCV(
    create_gb_pipeline(),
    param_distributions={'classifier__' + k: v for k, v in gb_param_grid.items()},
    n_iter=50,
    scoring={'f1_macro': f1_scorer, 'accuracy': 'accuracy'}, refit='accuracy',
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=0,
    return_train_score=True
)

gb_search.fit(X_train, y_train, classifier__sample_weight=sample_weights)
print(f"Best GB CV Score: {gb_search.best_score_:.4f}")
print(f"Best GB Params: {gb_search.best_params_}")

Best GB CV Score: 0.7711
Best GB Params: {'classifier__subsample': 0.9, 'classifier__n_estimators': 200, 'classifier__min_samples_split': 20, 'classifier__min_samples_leaf': 5, 'classifier__max_depth': 9, 'classifier__learning_rate': 0.1}


# Evaluate on test set


In [6]:
def evaluate_model(model, X_test, y_test, model_name):
    """Comprehensive model evaluation"""
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)
    
    results = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1_Macro': f1_score(y_test, y_pred, average='macro'),
        'F1_Weighted': f1_score(y_test, y_pred, average='weighted'),
        'Precision_Macro': precision_score(y_test, y_pred, average='macro', zero_division=0),
        'Recall_Macro': recall_score(y_test, y_pred, average='macro', zero_division=0),
        'ROC_AUC': roc_auc_score(y_test, y_proba, multi_class='ovr', average='macro')
    }
    return results

# Evaluate both models
rf_results = evaluate_model(rf_search.best_estimator_, X_test, y_test, 'RandomForest')
gb_results = evaluate_model(gb_search.best_estimator_, X_test, y_test, 'GradientBoosting')

In [10]:
# Create comparison dataframe
comparison_df = pd.DataFrame([rf_results, gb_results]).set_index('Model')
print("\n=== Final Model Comparison ===")
print(comparison_df)

# Check for overfitting
print("\n=== Overfitting Analysis ===")
for name, search in [('RandomForest', rf_search), ('GradientBoosting', gb_search)]:
    cv_score = search.best_score_
    test_score = comparison_df.loc[name, 'Accuracy']
    diff = cv_score - test_score
    print(f"{name}:")
    print(f"  CV Score: {cv_score:.4f}")
    print(f"  Test Score: {test_score:.4f}")
    print(f"  Difference: {diff:.4f} {'(HIGH OVERFITTING!)' if diff > 0.05 else '(acceptable)'}")

# Feature importance analysis
best_model = rf_search.best_estimator_
feature_names = (numeric_cols + 
                 list(best_model.named_steps['preprocessor']
                      .named_transformers_['cat']
                      .get_feature_names_out(categorical_cols)))

importances = best_model.named_steps['classifier'].feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values('Importance', ascending=False).head(20)

print("\n=== Top 20 Feature Importances ===")
feature_importance_df


=== Final Model Comparison ===
                  Accuracy  F1_Macro  F1_Weighted  Precision_Macro  \
Model                                                                
RandomForest      0.755932   0.70915     0.760996         0.710517   
GradientBoosting  0.778531   0.72830     0.778108         0.731979   

                  Recall_Macro   ROC_AUC  
Model                                     
RandomForest          0.711510  0.886413  
GradientBoosting      0.725586  0.891982  

=== Overfitting Analysis ===
RandomForest:
  CV Score: 0.7725
  Test Score: 0.7559
  Difference: 0.0166 (acceptable)
GradientBoosting:
  CV Score: 0.7711
  Test Score: 0.7785
  Difference: -0.0074 (acceptable)

=== Top 20 Feature Importances ===


Unnamed: 0,Feature,Importance
12,2nd_approval_rate,0.103534
8,2nd - approved,0.08048
11,2nd_avg_grade,0.067708
10,1st_approval_rate,0.066907
5,1st - approved,0.055955
9,1st_avg_grade,0.055553
7,2nd - evaluations,0.034803
14,delta_avg_grade,0.034007
13,delta_approval_rate,0.03249
1,Admission grade,0.031257


# Ensemble Learning


In [8]:
from sklearn.ensemble import VotingClassifier
ensemble = VotingClassifier(estimators=[('rf', rf_search.best_estimator_), ('gb', gb_search.best_estimator_)], voting='soft')
ensemble.fit(X_train, y_train)

0,1,2
,estimators,"[('rf', ...), ('gb', ...)]"
,voting,'soft'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,400
,criterion,'gini'
,max_depth,25
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,200
,subsample,0.9
,criterion,'friedman_mse'
,min_samples_split,20
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_depth,9
,min_impurity_decrease,0.0


In [9]:
ensemble_results = evaluate_model(ensemble, X_test, y_test, 'Ensemble')
print("\n=== Ensemble Model Performance ===")
for key, value in ensemble_results.items():
    if key != 'Model':
        print(f"{key}: {value:.4f}")


=== Ensemble Model Performance ===
Accuracy: 0.7831
F1_Macro: 0.7300
F1_Weighted: 0.7799
Precision_Macro: 0.7372
Recall_Macro: 0.7245
ROC_AUC: 0.8906
