This modular Python code:

- Handles imbalanced classes using StratifiedKFold and f1_score.
- Compares RandomForestClassifier and XGBClassifier.
- Iteratively drops or alters one feature at a time and tracks the F1-score.

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from collections import defaultdict
from tqdm import tqdm

In [None]:
ens_df = pd.read_csv("../data/processed/imputed_latlon_fI_pumpage_pop_gps.csv")

X = cat_train_df.drop(['status_group','id'], axis=1)
y = cat_train_df['status_group']

print(ens_df.shape)
print(ens_df.columns)

In [None]:
def evaluate_models(X, y, n_splits=5, random_state=42):
    """Evaluate Random Forest and XGBoost using StratifiedKFold and F1-macro."""
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    rf_scores, xgb_scores = [], []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        rf = RandomForestClassifier(n_estimators=100, random_state=random_state, n_jobs=-1)
        xgb = XGBClassifier(n_estimators=100, random_state=random_state, n_jobs=-1, use_label_encoder=False, eval_metric='mlogloss')

        rf.fit(X_train, y_train)
        xgb.fit(X_train, y_train)

        rf_preds = rf.predict(X_val)
        xgb_preds = xgb.predict(X_val)

        rf_scores.append(f1_score(y_val, rf_preds, average='macro'))
        xgb_scores.append(f1_score(y_val, xgb_preds, average='macro'))

    return np.mean(rf_scores), np.mean(xgb_scores)


def feature_impact_analysis(X, y):
    """Run feature ablation: remove each feature one at a time and evaluate models."""
    baseline_rf, baseline_xgb = evaluate_models(X, y)
    print(f"Baseline RF F1: {baseline_rf:.4f}, XGB F1: {baseline_xgb:.4f}")
    
    results = defaultdict(list)

    for col in tqdm(X.columns, desc="Testing each feature"):
        X_dropped = X.drop(columns=[col])
        rf_score, xgb_score = evaluate_models(X_dropped, y)
        results['feature'].append(col)
        results['rf_f1'].append(rf_score)
        results['xgb_f1'].append(xgb_score)

    df_results = pd.DataFrame(results)
    df_results['rf_diff'] = df_results['rf_f1'] - baseline_rf
    df_results['xgb_diff'] = df_results['xgb_f1'] - baseline_xgb
    return df_results.sort_values(by='rf_diff')  # or 'xgb_diff'

# Example usage:
# df_results = feature_impact_analysis(X, y)
# print(df_results)
