
1. [Install Treno](https://github.com/erosmontin/treno)
1. FS pipeline definition


In [30]:
!! pip install git+https://github.com/erosmontin/treno #--force-reinstall
!! pip install xgboost
!! pip install imbalanced-learn




In [31]:
from treno.utils import (
    remove_nans,
    zScoreFeatures,
    filterFeaturesByMAD,
    filterFeaturesByScore,
    filterFeaturesByCorrelation,
    rankFeaturesByRepeatedGini
)
from sklearn.ensemble import RandomForestClassifier

def classification_feature_selection(
    features,
    targets,
    groups=None,
    score_threshold=0.5,
    corr_threshold=0.9,
    score_model=None,
    score_test_size=0.3,
    score_n_repeats=1,
    gini_n_repeats=10,
    gini_test_size=0.1,
    gini_random_seed=None,
    return_gini=False
):
    """
    Feature selection pipeline for classification.

    Parameters:
        features: DataFrame or ndarray of features
        targets: Series or ndarray of targets
        groups: Optional, group labels for splitting
        score_threshold: Threshold for feature score selection
        corr_threshold: Correlation threshold for feature removal
        score_model: scikit-learn classifier to use for scoring (default: RandomForestClassifier)
        score_test_size: Test size for score evaluation
        score_n_repeats: Number of repeats for score evaluation
        gini_n_repeats: Number of repeats for Gini ranking
        gini_test_size: Test size for Gini ranking
        gini_random_seed: Random seed for Gini ranking
        return_gini: If True, also return Gini scores

    Returns:
        DataFrame of selected features (optionally sorted by Gini), and optionally Gini scores
    """
    # 1. Remove NaNs
    features, targets = remove_nans(features, targets)

    # 2. Normalize features
    features = zScoreFeatures(features)

    # 3. Filter by MAD
    features = filterFeaturesByMAD(features)
    if features.shape[1] == 0:
        raise ValueError("No features passed MAD filter")

    # 4. Filter by score (user-selected model or RandomForestClassifier)
    model = score_model if score_model is not None else RandomForestClassifier()
    features, score_values = filterFeaturesByScore(
        features,
        targets,
        groups=groups,
        threshold=score_threshold,
        return_score=True,
        model=model,
        test_size=score_test_size,
        n_repeats=score_n_repeats
    )
    if features.shape[1] == 0:
        raise ValueError("No features passed score filter")

    # 5. Filter by correlation
    features = filterFeaturesByCorrelation(
        features,
        threshold=corr_threshold,
        score=score_values.values
    )
    if features.shape[1] == 0:
        raise ValueError("No features passed correlation filter")

    # 6. Rank by repeated Gini index (optional)
    if return_gini:
        features_sorted, gini_ranks = rankFeaturesByRepeatedGini(
            features, targets,
            n_repeats=gini_n_repeats,
            test_size=gini_test_size,
            random_seed=gini_random_seed,
            groups=groups,
            return_gini=True
        )
        return features_sorted, gini_ranks
    else:
        features_sorted = rankFeaturesByRepeatedGini(
            features, targets,
            n_repeats=gini_n_repeats,
            test_size=gini_test_size,
            random_seed=gini_random_seed,
            groups=groups,
            return_gini=False
        )
        return features_sorted

In [32]:
from treno.utils import generate_fake_data
# Generate classification data with groups


# Use the classification data with groups to test the pipeline
x, y, groups = generate_fake_data(n_samples=200, n_features=30, n_groups=10, classification=True, random_state=42)


# Example: Only selected features
selected_features = classification_feature_selection(x, y)
print("Selected features after pipeline:", selected_features.columns.tolist())

# Example: Selected features and Gini scores
selected_features, gini_scores = classification_feature_selection(x, y, return_gini=True)
print("Selected features after pipeline:", selected_features.columns.tolist())
print("Gini scores:\n", gini_scores)

TypeError: 'int' object is not iterable