# Santander

# 1. Import libs
[top](#Contents)

In [24]:
# pandas and numpy imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
# Suppress the specific warning
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.metrics')

In [25]:
# pip install -U imbalanced-learn

# 2. Get data
[top](#Contents)

#### Read data
[top](#Contents)

In [26]:
bank_ds = pd.read_csv('train.csv')

In [None]:
bank_test_ds = pd.read_csv('test.csv')

#### Drop an unneeded column

In [27]:
# Assuming 'column_to_drop' is the name of the column you want to drop
column_to_drop = 'ID_code'

# Drop the column from bank_ds
bank_ds.drop(column_to_drop, axis=1, inplace=True)


### Split 70/30 (stratified sampling)

In [28]:
target = bank_ds['target']

In [29]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    bank_ds.drop(labels=['target'], axis=1),  # drop the target
    bank_ds['target'],  # just the target
    test_size=0.3,
    stratify=target,
    random_state=42)

X_train.shape, X_test.shape

((140000, 200), (60000, 200))

### Scaling

Models like Logistic Regression require scaling for better performance. Tree-based models like Random Forest, XGBoost, and LightGBM do not inherently require scaling but scaling can still be beneficial (can improve performance and convergence in some cases).

In [30]:
from sklearn.preprocessing import MinMaxScaler
# we put the variables in the same scale
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Ensemble methods

In [32]:
from sklearn.ensemble import BaggingClassifier

from imblearn.ensemble import (
    BalancedBaggingClassifier,
    BalancedRandomForestClassifier,
    RUSBoostClassifier,
    EasyEnsembleClassifier,
)

In [33]:
# ensemble methods (with or without resampling)
ensemble_dict = {
    # balanced random forests (bagging)
    'balancedRF': BalancedRandomForestClassifier(n_estimators=20, criterion='gini', max_depth=3, sampling_strategy='auto', n_jobs=4, random_state=42),
    # bagging of Logistic regression, no resampling
    'bagging': BaggingClassifier(base_estimator=LogisticRegression(random_state=42), n_estimators=20, n_jobs=4, random_state=42),
    # bagging of Logistic regression, with resampling
    'balancedbagging': BalancedBaggingClassifier(estimator=LogisticRegression(random_state=42), n_estimators=20, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, sampling_strategy='auto', n_jobs=4, random_state=42),
    # boosting + undersampling
    'rusboost': RUSBoostClassifier(estimator=None, n_estimators=20, learning_rate=1.0, sampling_strategy='auto', random_state=42),
    # bagging + boosting + under-sammpling
    'easyEnsemble': EasyEnsembleClassifier(n_estimators=20, sampling_strategy='auto', n_jobs=4, random_state=42)
}

## Evaluation

#### Metrics which require predict_proba method
The ROC AUC score is calculated based on the predicted probabilities for each class, which are obtained using the predict_proba method. This method provides the probability estimates for each class, which are necessary to compute the ROC curve and subsequently the AUC (Area Under the Curve).

In [34]:
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score
)

# Collecting results
results = []
# Iterate over each model
for ensemble_name, ensemble in ensemble_dict.items():
    print(f"Training ensemble model: {ensemble_name}")

    # Train the model
    ensemble.fit(X_train, y_train)
        
    # Make predictions on the test set
    y_proba_pred = ensemble.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
        
    # Calculate metrics
    roc_auc   = roc_auc_score(y_test, y_proba_pred) 
    pr_auc = average_precision_score(y_test, y_proba_pred)
        
    # Append the results
    results.append({
        'Ensemble model': ensemble_name,
        'ROC-AUC': roc_auc,
        'PR-AUC': pr_auc
    })

# Create a DataFrame from the results
df_results_2 = pd.DataFrame(results)

# Print the DataFrame as a table
print(df_results_2)

Training model: balancedRF


  warn(
  warn(


Training model: bagging




Training model: balancedbagging
Training model: rusboost
Training model: easyEnsemble
    Ensemble model   ROC-AUC    PR-AUC
0       balancedRF  0.732446  0.256243
1          bagging  0.858818  0.498701
2  balancedbagging  0.858477  0.497667
3         rusboost  0.731738  0.272281
4     easyEnsemble  0.768610  0.319986


### Export classification metrics

In [35]:
# Export results to CSV files
df_results_2.to_csv('6_evaluation_results_ensemble_methods.csv', index=False)

# To remove

In [None]:
lgb.plot_tree(model, figsize=(20,6), tree_index=model.best_iteration-1, dpi=300, show_info='data_percentage');

In [None]:
lgb.plot_importance(model, figsize=(12,10), max_num_features=50);