# Santander

# 1. Import libs
[top](#Contents)

In [10]:
# pandas and numpy imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
# Suppress the specific warning
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.metrics')

In [11]:
# pip install -U imbalanced-learn

# 2. Get data
[top](#Contents)

#### Read data
[top](#Contents)

In [12]:
bank_ds = pd.read_csv('train.csv')

In [None]:
bank_test_ds = pd.read_csv('test.csv')

#### Drop an unneeded column

In [13]:
# Assuming 'column_to_drop' is the name of the column you want to drop
column_to_drop = 'ID_code'

# Drop the column from bank_ds
bank_ds.drop(column_to_drop, axis=1, inplace=True)


### Split 70/30 (stratified sampling)

In [14]:
target = bank_ds['target']

In [15]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    bank_ds.drop(labels=['target'], axis=1),  # drop the target
    bank_ds['target'],  # just the target
    test_size=0.3,
    stratify=target,
    random_state=42)

X_train.shape, X_test.shape

((140000, 200), (60000, 200))

### Scaling

Models like Logistic Regression require scaling for better performance. Tree-based models like Random Forest, XGBoost, and LightGBM do not inherently require scaling but scaling can still be beneficial (can improve performance and convergence in some cases).

In [16]:
from sklearn.preprocessing import MinMaxScaler
# we put the variables in the same scale
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Oversampling

In [17]:
from imblearn.over_sampling import (
    RandomOverSampler,
    SMOTE,
    ADASYN,
    BorderlineSMOTE,
    SVMSMOTE,
)
from sklearn.svm import SVC

In [18]:
oversampler_dict = {
    'random': RandomOverSampler(sampling_strategy='auto', random_state=0),
    'smote': SMOTE(sampling_strategy='auto', random_state=0, k_neighbors=5, n_jobs=4),
    'adasyn': ADASYN(sampling_strategy='auto', random_state=0, n_neighbors=5, n_jobs=4),
    'border1': BorderlineSMOTE(sampling_strategy='auto', random_state=0, k_neighbors=5, m_neighbors=10, kind='borderline-1', n_jobs=4),
    'border2': BorderlineSMOTE(sampling_strategy='auto', random_state=0, k_neighbors=5, m_neighbors=10, kind='borderline-2', n_jobs=4)
}

In [19]:
# oversampler_dict = {
    'random': RandomOverSampler(sampling_strategy='auto', random_state=0),
    'smote': SMOTE(sampling_strategy='auto', random_state=0, k_neighbors=5, n_jobs=4),
    'adasyn': ADASYN(sampling_strategy='auto', random_state=0, n_neighbors=5, n_jobs=4),
    'border1': BorderlineSMOTE(sampling_strategy='auto', random_state=0, k_neighbors=5, m_neighbors=10, kind='borderline-1', n_jobs=4),
    'border2': BorderlineSMOTE(sampling_strategy='auto', random_state=0, k_neighbors=5, m_neighbors=10, kind='borderline-2', n_jobs=4),
    'svm': SVMSMOTE(sampling_strategy='auto', random_state=0, k_neighbors=5, m_neighbors=10, n_jobs=4, svm_estimator=SVC(kernel='linear'))
}


## Evaluation

#### Metrics which require predict method

#### Confusion matrix, Recall, Precision, F1-score, G-mean, etc 

In [22]:
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    classification_report,
    confusion_matrix
)

# require scaling
from imblearn.metrics import (
    geometric_mean_score,
    make_index_balanced_accuracy,
)

def dominance(y_true, y_pred):
    tpr = recall_score(y_test, y_pred, pos_label=1)
    tnr = recall_score(y_test, y_pred, pos_label=0)
    return tpr - tnr

# Function to get classification metrics
def get_classification_metrics(y_true, y_pred):
    report = classification_report(y_true, y_pred, output_dict=True)
    accuracy          = accuracy_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    
    # Adding possible averages to recall, precision, and f1-score calculations
    # Default average is None - scores are returned for each class individually rather than being averaged
    recall_weighted    = recall_score(y_test, y_pred, average='weighted')
    precision_weighted = precision_score(y_test, y_pred, average='weighted') 
    f1_weighted        = f1_score(y_test, y_pred, average='weighted')

    recall_micro       = recall_score(y_test, y_pred, average='micro')
    precision_micro    = precision_score(y_test, y_pred, average='micro')
    f1_micro           = f1_score(y_test, y_pred, average='micro')
    
    recall_macro       = recall_score(y_test, y_pred, average='macro')
    precision_macro    = precision_score(y_test, y_pred, average='macro')
    f1_macro           = f1_score(y_test, y_pred, average='macro')

    # Adding possible averages to G-mean
    # If the average parameter is not specified, it defaults to average='binary'.
    g_mean_binary      = geometric_mean_score(y_test, y_pred, average='binary')
    g_mean_weighted    = geometric_mean_score(y_test, y_pred, average='weighted')
    g_mean_micro       = geometric_mean_score(y_test, y_pred, average='micro')
    g_mean_macro       = geometric_mean_score(y_test, y_pred, average='macro')

    # A lower alpha gives more weight to sensitivity (TPR), while a higher alpha gives more weight to specificity (TNR). 
    # In other words,in case of a lower alpha more emphasis is placed on correctly identifying positive instances
    gmean = make_index_balanced_accuracy(alpha=0.5, squared=True)(geometric_mean_score) 
    corrected_g_mean  = gmean(y_test, y_pred)
    # specifying an average might not be necessary or meaningful. Leaving for the time being
    corrected_g_mean_binary  = gmean(y_test, y_pred,average='binary')
    corrected_g_mean_weighted  = gmean(y_test, y_pred,average='weighted')
    corrected_g_mean_micro  = gmean(y_test, y_pred,average='micro')
    corrected_g_mean_macro  = gmean(y_test, y_pred,average='macro')
    
    dominance_score   = dominance(y_test, y_pred) 

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel()
    FPR = fp / (tn + fp)
    FNR = fn / (tp + fn)
    # TPR = tp / (tp + fn) # recallminority
    # TNR = tn / (tn + fp) # recall majority
    
    return {
        'Accuracy': accuracy,
        'Recall Majority (TNR)': report['0']['recall'],
        'Recall Minority (TPR)': report['1']['recall'],
        'Balanced Accuracy': balanced_accuracy,
        'FPR': FPR,
        'FNR': FNR,
        
        'Precision Majority': report['0']['precision'],
        'Precision Minority': report['1']['precision'],
        'F1-Score Majority': report['0']['f1-score'],
        'F1-Score Minority': report['1']['f1-score'], 
        
        # Weighted average (Use 'weighted' if you want to take class imbalance into account)
        'Weighted Precision': precision_weighted,
        'Weighted Recall': recall_weighted,
        'Weighted F1-Score': f1_weighted,

        # Micro average (Use 'micro' if you want a metric that gives equal weight to every individual prediction)
        'Micro Precision': precision_micro,
        'Micro Recall': recall_micro,
        'Micro F1-Score': f1_micro,
        
        # Macro average (Use 'macro' if you want to treat each class equally)
        'Macro Precision': precision_macro,
        'Macro Recall': recall_macro,
        'Macro F1-Score': f1_macro,

        # Geometric average (a suitable metric when you want to balance the trade-off between detecting the minority class 
        # and avoiding false positives from the majority class)
        'G-mean-binary': g_mean_binary, # (compute two geometric means for each class, then the arithmetic mean of these G-means)
        'G-mean-weighted': g_mean_weighted, # calculates the geometric mean for each class and then computes the weighted arithmetic mean of these G-means
        'G-mean-micro': g_mean_micro, # treats all classes equally, regardless of their size
        'G-mean-macro': g_mean_macro, # use when you want to weight each instance equally, regardless of their class

        # Corrected G-means
        'Corrected G-mean': corrected_g_mean,
        'Corrected G-mean-binary': corrected_g_mean_binary,
        'Corrected G-mean-weighted': corrected_g_mean_weighted,
        'Corrected G-mean-micro': corrected_g_mean_micro,
        'Corrected G-mean-macro': corrected_g_mean_macro,
        
        'Dominance': dominance_score
    }

# Dictionary to hold the models
models_dict = {
    #'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    #'XGBoost': XGBClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(random_state=42)
}
# Collecting results
results = []
confusion_matrices = []

# Iterate over each oversampling technique
for name, oversampler in oversampler_dict.items():
    print(f"Applying undersampling technique: {name}")
    
    # Apply oversampling to the training data
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
    
    # Iterate over each model
    for model_name, model in models_dict.items():
        print(f"Training model: {model_name} with oversampling technique: {name}")
        
        # Train the model
        model.fit(X_resampled, y_resampled)
        
        # Make predictions on the test set
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        metrics = get_classification_metrics(y_test, y_pred)
        
        # Append the results
        results.append({
            'Oversampling Technique': name,
            'Model': model_name,
            **metrics
        })

# Create a DataFrame from the results
df_results_1 = pd.DataFrame(results)

# Print the DataFrame as a table
print(df_results_1)

Applying undersampling technique: random
Training model: Logistic Regression with oversampling technique: random
Training model: LightGBM with oversampling technique: random
[LightGBM] [Info] Number of positive: 125931, number of negative: 125931
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.644267 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51000
[LightGBM] [Info] Number of data points in the train set: 251862, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Applying undersampling technique: smote




Training model: Logistic Regression with oversampling technique: smote
Training model: LightGBM with oversampling technique: smote
[LightGBM] [Info] Number of positive: 125931, number of negative: 125931
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.615931 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51000
[LightGBM] [Info] Number of data points in the train set: 251862, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Applying undersampling technique: adasyn




Training model: Logistic Regression with oversampling technique: adasyn
Training model: LightGBM with oversampling technique: adasyn
[LightGBM] [Info] Number of positive: 123202, number of negative: 125931
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.618503 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51000
[LightGBM] [Info] Number of data points in the train set: 249133, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494523 -> initscore=-0.021909
[LightGBM] [Info] Start training from score -0.021909
Applying undersampling technique: border1




Training model: Logistic Regression with oversampling technique: border1
Training model: LightGBM with oversampling technique: border1
[LightGBM] [Info] Number of positive: 125931, number of negative: 125931
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.542396 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51000
[LightGBM] [Info] Number of data points in the train set: 251862, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Applying undersampling technique: border2




Training model: Logistic Regression with oversampling technique: border2
Training model: LightGBM with oversampling technique: border2
[LightGBM] [Info] Number of positive: 125931, number of negative: 125931
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.651417 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51000
[LightGBM] [Info] Number of data points in the train set: 251862, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
  Oversampling Technique                Model  Accuracy  \
0                 random  Logistic Regression  0.782117   
1                 random             LightGBM  0.815500   
2                  smote  Logistic Regression  0.787783   
3                  smote             LightGBM  0.858667   
4                 adasyn  Logistic Regression  0.789567   
5                 adasyn             LightGBM  0.860033   
6       

#### Metrics which require predict_proba method
The ROC AUC score is calculated based on the predicted probabilities for each class, which are obtained using the predict_proba method. This method provides the probability estimates for each class, which are necessary to compute the ROC curve and subsequently the AUC (Area Under the Curve).

In [19]:
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score
)

# Dictionary to hold the models
models_dict = {
    #'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    #'XGBoost': XGBClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(random_state=42)
}

# Collecting results
results = []
# Iterate over each undersampling technique
for name, oversampler in oversampler_dict.items():
    print(f"Applying oversampling technique: {name}")
    
    # Apply oversampling to the training data
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
    
    # Iterate over each model
    for model_name, model in models_dict.items():
        print(f"Training model: {model_name} with oversampling technique: {name}")

        # Train the model
        model.fit(X_resampled, y_resampled)
        
        # Make predictions on the test set
        y_pred_proba = model.predict_proba(X_test)[:, 1]  
        
        # Calculate metrics
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        pr_auc = average_precision_score(y_test, y_pred_proba)
        
        # Append the results
        results.append({
            'Undersampling Technique': name,
            'Model': model_name,
            'ROC-AUC': roc_auc,
            'PR-AUC': pr_auc
        })

# Create a DataFrame from the results
df_results_2 = pd.DataFrame(results)

# Print the DataFrame as a table
print(df_results_2)

Applying oversampling technique: random
Training model: CatBoost with oversampling technique: random
Learning rate set to 0.109202
0:	learn: 0.6849325	total: 356ms	remaining: 5m 55s
1:	learn: 0.6771810	total: 498ms	remaining: 4m 8s
2:	learn: 0.6709826	total: 637ms	remaining: 3m 31s
3:	learn: 0.6649726	total: 781ms	remaining: 3m 14s
4:	learn: 0.6595633	total: 921ms	remaining: 3m 3s
5:	learn: 0.6546803	total: 1.06s	remaining: 2m 56s
6:	learn: 0.6499906	total: 1.2s	remaining: 2m 50s
7:	learn: 0.6454376	total: 1.33s	remaining: 2m 45s
8:	learn: 0.6411991	total: 1.5s	remaining: 2m 45s
9:	learn: 0.6370973	total: 1.68s	remaining: 2m 46s
10:	learn: 0.6331012	total: 1.83s	remaining: 2m 44s
11:	learn: 0.6292891	total: 1.96s	remaining: 2m 41s
12:	learn: 0.6256999	total: 2.1s	remaining: 2m 39s
13:	learn: 0.6221606	total: 2.24s	remaining: 2m 37s
14:	learn: 0.6188819	total: 2.38s	remaining: 2m 36s
15:	learn: 0.6157070	total: 2.53s	remaining: 2m 35s
16:	learn: 0.6124670	total: 2.65s	remaining: 2m 33s




Training model: CatBoost with oversampling technique: smote
Learning rate set to 0.109202
0:	learn: 0.6852365	total: 143ms	remaining: 2m 22s
1:	learn: 0.6776751	total: 271ms	remaining: 2m 15s
2:	learn: 0.6703730	total: 412ms	remaining: 2m 16s
3:	learn: 0.6634960	total: 550ms	remaining: 2m 17s
4:	learn: 0.6570518	total: 684ms	remaining: 2m 16s
5:	learn: 0.6507136	total: 824ms	remaining: 2m 16s
6:	learn: 0.6448217	total: 1000ms	remaining: 2m 21s
7:	learn: 0.6392497	total: 1.13s	remaining: 2m 20s
8:	learn: 0.6337830	total: 1.26s	remaining: 2m 19s
9:	learn: 0.6285549	total: 1.4s	remaining: 2m 18s
10:	learn: 0.6234783	total: 1.55s	remaining: 2m 19s
11:	learn: 0.6185752	total: 1.68s	remaining: 2m 18s
12:	learn: 0.6137673	total: 1.81s	remaining: 2m 17s
13:	learn: 0.6091029	total: 1.95s	remaining: 2m 17s
14:	learn: 0.6045020	total: 2.1s	remaining: 2m 17s
15:	learn: 0.6000031	total: 2.24s	remaining: 2m 17s
16:	learn: 0.5955026	total: 2.37s	remaining: 2m 16s
17:	learn: 0.5912212	total: 2.5s	rema



Training model: CatBoost with oversampling technique: adasyn
Learning rate set to 0.108695
0:	learn: 0.6850438	total: 146ms	remaining: 2m 25s
1:	learn: 0.6776886	total: 270ms	remaining: 2m 14s
2:	learn: 0.6709229	total: 410ms	remaining: 2m 16s
3:	learn: 0.6642242	total: 555ms	remaining: 2m 18s
4:	learn: 0.6577394	total: 697ms	remaining: 2m 18s
5:	learn: 0.6516680	total: 835ms	remaining: 2m 18s
6:	learn: 0.6459196	total: 969ms	remaining: 2m 17s
7:	learn: 0.6402365	total: 1.1s	remaining: 2m 16s
8:	learn: 0.6347827	total: 1.25s	remaining: 2m 17s
9:	learn: 0.6295700	total: 1.38s	remaining: 2m 16s
10:	learn: 0.6245747	total: 1.53s	remaining: 2m 17s
11:	learn: 0.6195469	total: 1.67s	remaining: 2m 17s
12:	learn: 0.6147665	total: 1.81s	remaining: 2m 17s
13:	learn: 0.6100233	total: 1.95s	remaining: 2m 17s
14:	learn: 0.6055550	total: 2.08s	remaining: 2m 16s
15:	learn: 0.6009851	total: 2.22s	remaining: 2m 16s
16:	learn: 0.5968861	total: 2.35s	remaining: 2m 15s
17:	learn: 0.5927146	total: 2.49s	re



Training model: CatBoost with oversampling technique: border1
Learning rate set to 0.109202
0:	learn: 0.6837468	total: 162ms	remaining: 2m 41s
1:	learn: 0.6754558	total: 311ms	remaining: 2m 35s
2:	learn: 0.6677831	total: 459ms	remaining: 2m 32s
3:	learn: 0.6599398	total: 615ms	remaining: 2m 33s
4:	learn: 0.6526530	total: 753ms	remaining: 2m 29s
5:	learn: 0.6458925	total: 884ms	remaining: 2m 26s
6:	learn: 0.6395180	total: 1.03s	remaining: 2m 25s
7:	learn: 0.6333726	total: 1.16s	remaining: 2m 24s
8:	learn: 0.6274761	total: 1.3s	remaining: 2m 23s
9:	learn: 0.6215030	total: 1.45s	remaining: 2m 23s
10:	learn: 0.6157099	total: 1.59s	remaining: 2m 23s
11:	learn: 0.6100232	total: 1.74s	remaining: 2m 22s
12:	learn: 0.6047931	total: 1.87s	remaining: 2m 21s
13:	learn: 0.5996511	total: 2s	remaining: 2m 21s
14:	learn: 0.5945357	total: 2.13s	remaining: 2m 20s
15:	learn: 0.5896344	total: 2.26s	remaining: 2m 19s
16:	learn: 0.5848882	total: 2.4s	remaining: 2m 18s
17:	learn: 0.5804847	total: 2.54s	remai



Training model: CatBoost with oversampling technique: border2
Learning rate set to 0.109202
0:	learn: 0.6859715	total: 176ms	remaining: 2m 55s
1:	learn: 0.6799863	total: 355ms	remaining: 2m 57s
2:	learn: 0.6746217	total: 537ms	remaining: 2m 58s
3:	learn: 0.6695674	total: 714ms	remaining: 2m 57s
4:	learn: 0.6649832	total: 894ms	remaining: 2m 57s
5:	learn: 0.6605378	total: 1.06s	remaining: 2m 56s
6:	learn: 0.6564026	total: 1.21s	remaining: 2m 51s
7:	learn: 0.6523710	total: 1.38s	remaining: 2m 50s
8:	learn: 0.6485576	total: 1.53s	remaining: 2m 48s
9:	learn: 0.6446643	total: 1.69s	remaining: 2m 47s
10:	learn: 0.6410816	total: 1.85s	remaining: 2m 46s
11:	learn: 0.6375119	total: 2s	remaining: 2m 45s
12:	learn: 0.6340841	total: 2.15s	remaining: 2m 43s
13:	learn: 0.6308683	total: 2.3s	remaining: 2m 42s
14:	learn: 0.6276090	total: 2.46s	remaining: 2m 41s
15:	learn: 0.6244042	total: 2.62s	remaining: 2m 41s
16:	learn: 0.6212394	total: 2.77s	remaining: 2m 40s
17:	learn: 0.6183014	total: 2.93s	rema

### Export classification metrics

In [23]:
# Concatenate df_results_1 and df_results_2 horisontally
df_combined_results = pd.concat([df_results_2, df_results_1], axis=1)

# Export results to CSV files
df_combined_results.to_csv('6_evaluation_results_oversampling.csv', index=False)