# Santander

# 1. Import libs
[top](#Contents)

In [1]:
# pandas and numpy imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
# Suppress the specific warning
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.metrics')

In [2]:
# pip install -U imbalanced-learn

# 2. Get data
[top](#Contents)

#### Read data
[top](#Contents)

In [3]:
bank_ds = pd.read_csv('train.csv')

In [14]:
bank_test_ds = pd.read_csv('test.csv')

#### Drop an unneeded column

In [4]:
# Assuming 'column_to_drop' is the name of the column you want to drop
column_to_drop = 'ID_code'

# Drop the column from bank_ds
bank_ds.drop(column_to_drop, axis=1, inplace=True)


### Split 70/30 (stratified sampling)

In [5]:
target = bank_ds['target']

In [6]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    bank_ds.drop(labels=['target'], axis=1),  # drop the target
    bank_ds['target'],  # just the target
    test_size=0.3,
    stratify=target,
    random_state=42)

X_train.shape, X_test.shape

((140000, 200), (60000, 200))

### Over and Under sampling

In [7]:
from imblearn.under_sampling import (
    TomekLinks,
    EditedNearestNeighbours,
)
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

In [8]:
over_under_sampler_dict = {
    'smenn': SMOTEENN(sampling_strategy='auto', random_state=0, 
                      smote=SMOTE(sampling_strategy='auto', random_state=0, k_neighbors=5), 
                      enn=EditedNearestNeighbours(sampling_strategy='auto', n_neighbors=3, kind_sel='all'), n_jobs=4),
    'smtomek': SMOTETomek(sampling_strategy='auto', random_state=0, 
                          smote=SMOTE(sampling_strategy='auto', random_state=0, k_neighbors=5), 
                          tomek=TomekLinks(sampling_strategy='all'), n_jobs=4)
}


## Evaluation

#### Metrics which require predict_proba method
The ROC AUC score is calculated based on the predicted probabilities for each class, which are obtained using the predict_proba method. This method provides the probability estimates for each class, which are necessary to compute the ROC curve and subsequently the AUC (Area Under the Curve).

Over-sampling methods should be performed on the dataset that we are going to use to train the classifier. But, the performance of the model should be determined on a portion of the data, that was not re-sampled

In [9]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, roc_auc_score, average_precision_score
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import make_pipeline

# Dictionary to hold the models
models_dict = {
    #'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, solver='lbfgs', max_iter=1000, verbose=0),  # Suppress logs 
    #'XGBoost': XGBClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42, verbose=-1),  # Suppress LightGBM logs
    'CatBoost': CatBoostClassifier(random_state=42)
}

# Collecting results
results = []

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Custom scoring function
scoring = {
    'roc_auc': make_scorer(roc_auc_score, needs_proba=True),
    'pr_auc': make_scorer(average_precision_score, needs_proba=True)
}

# Define a scaler
scaler = StandardScaler()

# Iterate over each over_under_samplling technique
for name, over_under_sampler in over_under_sampler_dict.items():
    print(f"Applying over_under_sampling technique: {name}")
    
    # Iterate over each model
    for model_name, model in models_dict.items():
        print(f"Training model: {model_name} with over_under_sampling technique: {name}")
        
        # Set up the pipeline
        if not over_under_sampler:
            pipeline = model
        else:
            pipeline = make_pipeline(
                scaler,
                over_under_sampler,
                model
            )
        
        # Perform cross-validation
        cv_results = cross_validate(
            pipeline, 
            X_train, 
            y_train, 
            scoring=scoring, 
            cv=cv, 
            return_train_score=False
        )
        
        # Print cross-validation results
        print(f"{model_name} results:")
        print(
            'ROC-AUC: {0:.3f} +/- {1:.3f}'.format(
            cv_results['test_roc_auc'].mean(), cv_results['test_roc_auc'].std()
            )
        )
        print(
            'PR-AUC: {0:.3f} +/- {1:.3f}'.format(
            cv_results['test_pr_auc'].mean(), cv_results['test_pr_auc'].std()
            )
        )
        print()  # Add a blank line for better readability
        
        # Append the results
        results.append({
            'Oversampling Technique': name,
            'Model': model_name,
            'ROC-AUC': cv_results['test_roc_auc'].mean(),
            'ROC-AUC-std': cv_results['test_roc_auc'].std(),
            'PR-AUC': cv_results['test_pr_auc'].mean(),
            'PR-AUC-std': cv_results['test_pr_auc'].std()
        })

# Create a DataFrame from the results
df_results_2 = pd.DataFrame(results)

# Print the DataFrame as a table
print(df_results_2)

Applying over_under_sampling technique: smenn
Training model: CatBoost with over_under_sampling technique: smenn
Learning rate set to 0.091842
0:	learn: 0.6856500	total: 298ms	remaining: 4m 57s
1:	learn: 0.6789672	total: 445ms	remaining: 3m 41s
2:	learn: 0.6729439	total: 566ms	remaining: 3m 7s
3:	learn: 0.6672334	total: 677ms	remaining: 2m 48s
4:	learn: 0.6614537	total: 797ms	remaining: 2m 38s
5:	learn: 0.6558848	total: 898ms	remaining: 2m 28s
6:	learn: 0.6505872	total: 1s	remaining: 2m 22s
7:	learn: 0.6454661	total: 1.11s	remaining: 2m 17s
8:	learn: 0.6406370	total: 1.22s	remaining: 2m 14s
9:	learn: 0.6359345	total: 1.33s	remaining: 2m 11s
10:	learn: 0.6311845	total: 1.45s	remaining: 2m 10s
11:	learn: 0.6265481	total: 1.57s	remaining: 2m 9s
12:	learn: 0.6222056	total: 1.68s	remaining: 2m 7s
13:	learn: 0.6180368	total: 1.79s	remaining: 2m 6s
14:	learn: 0.6140429	total: 1.91s	remaining: 2m 5s
15:	learn: 0.6101123	total: 2.02s	remaining: 2m 3s
16:	learn: 0.6061716	total: 2.13s	remaining:

### Export classification metrics

In [19]:
# Export results to CSV files
df_results_2.to_csv('7_evaluation_results_oversampling_and_cross_validation.csv', index=False)