# Imports

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna 
import os
from xgboost import XGBClassifier
from sklearn.metrics import recall_score,f1_score,confusion_matrix

from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import KFold
from random import randint

# Create the dataset

In [10]:
features=['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

df=pd.read_csv('../creditcard.csv')
df=df.rename(columns={'Class':'target'})

optuna.logging.set_verbosity(optuna.logging.WARNING)

# Optuna tuning

In [11]:
def objective(trial: pd.DataFrame, X_train: pd.DataFrame, y_train: pd.DataFrame):
    
    params={'lambda': trial.suggest_loguniform('lambda', 1e-2, 5.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-2, 5.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01,0.012,0.014,0.016,0.018, 0.02,0.05]),
        'n_estimators': trial.suggest_int('n_estimators',50,500),
        'max_depth': trial.suggest_categorical('max_depth', [2,3,5,7,9,11]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2021]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        }
    
    f1=[]
    recall=[]
    
    kf= KFold(n_splits=5)
     
    for train_idx,test_idx in kf.split(X_train,y_train):
        
        X_train_tuning,y_train_tuning= X_train.iloc[train_idx],y_train.iloc[train_idx]

        X_test_tuning,y_test_tuning= X_train.iloc[test_idx],y_train.iloc[test_idx]
        
        model=XGBClassifier(**params,n_jobs=-1)
        
        model.fit(X_train_tuning,y_train_tuning)
        
        predictions=model.predict(X_test_tuning)
        
        f1.append(f1_score(y_test_tuning,predictions))
        
        #recall.append(recall_score(y_test_tuning,predictions))
        
    return np.mean(f1)

# XGBoost model

In [12]:
def create_xgb(X_train: pd.DataFrame, y_train: pd.DataFrame):
    
    study=optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=50)
    
    params=study.best_params
    
    model=XGBClassifier(**params,n_jobs=-1)
    
    model.fit(X_train, y_train, verbose=0)
    
    return model

# EasyEnsemble algorithm

In [13]:
def easyensemble(train: pd.DataFrame, features: list, n_estimators: int):
    
    X_train,y_train= train[features], train.target
    
    models=[]
    
    n_models=0
    for estimator in range(1,n_estimators):
        
        undersampler= RandomUnderSampler(sampling_strategy=1,random_state=randint(0,100000))
        
        X_train_rus,y_train_rus= undersampler.fit_resample(X_train, y_train)
        
        models.append(create_xgb(X_train_rus,y_train_rus))
        
        n_models+=1
        
        print(n_models)
        
        

        
    return models   

# Cross validation function

In [14]:
def cross_val(df: pd.DataFrame, features: list,n_estimators: int):
    
    train=df[:200000]
    test=df[200000:]
    
    models=easyensemble(train, features, n_estimators)
    
    X_test, y_test= test[features], test.target
    
    y_proba=0
    
    for model in models:
        
        y_proba+=model.predict_proba(X_test)[:,1]
        
    y_proba=y_proba/len(models)
        
    y_preds=(y_proba>0.5).astype(int)
    
    print('\n')
    print(recall_score(y_test,y_preds))
    print('\n')
    print(confusion_matrix(y_test,y_preds))
    
    return y_proba,models

In [15]:
y_proba, models=cross_val(df,features,10)

1
2
3
4
5
6
7
8
9


0.8785046728971962


[[82930  1770]
 [   13    94]]
