# Imports

In [1]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt
from random import randint
from sklearn.metrics import f1_score,recall_score, confusion_matrix

In [3]:
features=['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

df=pd.read_csv('../creditcard.csv')
df=df.rename(columns={'Class':'target'})

In [4]:
params={'n_estimators':200}

# Create model

In [5]:
def create_xgb(params: dict,X_train: pd.DataFrame, y_train: pd.DataFrame, featureImportance: bool= False):
    
    model=XGBClassifier(**params)
    
    train=pd.concat([X_train,y_train],axis=1)
    
    for 
    
    model.fit(X_train,y_train,verbose=0)
    
    if featureImportance:
        plot_importance(model, max_num_features=6)
        plt.show()
        
    return model

# Balance Cascade algorithm

In [6]:
def balancecascade(train: pd.DataFrame,features: list, n_estimators: int):
    
    train_maj= train[train.target==0]
    train_min= train[train.target==1]
    
    model_list=[]
    
    n_maj= len(train_maj)
    n_min= len(train_min)
    
    ratio=n_min/n_maj
    
    keep_rate=np.power(ratio, 1/(n_estimators-1))
    
    
    while len(train_maj)>len(train_min):
        
        train=pd.concat([train_maj,train_min],axis=0)
        
        X_train,y_train=train[features],train.target
        
        undersampler=RandomUnderSampler(sampling_strategy=1, random_state=randint(0,10000))
        X_train_rus,y_train_rus=undersampler.fit_resample(X_train,y_train)
        
        model_list.append(create_xgb(params, X_train_rus, y_train_rus))
                          
        y_probs=0
                          
        for model in model_list:
            
            y_probs+=model.predict_proba(train_maj[features])[:,1]
        
        y_probs=y_probs/len(model_list)
                          
        train_maj['proba']=y_probs
                          
        train_maj=train_maj.sort_values('proba',ascending=False)[:int(keep_rate*len(train_maj)+1)]
        
        
    return model_list



# Cross validation strategy

In [7]:
def cross_val(df: pd.DataFrame,features: list):
    
    train=df[:200000]
    test=df[200000:]
    
    models=balancecascade(train= train, features= features, n_estimators=10)
    
    X_test,y_test= test[features],test.target
        
    y_preds_proba=0
    
    for model in models:
        
        y_preds_proba+=model.predict_proba(X_test)[:,1]
        
    y_preds_proba=y_preds_proba/len(models)
    y_preds=(y_preds_proba>0.5).astype(int)
    
    print(recall_score(y_test,y_preds))
    print('\n')
    print(confusion_matrix(y_test,y_preds))
    
    
    return models
        

In [8]:
models=cross_val(df,features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0.8130841121495327


[[84644    56]
 [   20    87]]
