In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import category_encoders as ce
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, auc, recall_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline as Pipeline_skl
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import OneHotEncoder, TargetEncoder
from sklearn.compose import ColumnTransformer
from utils import get_result, train_xgb

pd.set_option('display.float_format', '{:,.2f}'.format)
sns.set(style="whitegrid")

In [None]:
df = pd.read_pickle('process_df.pickle')
df.head()

In [None]:
df_1 = df.copy()

In [None]:
# Dropping columns that are not needed
df_1.drop(['nameOrig','nameDest'],axis=1,inplace=True)
# df.drop(['step', 'nameOrig','nameDest'],axis=1,inplace=True)
df_1.head()

In [None]:
SEARCH_DICT, X_train, y_train, X_test, y_test, pipe = train_xgb(df_1)

In [None]:
pipe

In [None]:
get_result(SEARCH_DICT, X_train, y_train, X_test, y_test)

In [None]:
# df.drop(['step'],axis=1,inplace=True)

In [None]:
# Remove step
SEARCH_DICT, X_train, y_train, X_test, y_test, pipe = train_xgb(df, numerical_cols = ['amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest'])

In [None]:
pipe

In [None]:
get_result(SEARCH_DICT, X_train, y_train, X_test, y_test)

# Fine tuning

In [None]:
df_2 = df.copy()

log_columns = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
df_2[log_columns].hist(bins=10, figsize=(10, 10))

In [None]:
for column in log_columns:
    df_2[column] = np.log1p(df_2[column])
df_2.head()

In [None]:
df_2[log_columns].hist(bins=10, figsize=(10, 10))

In [None]:
SEARCH_DICT, X_train, y_train, X_test, y_test, pipe = train_xgb(df_2, numerical_cols = log_columns)

In [None]:
pipe

In [None]:
# log transform numerical variables
get_result(SEARCH_DICT, X_train, y_train, X_test, y_test)

In [None]:
X=df_2.drop('isFraud',axis=1)
y=df_2['isFraud']


# Doing train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, train_size=0.7)

In [None]:
#  scale_pos_weight = total non fraudulent transaction / total fraudulent transactions

In [None]:
scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])
scale_pos_weight

In [None]:
from sklearn.preprocessing import OneHotEncoder, RobustScaler
robust_scaler = RobustScaler()
numerical_cols = log_columns

skf=StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [None]:
rf_model = RandomForestClassifier(random_state=42)
lgr_model = LogisticRegression(random_state=42)
brf_model = BalancedRandomForestClassifier(sampling_strategy=0.1, random_state=42)
xgb_model = xgb.XGBClassifier(random_state=42)

MODELS = [rf_model, lgr_model, brf_model, xgb_model]
MODELS = [rf_model, brf_model, xgb_model]
MODELS = [xgb_model]


SEARCH = 'RandomizedSearchCV' # 'GridSearchCV'
smote = False
under = False

scoring = {
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score),
    'recall': make_scorer(recall_score)
}


categorical_cols = ['type']
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    # ('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='error'))
])

count_col = ['nameDest']
count_transformer = Pipeline(steps=[
     ('count', ce.CountEncoder())
])

woe_col = ['nameDest']
woe_transformer = Pipeline(steps=[
     ('woe', ce.woe.WOEEncoder())
])

target_enc_col = ['nameOrig']
target_transformer = Pipeline(steps=[
     ('target', TargetEncoder())
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        # ('woe', woe_transformer, woe_col),
        # ('target', target_transformer, target_enc_col),
        # ('count', count_transformer, count_col),
        ('cat', categorical_transformer, categorical_cols),
        ("num", robust_scaler, numerical_cols)
    ])



SEARCH_DICT = {}

for model in MODELS:
    STEPS = [('preprocessor', preprocessor),  ("clf", model)]
    if smote:
        STEPS.insert(1, ('over', SMOTE(sampling_strategy=0.1, random_state=42)))
    if under:
        STEPS.insert(0, ('under', RandomUnderSampler(sampling_strategy=0.1, random_state=42)))

    pipe = Pipeline(steps=STEPS)

    if type(model).__name__ == 'LogisticRegression':
        param_grid = {
        'clf__C': [1e-07, 1e-06, 1e-05],
        'clf__penalty': ['l2']
    }
    
    elif type(model).__name__ in ['RandomForestClassifier', 'BalancedRandomForestClassifier']:
        param_grid = {
        'clf__n_estimators': [150, 200],
        'clf__max_depth': [None, 10, 20],
        'clf__min_samples_split': [20, 25],
        'clf__min_samples_leaf': [2, 3],
         'clf__class_weight': ['balanced', 'balanced_subsample']
    }
    
    
    elif type(model).__name__ == 'XGBClassifier':
        param_grid = {
        'clf__n_estimators': [250, 350],
        'clf__scale_pos_weight': [320, 330, 340],
    }


    if SEARCH == 'GridSearchCV':
        search = GridSearchCV(pipe, param_grid=param_grid, cv=skf, scoring=scoring, refit='f1', return_train_score=True)
    else:
        search = RandomizedSearchCV(pipe, param_distributions=param_grid, n_iter=3, scoring=scoring, refit='f1', cv=skf)

    # Fit the model to your training data
    search.fit(X_train, y_train)
    SEARCH_DICT[type(model).__name__] = search

In [None]:
pipe

In [None]:
get_result(SEARCH_DICT, X_train, y_train, X_test, y_test)