# 1. Install & import lib

In [None]:
!pip install imblearn scikit-plot mlflow
!pip install eli5 shap 

Collecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Collecting mlflow
  Downloading mlflow-1.24.0-py3-none-any.whl (16.5 MB)
[K     |████████████████████████████████| 16.5 MB 213 kB/s 
Collecting databricks-cli>=0.8.7
  Downloading databricks-cli-0.16.4.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 7.8 MB/s 
Collecting gitpython>=2.1.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 53.2 MB/s 
[?25hCollecting docker>=4.0.0
  Downloading docker-5.0.3-py2.py3-none-any.whl (146 kB)
[K     |████████████████████████████████| 146 kB 55.6 MB/s 
Collecting alembic
  Downloading alembic-1.7.7-py3-none-any.whl (210 kB)
[K     |████████████████████████████████| 210 kB 62.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.6

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import random

import joblib
import time, os
os.environ['TZ'] = 'Asia/Ho_Chi_Minh'
time.tzset()

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline 

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

from collections import Counter


from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.utils import resample

# from scikitplot.metrics import plot_roc
# from scikitplot.metrics import plot_precision_recall
# from scikitplot.metrics import plot_cumulative_gain
# from scikitplot.metrics import plot_lift_curve
import warnings
warnings.filterwarnings('ignore')

In [None]:
import shap
import eli5
from eli5.sklearn import PermutationImportance

In [None]:
import sklearn
sklearn.__version__

'1.0.2'

# 2. Data understanding & Data preparation

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:

MERCHANTDATA=pd.read_csv(r"/content/drive/MyDrive/BIDV/CARD_FRAUD/data/trainning/data_thang6.csv", error_bad_lines=False)
#MERCHANT_COLUMNS=MERCHANTDATA.columns.values.tolist();
MERCHANT_COLUMNS_PROPERTIES=pd.read_excel(r"/content/drive/MyDrive/BIDV/CARD_FRAUD/data/column_description.xlsx")
df_fraud_06 = pd.read_csv(r"/content/drive/MyDrive/BIDV/CARD_FRAUD/data/MERCHANT_KHONG.csv")


def get_list_columns():
    feature_cad = MERCHANT_COLUMNS_PROPERTIES.copy()
    feature_cad = feature_cad[(feature_cad["ORG_SOURCE"].isin(["CAD", "MIS_MRCH_DIM", "MIS_MR_CST_DIM","MIS_CST_FCT","MIS_MRCH_FCT","EXT"])) 
    & (feature_cad["USE_TO_MODEL"] == "Y")
    ]
    return feature_cad["COLUMN_NAME"].tolist()
    

def process_data_type(df):
    #xy ly kieu du lieu la date
    df["DAY_OPEN"]=pd.to_datetime(df["DAY_OPEN"])
    df["DAY_CLOSE"]=pd.to_datetime(df["DAY_CLOSE"])
    df["DAY_START"] = pd.to_datetime(df["DAY_START"])
    df["DAY_SUBM"] = pd.to_datetime(df["DAY_SUBM"])

    df["DAY_OPEN_YEAR"]=df["DAY_OPEN"].dt.year
    df["DAY_OPEN_MONTH"] = df["DAY_OPEN"].dt.month
    df["DAY_OPEN_DAY"] = df["DAY_OPEN"].dt.day

    df["DAY_CLOSE_YEAR"] = df["DAY_CLOSE"].dt.year
    df["DAY_CLOSE_MONTH"] = df["DAY_CLOSE"].dt.month
    df["DAY_CLOSE_DAY"] = df["DAY_CLOSE"].dt.day

    df["DAY_START_YEAR"] = df["DAY_START"].dt.year
    df["DAY_START_MONTH"] = df["DAY_START"].dt.month
    df["DAY_START_DAY"] = df["DAY_START"].dt.day

    df["DAY_SUBM_YEAR"] = df["DAY_SUBM"].dt.year
    df["DAY_SUBM_MONTH"] = df["DAY_SUBM"].dt.month
    df["DAY_SUBM_DAY"] = df["DAY_SUBM"].dt.day



    df=df.drop("DAY_CLOSE",1)
    df = df.drop("DAY_OPEN",1)
    df = df.drop("DAY_START", 1)
    df = df.drop("DAY_SUBM", 1)

    return df


# reduce mem

In [None]:
def reduce_memory_usage(df):   
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                    
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

In [None]:
MERCHANTDATA = reduce_memory_usage(MERCHANTDATA)

Memory usage of dataframe is 168.3458251953125 MB
Memory usage of dataframe after reduction 74.88503074645996 MB
Reduced by 55.51714415277041 % 


# Drop column NA

In [None]:
limitPer = len(MERCHANTDATA) * 0.50
MERCHANTDATA = MERCHANTDATA.dropna(thresh=limitPer, axis=1)

In [None]:
MERCHANTDATA.shape

(57462, 382)

# Init dataset

In [None]:
index_fraud = MERCHANTDATA['MERCHANT'].isin(df_fraud_06['MERCHANT'])
df_fraud = MERCHANTDATA[index_fraud]
df_fraud['MERCH_FR'] = 1 # FRAUD
df_non_fraud = MERCHANTDATA[~index_fraud]
df_non_fraud['MERCH_FR'] = 0

df_org = df_non_fraud.append(df_fraud)

df_fraud.shape 
df_non_fraud.shape
df_org.shape
MERCHANTDATA.shape

(752, 382)

(56710, 382)

(57462, 382)

(57462, 382)

In [None]:
list_col_name = get_list_columns()
df = df_org[list_col_name].copy()
df = process_data_type(df)


In [None]:
numper_of_non_fr = 1500
df_tmp = df[df.MERCH_FR == 0]

df_sub = df_tmp.sample(numper_of_non_fr, random_state=42)
df_sub = df_sub.append(df[df.MERCH_FR == 1], ignore_index = True)
df_sub.shape

(2252, 292)

# Remove covariance threshold

In [None]:
def remove_collinear_features(x, threshold):
    '''
    Objective:
        Remove collinear features in a dataframe with a correlation coefficient
        greater than the threshold. Removing collinear features can help a model 
        to generalize and improves the interpretability of the model.

    Inputs: 
        x: features dataframe
        threshold: features with correlations greater than this value are removed

    Output: 
        dataframe that contains only the non-highly-collinear features
    '''

    # Calculate the correlation matrix
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i+1):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)

            # If correlation exceeds the threshold
            if val >= threshold:
                # Print the correlated features and the correlation value
                #print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(col.values[0])

    # Drop one of each pair of correlated columns
    drops = set(drop_cols)
    # x = x.drop(columns=drops)
    print('Removed Columns {}'.format(drops))
    return list(drops)

In [None]:
# check dtype dataframe 
# dtype_include=object
# num_cols = make_column_selector(dtype_include=np.number)
object_cols = df_sub.select_dtypes(include='object').columns.to_list()
num_cols = df_sub.select_dtypes(include='number').columns.to_list()
reject_columns = remove_collinear_features(df_sub[num_cols], 0.9)
reject_columns

Removed Columns {'CST_TOT_ACR_INT_INCM_AMT_TDY_LCY', 'CST_TOT_SA_CLS_BAL_AMT_QTD_LCY', 'CST_TOT_LBY_CLS_BAL_AMT_QTD_LCY', 'CST_TOT_AST_CLS_BAL_AMT_HTD_LCY', 'CST_TOT_CA_CLS_BAL_AMT_HTD_LCY', 'CST_TOT_SA_CLS_BAL_AMT_YTD_LCY', 'CST_TOT_NBR_OF_CST_CR_QTD', 'CST_AV_LBY_BAL_AMT_R3MTH_LCY', 'CST_AV_LBY_BAL_AMT_MTD_LCY', 'CST_TOT_NBR_OF_OTHR_DB_YTD', 'CST_TOT_OTHR_DB_AMT_MTD', 'CST_AV_LBY_BAL_AMT_HTD_LCY', 'CST_TOT_CA_ACR_INT_AMT_TDY_LCY', 'CST_TOT_LOAN_CLS_BAL_AMT_LMTH_LCY', 'CST_TOT_AST_CLS_BAL_AMT_QTD_LCY', 'CST_TOT_AST_CLS_BAL_AMT_LQTR_LCY', 'CST_TOT_PASSIVE_TXN_CR_AMT_LTD_LCY', 'CST_TOT_NBR_OF_CST_TXN_QTD', 'CST_TOT_CA_ACR_INT_AMT_YTD_LCY', 'CST_TOT_CST_DB_AMT_MTD_LCY', 'CST_TOT_LOAN_CLS_BAL_AMT_LYR_LCY', 'CST_TOT_CARD_CLS_BAL_AMT_LYR_LCY', 'CST_TOT_NBR_OF_ACTV_TXN_DB_YTD', 'CST_NBR_OF_CLS_AC_YTD', 'CST_TOT_LBY_CLS_BAL_AMT_MTD_LCY', 'CST_TOT_CA_CLS_BAL_AMT_TDY_LCY', 'CST_TOT_NBR_OF_OTHR_CR_YTD', 'CST_TOT_CST_DB_AMT_LTD_LCY', 'FL_LIM2', 'CST_AV_LBY_BAL_AMT_QTD_LCY', 'CST_TOT_OTHR_DB_AMT_Y

['CST_TOT_ACR_INT_INCM_AMT_TDY_LCY',
 'CST_TOT_SA_CLS_BAL_AMT_QTD_LCY',
 'CST_TOT_LBY_CLS_BAL_AMT_QTD_LCY',
 'CST_TOT_AST_CLS_BAL_AMT_HTD_LCY',
 'CST_TOT_CA_CLS_BAL_AMT_HTD_LCY',
 'CST_TOT_SA_CLS_BAL_AMT_YTD_LCY',
 'CST_TOT_NBR_OF_CST_CR_QTD',
 'CST_AV_LBY_BAL_AMT_R3MTH_LCY',
 'CST_AV_LBY_BAL_AMT_MTD_LCY',
 'CST_TOT_NBR_OF_OTHR_DB_YTD',
 'CST_TOT_OTHR_DB_AMT_MTD',
 'CST_AV_LBY_BAL_AMT_HTD_LCY',
 'CST_TOT_CA_ACR_INT_AMT_TDY_LCY',
 'CST_TOT_LOAN_CLS_BAL_AMT_LMTH_LCY',
 'CST_TOT_AST_CLS_BAL_AMT_QTD_LCY',
 'CST_TOT_AST_CLS_BAL_AMT_LQTR_LCY',
 'CST_TOT_PASSIVE_TXN_CR_AMT_LTD_LCY',
 'CST_TOT_NBR_OF_CST_TXN_QTD',
 'CST_TOT_CA_ACR_INT_AMT_YTD_LCY',
 'CST_TOT_CST_DB_AMT_MTD_LCY',
 'CST_TOT_LOAN_CLS_BAL_AMT_LYR_LCY',
 'CST_TOT_CARD_CLS_BAL_AMT_LYR_LCY',
 'CST_TOT_NBR_OF_ACTV_TXN_DB_YTD',
 'CST_NBR_OF_CLS_AC_YTD',
 'CST_TOT_LBY_CLS_BAL_AMT_MTD_LCY',
 'CST_TOT_CA_CLS_BAL_AMT_TDY_LCY',
 'CST_TOT_NBR_OF_OTHR_CR_YTD',
 'CST_TOT_CST_DB_AMT_LTD_LCY',
 'FL_LIM2',
 'CST_AV_LBY_BAL_AMT_QTD_LCY',
 'CST_TOT

In [None]:
len(reject_columns)

176

In [None]:
# MERCH_FR: label
reject_manual_cols = ['CST_AGE_SEG', 'CPLANSTYLS', 'DBA_CITY', 'DBA_CNTRY', 'FEE_CODE', 'MERCH_FR', 'MERCH_FR_TP']
rejected_cols = reject_columns + reject_manual_cols
len(rejected_cols)

183

In [None]:
type(reject_manual_cols)

list

In [None]:
num_features = [col for col in num_cols if col not in rejected_cols]
cat_features = [col for col in object_cols if col not in rejected_cols]
cat_label_cols = ['CST_PERF_ST', 'CST_MKT_SEG']
cat_onehot_cols = [f for f in cat_features if f not in cat_label_cols]
features = num_features + cat_features

y = df_sub.MERCH_FR
x_raw = df_sub[features].copy()
x = x_raw
# x = x_raw.drop('MERCH_FR', axis=1)
# x = x[mst_features["Specs"].values]

In [None]:
# pd.DataFrame(num_features, columns = ['num_feature']).to_csv(r"/content/drive/MyDrive/BIDV/CARD_FRAUD/data/trainning/funix_num_feature_model_06_20210716-31032022.csv")
# pd.DataFrame(cat_features, columns = ['cat_feature']).to_csv(r"/content/drive/MyDrive/BIDV/CARD_FRAUD/data/trainning/funix_cat_feature_model_06_20210716-31032022.csv")
# pd.DataFrame(features, columns = ['feature']).to_csv(r"/content/drive/MyDrive/BIDV/CARD_FRAUD/data/trainning/funix_feature_model_06_20210716-31032022.csv")

In [None]:
# cat_label_cols + cat_onehot_cols

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42, stratify=y)
x_cv, y_cv = x_train, y_train # data for cross validation 
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=42, stratify=y_train)

In [None]:
# 01. data sumary
print(sorted(Counter(y_train).items()))
print(sorted(Counter(y_val).items()))
print(sorted(Counter(y_test).items()))
print(sorted(Counter(y_cv).items()))

[(0, 956), (1, 479)]
[(0, 319), (1, 160)]
[(0, 225), (1, 113)]
[(0, 1275), (1, 639)]


In [None]:
x_train.columns[x_train.isna().any()]
x_val.columns[x_val.isna().any()]
x_test.columns[x_test.isna().any()]

Index(['NUM_OF_TERM'], dtype='object')

Index(['NUM_OF_TERM'], dtype='object')

Index(['NUM_OF_TERM'], dtype='object')

In [None]:
print(x_cv['NUM_OF_TERM'].isna().sum()/x_cv.shape[0])
print(x_train['NUM_OF_TERM'].isna().sum()/x_train.shape[0])
print(x_test['NUM_OF_TERM'].isna().sum()/x_test.shape[0])

0.109717868338558
0.10313588850174216
0.12130177514792899


# 3. Modeling

## 3.1. Create pipepline with resample
Thực hiện preprocessing data ở đây, rồi apply cho val set, thực hiện kiểm tra trên test set 

In [None]:
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, StandardScaler, OneHotEncoder
from imblearn.pipeline import make_pipeline, Pipeline

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import mean
# import mlflow
# import mlflow.sklearn

numeric_features = num_features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('scaler', StandardScaler() # ('scaler', MinMaxScaler()    
    )])

# cat_label_cols = ['CST_PERF_ST', 'CST_MKT_SEG']
cat_ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing_val')),
    ('encoder', OrdinalEncoder() 
    )])

# cat_onehot_cols = [f for f in cat_cols if f not in cat_label_cols]
cat_onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing_val')),
    ('encoder', OneHotEncoder(handle_unknown = 'ignore', sparse=False) 
    )])

# over sampling 
oversample = SMOTE() # ADASYN()

# modeling 
classifier = RandomForestClassifier()

preprocessor = ColumnTransformer(
    transformers=[
                  ('num', numeric_transformer, numeric_features), 
                  #('cat_label', cat_ordinal_transformer, cat_label_cols)
                  #('cat_onehot', cat_onehot_transformer, ['CLOSE_RSN','CHIP_YN']) # ['CLOSE_RSN', 'COMM_FREQ']
                 ], remainder='drop')

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('sampling', oversample),
                      ('classifier', classifier)])

cat_label_cols

['CST_PERF_ST', 'CST_MKT_SEG']

In [None]:
preprocessor.fit(x_train)
pipeline.fit(x_train, y_train)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='constant')),
                                                 ('scaler', StandardScaler())]),
                                 ['MER_TYPE', 'BRANCH', 'CURR_NUM',
                                  'DAY_REINSTATED', 'DAYS_AVGE', 'FL_LIM1',
                                  'FL_LIM3', 'FL_LIM4', 'GROSS_SALE', 'HO_RATE',
                                  'LAST_V_DAY', 'LAST_VOUCH', 'LST_COM_DY',
                                  'LST_DEP_DY', 'MP_IND', 'PAY_METHOD',
                                  'PAYMT_DAYS', 'TERM_FEE', 'TRAN_AVGE',
                                  'VOUCH_USED', 'WEEK_COUNT', 'WEEKLY_DAY',
                                  'NUM_OF_TERM', 'AVG_LBY_BAL_AMT_2YR_AGO',
                                  'AVG_LBY_BAL_AMT_3YR_AGO',
                                  'TOT_NET_INCM_2YR_AGO',
              

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['MER_TYPE', 'BRANCH',
                                                   'CURR_NUM', 'DAY_REINSTATED',
                                                   'DAYS_AVGE', 'FL_LIM1',
                                                   'FL_LIM3', 'FL_LIM4',
                                                   'GROSS_SALE', 'HO_RATE',
                                                   'LAST_V_DAY', 'LAST_VOUCH',
                                                   'LST_COM_DY', 'LST_DEP_DY',
                           

In [None]:
'''
cat_onehot_cols
#x_train['COMM_FREQ'].describe()
x_train['COMM_FREQ'].value_counts()
x_train[cat_onehot_cols].describe()
x_train['CLOSE_RSN'].value_counts()
'''

In [None]:
'''
# cat_onehot_cols
xx_train = x_train[['CLOSE_RSN', 'COMM_FREQ']].copy()
imputer = SimpleImputer(strategy='constant', fill_value='missing')
x_train_imputed = imputer.fit_transform(xx_train)
pd.DataFrame(x_train_imputed,columns=xx_train.columns).head()
# Encode training data
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
#encoder = LabelEncoder()
x_train_encoded = encoder.fit_transform(x_train_imputed)
#x_train_encoded
pd.DataFrame(x_train_encoded, columns=encoder.get_feature_names(xx_train.columns)).head()
'''

##3.2. Base_model

In [None]:
from sklearn.metrics import classification_report
pipeline.fit(x_train, y_train)
print(classification_report(y_val, pipeline.predict(x_val)))

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['MER_TYPE', 'BRANCH',
                                                   'CURR_NUM', 'DAY_REINSTATED',
                                                   'DAYS_AVGE', 'FL_LIM1',
                                                   'FL_LIM3', 'FL_LIM4',
                                                   'GROSS_SALE', 'HO_RATE',
                                                   'LAST_V_DAY', 'LAST_VOUCH',
                                                   'LST_COM_DY', 'LST_DEP_DY',
                           

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       319
           1       0.94      0.97      0.96       160

    accuracy                           0.97       479
   macro avg       0.96      0.97      0.97       479
weighted avg       0.97      0.97      0.97       479



##3.3. Random grid to search for best hyperparameters

define parameter grid

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'classifier__n_estimators': n_estimators,
               'classifier__max_features': max_features,
               'classifier__max_depth': max_depth,
               'classifier__min_samples_split': min_samples_split,
               'classifier__min_samples_leaf': min_samples_leaf,               
               'classifier__bootstrap': bootstrap}

print(random_grid)

{'classifier__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'classifier__max_features': ['auto', 'sqrt'], 'classifier__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'classifier__min_samples_split': [2, 5, 10], 'classifier__min_samples_leaf': [1, 2, 4], 'classifier__bootstrap': [True, False]}


In [None]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
rf_random = RandomizedSearchCV(estimator = pipeline, param_distributions = random_grid, 
                               n_iter = 20, cv = cv, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
# rf_random.fit(x_train, y_train)
rf_random.fit(x_cv, y_cv)

Fitting 15 folds for each of 20 candidates, totalling 300 fits


RandomizedSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=42),
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='constant')),
                                                                                               ('scaler',
                                                                                                StandardScaler())]),
                                                                               ['MER_TYPE',
                                                                                'BRANCH',
                                                                                'CURR_NUM',
                          

In [None]:
rf_random.best_params_
best_random = rf_random.best_estimator_
best_random.fit(x_train, y_train)

{'classifier__bootstrap': False,
 'classifier__max_depth': 50,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 1,
 'classifier__min_samples_split': 10,
 'classifier__n_estimators': 1000}

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['MER_TYPE', 'BRANCH',
                                                   'CURR_NUM', 'DAY_REINSTATED',
                                                   'DAYS_AVGE', 'FL_LIM1',
                                                   'FL_LIM3', 'FL_LIM4',
                                                   'GROSS_SALE', 'HO_RATE',
                                                   'LAST_V_DAY', 'LAST_VOUCH',
                                                   'LST_COM_DY', 'LST_DEP_DY',
                           

In [None]:
print(classification_report(y_val, best_random.predict(x_val)))
print(classification_report(y_test, best_random.predict(x_test)))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       319
           1       0.95      0.97      0.96       160

    accuracy                           0.97       479
   macro avg       0.97      0.97      0.97       479
weighted avg       0.97      0.97      0.97       479

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       225
           1       0.96      0.98      0.97       113

    accuracy                           0.98       338
   macro avg       0.97      0.98      0.98       338
weighted avg       0.98      0.98      0.98       338



# 4. Check model: cross validation

In [None]:
# metrics
from sklearn.metrics import SCORERS
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [None]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, cross_validate 
from numpy import mean

scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
scores = cross_validate(best_random, x_cv, y_cv, scoring=scoring, cv=cv, n_jobs=-1)
# scores = cross_val_score(pipepline, x_train, y_train, scoring=scoring, cv=cv, n_jobs=-1)


In [None]:
scores

{'fit_time': array([9.86289811, 9.85641599, 7.61891532, 7.6970489 , 8.06851625,
        7.72504854, 7.65173578, 7.69987249, 7.9744904 , 7.94438887,
        7.91097689, 7.83457661, 7.76003909, 7.65839386, 5.37069345]),
 'score_time': array([0.27973795, 0.28716063, 0.26268077, 0.28627062, 0.26365685,
        0.25285268, 0.24738431, 0.25354576, 0.25226331, 0.27209783,
        0.25836301, 0.27255034, 0.26620007, 0.26745534, 0.14916468]),
 'test_accuracy': array([0.96083551, 0.97911227, 0.9843342 , 0.97127937, 0.97120419,
        0.97650131, 0.97911227, 0.97911227, 0.97650131, 0.96335079,
        0.97650131, 0.98694517, 0.96083551, 0.97127937, 0.97643979]),
 'test_f1_score': array([0.94208494, 0.96875   , 0.97674419, 0.95785441, 0.95686275,
        0.96498054, 0.96946565, 0.968     , 0.96498054, 0.94573643,
        0.96470588, 0.98054475, 0.94252874, 0.95785441, 0.96498054]),
 'test_precision': array([0.93129771, 0.96875   , 0.96923077, 0.93984962, 0.953125  ,
        0.96124031, 0.94776119

In [None]:
print(mean(scores['test_accuracy']))
print(mean(scores['test_precision']))
print(mean(scores['test_recall']))
print(mean(scores['test_f1_score']))

0.9742229755899735
0.9546234558031693
0.9692175196850393
0.9617382511841067


In [None]:
timestr = time.strftime("%Y%m%d-%H%M%S")
path = '/content/drive/MyDrive/BIDV/CARD_FRAUD/model/'
filename = path + 'funix_finalized_model_06_'+timestr+'.sav'
joblib.dump(best_random, filename)

['/content/drive/MyDrive/BIDV/CARD_FRAUD/model/funix_finalized_model_06_20220329-163941.sav']

# 5. Check new data

In [None]:
fraud_mids=pd.read_csv(r"/content/drive/MyDrive/BIDV/CARD_FRAUD/data/test_real_data/FRAUD_07.csv", error_bad_lines=False)
fraud_data=pd.read_csv(r"/content/drive/MyDrive/BIDV/CARD_FRAUD/data/test_real_data/data_1207.csv", error_bad_lines=False)

In [None]:
index_fraud_mids = fraud_data['MERCHANT'].isin(fraud_mids['MID'])
df_check = fraud_data[index_fraud_mids]
df_check['MERCH_FR'] = 1
df_check.shape

(216, 384)

In [None]:
df_check = process_data_type(df_check[list_col_name])
df_fraud_new = df_check[features].copy()

In [None]:
result = best_random.predict(df_fraud_new)

In [None]:
result.sum()/len(result)

0.8981481481481481

In [None]:
print(classification_report(df_check['MERCH_FR'], best_random.predict(df_fraud_new)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.90      0.95       216

    accuracy                           0.90       216
   macro avg       0.50      0.45      0.47       216
weighted avg       1.00      0.90      0.95       216

