#### Use Case Background 

##### For financial services organizations, model development for fraud detection and for surfacing potentially anti-money laundering activities are an area of increasing interest.

##### Bespoke models may be used by banks to replace rules-based scenarios or other fraud detection activities.

##### This use case models bank account holder activity to determine if the probability of money launding event.

##### (1) Developing Python / Notebook to create dataframe and set up training and testing partitions.

##### Analytics base table (aml_bank_prep) has already gone through ETL process and is prepped for modeling.

##### Engines are built into Workbench to access and process data in external data sources.

In [1]:
########################
### Create Dataframe ###
########################

import pandas as pd
from pathlib import Path

workspace_dir = "/workspaces/chris_parrish/__fraud_detection_use_case__/"
data_table = "aml_bank_prep_synthetic.csv"

dm_inputdf = pd.read_csv(Path(workspace_dir) / data_table, header=0)
print(dm_inputdf.dtypes)

account_id                       int64
num_transactions               float64
credit_score                   float64
marital_status_single            int64
marital_status_married           int64
marital_status_divorced          int64
analytic_partition               int64
ml_indicator                     int64
checking_only_indicator          int64
prior_ctr_indicator              int64
address_change_2x_indicator      int64
cross_border_trx_indicator       int64
in_person_contact_indicator      int64
linkedin_indicator               int64
atm_deposit_indicator            int64
trx_10ksum_indicator             int64
common_merchant_indicator        int64
direct_deposit_indicator         int64
citizenship_country_risk         int64
occupation_risk                  int64
num_acctbal_chgs_gt2000        float64
distance_to_employer           float64
distance_to_bank               float64
income                         float64
primary_transfer_cash            int64
primary_transfer_check   

In [2]:
########################
### Model Parameters ###
########################

### import python libraries
import numpy as np
from sklearn.utils import shuffle

### model manager information
metadata_output_dir = 'outputs'
model_name = 'logit_python_aml_bank_workbench'
project_name = 'Anti-Money Laundering'
description = 'Logistic Regression'
model_type = 'logistic_regression'
model_function = 'Classification'
predict_syntax = 'predict_proba'

### define macro variables for model
dm_dec_target = 'ml_indicator'
dm_partitionvar = 'analytic_partition'
create_new_partition = 'no' # 'yes', 'no'
dm_key = 'account_id' 
dm_classtarget_level = ['0', '1']
dm_partition_validate_val, dm_partition_train_val, dm_partition_test_val = [0, 1, 2]
dm_partition_validate_perc, dm_partition_train_perc, dm_partition_test_perc = [0.3, 0.6, 0.1]

### create list of regressors
keep_predictors = [
    'marital_status_single',
    'checking_only_indicator',
    'prior_ctr_indicator',
    'address_change_2x_indicator',
    'cross_border_trx_indicator',
    'in_person_contact_indicator',
    'linkedin_indicator',
    'citizenship_country_risk',
    'distance_to_employer',
    'distance_to_bank'
    ]
#rejected_predictors = []

### mlflow
use_mlflow = 'no' # 'yes', 'no'
mlflow_run_to_use = 0
mlflow_class_labels =['TENSOR']
mlflow_predict_syntax = 'predict'

### var to consider in bias assessment
bias_vars = ['marital_status_single']

### var to consider in partial dependency
pd_var1 = 'distance_to_employer'
pd_var2 = 'distance_to_bank'

### create partition column, if not already in dataset
if create_new_partition == 'yes':
    dm_inputdf = shuffle(dm_inputdf)
    dm_inputdf.reset_index(inplace=True, drop=True)
    validate_rows = round(len(dm_inputdf)*dm_partition_validate_perc)
    train_rows = round(len(dm_inputdf)*dm_partition_train_perc) + validate_rows
    test_rows = len(dm_inputdf)-train_rows
    dm_inputdf.loc[0:validate_rows,dm_partitionvar] = dm_partition_validate_val
    dm_inputdf.loc[validate_rows:train_rows,dm_partitionvar] = dm_partition_train_val
    dm_inputdf.loc[train_rows:,dm_partitionvar] = dm_partition_test_val

In [3]:
##############################
### Final Modeling Columns ###
##############################

### create list of model variables
dm_input = list(dm_inputdf.columns.values)
macro_vars = (dm_dec_target + ' ' + dm_partitionvar + ' ' + dm_key).split()
rejected_predictors = [i for i in dm_input if i not in keep_predictors]
rejected_vars = rejected_predictors # + macro_vars (include macro_vars if rejected_predictors are explicitly listed - not contra keep_predictors)
for i in rejected_vars:
    dm_input.remove(i)
print(dm_input)

### create prediction variables
dm_predictionvar = [str('P_') + dm_dec_target + dm_classtarget_level[0], str('P_') + dm_dec_target + dm_classtarget_level[1]]
dm_classtarget_intovar = str('I_') + dm_dec_target

##################
### Data Split ###
##################

### create train, test, validate datasets using existing partition column
dm_traindf = dm_inputdf[dm_inputdf[dm_partitionvar] == dm_partition_train_val]
X_train = dm_traindf.loc[:, dm_input]
y_train = dm_traindf[dm_dec_target]
dm_testdf = dm_inputdf.loc[(dm_inputdf[dm_partitionvar] == dm_partition_test_val)]
X_test = dm_testdf.loc[:, dm_input]
y_test = dm_testdf[dm_dec_target]
dm_validdf = dm_inputdf.loc[(dm_inputdf[dm_partitionvar] == dm_partition_validate_val)]
X_valid = dm_validdf.loc[:, dm_input]
y_valid = dm_validdf[dm_dec_target]

['marital_status_single', 'checking_only_indicator', 'prior_ctr_indicator', 'address_change_2x_indicator', 'cross_border_trx_indicator', 'in_person_contact_indicator', 'linkedin_indicator', 'citizenship_country_risk', 'distance_to_employer', 'distance_to_bank']


In [4]:
from time import time

In [5]:
print(dm_inputdf.shape)

(350000, 27)


##### (2) Train model using sklearn and with sasviya.ml API

In [15]:
##############
### Python ###
##############

from sklearn.ensemble import GradientBoostingClassifier

### estimate & fit model
dm_model = GradientBoostingClassifier()

start = time()
dm_model.fit(X_train, y_train)
finish = time()

print('score_train:', dm_model.score(X_train, y_train))
print('score_test:', dm_model.score(X_test, y_test))
print('score_valid:', dm_model.score(X_valid, y_valid))

time_to_complete = finish-start
print("Time to complete model fit with Python:", time_to_complete)

score_train: 0.97523340310508
score_test: 0.9626098443512722
score_valid: 0.9728206256998211
Time to complete model fit with Python: 23.644811630249023


In [17]:
##################
### Python API ###
##################

from sasviya.ml.tree import GradientBoostingClassifier

### estimate & fit model
dm_model = GradientBoostingClassifier()

start = time()
dm_model.fit(X_train, y_train)
finish = time()

print('score_train:', dm_model.score(X_train, y_train))
print('score_test:', dm_model.score(X_test, y_test))
print('score_valid:', dm_model.score(X_valid, y_valid))

time_to_complete = finish-start
print("Time to complete model fit with Python API:", time_to_complete)

score_train: 0.9768803960377243
score_test: 0.9628108666934696
score_valid: 0.9726579322621087
Time to complete model fit with Python API: 1.1506941318511963
