#### Use Case Background 

##### For financial services organizations, model development for fraud detection and for surfacing potentially anti-money laundering activities are an area of increasing interest.

##### Bespoke models may be used by banks to replace rules-based scenarios or other fraud detection activities.

##### This use case models bank account holder activity to determine the probability of money launding event.

##### (1) Developing Python / Notebook to create dataframe and set up training and testing partitions.

##### Analytics base table (aml_bank_prep) has already gone through ETL process and is prepped for modeling.

##### Engines are built into Workbench to access and process data in external data sources.

In [15]:
########################
### Create Dataframe ###
########################

import pandas as pd
from pathlib import Path

workspace_dir = "/workspaces/chris_parrish/__fraud_detection_use_case__/"
data_table = "aml_bank_prep.csv"

dm_inputdf = pd.read_csv(Path(workspace_dir) / data_table, header=0)
print(dm_inputdf.dtypes)

account_id                       int64
num_transactions               float64
credit_score                   float64
marital_status_single            int64
marital_status_married           int64
marital_status_divorced          int64
analytic_partition               int64
ml_indicator                     int64
checking_only_indicator          int64
prior_ctr_indicator              int64
address_change_2x_indicator      int64
cross_border_trx_indicator       int64
in_person_contact_indicator      int64
linkedin_indicator               int64
atm_deposit_indicator            int64
trx_10ksum_indicator             int64
common_merchant_indicator        int64
direct_deposit_indicator         int64
citizenship_country_risk         int64
occupation_risk                  int64
num_acctbal_chgs_gt2000        float64
distance_to_employer           float64
distance_to_bank               float64
income                         float64
primary_transfer_cash            int64
primary_transfer_check   

In [17]:
########################
### Model Parameters ###
########################

### import python libraries
import numpy as np
from sklearn.utils import shuffle

### model manager information
metadata_output_dir = 'outputs'
model_name = 'logit_python_aml_bank_workbench'
project_name = 'Anti-Money Laundering'
description = 'Logistic Regression'
model_type = 'logistic_regression'
model_function = 'Classification'
predict_syntax = 'predict_proba'

### define macro variables for model
dm_dec_target = 'ml_indicator'
dm_partitionvar = 'analytic_partition'
create_new_partition = 'no' # 'yes', 'no'
dm_key = 'account_id' 
dm_classtarget_level = ['0', '1']
dm_partition_validate_val, dm_partition_train_val, dm_partition_test_val = [0, 1, 2]
dm_partition_validate_perc, dm_partition_train_perc, dm_partition_test_perc = [0.3, 0.6, 0.1]

### create list of regressors
keep_predictors = [
    'marital_status_single',
    'checking_only_indicator',
    'prior_ctr_indicator',
    'address_change_2x_indicator',
    'cross_border_trx_indicator',
    'in_person_contact_indicator',
    'linkedin_indicator',
    'citizenship_country_risk',
    'distance_to_employer',
    'distance_to_bank'
    ]
#rejected_predictors = []

### mlflow
use_mlflow = 'no' # 'yes', 'no'
mlflow_run_to_use = 0
mlflow_class_labels =['TENSOR']
mlflow_predict_syntax = 'predict'

### var to consider in bias assessment
bias_vars = ['marital_status_single']

### var to consider in partial dependency
pd_var1 = 'distance_to_employer'
pd_var2 = 'distance_to_bank'

### create partition column, if not already in dataset
if create_new_partition == 'yes':
    dm_inputdf = shuffle(dm_inputdf)
    dm_inputdf.reset_index(inplace=True, drop=True)
    validate_rows = round(len(dm_inputdf)*dm_partition_validate_perc)
    train_rows = round(len(dm_inputdf)*dm_partition_train_perc) + validate_rows
    test_rows = len(dm_inputdf)-train_rows
    dm_inputdf.loc[0:validate_rows,dm_partitionvar] = dm_partition_validate_val
    dm_inputdf.loc[validate_rows:train_rows,dm_partitionvar] = dm_partition_train_val
    dm_inputdf.loc[train_rows:,dm_partitionvar] = dm_partition_test_val

In [18]:
##############################
### Final Modeling Columns ###
##############################

### create list of model variables
dm_input = list(dm_inputdf.columns.values)
macro_vars = (dm_dec_target + ' ' + dm_partitionvar + ' ' + dm_key).split()
rejected_predictors = [i for i in dm_input if i not in keep_predictors]
rejected_vars = rejected_predictors # + macro_vars (include macro_vars if rejected_predictors are explicitly listed - not contra keep_predictors)
for i in rejected_vars:
    dm_input.remove(i)
print(dm_input)

### create prediction variables
dm_predictionvar = [str('P_') + dm_dec_target + dm_classtarget_level[0], str('P_') + dm_dec_target + dm_classtarget_level[1]]
dm_classtarget_intovar = str('I_') + dm_dec_target

##################
### Data Split ###
##################

### create train, test, validate datasets using existing partition column
dm_traindf = dm_inputdf[dm_inputdf[dm_partitionvar] == dm_partition_train_val]
X_train = dm_traindf.loc[:, dm_input]
y_train = dm_traindf[dm_dec_target]
dm_testdf = dm_inputdf.loc[(dm_inputdf[dm_partitionvar] == dm_partition_test_val)]
X_test = dm_testdf.loc[:, dm_input]
y_test = dm_testdf[dm_dec_target]
dm_validdf = dm_inputdf.loc[(dm_inputdf[dm_partitionvar] == dm_partition_validate_val)]
X_valid = dm_validdf.loc[:, dm_input]
y_valid = dm_validdf[dm_dec_target]

['marital_status_single', 'checking_only_indicator', 'prior_ctr_indicator', 'address_change_2x_indicator', 'cross_border_trx_indicator', 'in_person_contact_indicator', 'linkedin_indicator', 'citizenship_country_risk', 'distance_to_employer', 'distance_to_bank']


##### (2) Train model using sklearn and with sasviya.ml API

In [10]:
##############################
### Training Code - Python ###
##############################

from sklearn.linear_model import LogisticRegression

### estimate & fit model
dm_model = LogisticRegression(
        tol=1e-8,
        fit_intercept=True,
        solver='newton-cg',
        verbose=0,
        max_iter=100
    )
dm_model.fit(X_train, y_train)

print('score_train:', dm_model.score(X_train, y_train))
print('score_test:', dm_model.score(X_test, y_test))
print('score_valid:', dm_model.score(X_valid, y_valid))

score_train: 0.9660878685467894
score_test: 0.9615384615384616
score_valid: 0.9638778839431368


In [None]:
####################
### Pickle Model ###
####################

import pickle

pickle_file = 'aml_bank_pickle.pkl'
dm_pklpath = Path(workspace_dir)/pickle_file

with open(dm_pklpath, 'wb') as f:
	pickle.dump(dm_model, f)

In [11]:
###############
### SAS API ###
###############

from sasviya.ml.linear_model import LogisticRegression

### estimate & fit model
dm_model = LogisticRegression(
        tol=1e-8,
        fit_intercept=True,
        solver="newrap",
        selection=None,
        verbose=0,
        max_iter=None,
        max_time=None
        )
dm_model.fit(X_train, y_train)

print('score_train:', dm_model.score(X_train, y_train))
print('score_test:', dm_model.score(X_test, y_test))
print('score_valid:', dm_model.score(X_valid, y_valid))

score_train: 0.9664374781493998
score_test: 0.9622377622377623
score_valid: 0.9636448380330925


In [13]:
#################################
### Save Model as Astore File ###
#################################

astore = "aml_bank_astore"
dm_model.export(file=Path(workspace_dir)/astore, replace=True)

In [14]:
###########################
### Save Model in Table ###
###########################

model_table = "aml_bank_model"
dm_model.save(Path(workspace_dir)/model_table)

##### (4) Score deployed model

In [1]:
hostname = 'https://innovationlab.demo.sas.com/cas-shared-default-http'
session = 'https://innovationlab.demo.sas.com/'
token = r'/workspaces/chris_parrish/_chris_demo/access_token.txt'
token_refresh = r'/workspaces/chris_parrish/_chris_demo/refresh_token.txt'
token_pem = r'/workspaces/chris_parrish/_chris_demo/innovationlab_demo_sas_com.pem'

In [3]:
import swat

access_token = open(token, "r").read()
conn =  swat.CAS(hostname=hostname, username=None, password=access_token, ssl_ca_list=token_pem, protocol='https')
print(conn.serverstatus())

NOTE: Grid node action status report: 1 nodes, 9 total actions executed.
[About]

 {'CAS': 'Cloud Analytic Services',
  'CASCacheLocation': 'CAS Disk Cache',
  'CASHostAccountRequired': 'OPTIONAL',
  'Copyright': 'Copyright © 2014-2024 SAS Institute Inc. All Rights Reserved.',
  'ServerTime': '2024-10-08T21:08:09Z',
  'System': {'Hostname': 'controller.sas-cas-server-default.innovationlab.svc.cluster.local',
   'Linux Distribution': 'Red Hat Enterprise Linux release 8.10 (Ootpa)',
   'Model Number': 'x86_64',
   'OS Family': 'LIN X64',
   'OS Name': 'Linux',
   'OS Release': '5.15.0-1071-azure',
   'OS Version': '#80-Ubuntu SMP Tue Aug 6 19:27:32 UTC 2024'},
  'Transferred': 'NO',
  'Version': '4.00',
  'VersionLong': 'V.04.00M0P08122024',
  'Viya Release': '20240906.1725666769572',
  'Viya Version': 'Stable 2024.08',
  'license': {'expires': '13Sep2026:00:00:00',
   'gracePeriod': 0,
   'site': 'CIS CREATE INNOVATION LAB (ENTERPRISE-RISK-MRM)',
   'siteNum': 70180938,

[nodestatus]

 

In [4]:
###########################
### Select Target Model ###
###########################

from requests import request

model_name = 'logit_python_api_aml_bank_workbe' # all lower case
access_token = open(token, "r").read()

headers = {'Authorization': 'Bearer ' + access_token}
url = session + '/microanalyticScore/modules/' + model_name + '/steps'
r = request('GET', url, params={}, headers=headers, verify=False)
r.json()



{'links': [{'method': 'GET',
   'rel': 'collection',
   'href': '/microanalyticScore/modules/logit_python_api_aml_bank_workbe/steps',
   'uri': '/microanalyticScore/modules/logit_python_api_aml_bank_workbe/steps',
   'type': 'application/vnd.sas.collection'},
  {'method': 'GET',
   'rel': 'self',
   'href': '/microanalyticScore/modules/logit_python_api_aml_bank_workbe/steps?start=0&limit=20',
   'uri': '/microanalyticScore/modules/logit_python_api_aml_bank_workbe/steps?start=0&limit=20',
   'type': 'application/vnd.sas.collection'},
  {'method': 'POST',
   'rel': '/microanalyticScore/modules/logit_python_api_aml_bank_workbe/steps',
   'href': '/microanalyticScore/modules/logit_python_api_aml_bank_workbe/steps',
   'uri': '/microanalyticScore/modules/logit_python_api_aml_bank_workbe/steps',
   'type': 'application/vnd.sas.microanalytic.module.definition',
   'responseType': 'application/vnd.sas.microanalytic.module'}],
 'name': 'steps',
 'accept': 'application/vnd.sas.microanalytic.modu

In [5]:
##########################
### Score Target Model ###
##########################

data = '{"inputs":[ {"name":"marital_status_single", "value": 0}, \
                    {"name":"checking_only_indicator", "value": 0}, \
                    {"name":"prior_ctr_indicator", "value": 1}, \
                    {"name":"address_change_2x_indicator", "value": 1}, \
                    {"name":"cross_border_trx_indicator", "value": 1}, \
                    {"name":"in_person_contact_indicator", "value": 0}, \
                    {"name":"linkedin_indicator", "value": 0}, \
                    {"name":"citizenship_country_risk", "value": 0}, \
                    {"name":"distance_to_employer", "value": -0.091215699}, \
                    {"name":"distance_to_bank", "value": -0.895717584} \
                        ] \
            }'
headers = {'Content-Type': 'application/vnd.sas.microanalytic.module.step.input+json', 
               'Authorization': 'Bearer ' + access_token}
url = session + '/microanalyticScore/modules/' + model_name + '/steps/score'
r = request('POST', url, data=data, headers=headers, verify=False)
score_val = (r.json()['outputs'][1]['value'])
score_val = "{0:.2%}".format(score_val)


print ('')
print('****************************************************************************')
print("This customer has a " + str(score_val) + " probability of a money laundering event ")
print('****************************************************************************')


****************************************************************************
This customer has a 11.86% probability of a money laundering event 
****************************************************************************


