In [1]:
#####################################################
###       Train & Publish SAS gbTree Model        ###
#####################################################

###################
### Credentials ###
###################

import os
import sys
from pathlib import Path

sys.path.append('C:/Users/chparr/OneDrive - SAS/credentials')
from credentials import hostname, session, port, protocol, wd, output_dir, git_dir, token_dir, token, token_refresh, token_pem

In [2]:
#############################
### Connect with SAS Viya ###
#############################

import swat

access_token = open(token, "r").read()
conn =  swat.CAS(hostname=hostname, username=None, password=access_token, ssl_ca_list=token_pem, protocol=protocol)
print(conn.serverstatus())

NOTE: Grid node action status report: 1 nodes, 9 total actions executed.
[About]

 {'CAS': 'Cloud Analytic Services',
  'CASCacheLocation': 'CAS Disk Cache',
  'CASHostAccountRequired': 'OPTIONAL',
  'Copyright': 'Copyright © 2014-2024 SAS Institute Inc. All Rights Reserved.',
  'ServerTime': '2024-07-30T03:02:12Z',
  'System': {'Hostname': 'controller.sas-cas-server-default.innovationlab.svc.cluster.local',
   'Linux Distribution': 'Red Hat Enterprise Linux release 8.10 (Ootpa)',
   'Model Number': 'x86_64',
   'OS Family': 'LIN X64',
   'OS Name': 'Linux',
   'OS Release': '5.15.0-1064-azure',
   'OS Version': '#73-Ubuntu SMP Tue Apr 30 14:24:24 UTC 2024'},
  'Transferred': 'NO',
  'Version': '4.00',
  'VersionLong': 'V.04.00M0P06172024',
  'Viya Release': '20240717.1721252687211',
  'Viya Version': 'Stable 2024.06',
  'license': {'expires': '21Sep2024:00:00:00',
   'gracePeriod': 0,
   'site': 'CIS SSEMONTHLY INNOVATION ENTERPRISE-RISK-MRM',
   'siteNum': 70180938,

[nodestatus]

 N

In [13]:
#############################
### Identify Table in CAS ###
#############################

### caslib and table to use in modeling
caslib = 'casuser'
in_mem_tbl = 'AML_BANK_PREP'

### load table in-memory if not already exists in-memory
if conn.table.tableExists(caslib=caslib, name=in_mem_tbl).exists<=0:
    conn.table.loadTable(caslib=caslib, path=str(in_mem_tbl+str('.sashdat')), 
                         casout={'name':in_mem_tbl, 'caslib':caslib, 'promote':True})

### show table to verify
conn.table.tableInfo(caslib=caslib, wildIgnore=False, name=in_mem_tbl)

Unnamed: 0,Name,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,View,MultiPart,SourceName,SourceCaslib,Compressed,Creator,Modifier,SourceModTimeFormatted,SourceModTime,TableRedistUpPolicy
0,AML_BANK_PREP,14302,27,0,utf-8,2024-07-29T18:20:15+00:00,2024-07-29T18:20:15+00:00,2024-07-30T03:50:10+00:00,UTF8,2037896000.0,0,0,AML_BANK_PREP.sashdat,CASUSER(Chris.Parrish@sas.com),0,Chris.Parrish@sas.com,,2024-04-16T21:17:04+00:00,2028921000.0,Not Specified


In [4]:
########################
### Create Dataframe ###
########################

dm_inputdf =  conn.CASTable(in_mem_tbl, caslib=caslib)

### print columns for review of model parameters
conn.table.columnInfo(table={"caslib":caslib, "name":in_mem_tbl})

Unnamed: 0,Column,Label,ID,Type,RawLength,FormattedLength,Format,NFL,NFD
0,account_id,unique identifier,1,double,8,12,,0,0
1,num_transactions,number of transactions financial services cust...,2,double,8,12,,0,0
2,credit_score,customer credit score,3,double,8,12,,0,0
3,marital_status_single,,4,double,8,12,,0,0
4,marital_status_married,,5,double,8,12,,0,0
5,marital_status_divorced,,6,double,8,12,,0,0
6,analytic_partition,,7,double,8,12,,0,0
7,ml_indicator,"indicator for money laundering event no (0), y...",8,double,8,12,,0,0
8,checking_only_indicator,indicator for whether customer only has a chec...,9,double,8,12,,0,0
9,prior_ctr_indicator,indicator for whether the customer has a curre...,10,double,8,12,,0,0


In [5]:
########################
### Model Parameters ###
########################

### import packages
conn.loadactionset('decisionTree')
conn.loadactionset('astore')
conn.loadactionset('explainModel')
conn.loadactionset('fairAITools')
conn.loadactionset('percentile')
conn.loadactionset('modelPublishing')

### model arugments
m=20
seed=12345
nTree=100
learningRate=0.1
subSampleRate=0.5
lasso=0
ridge=1
distribution="binary"
maxBranch=2
maxLevel=5
leafSize=5
missing="useinsearch"
minUseInSearch=1
nBins=50
quantileBin=True

early_stop_params = dict(
    metric="MCR",
    stagnation=5,
    tolerance=0,
    minimum=False,
    threshold=0,
    thresholdIter=0
    )

### model manager information
model_name = 'gbtree_python_api_aml_bank'
project_name = 'Anti-Money Laundering'
description = 'gbtree_python_api'
model_type = 'gradient_boost'

### define macro variables for model
dm_dec_target = 'ml_indicator'
dm_partitionvar = 'analytic_partition' 
dm_key = 'account_id' 
dm_classtarget_level = ['0', '1']
dm_partition_validate_val, dm_partition_train_val, dm_partition_test_val = [0, 1, 2]

### create list of regressors
keep_predictors = [
    'marital_status_single',
    'checking_only_indicator',
    'prior_ctr_indicator',
    'address_change_2x_indicator',
    'cross_border_trx_indicator',
    'in_person_contact_indicator',
    'linkedin_indicator',
    'citizenship_country_risk',
    'distance_to_employer',
    'distance_to_bank'
    ]
#rejected_predictors = []

### var to consider in bias assessment
bias_vars = ['marital_status_single']

### var to consider in partial dependency
pd_var1 = 'distance_to_employer'
pd_var2 = 'distance_to_bank'

NOTE: Added action set 'decisionTree'.
NOTE: Added action set 'astore'.
NOTE: Added action set 'explainModel'.
NOTE: Added action set 'fairAITools'.
NOTE: Added action set 'percentile'.
NOTE: Added action set 'modelPublishing'.


In [9]:
##############################
### Final Modeling Columns ###
##############################

### create list of model variables
dm_input = list(dm_inputdf.columns.values)
macro_vars = (dm_dec_target + ' ' + dm_partitionvar + ' ' + dm_key).split()
rejected_predictors = [i for i in dm_input if i not in keep_predictors]
rejected_vars = rejected_predictors # + macro_vars (include macro_vars if rejected_predictors are explicitly listed - not contra keep_predictors)
for i in rejected_vars:
    dm_input.remove(i)
print(dm_input)

### create prediction variables
dm_predictionvar = [str('P_') + dm_dec_target + dm_classtarget_level[0], str('P_') + dm_dec_target + dm_classtarget_level[1]]
dm_classtarget_intovar = str('I_') + dm_dec_target

### create partition objects
train_part = str(dm_partitionvar)+str('=')+str(dm_partition_train_val)
test_part = str(dm_partitionvar)+str('=')+str(dm_partition_test_val)
valid_part = str(dm_partitionvar)+str('=')+str(dm_partition_validate_val)

['marital_status_single', 'checking_only_indicator', 'prior_ctr_indicator', 'address_change_2x_indicator', 'cross_border_trx_indicator', 'in_person_contact_indicator', 'linkedin_indicator', 'citizenship_country_risk', 'distance_to_employer', 'distance_to_bank']


In [14]:
#####################
### Training Code ###
#####################

### create names of tables for action set
astore_tbl = str(in_mem_tbl+str('_astore'))
cas_score_tbl = str(in_mem_tbl+str('_score'))
cas_out_tbl = str(in_mem_tbl+str('_model'))

### estimate & fit model
dm_model = conn.decisionTree.gbtreeTrain(
    earlyStop=early_stop_params,
    table=dict(caslib=caslib, name=in_mem_tbl, where=train_part),
    target=dm_dec_target,
    nominal=dm_dec_target,
    inputs=dm_input,
    encodeName=True,
    casOut=dict(caslib=caslib, name=cas_out_tbl, replace=True),
    saveState=dict(caslib=caslib, name=astore_tbl, replace=True),
    m=m, seed=seed, nTree=nTree, learningRate=learningRate, subSampleRate=subSampleRate, 
    lasso=lasso, ridge=ridge, distribution=distribution, maxBranch=maxBranch, 
    maxLevel=maxLevel, leafSize=leafSize, missing=missing, minUseInSearch=minUseInSearch, 
    nBins=nBins, quantileBin=quantileBin 
    )

NOTE: 139802 bytes were written to the table "AML_BANK_PREP_astore" in the caslib "CASUSER(Chris.Parrish@sas.com)".


In [15]:
##################
### Score Code ###
##################

### score full data
conn.decisionTree.dtreeScore(
    modelTable=dict(caslib=caslib, name=cas_out_tbl),
    table=dict(caslib=caslib, name=in_mem_tbl), 
    copyvars=[dm_dec_target, dm_partitionvar],
    casout=dict(caslib=caslib, name=cas_score_tbl, replace=True),
    encodeName=True,
    assessOneRow=True
    )

### create score code
conn.decisionTree.gbtreeCode(
  modelTable=dict(caslib=caslib, name=cas_out_tbl),
  code=dict(casOut=dict(caslib=caslib, name='gbtree_scorecode', replace=True, promote=False))
  )

Unnamed: 0,casLib,Name,Rows,Columns,casTable
0,CASUSER(Chris.Parrish@sas.com),gbtree_scorecode,1,10,"CASTable('gbtree_scorecode', caslib='CASUSER(C..."


In [18]:
####################
### Assess Model ###
####################

conn.percentile.assess(
  table=dict(caslib=caslib, name=cas_score_tbl),
  event="1",
  response=dm_dec_target,
  inputs=dm_predictionvar[1],
  cutStep=0.0001,
  casOut=dict(caslib=caslib, name='gbtree_python_assess', replace=True)
  )

Unnamed: 0,casLib,Name,Rows,Columns,casTable
0,CASUSER(Chris.Parrish@sas.com),gbtree_python_assess,20,21,"CASTable('gbtree_python_assess', caslib='CASUS..."
1,CASUSER(Chris.Parrish@sas.com),gbtree_python_assess_ROC,10000,22,"CASTable('gbtree_python_assess_ROC', caslib='C..."


In [21]:
###################
### Assess Bias ###
###################

conn.fairAITools.assessBias(
		table = dict(caslib=caslib, name=in_mem_tbl),
		modelTable = dict(caslib=caslib, name=astore_tbl),
		modelTableType = "ASTORE",
		response = dm_dec_target,
		predictedVariables = dm_predictionvar,
		responseLevels = dm_classtarget_level,
		sensitiveVariable = bias_vars[0]
        )

NOTE: The option event is not specified. Response level '0' is used as an event.


Unnamed: 0,Metric,MetricLabel,Value,Base,Compare,Note
0,DemographicParity,Demographic Parity (Statistical Parity),0.136925,0.0,1.0,
1,PredictiveParity,Predictive Parity,0.144532,0.0,1.0,
2,EqualAccuracy,Equal Accuracy,0.051136,0.0,1.0,
3,EqualizedOdds,Equalized Odds,0.360057,0.0,1.0,The maximum FPR difference is greater than the...
4,EqualOpportunity,Equal Opportunity,0.022626,0.0,1.0,

Unnamed: 0,Group,N,TP,FP,TN,FN,TPR,FPR,TNR,FNR,cutoffKS,GAIN,LIFT,RESP,CUMRESP,CUMLIFT,INTO_EVENT,PREDICTED_EVENT,P_ml_indicator0,P_ml_indicator1
0,0.0,11260.0,11037.0,125.0,77.0,21.0,0.998101,0.618812,0.381188,0.001899,0.379289,0.018267,1.018267,5.091337,10.182673,1.018267,0.991297,0.981388,0.981388,0.018612
1,1.0,3042.0,2466.0,133.0,381.0,62.0,0.975475,0.258755,0.741245,0.024525,0.71672,0.210443,1.210443,6.052215,12.10443,1.210443,0.854372,0.836856,0.836856,0.163144

Unnamed: 0,Metric,MetricLabel,Value,Base,Compare
0,P_ml_indicator0,Average Predicted: ML_INDICATOR=0,0.144532,0.0,1.0
1,P_ml_indicator1,Average Predicted: ML_INDICATOR=1,0.144532,1.0,0.0
2,TPR,True Positive Rate,0.022626,0.0,1.0
3,FPR,False Positive Rate,0.360057,0.0,1.0
4,TNR,True Negative Rate,0.360057,1.0,0.0
5,FNR,False Negative Rate,0.022626,1.0,0.0
6,FDR,False Discovery Rate,0.039975,1.0,0.0
7,ACC,Accuracy,0.051136,0.0,1.0
8,C,Area under ROC,0.079008,1.0,0.0
9,F1,F1 Score,0.031463,0.0,1.0


In [23]:
import pandas as pd

### score full data
conn.astore.score(
    table={"name":in_mem_tbl, "caslib":caslib}, 
    copyvars=[dm_dec_target, dm_partitionvar],
    casout={"name":cas_score_tbl, "replace":True},
    rstore={"name":astore_tbl, "caslib":caslib})
score_astore = conn.CASTable(cas_score_tbl)

### create tables with predicted values
dm_scoreddf = conn.CASTable(score_astore).to_frame()
dm_scoreddf[dm_dec_target] = dm_scoreddf[dm_dec_target].astype(int)
trainData = dm_scoreddf[dm_scoreddf[dm_partitionvar]==dm_partition_train_val][[dm_dec_target, dm_predictionvar[1]]].rename(columns=lambda x:'0')
testData = dm_scoreddf[dm_scoreddf[dm_partitionvar]==dm_partition_test_val][[dm_dec_target, dm_predictionvar[1]]].rename(columns=lambda x:'0')
validData = dm_scoreddf[dm_scoreddf[dm_partitionvar]==dm_partition_validate_val][[dm_dec_target, dm_predictionvar[1]]].rename(columns=lambda x:'0')
trainData = pd.DataFrame(trainData)
testData = pd.DataFrame(testData)
validData = pd.DataFrame(validData)

### print model & results
print(dm_model)
conn.table.tableInfo(caslib=caslib, wildIgnore=False, name=astore_tbl)
print(conn.astore.describe(rstore={"name":astore_tbl, "caslib":caslib}, epcode=True).Description)
print(conn.astore.describe(rstore={"name":astore_tbl, "caslib":caslib}, epcode=True).InputVariables)
print(conn.astore.describe(rstore={"name":astore_tbl, "caslib":caslib}, epcode=True).OutputVariables)
print(conn.astore.describe(rstore={"name":astore_tbl, "caslib":caslib}, epcode=True).epcode)
model_astore = conn.CASTable(astore_tbl, caslib=caslib)

[EncodedName]

         LEVNAME  LEVINDEX          VARNAME
 0             1         0  P_ml_indicator1
 1             0         1  P_ml_indicator0

[EncodedTargetName]

   LEVNAME  LEVINDEX         VARNAME
 0                 0  I_ml_indicator

[ModelInfo]

 Gradient Boosting Tree for AML_BANK_PREP
 
                                Descr     Value
 0                    Number of Trees    100.00
 1                       Distribution      2.00
 2                      Learning Rate      0.10
 3                   Subsampling Rate      0.50
 4   Number of Selected Variables (M)     10.00
 5                     Number of Bins     50.00
 6                Number of Variables     10.00
 7           Max Number of Tree Nodes     31.00
 8           Min Number of Tree Nodes     17.00
 9             Max Number of Branches      2.00
 10            Min Number of Branches      2.00
 11              Max Number of Levels      5.00
 12              Min Number of Levels      5.00
 13              Max Number

In [26]:
from sasctl import Session
sess = Session(hostname=session, token=access_token, client_secret='access_token')

In [None]:
#######################################
### Register Model in Model Manager ###
#######################################

from sasctl import register_model, publish_model

### create session in cas
access_token = open(token, "r").read()

with sess:
    model = register_model(model=model_astore, name=model_name, project=project_name, version='latest')