##### Notes to install SAS Python packages
###### In SageMaker, open terminal in Jupyter/VS code
###### install numactl at prompt: sudo apt-get install -y numactl
###### pip install swat
###### pip install sasctl

In [None]:
### delete in SageMaker - firewall is preventing connection ###

import os
import sys
from pathlib import Path

filepath = input("file path to credentials: ")
sys.path.append(filepath)
from credentials import hostname, session, port, protocol, wd, output_dir, git_dir, token_dir, token, token_refresh, token_pem

import swat

access_token = open(token, "r").read()
conn =  swat.CAS(hostname=hostname, username=None, password=access_token, ssl_ca_list=token_pem, protocol=protocol)
print(conn.serverstatus())

NOTE: Grid node action status report: 1 nodes, 9 total actions executed.
[About]

 {'CAS': 'Cloud Analytic Services',
  'CASCacheLocation': 'CAS Disk Cache',
  'CASHostAccountRequired': 'OPTIONAL',
  'Copyright': 'Copyright © 2014-2024 SAS Institute Inc. All Rights Reserved.',
  'ServerTime': '2024-07-25T18:53:24Z',
  'System': {'Hostname': 'controller.sas-cas-server-default.innovationlab.svc.cluster.local',
   'Linux Distribution': 'Red Hat Enterprise Linux release 8.10 (Ootpa)',
   'Model Number': 'x86_64',
   'OS Family': 'LIN X64',
   'OS Name': 'Linux',
   'OS Release': '5.15.0-1064-azure',
   'OS Version': '#73-Ubuntu SMP Tue Apr 30 14:24:24 UTC 2024'},
  'Transferred': 'NO',
  'Version': '4.00',
  'VersionLong': 'V.04.00M0P06172024',
  'Viya Release': '20240717.1721252687211',
  'Viya Version': 'Stable 2024.06',
  'license': {'expires': '21Sep2024:00:00:00',
   'gracePeriod': 0,
   'site': 'CIS SSEMONTHLY INNOVATION ENTERPRISE-RISK-MRM',
   'siteNum': 70180938,

[nodestatus]

 N

In [31]:
### delete in SageMaker - firewall is preventing connection ###

### caslib and table to use in modeling
caslib = 'casuser'
in_mem_tbl = 'AML_BANK_PREP'

### load table in-memory if not already exists in-memory
if conn.table.tableExists(caslib=caslib, name=in_mem_tbl).exists<=0:
    conn.table.loadTable(caslib=caslib, path=str(in_mem_tbl+str('.sashdat')), 
                         casout={'name':in_mem_tbl, 'caslib':caslib, 'promote':True})

### show table to verify
conn.table.tableInfo(caslib=caslib, wildIgnore=False, name=in_mem_tbl)

dm_inputdf =  conn.CASTable(in_mem_tbl, caslib=caslib).to_frame()

### print columns for review of model parameters
print(dm_inputdf.dtypes)

account_id                     float64
num_transactions               float64
credit_score                   float64
marital_status_single          float64
marital_status_married         float64
marital_status_divorced        float64
analytic_partition             float64
ml_indicator                   float64
checking_only_indicator        float64
prior_ctr_indicator            float64
address_change_2x_indicator    float64
cross_border_trx_indicator     float64
in_person_contact_indicator    float64
linkedin_indicator             float64
atm_deposit_indicator          float64
trx_10ksum_indicator           float64
common_merchant_indicator      float64
direct_deposit_indicator       float64
citizenship_country_risk       float64
occupation_risk                float64
num_acctbal_chgs_gt2000        float64
distance_to_employer           float64
distance_to_bank               float64
income                         float64
primary_transfer_cash          float64
primary_transfer_check   

In [1]:
########################
### Create Dataframe ###
########################

import pandas as pd
from pathlib import Path

workspace_dir = "/home/sagemaker-user"
data_table = "aml_bank_prep.csv"

dm_inputdf = pd.read_csv(Path(workspace_dir) / data_table, header=0)
print(dm_inputdf.dtypes)

account_id                       int64
num_transactions               float64
credit_score                   float64
marital_status_single            int64
marital_status_married           int64
marital_status_divorced          int64
analytic_partition               int64
ml_indicator                     int64
checking_only_indicator          int64
prior_ctr_indicator              int64
address_change_2x_indicator      int64
cross_border_trx_indicator       int64
in_person_contact_indicator      int64
linkedin_indicator               int64
atm_deposit_indicator            int64
trx_10ksum_indicator             int64
common_merchant_indicator        int64
direct_deposit_indicator         int64
citizenship_country_risk         int64
occupation_risk                  int64
num_acctbal_chgs_gt2000        float64
distance_to_employer           float64
distance_to_bank               float64
income                         float64
primary_transfer_cash            int64
primary_transfer_check   

In [42]:
########################
### Model Parameters ###
########################

### import python libraries
import numpy as np
from sklearn.utils import shuffle

### model manager information
metadata_output_dir = 'outputs'
model_name = 'logit_python_aml_bank_sagemaker'
project_name = 'Anti-Money Laundering'
description = 'Logistic Regression'
model_type = 'logistic_regression'
model_function = 'Classification'
predict_syntax = 'predict_proba'
username = 'sagemaker_user'
table_name = 'aml_bank_prep'

### define macro variables for model
dm_dec_target = 'ml_indicator'
dm_partitionvar = 'analytic_partition'
create_new_partition = 'no' # 'yes', 'no'
dm_key = 'account_id' 
dm_classtarget_level = ['0', '1']
prediction_labels = ['P_ml_indicator0', 'P_ml_indicator1', 'I_ml_indicator']
event_prob_var = ['1', '0']
dm_partition_validate_val, dm_partition_train_val, dm_partition_test_val = [0, 1, 2]
dm_partition_validate_perc, dm_partition_train_perc, dm_partition_test_perc = [0.3, 0.6, 0.1]

### create list of regressors
keep_predictors = [
    'marital_status_single',
    'checking_only_indicator',
    'prior_ctr_indicator',
    'address_change_2x_indicator',
    'cross_border_trx_indicator',
    'in_person_contact_indicator',
    'linkedin_indicator',
    'citizenship_country_risk',
    'distance_to_employer',
    'distance_to_bank'
    ]
#rejected_predictors = []

### mlflow
use_mlflow = 'no' # 'yes', 'no'
mlflow_run_to_use = 0
mlflow_class_labels =['TENSOR']
mlflow_predict_syntax = 'predict'

### var to consider in bias assessment
bias_vars = ['marital_status_single']

### var to consider in partial dependency
pd_var1 = 'distance_to_employer'
pd_var2 = 'distance_to_bank'

### create partition column, if not already in dataset
if create_new_partition == 'yes':
    dm_inputdf = shuffle(dm_inputdf)
    dm_inputdf.reset_index(inplace=True, drop=True)
    validate_rows = round(len(dm_inputdf)*dm_partition_validate_perc)
    train_rows = round(len(dm_inputdf)*dm_partition_train_perc) + validate_rows
    test_rows = len(dm_inputdf)-train_rows
    dm_inputdf.loc[0:validate_rows,dm_partitionvar] = dm_partition_validate_val
    dm_inputdf.loc[validate_rows:train_rows,dm_partitionvar] = dm_partition_train_val
    dm_inputdf.loc[train_rows:,dm_partitionvar] = dm_partition_test_val

In [33]:
##############################
### Final Modeling Columns ###
##############################

### create list of model variables
dm_input = list(dm_inputdf.columns.values)
macro_vars = (dm_dec_target + ' ' + dm_partitionvar + ' ' + dm_key).split()
rejected_predictors = [i for i in dm_input if i not in keep_predictors]
rejected_vars = rejected_predictors # + macro_vars (include macro_vars if rejected_predictors are explicitly listed - not contra keep_predictors)
for i in rejected_vars:
    dm_input.remove(i)
print(dm_input)

### create prediction variables
dm_predictionvar = [str('P_') + dm_dec_target + dm_classtarget_level[0], str('P_') + dm_dec_target + dm_classtarget_level[1]]
dm_classtarget_intovar = str('I_') + dm_dec_target

##################
### Data Split ###
##################

### create train, test, validate datasets using existing partition column
dm_traindf = dm_inputdf[dm_inputdf[dm_partitionvar] == dm_partition_train_val]
X_train = dm_traindf.loc[:, dm_input]
y_train = dm_traindf[dm_dec_target]
dm_testdf = dm_inputdf.loc[(dm_inputdf[dm_partitionvar] == dm_partition_test_val)]
X_test = dm_testdf.loc[:, dm_input]
y_test = dm_testdf[dm_dec_target]
dm_validdf = dm_inputdf.loc[(dm_inputdf[dm_partitionvar] == dm_partition_validate_val)]
X_valid = dm_validdf.loc[:, dm_input]
y_valid = dm_validdf[dm_dec_target]

['marital_status_single', 'checking_only_indicator', 'prior_ctr_indicator', 'address_change_2x_indicator', 'cross_border_trx_indicator', 'in_person_contact_indicator', 'linkedin_indicator', 'citizenship_country_risk', 'distance_to_employer', 'distance_to_bank']


In [34]:
##############################
### Training Code - Python ###
##############################

from sklearn.linear_model import LogisticRegression

### estimate & fit model
dm_model = LogisticRegression(
        tol=1e-8,
        fit_intercept=True,
        solver='newton-cg',
        verbose=0,
        max_iter=100
    )
dm_model.fit(X_train, y_train)

print('score_train:', dm_model.score(X_train, y_train))
print('score_test:', dm_model.score(X_test, y_test))
print('score_valid:', dm_model.score(X_valid, y_valid))

score_train: 0.9660878685467894
score_test: 0.9615384615384616
score_valid: 0.9638778839431368


In [35]:
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

### score full data
fullX = dm_inputdf.loc[:, dm_input]
fully = dm_inputdf[dm_dec_target]
#plot_roc_curve(dm_model, fullX, fully)
dm_scoreddf_prob = pd.DataFrame(dm_model.predict_proba(fullX), columns=dm_predictionvar)
dm_scoreddf_class = pd.DataFrame(dm_model.predict(fullX), columns=[dm_classtarget_intovar])
columns_actual = bias_vars + [dm_dec_target]
dm_scoreddf_bias = pd.DataFrame(dm_inputdf, columns=columns_actual)
dm_scoreddf = pd.concat([dm_scoreddf_prob, dm_scoreddf_class], axis=1)
scored = pd.concat([dm_scoreddf, dm_scoreddf_bias], axis=1)

### create tables with predicted values
trainProba = dm_model.predict_proba(X_train)
trainProbaLabel = dm_model.predict(X_train)
testProba = dm_model.predict_proba(X_test)
testProbaLabel = dm_model.predict(X_test)
validProba = dm_model.predict_proba(X_valid)
validProbaLabel = dm_model.predict(X_valid)
trainData = pd.concat([y_train.reset_index(drop=True), pd.Series(data=trainProbaLabel), pd.Series(data=trainProba[:,1])], axis=1)
testData = pd.concat([y_test.reset_index(drop=True), pd.Series(data=testProbaLabel), pd.Series(data=testProba[:,1])], axis=1)
validData = pd.concat([y_valid.reset_index(drop=True), pd.Series(data=validProbaLabel), pd.Series(data=validProba[:,1])], axis=1)
trainData.columns = ['actual', 'predict', 'probability']
testData.columns = ['actual', 'predict', 'probability']
validData.columns = ['actual', 'predict', 'probability']

### print model & results
predictions = dm_model.predict(X_test)
cols = X_train.columns
predictors = np.array(cols)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(description)
print(description)
print('model_parameters')
print(dm_model)
print(' ')
print('model_performance')
print('score_full:', dm_model.score(fullX, fully))
print('score_train:', dm_model.score(X_train, y_train))
print('score_test:', dm_model.score(X_test, y_test))
print('score_valid:', dm_model.score(X_valid, y_valid))
print('confusion_matrix:')
print('(tn, fp, fn, tp)')
print((tn, fp, fn, tp))
print('classification_report:')
print(classification_report(y_test, predictions))

### print logit odds ratios
orat = np.exp(dm_model.coef_, out=None)
c1 = np.vstack([predictors,orat])
c2 = np.transpose(c1)
c = pd.DataFrame(c2, columns=['predictors', 'odds_ratio'])
print('intercept:')
print(dm_model.intercept_)
print('odds_ratios:')
print(c)

Logistic Regression
Logistic Regression
model_parameters
LogisticRegression(solver='newton-cg', tol=1e-08)
 
model_performance
score_full: 0.9649699342749266
score_train: 0.9660878685467894
score_test: 0.9615384615384616
score_valid: 0.9638778839431368
confusion_matrix:
(tn, fp, fn, tp)
(1345, 15, 40, 30)
classification_report:
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98      1360
         1.0       0.67      0.43      0.52        70

    accuracy                           0.96      1430
   macro avg       0.82      0.71      0.75      1430
weighted avg       0.96      0.96      0.96      1430

intercept:
[-6.40563549]
odds_ratios:
                    predictors odds_ratio
0        marital_status_single   9.539825
1      checking_only_indicator   3.631731
2          prior_ctr_indicator   2.966805
3  address_change_2x_indicator   5.393386
4   cross_border_trx_indicator   5.111662
5  in_person_contact_indicator   0.140842
6          

In [13]:
###################################################
### Create Model Artifacts using sasctl package ###
###      Register Model in Model Manager        ###
###################################################






# target_df = y_train
# predictors = np.array(X_train.columns)
# target_event = dm_classtarget_level[1]
# num_target_categories = len(dm_classtarget_level)
# predict_method = str('{}.')+str(predict_syntax)+str('({})')


In [37]:
### delete in SageMaker ###

import json
import shutil

### create directories for metadata
output_path = Path(output_dir) / metadata_output_dir / model_name
if output_path.exists() and output_path.is_dir():
    shutil.rmtree(output_path)

### create output path
os.makedirs(output_path)
output_path

### create python requirements file
requirements = [
    {
        "step":"import math, pickle, pandas as pd, numpy as np, settings",
        "command":"pip3 install math==3.10.5 pickle==3.10.5 numpy==1.20.3 pandas==1.3.4 settings==0.2.2"
    }
]
requirementsObj = json.dumps(requirements, indent = 4)
with open(str(output_path)+str('/requirements.json'), 'w') as outfile:
    outfile.write(requirementsObj)
    
### copy .py script to output path
### right click script and copy path (change to forward slash)
src = str(git_dir) + str('/python/logit_python/aml_bank/logit_python_aml_bank_sagemaker.ipynb')
dst = output_path
shutil.copy(src, dst)


'C:\\Users\\chparr\\OneDrive - SAS\\python\\outputs\\logit_python_aml_bank_sagemaker\\logit_python_aml_bank_sagemaker.ipynb'

In [10]:
import os
import json
import shutil

### create directories for metadata
output_path = Path('/home/sagemaker-user') / metadata_output_dir / model_name
if output_path.exists() and output_path.is_dir():
    shutil.rmtree(output_path)

### create output path
os.makedirs(output_path)
output_path

### create python requirements file
requirements = [
    {
        "step":"import math, pickle, pandas as pd, numpy as np, settings",
        "command":"pip3 install math==3.10.5 pickle==3.10.5 numpy==1.20.3 pandas==1.3.4 settings==0.2.2"
    }
]
requirementsObj = json.dumps(requirements, indent = 4)
with open(str(output_path)+str('/requirements.json'), 'w') as outfile:
    outfile.write(requirementsObj)
    
### copy .py script to output path
### right click script and copy path (change to forward slash)
src = '/home/sagemaker-user/logit_python_aml_bank_sagemaker.ipynb'
dst = output_path
shutil.copy(src, dst)


/home/sagemaker-user/logit_python_aml_bank_sagemaker.ipynb


PosixPath('/home/sagemaker-user/outputs/logit_python_aml_bank_workbench')

In [43]:
import sasctl.pzmm as pzmm

input_df = X_train
output_vars = pd.DataFrame(columns=prediction_labels, data=[[0.5, 0.5, 'A']])

### create metadata
pzmm.PickleModel.pickle_trained_model(trained_model=dm_model, model_prefix=model_name, pickle_path=output_path)
pzmm.JSONFiles().write_var_json(input_data=input_df, is_input=True, json_path=output_path)
pzmm.JSONFiles().write_var_json(input_data=output_vars, is_input=False, json_path=output_path)
pzmm.JSONFiles().write_file_metadata_json(model_prefix=model_name, json_path=output_path)
pzmm.JSONFiles().write_model_properties_json(
    model_name=model_name, 
    target_variable=dm_dec_target,
    target_values=event_prob_var,
    json_path=output_path,
    model_desc=description,
    model_algorithm=model_type,
    model_function=model_function,
    modeler=username,
    train_table=table_name,
    properties=None)

Model logit_python_aml_bank_sagemaker was successfully pickled and saved to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_aml_bank_sagemaker\logit_python_aml_bank_sagemaker.pickle.
inputVar.json was successfully written and saved to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_aml_bank_sagemaker\inputVar.json
outputVar.json was successfully written and saved to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_aml_bank_sagemaker\outputVar.json
fileMetadata.json was successfully written and saved to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_aml_bank_sagemaker\fileMetadata.json
ModelProperties.json was successfully written and saved to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_aml_bank_sagemaker\ModelProperties.json


In [47]:
kwargs = {'model_file_name': 'logit_python_aml_bank_sagemaker.pickle'}

pzmm.ScoreCode.write_score_code(
    model_prefix=model_name,
    input_data=input_df,
    predict_method=[dm_model.predict_proba, [0.4, float]],
    target_variable=dm_dec_target,
    target_values=event_prob_var,
    score_metrics=None,
    predict_threshold=None,
    model=None,
    pickle_type="pickle",
    missing_values=False,
    score_cas=True,
    score_code_path=output_path,
    target_index=None,
    **kwargs
    )

  warn(
  warn(
  warn(


In [48]:
pzmm.ZipModel.zip_files(        
    model_files=output_path,
    model_prefix=model_name,
    is_viya4=True
    )

<_io.BytesIO at 0x2952833c8b0>