In [2]:
####################################################
###  Train & Register Python Scikit Logit Model  ###
####################################################

###################
### Credentials ###
###################

import os
import sys
from pathlib import Path

sys.path.append('C:/Users/chparr/OneDrive - SAS/credentials')
from credentials import hostname, session, port, protocol, wd, output_dir, git_dir, token_dir, token, token_refresh, token_pem, username

In [3]:
#############################
### Connect with SAS Viya ###
#############################

import swat

access_token = open(token, "r").read()
conn =  swat.CAS(hostname=hostname, username=None, password=access_token, ssl_ca_list=token_pem, protocol=protocol)
print(conn.serverstatus())

NOTE: Grid node action status report: 1 nodes, 9 total actions executed.
[About]

 {'CAS': 'Cloud Analytic Services',
  'CASCacheLocation': 'CAS Disk Cache',
  'CASHostAccountRequired': 'OPTIONAL',
  'Copyright': 'Copyright © 2014-2024 SAS Institute Inc. All Rights Reserved.',
  'ServerTime': '2024-06-07T19:14:58Z',
  'System': {'Hostname': 'controller.sas-cas-server-default.innovationlab.svc.cluster.local',
   'Linux Distribution': 'Red Hat Enterprise Linux release 8.9 (Ootpa)',
   'Model Number': 'x86_64',
   'OS Family': 'LIN X64',
   'OS Name': 'Linux',
   'OS Release': '5.15.0-1042-azure',
   'OS Version': '#49-Ubuntu SMP Tue Jul 11 17:28:46 UTC 2023'},
  'Transferred': 'NO',
  'Version': '4.00',
  'VersionLong': 'V.04.00M0P04152024',
  'Viya Release': '20240420.1713587951902',
  'Viya Version': 'Stable 2024.04',
  'license': {'expires': '06Sep2024:00:00:00',
   'gracePeriod': 0,
   'site': 'CIS SSEMONTHLY INNOVATION ENTERPRISE-RISK-MRM',
   'siteNum': 70180938,

[nodestatus]

 No

In [4]:
#############################
### Identify Table in CAS ###
#############################

### caslib and table to use in modeling
caslib = 'casuser'
in_mem_tbl = 'AML_BANK_PREP'

### load table in-memory if not already exists in-memory
if conn.table.tableExists(caslib=caslib, name=in_mem_tbl).exists<=0:
    conn.table.loadTable(caslib=caslib, path=str(in_mem_tbl+str('.sashdat')), 
                         casout={'name':in_mem_tbl, 'caslib':caslib, 'promote':True})

### show table to verify
conn.table.tableInfo(caslib=caslib, wildIgnore=False, name=in_mem_tbl)

Unnamed: 0,Name,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,View,MultiPart,SourceName,SourceCaslib,Compressed,Creator,Modifier,SourceModTimeFormatted,SourceModTime,TableRedistUpPolicy
0,AML_BANK_PREP,14302,27,0,utf-8,2024-04-30T14:36:02+00:00,2024-04-30T14:36:02+00:00,2024-05-23T23:01:18+00:00,UTF8,2030107000.0,0,0,AML_BANK_PREP.sashdat,CASUSER(Chris.Parrish@sas.com),0,Chris.Parrish@sas.com,,2024-04-16T21:17:04+00:00,2028921000.0,Not Specified


In [4]:
########################
### Create Dataframe ###
########################

dm_inputdf =  conn.CASTable(in_mem_tbl, caslib=caslib).to_frame()

### print columns for review of model parameters
print(dm_inputdf.dtypes)

account_id                     float64
num_transactions               float64
credit_score                   float64
marital_status_single          float64
marital_status_married         float64
marital_status_divorced        float64
analytic_partition             float64
ml_indicator                   float64
checking_only_indicator        float64
prior_ctr_indicator            float64
address_change_2x_indicator    float64
cross_border_trx_indicator     float64
in_person_contact_indicator    float64
linkedin_indicator             float64
atm_deposit_indicator          float64
trx_10ksum_indicator           float64
common_merchant_indicator      float64
direct_deposit_indicator       float64
citizenship_country_risk       float64
occupation_risk                float64
num_acctbal_chgs_gt2000        float64
distance_to_employer           float64
distance_to_bank               float64
income                         float64
primary_transfer_cash          float64
primary_transfer_check   

In [5]:
########################
### Model Parameters ###
########################

### import python libraries
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

logit_params = {
             'penalty': 'l2', 
             'dual': False, 
             'tol': 0.0001, 
             'fit_intercept': True, 
             'intercept_scaling': 1, 
             'class_weight': None, 
             'random_state': None, 
             'solver': 'newton-cg', 
             'max_iter': 100, 
             'multi_class': 'auto', 
             'verbose': 0, 
             'warm_start': False, 
             'n_jobs': None, 
             'l1_ratio': None
             } 
print(logit_params)

### model manager information
metadata_output_dir = 'outputs'
model_name = 'logit_python_amlbank'
project_name = 'Anti-Money Laundering'
description = 'Logistic Regression'
model_type = 'logistic_regression'
model_function = 'Classification'
predict_syntax = 'predict_proba'

### define macro variables for model
dm_dec_target = 'ml_indicator'
dm_partitionvar = 'analytic_partition'
create_new_partition = 'no' # 'yes', 'no'
dm_key = 'account_id' 
dm_classtarget_level = ['0', '1']
dm_partition_validate_val, dm_partition_train_val, dm_partition_test_val = [0, 1, 2]
dm_partition_validate_perc, dm_partition_train_perc, dm_partition_test_perc = [0.3, 0.6, 0.1]

### create list of regressors
keep_predictors = [
    'marital_status_single',
    'checking_only_indicator',
    'prior_ctr_indicator',
    'address_change_2x_indicator',
    'cross_border_trx_indicator',
    'in_person_contact_indicator',
    'linkedin_indicator',
    'citizenship_country_risk',
    'distance_to_employer',
    'distance_to_bank'
    ]
#rejected_predictors = []

### mlflow
use_mlflow = 'no' # 'yes', 'no'
mlflow_run_to_use = 0
mlflow_class_labels =['TENSOR']
mlflow_predict_syntax = 'predict'

### var to consider in bias assessment
bias_vars = ['marital_status_single']

### var to consider in partial dependency
pd_var1 = 'distance_to_employer'
pd_var2 = 'distance_to_bank'

### create partition column, if not already in dataset
if create_new_partition == 'yes':
    dm_inputdf = shuffle(dm_inputdf)
    dm_inputdf.reset_index(inplace=True, drop=True)
    validate_rows = round(len(dm_inputdf)*dm_partition_validate_perc)
    train_rows = round(len(dm_inputdf)*dm_partition_train_perc) + validate_rows
    test_rows = len(dm_inputdf)-train_rows
    dm_inputdf.loc[0:validate_rows,dm_partitionvar] = dm_partition_validate_val
    dm_inputdf.loc[validate_rows:train_rows,dm_partitionvar] = dm_partition_train_val
    dm_inputdf.loc[train_rows:,dm_partitionvar] = dm_partition_test_val

{'penalty': 'l2', 'dual': False, 'tol': 0.0001, 'fit_intercept': True, 'intercept_scaling': 1, 'class_weight': None, 'random_state': None, 'solver': 'newton-cg', 'max_iter': 100, 'multi_class': 'auto', 'verbose': 0, 'warm_start': False, 'n_jobs': None, 'l1_ratio': None}


In [6]:
##############################
### Final Modeling Columns ###
##############################

### create list of model variables
dm_input = list(dm_inputdf.columns.values)
macro_vars = (dm_dec_target + ' ' + dm_partitionvar + ' ' + dm_key).split()
rejected_predictors = [i for i in dm_input if i not in keep_predictors]
rejected_vars = rejected_predictors # + macro_vars (include macro_vars if rejected_predictors are explicitly listed - not contra keep_predictors)
for i in rejected_vars:
    dm_input.remove(i)
print(dm_input)

### create prediction variables
dm_predictionvar = [str('P_') + dm_dec_target + dm_classtarget_level[0], str('P_') + dm_dec_target + dm_classtarget_level[1]]
dm_classtarget_intovar = str('I_') + dm_dec_target

['marital_status_single', 'checking_only_indicator', 'prior_ctr_indicator', 'address_change_2x_indicator', 'cross_border_trx_indicator', 'in_person_contact_indicator', 'linkedin_indicator', 'citizenship_country_risk', 'distance_to_employer', 'distance_to_bank']


In [7]:
##################
### Data Split ###
##################

### create train, test, validate datasets using existing partition column
dm_traindf = dm_inputdf[dm_inputdf[dm_partitionvar] == dm_partition_train_val]
X_train = dm_traindf.loc[:, dm_input]
y_train = dm_traindf[dm_dec_target]
dm_testdf = dm_inputdf.loc[(dm_inputdf[dm_partitionvar] == dm_partition_test_val)]
X_test = dm_testdf.loc[:, dm_input]
y_test = dm_testdf[dm_dec_target]
dm_validdf = dm_inputdf.loc[(dm_inputdf[dm_partitionvar] == dm_partition_validate_val)]
X_valid = dm_validdf.loc[:, dm_input]
y_valid = dm_validdf[dm_dec_target]

In [8]:
#####################
### Training Code ###
#####################

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

### estimate & fit model
dm_model = LogisticRegression(**logit_params)
dm_model.fit(X_train, y_train)

### score full data
fullX = dm_inputdf.loc[:, dm_input]
fully = dm_inputdf[dm_dec_target]
#plot_roc_curve(dm_model, fullX, fully)
dm_scoreddf_prob = pd.DataFrame(dm_model.predict_proba(fullX), columns=dm_predictionvar)
dm_scoreddf_class = pd.DataFrame(dm_model.predict(fullX), columns=[dm_classtarget_intovar])
columns_actual = bias_vars + [dm_dec_target]
dm_scoreddf_bias = pd.DataFrame(dm_inputdf, columns=columns_actual)
dm_scoreddf = pd.concat([dm_scoreddf_prob, dm_scoreddf_class], axis=1)
scored = pd.concat([dm_scoreddf, dm_scoreddf_bias], axis=1)

### create tables with predicted values
trainProba = dm_model.predict_proba(X_train)
trainProbaLabel = dm_model.predict(X_train)
testProba = dm_model.predict_proba(X_test)
testProbaLabel = dm_model.predict(X_test)
validProba = dm_model.predict_proba(X_valid)
validProbaLabel = dm_model.predict(X_valid)
trainData = pd.concat([y_train.reset_index(drop=True), pd.Series(data=trainProbaLabel), pd.Series(data=trainProba[:,1])], axis=1)
testData = pd.concat([y_test.reset_index(drop=True), pd.Series(data=testProbaLabel), pd.Series(data=testProba[:,1])], axis=1)
validData = pd.concat([y_valid.reset_index(drop=True), pd.Series(data=validProbaLabel), pd.Series(data=validProba[:,1])], axis=1)
trainData.columns = ['actual', 'predict', 'probability']
testData.columns = ['actual', 'predict', 'probability']
validData.columns = ['actual', 'predict', 'probability']

### print model & results
predictions = dm_model.predict(X_test)
cols = X_train.columns
predictors = np.array(cols)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(description)
print(description)
print('model_parameters')
print(dm_model)
print(' ')
print('model_performance')
print('score_full:', dm_model.score(fullX, fully))
print('score_train:', dm_model.score(X_train, y_train))
print('score_test:', dm_model.score(X_test, y_test))
print('score_valid:', dm_model.score(X_valid, y_valid))
print('confusion_matrix:')
print('(tn, fp, fn, tp)')
print((tn, fp, fn, tp))
print('classification_report:')
print(classification_report(y_test, predictions))

### print logit odds ratios
orat = np.exp(dm_model.coef_, out=None)
c1 = np.vstack([predictors,orat])
c2 = np.transpose(c1)
c = pd.DataFrame(c2, columns=['predictors', 'odds_ratio'])
print('intercept:')
print(dm_model.intercept_)
print('odds_ratios:')
print(c)

Logistic Regression
Logistic Regression
model_parameters
LogisticRegression(solver='newton-cg')
 
model_performance
score_full: 0.9649699342749266
score_train: 0.9660878685467894
score_test: 0.9615384615384616
score_valid: 0.9638778839431368
confusion_matrix:
(tn, fp, fn, tp)
(1345, 15, 40, 30)
classification_report:
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98      1360
         1.0       0.67      0.43      0.52        70

    accuracy                           0.96      1430
   macro avg       0.82      0.71      0.75      1430
weighted avg       0.96      0.96      0.96      1430

intercept:
[-6.40166319]
odds_ratios:
                    predictors odds_ratio
0        marital_status_single   9.517659
1      checking_only_indicator   3.631675
2          prior_ctr_indicator   2.963293
3  address_change_2x_indicator   5.393762
4   cross_border_trx_indicator    5.10149
5  in_person_contact_indicator   0.140815
6           linkedin_i

In [9]:
#######################################
### Register Model in Model Manager ###
## Ensure Model Does Not Exist in MM ##
##### Using PZMM Zips Up Metadata #####
#######################################

from sasctl import Session
import sasctl.pzmm as pzmm
from sasctl.services import model_repository as modelRepo 
from sasctl.tasks import register_model
import shutil
import json

input_df = X_train
target_df = y_train
predictors = np.array(X_train.columns)
prediction_labels = ['EM_CLASSIFICATION', 'EM_EVENTPROBABILITY']
target_event = dm_classtarget_level[1]
target_level = 'BINARY'
num_target_categories = len(dm_classtarget_level)
predict_method = str('{}.')+str(predict_syntax)+str('({})')
output_vars = pd.DataFrame(columns=prediction_labels, data=[['A', 0.5]])

In [10]:
### create directories for metadata
output_path = Path(output_dir) / metadata_output_dir / model_name
if output_path.exists() and output_path.is_dir():
    shutil.rmtree(output_path)

### create output path
os.makedirs(output_path)

### create python requirements file
requirements = [
    {
        "step":"import math, pickle, pandas as pd, numpy as np, settings",
        "command":"pip3 install math==3.10.5 pickle==3.10.5 numpy==1.20.3 pandas==1.3.4 settings==0.2.2"
    }
]
requirementsObj = json.dumps(requirements, indent = 4)
with open(str(output_path)+str('/requirements.json'), 'w') as outfile:
    outfile.write(requirementsObj)
    
### copy .py script to output path
### right click script and copy path (change to forward slash)
src = str(git_dir) + str('/python/logit_python/aml_bank/logit_python_amlbank.ipynb')
print(src)
dst = output_path
shutil.copy(src, dst)
output_path

C:/Users/chparr/OneDrive - SAS/git/sas_viya/python/logit_python/aml_bank/logit_python_amlbank.ipynb


WindowsPath('C:/Users/chparr/OneDrive - SAS/python/outputs/logit_python_amlbank')

In [11]:
### create metadata
pzmm.PickleModel.pickle_trained_model(trained_model=dm_model, model_prefix=model_name, pickle_path=output_path)
pzmm.JSONFiles().write_var_json(input_data=input_df, is_input=True, json_path=output_path)
pzmm.JSONFiles().write_var_json(input_data=output_vars, is_input=False, json_path=output_path)
pzmm.JSONFiles().write_file_metadata_json(model_prefix=model_name, json_path=output_path)
pzmm.JSONFiles().write_model_properties_json(
    model_name=model_name, 
    target_variable=dm_dec_target,
    target_values=dm_classtarget_level,
    json_path=output_path,
    model_desc=description,
    model_algorithm=model_type,
    model_function=model_function,
    modeler=username,
    train_table=in_mem_tbl,
    properties=None)

Model logit_python_amlbank was successfully pickled and saved to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_amlbank\logit_python_amlbank.pickle.
inputVar.json was successfully written and saved to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_amlbank\inputVar.json
outputVar.json was successfully written and saved to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_amlbank\outputVar.json
fileMetadata.json was successfully written and saved to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_amlbank\fileMetadata.json
ModelProperties.json was successfully written and saved to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_amlbank\ModelProperties.json


In [12]:
### create session in cas
sess = Session(hostname=session, token=access_token, client_secret='access_token')

In [13]:
pzmm.JSONFiles().calculate_model_statistics(
    target_value=int(dm_classtarget_level[1]), 
    prob_value=0.11, 
    train_data=trainData, 
    test_data=testData, 
    validate_data=validData, 
    json_path=output_path)

dmcas_fitstat.json was successfully written and saved to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_amlbank\dmcas_fitstat.json
dmcas_roc.json was successfully written and saved to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_amlbank\dmcas_roc.json
dmcas_lift.json was successfully written and saved to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_amlbank\dmcas_lift.json


In [14]:
pzmm.JSONFiles().assess_model_bias(
    score_table=scored, 
    sensitive_values=bias_vars, 
    actual_values=dm_dec_target,
    pred_values=None,
    prob_values=dm_predictionvar,
    levels=dm_classtarget_level,
    cutoff=0.5,
    json_path=output_path)

  pzmm.JSONFiles().assess_model_bias(


maxDifferences.json was successfully written and saved to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_amlbank\maxDifferences.json
groupMetrics.json was successfully written and saved to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_amlbank\groupMetrics.json


  json_files = cls.bias_dataframes_to_json(


In [15]:
### import to model manager
pzmm.ImportModel().import_model(
    model_files=output_path, 
    model_prefix=model_name, 
    project=project_name, 
    input_data=input_df,
    predict_method=[dm_model.predict_proba, [int, int]],
    score_metrics=prediction_labels,
    pickle_type='pickle',
    project_version='latest',
    missing_values=False,
    overwrite_model=False,
    mlflow_details=None,
    predict_threshold=None,
    target_values=dm_classtarget_level,
    overwrite_project_properties=False,
    target_index=1,
    model_file_name=model_name + str('.pickle'))

  warn(


Model score code was written successfully to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_amlbank\score_logit_python_amlbank.py and uploaded to SAS Model Manager.
All model files were zipped to C:\Users\chparr\OneDrive - SAS\python\outputs\logit_python_amlbank.


  warn(


Model was successfully imported into SAS Model Manager as logit_python_amlbank with the following UUID: d3d9ba5c-5134-40e4-b47d-e2217aa2adeb.


(<class 'sasctl.core.RestObj'>(headers={'Date': 'Fri, 03 May 2024 18:07:20 GMT', 'Content-Type': 'application/vnd.sas.collection+json; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Cache-Control': 'no-cache, no-store, max-age=0, must-revalidate', 'Content-Security-Policy': "default-src 'self'; object-src 'none'; frame-ancestors 'self'; form-action 'self';", 'Expires': '0', 'Pragma': 'no-cache', 'Sas-Service-Response-Flag': 'true', 'Vary': 'Origin', 'X-Content-Type-Options': 'nosniff', 'X-Xss-Protection': '1; mode=block', 'Strict-Transport-Security': 'max-age=63072000; includeSubDomains'}, data={'creationTimeStamp': '2024-05-03T18:07:19.200Z', 'createdBy': 'Chris.Parrish@sas.com', 'modifiedTimeStamp': '2024-05-03T18:07:20.358Z', 'modifiedBy': 'Chris.Parrish@sas.com', 'id': 'd3d9ba5c-5134-40e4-b47d-e2217aa2adeb', 'name': 'logit_python_amlbank', 'description': 'Logistic Regression', 'role': 'plain', 'scoreCodeType': 'python', 'algorithm': 'logistic_regressio