![](./images/Title.PNG)
<div class="alert alert-block alert-info"> <b> </b> 
</div>

# Fraud detection from transactions
![](./images/workflow_fraud.PNG)

<div class="alert alert-block alert-info"> <b> </b> 
</div>

## 1 - Connect to Vantage
<div class="alert alert-block alert-info"> <b> </b> 
</div>

![](./images/Slide32.PNG)

In [1]:
import teradataml as tdml
tdml.options.configure.byom_install_location = "mldb"
tdml.display.print_sqlmr_query = False
import getpass
import json
tdml.__version__
from datetime import datetime
tic = datetime.now()
tdml.options.display
import sys

In [2]:
tdml.__version__

'17.20.00.04'

In [3]:
from aoa import (
    record_training_stats,
    save_plot,
    aoa_create_context,
    ModelContext
)
import os

In [4]:
# set the path to the local project repository for this model demo
model_local_path = 'C:/Users/dm250067/OneDrive - Teradata/Documents/01 - Code Development/modelops-demo-models/model_definitions/transaction_fraud_indb'
res = os.system(f'mkdir -p "{model_local_path}"')
res = os.system(f'mkdir -p "{model_local_path}/model_modules"')

In [5]:
Param = {
    'host'          : 'tdprd2.td.teradata.com', 
    'user'          : 'dm250067', 
    'password'      : "ENCRYPTED_PASSWORD(file:{},file:{})".format ('../../PassKey.properties','../../EncPass.properties'), #getpass.getpass(), 
    'logmech'       : 'LDAP',
    'database'      : 'ADLSLSEMEA_DEMO_BANKING',
    'temp_database_name' : 'dm250067'
    }

Param = {
    'host'          : 'tdprd3.td.teradata.com', 
    'user'          : 'dm250067', 
    'password'      : "ENCRYPTED_PASSWORD(file:{},file:{})".format ('../../PassKey.properties','../../EncPass.properties'), #getpass.getpass(), 
    'logmech'       : 'LDAP',
    'database'      : 'ADLDSD_CHURN',
    'temp_database_name' : 'dm250067'
    }

tdml.create_context(**Param)



Engine(teradatasql://:***@tdprd3.td.teradata.com/?DATABASE=ADLDSD_CHURN&LOGDATA=%2A%2A%2A&LOGMECH=%2A%2A%2A&USER=DM250067)

## 2 - Define Training Function

In [6]:
%%writefile "$model_local_path/model_modules/training.py"
from teradataml import (
    DataFrame,
    OneHotEncodingFit,
    OneHotEncodingTransform,
    ScaleFit,
    ScaleTransform,
    DecisionForest,
    configure
)
from aoa import (
    record_training_stats,
    save_plot,
    aoa_create_context,
    ModelContext
)
import numpy as np

configure.val_install_location = 'TRNG_XSP'

def plot_roc_curve(fi, img_filename):
    import pandas as pd
    import matplotlib.pyplot as plt
    feat_importances = pd.Series(fi)
    feat_importances.nlargest(10).plot(kind='barh').set_title('Feature Importance')
    fig = plt.gcf()
    fig.savefig(img_filename, dpi=500)
    plt.clf()


def train(context: ModelContext, **kwargs):
    aoa_create_context()

    feature_names = context.dataset_info.feature_names
    target_name   = context.dataset_info.target_names[0]
    entity_key    = context.dataset_info.entity_key

    # read training dataset from Teradata and convert to pandas
    train_df      = DataFrame.from_query(context.dataset_info.sql)
    
    if 'type' in feature_names:
        print ("OneHotEncoding using InDB Functions...")
        
        transaction_types = list(train_df[['type','txn_id']].groupby(['type']).count().to_pandas()['type'].values)


        onehot = OneHotEncodingFit(data           = train_df,
                                        is_input_dense  = True,
                                        target_column      = '"type"',
                                        categorical_values = transaction_types,
                                        other_column="other"
                                       )

        train_df_onehot = OneHotEncodingTransform(data=train_df,
                                           object=onehot.result,
                                           is_input_dense=True
                                          ).result

        onehot.result.to_sql(f"onehot_${context.model_version}", if_exists="replace")
        print("Saved onehot")
        
        feature_names_after_one_hot = [c for c in feature_names if c != 'type'] + ['type_'+c for c in transaction_types]
        category_features = ['type_'+c for c in transaction_types]
    else:
        train_df_onehot = train_df
        feature_names_after_one_hot = feature_names
        category_features = []
    
    print ("Scaling using InDB Functions...")
    
    scaler = ScaleFit(
        data           = train_df_onehot,
        target_columns = feature_names_after_one_hot,
        scale_method   = context.hyperparams["scale_method"],
        miss_value     = context.hyperparams["miss_value"],
        global_scale   = context.hyperparams["global_scale"].lower() in ['true', '1'],
        multiplier     = context.hyperparams["multiplier"],
        intercept      = context.hyperparams["intercept"]
    )

    scaled_train = ScaleTransform(
        data           = train_df_onehot,
        object         = scaler.output,
        accumulate     = [target_name, entity_key]
    ).result
    
    scaler.output.to_sql(f"scaler_${context.model_version}", if_exists="replace")
    print("Saved scaler")
    
    print("Starting training...")
    model = DecisionForest(
        input_columns        = feature_names_after_one_hot,
        response_column      = target_name,
        data                 = scaled_train,
        max_depth            = context.hyperparams["max_depth"],
        num_trees            = context.hyperparams["num_trees"],
        min_node_size        = context.hyperparams["min_node_size"],
        mtry                 = context.hyperparams["mtry"],
        mtry_seed            = context.hyperparams["mtry_seed"],
        seed                 = context.hyperparams["seed"],
        tree_type            = 'CLASSIFICATION'
    )
    
    model.result.to_sql(f"model_${context.model_version}", if_exists="replace")    
    print("Saved trained model")

    record_training_stats(
        train_df_onehot,
        features    = feature_names_after_one_hot,
        targets     = [target_name],
        categorical = [target_name]+category_features,
        feature_importance = {f:0 for f in feature_names_after_one_hot},
        context     = context
    )

Overwriting C:/Users/dm250067/OneDrive - Teradata/Documents/01 - Code Development/modelops-demo-models/model_definitions/transaction_fraud_indb/model_modules/training.py


In [7]:
help(record_training_stats)

Help on function record_training_stats in module aoa.stats.stats:

record_training_stats(df: teradataml.dataframe.dataframe.DataFrame, features: List[str], targets: List[str] = [], categorical: List[str] = [], context: aoa.context.model_context.ModelContext = {}, feature_importance: Dict[str, float] = {}, **kwargs) -> Dict
    Compute and record the dataset statistics used for training. This information provides ModelOps with a snapshot
    of the dataset at this point in time (i.e. at the point of training). ModelOps uses this information for data and
    prediction drift monitoring. It can also be used for data quality monitoring as all of the information which is
    captured here is available to configure an alert on (e.g. max > some_threshold).
    
    Depending on the type of variable (categorical or continuous), different statistics and distributions are computed.
    All of this is computed in Vantage via the Vantage Analytics Library (VAL).
    
    Continuous Variable:
     

In [8]:
# Define the ModelContext to test with. The ModelContext is created and managed automatically by ModelOps 
# when it executes your code via CLI / UI. However, for testing in the notebook, you can define as follows

# define the training dataset 
sql = f"""
SELECT 
*
FROM {Param['database']}.transactions
where fold = 'train'
"""

feature_metadata =  {
    "database": Param['database'],
    "table": "aoa_statistics_metadata"
}

hyperparams = {
    # scaler
    "scale_method":"STD",
    "miss_value":"KEEP",
    "global_scale":"False",
    "multiplier":"1",
    "intercept":"0",
    # decision forest
    "max_depth": 15, 
    "num_trees": 72,
    "min_node_size": 1,
    "mtry": 6,
    "mtry_seed": 1,
    "seed": 1,
}

entity_key    = "txn_id"
target_names  = ["isFraud"]
feature_names = ['amount', 'oldbalanceOrig', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'type']

from aoa import ModelContext, DatasetInfo

dataset_info = DatasetInfo(
    sql=sql,
    entity_key=entity_key,
    feature_names=feature_names,
    target_names=target_names,
    feature_metadata=feature_metadata
)

ctx = ModelContext(
    hyperparams=hyperparams,
    dataset_info=dataset_info,
    artifact_output_path=f'{model_local_path}/model_modules/artifacts/',
    model_version="InDB_v1",
    model_table="aoa_model_indb_v1"
)

sys.path.append(os.path.expanduser(f"{model_local_path}/model_modules"))
import training
training.train(context=ctx)

OneHotEncoding using InDB Functions...
Saved onehot
Scaling using InDB Functions...
Saved scaler
Starting training...
Saved trained model


Exception: Ensure feature statistics metadata in ADLDSD_CHURN.aoa_statistics_metadata are up to date. Attempted to compute stats for ['newbalanceorig', 'newbalancedest', 'oldbalanceorig', 'amount', 'oldbalancedest'] but only found the following continuous variables dict_keys([]).

In [None]:
tdml.db_list_tables(schema_name='ADLDSD_CHURN')

In [None]:
tdml.execute_sql("SEL * FROM ADLDSD_CHURN.aoa_statistics_metadata WHERE column_type='continuous'").fetchall()

In [None]:
import pandas as pd

In [None]:
 pd.read_sql("SEL * FROM ADLDSD_CHURN.aoa_statistics_metadata WHERE column_type='continuous'", tdml.get_context())